From 00f7cd4b36ae04237ffb062967e9dd36e62bf592 Mon Sep 17 00:00:00 2001 From: Paul Borman Date: Fri, 1 Jul 2011 12:16:56 -0400 Subject: [PATCH] csv: new package csv reader/writer based on RFC 4180 R=rsc, mattn.jp, r, dchest CC=golang-dev https://golang.org/cl/4629085 --- src/pkg/Makefile | 1 + src/pkg/csv/Makefile | 12 ++ src/pkg/csv/reader.go | 373 +++++++++++++++++++++++++++++++++++++ src/pkg/csv/reader_test.go | 265 ++++++++++++++++++++++++++ src/pkg/csv/writer.go | 123 ++++++++++++ src/pkg/csv/writer_test.go | 44 +++++ 6 files changed, 818 insertions(+) create mode 100644 src/pkg/csv/Makefile create mode 100644 src/pkg/csv/reader.go create mode 100644 src/pkg/csv/reader_test.go create mode 100644 src/pkg/csv/writer.go create mode 100644 src/pkg/csv/writer_test.go diff --git a/src/pkg/Makefile b/src/pkg/Makefile index 575f51fec6..7338399c2c 100644 --- a/src/pkg/Makefile +++ b/src/pkg/Makefile @@ -62,6 +62,7 @@ DIRS=\ crypto/x509\ crypto/x509/pkix\ crypto/xtea\ + csv\ debug/dwarf\ debug/macho\ debug/elf\ diff --git a/src/pkg/csv/Makefile b/src/pkg/csv/Makefile new file mode 100644 index 0000000000..e364d51d23 --- /dev/null +++ b/src/pkg/csv/Makefile @@ -0,0 +1,12 @@ +# Copyright 2011 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +include ../../Make.inc + +TARG=csv +GOFILES=\ + reader.go\ + writer.go\ + +include ../../Make.pkg diff --git a/src/pkg/csv/reader.go b/src/pkg/csv/reader.go new file mode 100644 index 0000000000..1f4b61cf9c --- /dev/null +++ b/src/pkg/csv/reader.go @@ -0,0 +1,373 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package csv reads and writes comma-separated values (CSV) files. +// +// A csv file contains zero or more records of one or more fields per record. +// Each record is separated by the newline character. The final record may +// optionally be followed by a newline character. +// +// field1,field2,field3 +// +// White space is considered part of a field. +// +// Carriage returns before newline characters are silently removed. +// +// Blank lines are ignored. A line with only whitespace characters (excluding +// the ending newline character) is not considered a blank line. +// +// Fields which start and stop with the quote character " are called +// quoted-fields. The beginning and ending quote are not part of the +// field. +// +// The source: +// +// normal string,"quoted-field" +// +// results in the fields +// +// {`normal string`, `quoted-field`} +// +// Within a quoted-field a quote character followed by a second quote +// character is considered a single quote. +// +// "the ""word"" is true","a ""quoted-field""" +// +// results in +// +// {`the "word" is true`, `a "quoted-field"`} +// +// Newlines and commas may be included in a quoted-field +// +// "Multi-line +// field","comma is ," +// +// results in +// +// {`Multi-line +// field`, `comma is ,`} +package csv + +import ( + "bufio" + "bytes" + "fmt" + "io" + "os" + "unicode" +) + +// A ParseError is returned for parsing errors. +// The first line is 1. The first column is 0. +type ParseError struct { + Line int // Line where the error occurred + Column int // Column (rune index) where the error occurred + Error os.Error // The actual error +} + +func (e *ParseError) String() string { + return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Error) +} + +// These are the errors that can be returned in ParseError.Error +var ( + ErrTrailingComma = os.NewError("extra delimiter at end of line") + ErrBareQuote = os.NewError("bare \" in non-quoted-field") + ErrQuote = os.NewError("extraneous \" in field") + ErrFieldCount = os.NewError("wrong number of fields in line") +) + +// A Reader reads records from a CSV-encoded file. +// +// As returned by NewReader, a Reader expects input conforming to RFC 4180. +// The exported fields can be changed to customize the details before the +// first call to Read or ReadAll. +// +// Comma is the field delimiter. It defaults to ','. +// +// Comment, if not 0, is the comment character. Lines beginning with the +// Comment character is ignored. +// +// If FieldsPerRecord is positive, Read requires each record to +// have the given number of fields. If FieldsPerRecord is 0, Read sets it to +// the number of fields in the first record, so that future records must +// have the same field count. +// +// If LazyQuotes is true, a quote may appear in an unquoted field and a +// non-doubled quote may appear in a quoted field. +// +// If TrailingComma is true, the last field may be a unquoted empty field. +// +// If TrimLeadingSpace is true, leading white space in a field is ignored. +type Reader struct { + Comma int // Field delimiter (set to ',' by NewReader) + Comment int // Comment character for start of line + FieldsPerRecord int // Number of expected fields per record + LazyQuotes bool // Allow lazy quotes + TrailingComma bool // Allow trailing comma + TrimLeadingSpace bool // Trim leading space + line int + column int + r *bufio.Reader + field bytes.Buffer +} + +// NewReader returns a new Reader that reads from r. +func NewReader(r io.Reader) *Reader { + return &Reader{ + Comma: ',', + r: bufio.NewReader(r), + } +} + +// error creates a new ParseError based on err. +func (r *Reader) error(err os.Error) os.Error { + return &ParseError{ + Line: r.line, + Column: r.column, + Error: err, + } +} + +// Read reads one record from r. The record is a slice of strings with each +// string representing one field. +func (r *Reader) Read() (record []string, err os.Error) { + for { + record, err = r.parseRecord() + if record != nil { + break + } + if err != nil { + return nil, err + } + } + + if r.FieldsPerRecord > 0 { + if len(record) != r.FieldsPerRecord { + r.column = 0 // report at start of record + return record, r.error(ErrFieldCount) + } + } else if r.FieldsPerRecord == 0 { + r.FieldsPerRecord = len(record) + } + return record, nil +} + +// ReadAll reads all the remaining records from r. +// Each record is a slice of fields. +func (r *Reader) ReadAll() (records [][]string, err os.Error) { + for { + record, err := r.Read() + if err == os.EOF { + return records, nil + } + if err != nil { + return nil, err + } + records = append(records, record) + } + panic("unreachable") +} + +// readRune reads one rune from r, folding \r\n to \n and keeping track +// of our far into the line we have read. r.column will point to the start +// of this rune, not the end of this rune. +func (r *Reader) readRune() (int, os.Error) { + rune, _, err := r.r.ReadRune() + + // Handle \r\n here. We make the simplifying assumption that + // anytime \r is followed by \n that it can be folded to \n. + // We will not detect files which contain both \r\n and bare \n. + if rune == '\r' { + rune, _, err = r.r.ReadRune() + if err == nil { + if rune != '\n' { + r.r.UnreadRune() + rune = '\r' + } + } + } + r.column++ + return rune, err +} + +// unreadRune puts the last rune read from r back. +func (r *Reader) unreadRune() { + r.r.UnreadRune() + r.column-- +} + +// skip reads runes up to and including the rune delim or until error. +func (r *Reader) skip(delim int) os.Error { + for { + rune, err := r.readRune() + if err != nil { + return err + } + if rune == delim { + return nil + } + } + panic("unreachable") +} + +// parseRecord reads and parses a single csv record from r. +func (r *Reader) parseRecord() (fields []string, err os.Error) { + // Each record starts on a new line. We increment our line + // number (lines start at 1, not 0) and set column to -1 + // so as we increment in readRune it points to the character we read. + r.line++ + r.column = -1 + + // Peek at the first rune. If it is an error we are done. + // If we are support comments and it is the comment character + // the skip to the end of line. + + rune, _, err := r.r.ReadRune() + if err != nil { + return nil, err + } + + if r.Comment != 0 && rune == r.Comment { + return nil, r.skip('\n') + } + r.r.UnreadRune() + + // At this point we have at least one field. + for { + haveField, delim, err := r.parseField() + if haveField { + fields = append(fields, r.field.String()) + } + if delim == '\n' || err == os.EOF { + return fields, err + } else if err != nil { + return nil, err + } + } + panic("unreachable") +} + + +// parseField parses the next field in the record. The read field is +// located in r.field. Delim is the first character not part of the field +// (r.Comma or '\n'). +func (r *Reader) parseField() (haveField bool, delim int, err os.Error) { + r.field.Reset() + + rune, err := r.readRune() + if err != nil { + // If we have EOF and are not at the start of a line + // then we return the empty field. We have already + // checked for trailing commas if needed. + if err == os.EOF && r.column != 0 { + return true, 0, err + } + return false, 0, err + } + + if r.TrimLeadingSpace { + for unicode.IsSpace(rune) { + rune, err = r.readRune() + if err != nil { + return false, 0, err + } + } + } + + switch rune { + case r.Comma: + // will check below + + case '\n': + // We are a trailing empty field or a blank linke + if r.column == 0 { + return false, rune, nil + } + return true, rune, nil + + case '"': + // quoted field + Quoted: + for { + rune, err = r.readRune() + if err != nil { + if err == os.EOF { + if r.LazyQuotes { + return true, 0, err + } + return false, 0, r.error(ErrQuote) + } + return false, 0, err + } + switch rune { + case '"': + rune, err = r.readRune() + if err != nil || rune == r.Comma { + break Quoted + } + if rune == '\n' { + return true, rune, nil + } + if rune != '"' { + if !r.LazyQuotes { + r.column-- + return false, 0, r.error(ErrQuote) + } + // accept the bare quote + r.field.WriteRune('"') + } + case '\n': + r.line++ + r.column = -1 + } + r.field.WriteRune(rune) + } + + default: + // unquoted field + for { + r.field.WriteRune(rune) + rune, err = r.readRune() + if err != nil || rune == r.Comma { + break + } + if rune == '\n' { + return true, rune, nil + } + if !r.LazyQuotes && rune == '"' { + return false, 0, r.error(ErrBareQuote) + } + } + } + + if err != nil { + if err == os.EOF { + return true, 0, err + } + return false, 0, err + } + + if !r.TrailingComma { + // We don't allow trailing commas. See if we + // are at the end of the line (being mindful + // of triming spaces + c := r.column + rune, err = r.readRune() + if r.TrimLeadingSpace { + for unicode.IsSpace(rune) { + rune, err = r.readRune() + if err != nil { + break + } + } + } + if err == os.EOF || rune == '\n' { + r.column = c // report the comma + return false, 0, r.error(ErrTrailingComma) + } + r.unreadRune() + } + return true, rune, nil +} diff --git a/src/pkg/csv/reader_test.go b/src/pkg/csv/reader_test.go new file mode 100644 index 0000000000..0068bad1db --- /dev/null +++ b/src/pkg/csv/reader_test.go @@ -0,0 +1,265 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package csv + +import ( + "reflect" + "strings" + "testing" +) + +var readTests = []struct { + Name string + Input string + Output [][]string + UseFieldsPerRecord bool // false (default) means FieldsPerRecord is -1 + + // These fields are copied into the Reader + Comma int + Comment int + FieldsPerRecord int + LazyQuotes bool + TrailingComma bool + TrimLeadingSpace bool + + Error string + Line int // Expected error line if != 0 + Column int // Expected error column if line != 0 +}{ + { + Name: "Simple", + Input: "a,b,c\n", + Output: [][]string{{"a", "b", "c"}}, + }, + { + Name: "CRLF", + Input: "a,b\r\nc,d\r\n", + Output: [][]string{{"a", "b"}, {"c", "d"}}, + }, + { + Name: "BareCR", + Input: "a,b\rc,d\r\n", + Output: [][]string{{"a", "b\rc", "d"}}, + }, + { + Name: "RFC4180test", + UseFieldsPerRecord: true, + Input: `#field1,field2,field3 +"aaa","bb +b","ccc" +"a,a","b""bb","ccc" +zzz,yyy,xxx +`, + Output: [][]string{ + {"#field1", "field2", "field3"}, + {"aaa", "bb\nb", "ccc"}, + {"a,a", `b"bb`, "ccc"}, + {"zzz", "yyy", "xxx"}, + }, + }, + { + Name: "NoEOLTest", + Input: "a,b,c", + Output: [][]string{{"a", "b", "c"}}, + }, + { + Name: "Semicolon", + Comma: ';', + Input: "a;b;c\n", + Output: [][]string{{"a", "b", "c"}}, + }, + { + Name: "MultiLine", + Input: `"two +line","one line","three +line +field"`, + Output: [][]string{{"two\nline", "one line", "three\nline\nfield"}}, + }, + { + Name: "BlankLine", + Input: "a,b,c\n\nd,e,f\n\n", + Output: [][]string{ + {"a", "b", "c"}, + {"d", "e", "f"}, + }, + }, + { + Name: "TrimSpace", + Input: " a, b, c\n", + TrimLeadingSpace: true, + Output: [][]string{{"a", "b", "c"}}, + }, + { + Name: "LeadingSpace", + Input: " a, b, c\n", + Output: [][]string{{" a", " b", " c"}}, + }, + { + Name: "Comment", + Comment: '#', + Input: "#1,2,3\na,b,c\n#comment", + Output: [][]string{{"a", "b", "c"}}, + }, + { + Name: "NoComment", + Input: "#1,2,3\na,b,c", + Output: [][]string{{"#1", "2", "3"}, {"a", "b", "c"}}, + }, + { + Name: "LazyQuotes", + LazyQuotes: true, + Input: `a "word","1"2",a","b`, + Output: [][]string{{`a "word"`, `1"2`, `a"`, `b`}}, + }, + { + Name: "BareQuotes", + LazyQuotes: true, + Input: `a "word","1"2",a"`, + Output: [][]string{{`a "word"`, `1"2`, `a"`}}, + }, + { + Name: "BareDoubleQuotes", + LazyQuotes: true, + Input: `a""b,c`, + Output: [][]string{{`a""b`, `c`}}, + }, + { + Name: "BadDoubleQuotes", + Input: `a""b,c`, + Output: [][]string{{`a""b`, `c`}}, + Error: `bare " in non-quoted-field`, Line: 1, Column: 1, + }, + { + Name: "TrimQuote", + Input: ` "a"," b",c`, + TrimLeadingSpace: true, + Output: [][]string{{"a", " b", "c"}}, + }, + { + Name: "BadBareQuote", + Input: `a "word","b"`, + Error: `bare " in non-quoted-field`, Line: 1, Column: 2, + }, + { + Name: "BadTrailingQuote", + Input: `"a word",b"`, + Error: `bare " in non-quoted-field`, Line: 1, Column: 10, + }, + { + Name: "ExtraneousQuote", + Input: `"a "word","b"`, + Error: `extraneous " in field`, Line: 1, Column: 3, + }, + { + Name: "BadFieldCount", + UseFieldsPerRecord: true, + Input: "a,b,c\nd,e", + Error: "wrong number of fields", Line: 2, + }, + { + Name: "BadFieldCount1", + UseFieldsPerRecord: true, + FieldsPerRecord: 2, + Input: `a,b,c`, + Error: "wrong number of fields", Line: 1, + }, + { + Name: "FieldCount", + Input: "a,b,c\nd,e", + Output: [][]string{{"a", "b", "c"}, {"d", "e"}}, + }, + { + Name: "BadTrailingCommaEOF", + Input: "a,b,c,", + Error: "extra delimiter at end of line", Line: 1, Column: 5, + }, + { + Name: "BadTrailingCommaEOL", + Input: "a,b,c,\n", + Error: "extra delimiter at end of line", Line: 1, Column: 5, + }, + { + Name: "BadTrailingCommaSpaceEOF", + TrimLeadingSpace: true, + Input: "a,b,c, ", + Error: "extra delimiter at end of line", Line: 1, Column: 5, + }, + { + Name: "BadTrailingCommaSpaceEOL", + TrimLeadingSpace: true, + Input: "a,b,c, \n", + Error: "extra delimiter at end of line", Line: 1, Column: 5, + }, + { + Name: "BadTrailingCommaLine3", + TrimLeadingSpace: true, + Input: "a,b,c\nd,e,f\ng,hi,", + Error: "extra delimiter at end of line", Line: 3, Column: 4, + }, + { + Name: "NotTrailingComma3", + Input: "a,b,c, \n", + Output: [][]string{{"a", "b", "c", " "}}, + }, + { + Name: "CommaFieldTest", + TrailingComma: true, + Input: `x,y,z,w +x,y,z, +x,y,, +x,,, +,,, +"x","y","z","w" +"x","y","z","" +"x","y","","" +"x","","","" +"","","","" +`, + Output: [][]string{ + {"x", "y", "z", "w"}, + {"x", "y", "z", ""}, + {"x", "y", "", ""}, + {"x", "", "", ""}, + {"", "", "", ""}, + {"x", "y", "z", "w"}, + {"x", "y", "z", ""}, + {"x", "y", "", ""}, + {"x", "", "", ""}, + {"", "", "", ""}, + }, + }, +} + +func TestRead(t *testing.T) { + for _, tt := range readTests { + r := NewReader(strings.NewReader(tt.Input)) + r.Comment = tt.Comment + if tt.UseFieldsPerRecord { + r.FieldsPerRecord = tt.FieldsPerRecord + } else { + r.FieldsPerRecord = -1 + } + r.LazyQuotes = tt.LazyQuotes + r.TrailingComma = tt.TrailingComma + r.TrimLeadingSpace = tt.TrimLeadingSpace + if tt.Comma != 0 { + r.Comma = tt.Comma + } + out, err := r.ReadAll() + perr, _ := err.(*ParseError) + if tt.Error != "" { + if err == nil || !strings.Contains(err.String(), tt.Error) { + t.Errorf("%s: error %v, want error %q", tt.Name, err, tt.Error) + } else if tt.Line != 0 && (tt.Line != perr.Line || tt.Column != perr.Column) { + t.Errorf("%s: error at %d:%d expected %d:%d", tt.Name, perr.Line, perr.Column, tt.Line, tt.Column) + } + } else if err != nil { + t.Errorf("%s: unexpected error %v", tt.Name, err) + } else if !reflect.DeepEqual(out, tt.Output) { + t.Errorf("%s: out=%q want %q", tt.Name, out, tt.Output) + } + } +} diff --git a/src/pkg/csv/writer.go b/src/pkg/csv/writer.go new file mode 100644 index 0000000000..01386da197 --- /dev/null +++ b/src/pkg/csv/writer.go @@ -0,0 +1,123 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package csv + +import ( + "bufio" + "io" + "os" + "strings" + "unicode" + "utf8" +) + +// A Writer writes records to a CSV encoded file. +// +// As returned by NewWriter, a Writer writes records terminated by a +// newline and uses ',' as the field delimiter. The exported fields can be +// changed to customize the details before the first call to Write or WriteAll. +// +// Comma is the field delimiter. +// +// If UseCRLF is true, the Writer ends each record with \r\n instead of \n. +// just \n is written. +type Writer struct { + Comma int // Field delimiter (set to to ',' by NewWriter) + UseCRLF bool // True to use \r\n as the line terminator + w *bufio.Writer +} + +// NewWriter returns a new Writer that writes to w. +func NewWriter(w io.Writer) *Writer { + return &Writer{ + Comma: ',', + w: bufio.NewWriter(w), + } +} + +// Writer writes a single CSV record to w along with any necessary quoting. +// A record is a slice of strings with each string being one field. +func (w *Writer) Write(record []string) (err os.Error) { + for n, field := range record { + if n > 0 { + if _, err = w.w.WriteRune(w.Comma); err != nil { + return + } + } + + // If we don't have to have a quoted field then just + // write out the field and continue to the next field. + if !w.fieldNeedsQuotes(field) { + if _, err = w.w.WriteString(field); err != nil { + return + } + continue + } + if err = w.w.WriteByte('"'); err != nil { + return + } + + for _, rune := range field { + switch rune { + case '"': + _, err = w.w.WriteString(`""`) + case '\r': + if !w.UseCRLF { + err = w.w.WriteByte('\r') + } + case '\n': + if w.UseCRLF { + _, err = w.w.WriteString("\r\n") + } else { + err = w.w.WriteByte('\n') + } + default: + _, err = w.w.WriteRune(rune) + } + if err != nil { + return + } + } + + if err = w.w.WriteByte('"'); err != nil { + return + } + } + if w.UseCRLF { + _, err = w.w.WriteString("\r\n") + } else { + err = w.w.WriteByte('\n') + } + return +} + +// Flush writes any buffered data to the underlying io.Writer. +func (w *Writer) Flush() { + w.w.Flush() +} + +// WriteAll writes multiple CSV records to w using Write and then calls Flush. +func (w *Writer) WriteAll(records [][]string) (err os.Error) { + for _, record := range records { + err = w.Write(record) + if err != nil { + break + } + } + w.Flush() + return nil +} + +// fieldNeedsQuotes returns true if our field must be enclosed in quotes. +// Empty fields, files with a Comma, fields with a quote or newline, and +// fields which start with a space must be enclosed in quotes. +func (w *Writer) fieldNeedsQuotes(field string) bool { + if len(field) == 0 || strings.IndexRune(field, w.Comma) >= 0 || strings.IndexAny(field, "\"\r\n") >= 0 { + return true + } + + rune, _ := utf8.DecodeRuneInString(field) + return unicode.IsSpace(rune) +} diff --git a/src/pkg/csv/writer_test.go b/src/pkg/csv/writer_test.go new file mode 100644 index 0000000000..578959007f --- /dev/null +++ b/src/pkg/csv/writer_test.go @@ -0,0 +1,44 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package csv + +import ( + "bytes" + "testing" +) + +var writeTests = []struct { + Input [][]string + Output string + UseCRLF bool +}{ + {Input: [][]string{{"abc"}}, Output: "abc\n"}, + {Input: [][]string{{"abc"}}, Output: "abc\r\n", UseCRLF: true}, + {Input: [][]string{{`"abc"`}}, Output: `"""abc"""` + "\n"}, + {Input: [][]string{{`a"b`}}, Output: `"a""b"` + "\n"}, + {Input: [][]string{{`"a"b"`}}, Output: `"""a""b"""` + "\n"}, + {Input: [][]string{{" abc"}}, Output: `" abc"` + "\n"}, + {Input: [][]string{{"abc,def"}}, Output: `"abc,def"` + "\n"}, + {Input: [][]string{{"abc", "def"}}, Output: "abc,def\n"}, + {Input: [][]string{{"abc"}, {"def"}}, Output: "abc\ndef\n"}, + {Input: [][]string{{"abc\ndef"}}, Output: "\"abc\ndef\"\n"}, + {Input: [][]string{{"abc\ndef"}}, Output: "\"abc\r\ndef\"\r\n", UseCRLF: true}, +} + +func TestWrite(t *testing.T) { + for n, tt := range writeTests { + b := &bytes.Buffer{} + f := NewWriter(b) + f.UseCRLF = tt.UseCRLF + err := f.WriteAll(tt.Input) + if err != nil { + t.Errorf("Unexpected error: %s\n", err) + } + out := b.String() + if out != tt.Output { + t.Errorf("#%d: out=%q want %q", n, out, tt.Output) + } + } +}