diff --git a/packages/csv-parse/lib/api/index.js b/packages/csv-parse/lib/api/index.js index 3f694d89..6cd8e3fe 100644 --- a/packages/csv-parse/lib/api/index.js +++ b/packages/csv-parse/lib/api/index.js @@ -2,6 +2,7 @@ import { normalize_columns_array } from "./normalize_columns_array.js"; import { init_state } from "./init_state.js"; import { normalize_options } from "./normalize_options.js"; import { CsvError } from "./CsvError.js"; +import { delimiter_discover } from "../utils/delimiter_discover.js"; const isRecordEmpty = function (record) { return record.every( @@ -70,6 +71,7 @@ const transform = function (original_options = {}) { const { bom, comment_no_infix, + delimiter_auto, encoding, from_line, ltrim, @@ -82,7 +84,45 @@ const transform = function (original_options = {}) { to_line, } = this.options; let { comment, escape, quote, record_delimiter } = this.options; - const { bomSkipped, previousBuf, rawBuffer, escapeIsQuote } = this.state; + const { + bomSkipped, + delimiterDiscovered, + delimiterBufPrevious, + rawBuffer, + escapeIsQuote, + } = this.state; + // Automatic delimiter discovery + if (!delimiterDiscovered && delimiter_auto) { + let delimiterBuf; + if (delimiterBufPrevious === undefined) { + delimiterBuf = nextBuf; + } else if ( + delimiterBufPrevious !== undefined && + nextBuf === undefined + ) { + delimiterBuf = delimiterBufPrevious; + } else { + delimiterBuf = Buffer.concat([delimiterBufPrevious, nextBuf]); + } + // Ensure that nextBuf is not concatenated a second time during buffer reconciliation + nextBuf = undefined; + // this.delimiterBufPrevious = delimiterBuf; + if (end || delimiterBuf.length > delimiter_auto.size) { + this.options.delimiter = [ + Buffer.from( + delimiter_discover(delimiterBuf, this.options.delimiter_auto), + ), + ]; + this.state.previousBuf = delimiterBuf; + this.state.delimiterBufPrevious = undefined; + this.state.delimiterDiscovered = true; + } else { + this.state.delimiterBufPrevious = delimiterBuf; + return; + } + } + // Previous buffers reconciliation + const { previousBuf } = this.state; let buf; if (previousBuf === undefined) { if (nextBuf === undefined) { diff --git a/packages/csv-parse/lib/api/init_state.js b/packages/csv-parse/lib/api/init_state.js index 0ce050c3..50b9e025 100644 --- a/packages/csv-parse/lib/api/init_state.js +++ b/packages/csv-parse/lib/api/init_state.js @@ -59,6 +59,8 @@ const init_state = function (options) { bufBytesStart: 0, castField: options.cast_function, commenting: false, + delimiterBufPrevious: undefined, + delimiterDiscovered: false, // Current error encountered by a record error: undefined, enabled: options.from_line === 1, @@ -77,7 +79,11 @@ const init_state = function (options) { // Skip if the remaining buffer smaller than comment options.comment !== null ? options.comment.length : 0, // Skip if the remaining buffer can be delimiter - ...options.delimiter.map((delimiter) => delimiter.length), + ...(options.delimiter + ? options.delimiter.map((delimiter) => delimiter.length) + : []), + // Auto discovery of delimiter is limited to 1 character + options.delimiter_auto ? 1 : 0, // Skip if the remaining buffer can be escape sequence options.quote !== null ? options.quote.length : 0, // Skip if the remaining buffer can be a multi-byte trim character diff --git a/packages/csv-parse/lib/api/normalize_options.js b/packages/csv-parse/lib/api/normalize_options.js index 440addfd..dbfaed9b 100644 --- a/packages/csv-parse/lib/api/normalize_options.js +++ b/packages/csv-parse/lib/api/normalize_options.js @@ -1,6 +1,7 @@ import { normalize_columns_array } from "./normalize_columns_array.js"; import { CsvError } from "./CsvError.js"; import { underscore } from "../utils/underscore.js"; +import { is_object } from "../utils/is_object.js"; const normalize_options = function (opts) { const options = {}; @@ -190,25 +191,57 @@ const normalize_options = function (opts) { options, ); } - // Normalize option `delimiter` - const delimiter_json = JSON.stringify(options.delimiter); - if (!Array.isArray(options.delimiter)) - options.delimiter = [options.delimiter]; - if (options.delimiter.length === 0) { + // Normalize option `delimiter_auto` + if ( + options.delimiter_auto === undefined || + options.delimiter_auto === null || + options.delimiter_auto === false + ) { + options.delimiter_auto = false; + } else if (options.delimiter_auto === true) { + options.delimiter_auto = {}; + } else if (!is_object(options.delimiter_auto)) { throw new CsvError( - "CSV_INVALID_OPTION_DELIMITER", + "CSV_INVALID_OPTION_DELIMITER_AUTO", [ - "Invalid option delimiter:", - "delimiter must be a non empty string or buffer or array of string|buffer,", - `got ${delimiter_json}`, + "Invalid option delimiter_auto:", + "delimiter_auto must be a boolean or a configuration object,", + `got ${JSON.stringify(options.delimiter_auto)}`, ], options, ); } - options.delimiter = options.delimiter.map(function (delimiter) { - if (delimiter === undefined || delimiter === null || delimiter === false) { - return Buffer.from(",", options.encoding); + if (options.delimiter_auto) { + options.delimiter_auto.preferred ??= { + [",".charCodeAt(0)]: 1.8, + ["\t".charCodeAt(0)]: 1.8, + [";".charCodeAt(0)]: 1.6, + [" ".charCodeAt(0)]: 1.6, + [":".charCodeAt(0)]: 1.5, + [".".charCodeAt(0)]: 1.4, + ["/".charCodeAt(0)]: 1.4, + }; + options.delimiter_auto.score ??= (info, options) => { + return (info.total - info.std) * (options.preferred[info.char_code] ?? 1); + }; + options.delimiter_auto.size ??= 2048; + } + // Normalize option `delimiter` + const delimiter_json = JSON.stringify(options.delimiter); + if (options.delimiter_auto) { + options.delimiter ??= []; + } + if (!Array.isArray(options.delimiter)) { + if ( + options.delimiter === undefined || + options.delimiter === null || + options.delimiter === false + ) { + options.delimiter = Buffer.from(",", options.encoding); } + options.delimiter = [options.delimiter]; + } + options.delimiter = options.delimiter.map(function (delimiter) { if (typeof delimiter === "string") { delimiter = Buffer.from(delimiter, options.encoding); } diff --git a/packages/csv-parse/lib/index.d.ts b/packages/csv-parse/lib/index.d.ts index f1d6bbf9..7119d61e 100644 --- a/packages/csv-parse/lib/index.d.ts +++ b/packages/csv-parse/lib/index.d.ts @@ -93,6 +93,12 @@ export type ColumnOption = | false | { name: K }; +export interface OptionDelimiterAuto { + preferred: Record; + score: () => number; + size: number; +} + export interface OptionsNormalized { auto_parse?: boolean | CastingFunction; auto_parse_date?: boolean | CastingDateFunction; @@ -143,6 +149,10 @@ export interface OptionsNormalized { * Set the field delimiter. One character only, defaults to comma. */ delimiter: Buffer[]; + /** + * Discover the field delimiter. + */ + delimiter_auto: OptionDelimiterAuto; /** * Set the source and destination encoding, a value of `null` returns buffer instead of strings. */ @@ -318,6 +328,10 @@ export interface Options { * Set the field delimiter. One character only, defaults to comma. */ delimiter?: OptionsNormalized["delimiter"] | string | string[] | Buffer; + /** + * Discover the field delimiter + */ + delimiter_auto?: boolean | Partial; /** * Set the source and destination encoding, a value of `null` returns buffer instead of strings. */ diff --git a/packages/csv-parse/lib/index.js b/packages/csv-parse/lib/index.js index 0fd69eac..4aefe2a2 100644 --- a/packages/csv-parse/lib/index.js +++ b/packages/csv-parse/lib/index.js @@ -104,8 +104,6 @@ const parse = function () { options === undefined || options.objname === undefined ? [] : Object.create(null); - // const records = - // options === undefined || options.objname === undefined ? [] : {}; parser.on("readable", function () { let record; while ((record = this.read()) !== null) { @@ -143,4 +141,4 @@ const parse = function () { return parser; }; -export { parse, Parser, CsvError, normalize_options }; +export { CsvError, parse, Parser, normalize_options }; diff --git a/packages/csv-parse/lib/stream.d.ts b/packages/csv-parse/lib/stream.d.ts index 2bddf336..7ba940ad 100644 --- a/packages/csv-parse/lib/stream.d.ts +++ b/packages/csv-parse/lib/stream.d.ts @@ -19,4 +19,5 @@ export { InfoField, CsvErrorCode, CsvError, + normalize_options, } from "./index.js"; diff --git a/packages/csv-parse/lib/stream.js b/packages/csv-parse/lib/stream.js index aef0ed15..b74c902e 100644 --- a/packages/csv-parse/lib/stream.js +++ b/packages/csv-parse/lib/stream.js @@ -1,5 +1,6 @@ import { TransformStream, CountQueuingStrategy } from "node:stream/web"; -import { transform } from "./api/index.js"; +import { CsvError, transform } from "./api/index.js"; +import { normalize_options } from "./api/normalize_options.js"; const parse = (opts) => { const api = transform(opts); @@ -33,4 +34,4 @@ const parse = (opts) => { ); }; -export { parse }; +export { parse, CsvError, normalize_options }; diff --git a/packages/csv-parse/lib/sync.d.ts b/packages/csv-parse/lib/sync.d.ts index 0b58066e..846b68fe 100644 --- a/packages/csv-parse/lib/sync.d.ts +++ b/packages/csv-parse/lib/sync.d.ts @@ -27,4 +27,5 @@ export { InfoField, CsvErrorCode, CsvError, + normalize_options, } from "./index.js"; diff --git a/packages/csv-parse/lib/sync.js b/packages/csv-parse/lib/sync.js index 92c5abd5..c45eaf49 100644 --- a/packages/csv-parse/lib/sync.js +++ b/packages/csv-parse/lib/sync.js @@ -1,4 +1,5 @@ import { CsvError, transform } from "./api/index.js"; +import { normalize_options } from "./api/normalize_options.js"; const parse = function (data, opts = {}) { if (typeof data === "string") { @@ -18,5 +19,4 @@ const parse = function (data, opts = {}) { return records; }; -export { parse }; -export { CsvError }; +export { parse, CsvError, normalize_options }; diff --git a/packages/csv-parse/lib/utils/delimiter_discover.js b/packages/csv-parse/lib/utils/delimiter_discover.js new file mode 100644 index 00000000..5a2bb4ae --- /dev/null +++ b/packages/csv-parse/lib/utils/delimiter_discover.js @@ -0,0 +1,65 @@ +import { normalize_options } from "../api/normalize_options.js"; +import { transform } from "../api/index.js"; + +// Discussed in [issue #400](https://github.com/adaltas/node-csv/issues/400) +// See https://github.com/python/cpython/blob/ea1b1c579f600cc85d145c60862b2e6b98701b24/Lib/csv.py#L349 +const delimiter_discover = function (records, options) { + // Normalize the configuration + if (!options) { + ({ delimiter_auto: options } = normalize_options({ delimiter_auto: true })); + } + // Convert String to Buffer + if (typeof records === "string") { + records = Buffer.from(records); + } + // Convert Buffer to an array of records + if (Buffer.isBuffer(records)) { + records = ((data) => { + const records = []; + const parser = transform({ delimiter: [] }); + const push = (record) => records.push(record); + const close = () => {}; + const error = parser.parse(data, true, push, close); + if (error !== undefined) throw error; + return records; + })(records); + } + // Info array initialization, 127 entries, one per char code + const info = Array(127) + .fill() + .map(() => ({ lines: [] })); + // Traverse each records, count occurences per char code + records.map(([record], line) => { + for (let i = 0, l = record.length; i < l; i++) { + // Count the character frequency + const code = record.charCodeAt(i); + info[code].lines[line] ??= 0; + info[code].lines[line]++; + } + }); + // Traverse each char code, compute the score + info.map((info, i) => { + info.char_code = i; + info.std = std(info.lines); + info.total = info.lines.reduce((acc, val) => acc + val, 0); + info.preferred = !!options.preferred[i]; + info.score = options.score(info, options); + }); + // Extract the dominant character + const result = info.reduce( + (acc, info) => (acc.score > info.score ? acc : info), + {}, + ); + return String.fromCharCode(result.char_code); +}; + +const std = function (array) { + const n = array.length; + if (n === 0) return 0; + const mean = array.reduce((a, b) => a + b) / n; + return Math.sqrt( + array.map((x) => Math.pow(x - mean, 2)).reduce((a, b) => a + b) / n, + ); +}; + +export { delimiter_discover }; diff --git a/packages/csv-parse/test/api.stream.finished.ts b/packages/csv-parse/test/api.stream.finished.ts index 43bae666..14d509bd 100644 --- a/packages/csv-parse/test/api.stream.finished.ts +++ b/packages/csv-parse/test/api.stream.finished.ts @@ -56,7 +56,6 @@ describe("API stream.finished", function () { } }); await stream.finished(parser); - console.log(records); records.length.should.eql(3); }); diff --git a/packages/csv-parse/test/api.types.ts b/packages/csv-parse/test/api.types.ts index e7b6d4db..1f9276d8 100644 --- a/packages/csv-parse/test/api.types.ts +++ b/packages/csv-parse/test/api.types.ts @@ -38,6 +38,7 @@ describe("API Types", function () { "comment", "comment_no_infix", "delimiter", + "delimiter_auto", "encoding", "escape", "from", diff --git a/packages/csv-parse/test/option.delimiter.ts b/packages/csv-parse/test/option.delimiter.ts index 9e8f11d1..4663eca5 100644 --- a/packages/csv-parse/test/option.delimiter.ts +++ b/packages/csv-parse/test/option.delimiter.ts @@ -1,10 +1,14 @@ import "should"; -import { parse } from "../lib/index.js"; +import { parse, normalize_options } from "../lib/index.js"; describe("Option `delimiter`", function () { it("validation", function () { parse("", { delimiter: "," }, () => {}); - parse("", { delimiter: [",", ","] }, () => {}); + normalize_options({ delimiter: [] }).delimiter.should.eql([]); + normalize_options({ delimiter: [".", ","] }).delimiter.should.eql([ + Buffer.from("."), + Buffer.from(","), + ]); parse("", { delimiter: Buffer.from(",") }, () => {}); parse("", { delimiter: [Buffer.from(","), Buffer.from(",")] }, () => {}); (() => { @@ -21,13 +25,6 @@ describe("Option `delimiter`", function () { 'Invalid option delimiter: delimiter must be a non empty string or buffer or array of string|buffer, got {"type":"Buffer","data":[]}', code: "CSV_INVALID_OPTION_DELIMITER", }); - (() => { - parse("", { delimiter: [] }, () => {}); - }).should.throw({ - message: - "Invalid option delimiter: delimiter must be a non empty string or buffer or array of string|buffer, got []", - code: "CSV_INVALID_OPTION_DELIMITER", - }); (() => { parse("", { delimiter: [""] }, () => {}); }).should.throw({ @@ -59,6 +56,19 @@ describe("Option `delimiter`", function () { parser.end(); }); + it("default to comma", function () { + const options = normalize_options({}); + options.delimiter.should.eql([Buffer.from(",")]); + }); + + it("empty array create a single field", function (next) { + parse("abc,,123,\n,def,,", { delimiter: [] }, (err, records) => { + if (err) return next(err); + records.should.eql([["abc,,123,"], [",def,,"]]); + next(); + }); + }); + it("using default comma", function (next) { parse("abc,,123,\n,def,,", (err, records) => { if (err) return next(err); diff --git a/packages/csv-parse/test/option.delimiter_auto.ts b/packages/csv-parse/test/option.delimiter_auto.ts new file mode 100644 index 00000000..4bd7e1f8 --- /dev/null +++ b/packages/csv-parse/test/option.delimiter_auto.ts @@ -0,0 +1,113 @@ +import should from "should"; +import { parse, normalize_options } from "../lib/index.js"; +import { parse as parse_sync } from "../lib/sync.js"; + +describe("Option `delimiter_auto`", function () { + it("validation", function () { + parse("", { delimiter_auto: true }, () => {}); + parse("", { delimiter_auto: false }, () => {}); + }); + + it("default to false", function () { + const options = normalize_options({}); + options.delimiter_auto.should.eql(false); + options.delimiter.should.eql([Buffer.from(",")]); + }); + + it("set delimiter to empty array when true", function () { + const options = normalize_options({ delimiter_auto: true }); + options.delimiter_auto.should.match({ + preferred: (it: object) => it.should.be.an.Object(), + score: (it: () => void) => it.should.be.a.Function(), + }); + should(options.delimiter).eql([]); + }); + + it("sync empty", function () { + parse_sync("", { + delimiter_auto: true, + }).should.eql([]); + }); + + it("sync small", function () { + parse_sync("a.b,c.d\ne,f.g.h", { + delimiter_auto: true, + }).should.eql([ + ["a", "b,c", "d"], + ["e,f", "g", "h"], + ]); + }); + + it("stream smaller than size", function (next) { + let content = ""; + for (let i = 0; i < 10; i++) { + content += i + ":abc:def:hij:klm:nop:qrs:tuv:wxyz" + "\n"; + } + // Size is greater than the max discovery size + const options = normalize_options({ delimiter_auto: true }); + content.length.should.be.lessThan(options.delimiter_auto.size); + // Data parsing + const parser = parse({ delimiter_auto: true }, (err, data) => { + if (err) return next(err); + data + .map((r) => r.join(":")) + .join("\n") + .should.eql(content.trim()); + next(); + }); + // Data writing + for (let i = 0; i < content.length; i++) { + parser.write(content.slice(i, i + 1)); + } + parser.end(); + }); + + it("stream greater than size small version", function (next) { + let content = ""; + for (let i = 0; i < 3; i++) { + content += i + ":abc:def:hij" + "\n"; + } + // Size is greater than the max discovery size + const options = normalize_options({ delimiter_auto: { size: 20 } }); + content.length.should.be.greaterThan(options.delimiter_auto.size); + // Data parsing + const parser = parse({ delimiter_auto: { size: 20 } }, (err, data) => { + if (err) return next(err); + data + .map((r) => r.join(":")) + .join("\n") + .should.eql(content.trim()); + next(); + }); + // Data writing + for (let i = 0; i < content.length; i++) { + parser.write(content.slice(i, i + 1)); + // process.stdout.write(content.slice(i, i + 1)); + } + parser.end(); + }); + + it("stream greater than size large version", function (next) { + let content = ""; + for (let i = 0; i < 100; i++) { + content += i + ":abc:def:hij:klm:nop:qrs:tuv:wxyz" + "\n"; + } + // Size is greater than the max discovery size + const options = normalize_options({ delimiter_auto: true }); + content.length.should.be.greaterThan(options.delimiter_auto.size); + // Data parsing + const parser = parse({ delimiter_auto: true }, (err, data) => { + if (err) return next(err); + data + .map((r) => r.join(":")) + .join("\n") + .should.eql(content.trim()); + next(); + }); + // Data writing + for (let i = 0; i < content.length; i++) { + parser.write(content.slice(i, i + 1)); + } + parser.end(); + }); +}); diff --git a/packages/csv-parse/test/ResizableBuffer.js b/packages/csv-parse/test/utils.ResizableBuffer.js similarity index 100% rename from packages/csv-parse/test/ResizableBuffer.js rename to packages/csv-parse/test/utils.ResizableBuffer.js diff --git a/packages/csv-parse/test/utils.delimiter_discover.js b/packages/csv-parse/test/utils.delimiter_discover.js new file mode 100644 index 00000000..4157fb8d --- /dev/null +++ b/packages/csv-parse/test/utils.delimiter_discover.js @@ -0,0 +1,31 @@ +import "should"; +import { delimiter_discover } from "../lib/utils/delimiter_discover.js"; + +describe("Option `delimiter_auto`", function () { + it("dominent characters has the most occurences", function () { + // There is more `.` than `,` + delimiter_discover("a.b,c.d\ne,f.g.h").should.eql("."); + delimiter_discover([["a.b,c.d"], ["e,f.g.h"]]).should.eql("."); + }); + + it("dominent characters has a smaller deviance", function () { + // There is more `x` than `.` but `.` has always 2 occurrences + delimiter_discover("xa.xb,c.d\nxe,xf.g.h\nxe,xf.xg.h").should.eql("."); + delimiter_discover([ + ["xa.xb,c.d"], + ["xe,xf.g.h"], + ["xe,xf.xg.h"], + ]).should.eql("."); + }); + + it("dominent characters are boosted if prefered", function () { + // `,` is more boosted than `.` + delimiter_discover("1,a.b,c.d\n2,e,f.g.h").should.eql(","); + delimiter_discover([["1,a.b,c.d"], ["2,e,f.g.h"]]).should.eql(","); + }); + + it("handle inconsistent field length", function () { + delimiter_discover("1,a.b,c.d\n2,e\n3,f,g").should.eql(","); + delimiter_discover([["1,a.b,c.d"], ["2,e"], ["3,f,g"]]).should.eql(","); + }); +});