Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion packages/csv-parse/lib/api/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { normalize_columns_array } from "./normalize_columns_array.js";
import { init_state } from "./init_state.js";
import { normalize_options } from "./normalize_options.js";
import { CsvError } from "./CsvError.js";
import { delimiter_discover } from "../utils/delimiter_discover.js";

const isRecordEmpty = function (record) {
return record.every(
Expand Down Expand Up @@ -70,6 +71,7 @@ const transform = function (original_options = {}) {
const {
bom,
comment_no_infix,
delimiter_auto,
encoding,
from_line,
ltrim,
Expand All @@ -82,7 +84,45 @@ const transform = function (original_options = {}) {
to_line,
} = this.options;
let { comment, escape, quote, record_delimiter } = this.options;
const { bomSkipped, previousBuf, rawBuffer, escapeIsQuote } = this.state;
const {
bomSkipped,
delimiterDiscovered,
delimiterBufPrevious,
rawBuffer,
escapeIsQuote,
} = this.state;
// Automatic delimiter discovery
if (!delimiterDiscovered && delimiter_auto) {
let delimiterBuf;
if (delimiterBufPrevious === undefined) {
delimiterBuf = nextBuf;
} else if (
delimiterBufPrevious !== undefined &&
nextBuf === undefined
) {
delimiterBuf = delimiterBufPrevious;
} else {
delimiterBuf = Buffer.concat([delimiterBufPrevious, nextBuf]);
}
// Ensure that nextBuf is not concatenated a second time during buffer reconciliation
nextBuf = undefined;
// this.delimiterBufPrevious = delimiterBuf;
if (end || delimiterBuf.length > delimiter_auto.size) {
this.options.delimiter = [
Buffer.from(
delimiter_discover(delimiterBuf, this.options.delimiter_auto),
),
];
this.state.previousBuf = delimiterBuf;
this.state.delimiterBufPrevious = undefined;
this.state.delimiterDiscovered = true;
} else {
this.state.delimiterBufPrevious = delimiterBuf;
return;
}
}
// Previous buffers reconciliation
const { previousBuf } = this.state;
let buf;
if (previousBuf === undefined) {
if (nextBuf === undefined) {
Expand Down
8 changes: 7 additions & 1 deletion packages/csv-parse/lib/api/init_state.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ const init_state = function (options) {
bufBytesStart: 0,
castField: options.cast_function,
commenting: false,
delimiterBufPrevious: undefined,
delimiterDiscovered: false,
// Current error encountered by a record
error: undefined,
enabled: options.from_line === 1,
Expand All @@ -77,7 +79,11 @@ const init_state = function (options) {
// Skip if the remaining buffer smaller than comment
options.comment !== null ? options.comment.length : 0,
// Skip if the remaining buffer can be delimiter
...options.delimiter.map((delimiter) => delimiter.length),
...(options.delimiter
? options.delimiter.map((delimiter) => delimiter.length)
: []),
// Auto discovery of delimiter is limited to 1 character
options.delimiter_auto ? 1 : 0,
// Skip if the remaining buffer can be escape sequence
options.quote !== null ? options.quote.length : 0,
// Skip if the remaining buffer can be a multi-byte trim character
Expand Down
57 changes: 45 additions & 12 deletions packages/csv-parse/lib/api/normalize_options.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { normalize_columns_array } from "./normalize_columns_array.js";
import { CsvError } from "./CsvError.js";
import { underscore } from "../utils/underscore.js";
import { is_object } from "../utils/is_object.js";

const normalize_options = function (opts) {
const options = {};
Expand Down Expand Up @@ -190,25 +191,57 @@ const normalize_options = function (opts) {
options,
);
}
// Normalize option `delimiter`
const delimiter_json = JSON.stringify(options.delimiter);
if (!Array.isArray(options.delimiter))
options.delimiter = [options.delimiter];
if (options.delimiter.length === 0) {
// Normalize option `delimiter_auto`
if (
options.delimiter_auto === undefined ||
options.delimiter_auto === null ||
options.delimiter_auto === false
) {
options.delimiter_auto = false;
} else if (options.delimiter_auto === true) {
options.delimiter_auto = {};
} else if (!is_object(options.delimiter_auto)) {
throw new CsvError(
"CSV_INVALID_OPTION_DELIMITER",
"CSV_INVALID_OPTION_DELIMITER_AUTO",
[
"Invalid option delimiter:",
"delimiter must be a non empty string or buffer or array of string|buffer,",
`got ${delimiter_json}`,
"Invalid option delimiter_auto:",
"delimiter_auto must be a boolean or a configuration object,",
`got ${JSON.stringify(options.delimiter_auto)}`,
],
options,
);
}
options.delimiter = options.delimiter.map(function (delimiter) {
if (delimiter === undefined || delimiter === null || delimiter === false) {
return Buffer.from(",", options.encoding);
if (options.delimiter_auto) {
options.delimiter_auto.preferred ??= {
[",".charCodeAt(0)]: 1.8,
["\t".charCodeAt(0)]: 1.8,
[";".charCodeAt(0)]: 1.6,
[" ".charCodeAt(0)]: 1.6,
[":".charCodeAt(0)]: 1.5,
[".".charCodeAt(0)]: 1.4,
["/".charCodeAt(0)]: 1.4,
};
options.delimiter_auto.score ??= (info, options) => {
return (info.total - info.std) * (options.preferred[info.char_code] ?? 1);
};
options.delimiter_auto.size ??= 2048;
}
// Normalize option `delimiter`
const delimiter_json = JSON.stringify(options.delimiter);
if (options.delimiter_auto) {
options.delimiter ??= [];
}
if (!Array.isArray(options.delimiter)) {
if (
options.delimiter === undefined ||
options.delimiter === null ||
options.delimiter === false
) {
options.delimiter = Buffer.from(",", options.encoding);
}
options.delimiter = [options.delimiter];
}
options.delimiter = options.delimiter.map(function (delimiter) {
if (typeof delimiter === "string") {
delimiter = Buffer.from(delimiter, options.encoding);
}
Expand Down
14 changes: 14 additions & 0 deletions packages/csv-parse/lib/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ export type ColumnOption<K = string> =
| false
| { name: K };

export interface OptionDelimiterAuto {
preferred: Record<string, number>;
score: () => number;
size: number;
}

export interface OptionsNormalized<T = string[], U = T> {
auto_parse?: boolean | CastingFunction;
auto_parse_date?: boolean | CastingDateFunction;
Expand Down Expand Up @@ -143,6 +149,10 @@ export interface OptionsNormalized<T = string[], U = T> {
* Set the field delimiter. One character only, defaults to comma.
*/
delimiter: Buffer[];
/**
* Discover the field delimiter.
*/
delimiter_auto: OptionDelimiterAuto;
/**
* Set the source and destination encoding, a value of `null` returns buffer instead of strings.
*/
Expand Down Expand Up @@ -318,6 +328,10 @@ export interface Options<T = string[], U = T> {
* Set the field delimiter. One character only, defaults to comma.
*/
delimiter?: OptionsNormalized["delimiter"] | string | string[] | Buffer;
/**
* Discover the field delimiter
*/
delimiter_auto?: boolean | Partial<OptionsNormalized["delimiter_auto"]>;
/**
* Set the source and destination encoding, a value of `null` returns buffer instead of strings.
*/
Expand Down
4 changes: 1 addition & 3 deletions packages/csv-parse/lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,6 @@ const parse = function () {
options === undefined || options.objname === undefined
? []
: Object.create(null);
// const records =
// options === undefined || options.objname === undefined ? [] : {};
parser.on("readable", function () {
let record;
while ((record = this.read()) !== null) {
Expand Down Expand Up @@ -143,4 +141,4 @@ const parse = function () {
return parser;
};

export { parse, Parser, CsvError, normalize_options };
export { CsvError, parse, Parser, normalize_options };
1 change: 1 addition & 0 deletions packages/csv-parse/lib/stream.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ export {
InfoField,
CsvErrorCode,
CsvError,
normalize_options,
} from "./index.js";
5 changes: 3 additions & 2 deletions packages/csv-parse/lib/stream.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { TransformStream, CountQueuingStrategy } from "node:stream/web";
import { transform } from "./api/index.js";
import { CsvError, transform } from "./api/index.js";
import { normalize_options } from "./api/normalize_options.js";

const parse = (opts) => {
const api = transform(opts);
Expand Down Expand Up @@ -33,4 +34,4 @@ const parse = (opts) => {
);
};

export { parse };
export { parse, CsvError, normalize_options };
1 change: 1 addition & 0 deletions packages/csv-parse/lib/sync.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ export {
InfoField,
CsvErrorCode,
CsvError,
normalize_options,
} from "./index.js";
4 changes: 2 additions & 2 deletions packages/csv-parse/lib/sync.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { CsvError, transform } from "./api/index.js";
import { normalize_options } from "./api/normalize_options.js";

const parse = function (data, opts = {}) {
if (typeof data === "string") {
Expand All @@ -18,5 +19,4 @@ const parse = function (data, opts = {}) {
return records;
};

export { parse };
export { CsvError };
export { parse, CsvError, normalize_options };
65 changes: 65 additions & 0 deletions packages/csv-parse/lib/utils/delimiter_discover.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { normalize_options } from "../api/normalize_options.js";
import { transform } from "../api/index.js";

// Discussed in [issue #400](https://github.com/adaltas/node-csv/issues/400)
// See https://github.com/python/cpython/blob/ea1b1c579f600cc85d145c60862b2e6b98701b24/Lib/csv.py#L349
const delimiter_discover = function (records, options) {
// Normalize the configuration
if (!options) {
({ delimiter_auto: options } = normalize_options({ delimiter_auto: true }));
}
// Convert String to Buffer
if (typeof records === "string") {
records = Buffer.from(records);
}
// Convert Buffer to an array of records
if (Buffer.isBuffer(records)) {
records = ((data) => {
const records = [];
const parser = transform({ delimiter: [] });
const push = (record) => records.push(record);
const close = () => {};
const error = parser.parse(data, true, push, close);
if (error !== undefined) throw error;
return records;
})(records);
}
// Info array initialization, 127 entries, one per char code
const info = Array(127)
.fill()
.map(() => ({ lines: [] }));
// Traverse each records, count occurences per char code
records.map(([record], line) => {
for (let i = 0, l = record.length; i < l; i++) {
// Count the character frequency
const code = record.charCodeAt(i);
info[code].lines[line] ??= 0;
info[code].lines[line]++;
}
});
// Traverse each char code, compute the score
info.map((info, i) => {
info.char_code = i;
info.std = std(info.lines);
info.total = info.lines.reduce((acc, val) => acc + val, 0);
info.preferred = !!options.preferred[i];
info.score = options.score(info, options);
});
// Extract the dominant character
const result = info.reduce(
(acc, info) => (acc.score > info.score ? acc : info),
{},
);
return String.fromCharCode(result.char_code);
};

const std = function (array) {
const n = array.length;
if (n === 0) return 0;
const mean = array.reduce((a, b) => a + b) / n;
return Math.sqrt(
array.map((x) => Math.pow(x - mean, 2)).reduce((a, b) => a + b) / n,
);
};

export { delimiter_discover };
1 change: 0 additions & 1 deletion packages/csv-parse/test/api.stream.finished.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ describe("API stream.finished", function () {
}
});
await stream.finished(parser);
console.log(records);
records.length.should.eql(3);
});

Expand Down
1 change: 1 addition & 0 deletions packages/csv-parse/test/api.types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ describe("API Types", function () {
"comment",
"comment_no_infix",
"delimiter",
"delimiter_auto",
"encoding",
"escape",
"from",
Expand Down
28 changes: 19 additions & 9 deletions packages/csv-parse/test/option.delimiter.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import "should";
import { parse } from "../lib/index.js";
import { parse, normalize_options } from "../lib/index.js";

describe("Option `delimiter`", function () {
it("validation", function () {
parse("", { delimiter: "," }, () => {});
parse("", { delimiter: [",", ","] }, () => {});
normalize_options({ delimiter: [] }).delimiter.should.eql([]);
normalize_options({ delimiter: [".", ","] }).delimiter.should.eql([
Buffer.from("."),
Buffer.from(","),
]);
parse("", { delimiter: Buffer.from(",") }, () => {});
parse("", { delimiter: [Buffer.from(","), Buffer.from(",")] }, () => {});
(() => {
Expand All @@ -21,13 +25,6 @@ describe("Option `delimiter`", function () {
'Invalid option delimiter: delimiter must be a non empty string or buffer or array of string|buffer, got {"type":"Buffer","data":[]}',
code: "CSV_INVALID_OPTION_DELIMITER",
});
(() => {
parse("", { delimiter: [] }, () => {});
}).should.throw({
message:
"Invalid option delimiter: delimiter must be a non empty string or buffer or array of string|buffer, got []",
code: "CSV_INVALID_OPTION_DELIMITER",
});
(() => {
parse("", { delimiter: [""] }, () => {});
}).should.throw({
Expand Down Expand Up @@ -59,6 +56,19 @@ describe("Option `delimiter`", function () {
parser.end();
});

it("default to comma", function () {
const options = normalize_options({});
options.delimiter.should.eql([Buffer.from(",")]);
});

it("empty array create a single field", function (next) {
parse("abc,,123,\n,def,,", { delimiter: [] }, (err, records) => {
if (err) return next(err);
records.should.eql([["abc,,123,"], [",def,,"]]);
next();
});
});

it("using default comma", function (next) {
parse("abc,,123,\n,def,,", (err, records) => {
if (err) return next(err);
Expand Down
Loading
Loading