From 224637ab92e75d8d476dcae59b737e2dcc85e412 Mon Sep 17 00:00:00 2001 From: mprammer Date: Thu, 11 Jun 2026 15:32:33 +0100 Subject: [PATCH] Resync docs/v1 snapshot to current truth Promote the drift that the Amazon dataset PR (#16) held back to stay scoped: BI-CommonGovernment's authoritative sources.json description, fresher Open Food Facts (4,466,927 -> 4,517,492 rows) and OSM Germany Relations (889,712 -> 890,059) local builds with their sizes and row-group counts, and recorded row-group / vortex metadata for Spambase and uci-iris. Verified against the on-disk parquet/vortex. Co-Authored-By: Claude Signed-off-by: mprammer --- docs/v1/datasets.md | 8 ++++---- docs/v1/snapshot.json | 26 +++++++++++++++++++------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/docs/v1/datasets.md b/docs/v1/datasets.md index 280af02..6b21800 100644 --- a/docs/v1/datasets.md +++ b/docs/v1/datasets.md @@ -53,7 +53,7 @@ | BI-Bimbo | Public BI Benchmark — Bimbo | Public BI workload `Bimbo` — 49.7M rows × 12 columns (10 integer, 2 decimal/float). Notable columns: `Agencia_ID`, `Canal_ID`, `Cliente_ID`, `Demanda_uni_equil`. Background: Grupo Bimbo (Mexican multinational bakery) sales / inventory demand data — `Agencia_ID`, `Cliente_ID`, `Demanda_uni_equil`; widely circulated via the 2016 Bimbo Inventory Demand Kaggle competition. One of CWI's 46 real-world BI-dashboard workloads in the Public BI Benchmark — pipe-delimited CSV partitions used to stress columnar engines on production data quirks: inconsistent encodings, mixed quoting, sparse columns, real-world cardinalities. | https://github.com/cwida/public_bi_benchmark/tree/master/benchmark/Bimbo | Custom | MIT | 74,180,464 | — | 368.6 MB | 436.5 MB | | BI-CityMaxCapita | Public BI Benchmark — CityMaxCapita | Public BI workload `CityMaxCapita` — 913k rows × 31 columns (18 string, 7 integer, 6 decimal/float). Notable columns: `City/Admin`, `City/State`, `City`, `Created Date/Time`. Background: Twitter analytics — `FF Ratio` (follower / following), `Favorites`, `First Link in Tweet`. The workbook name is not domain-revealing. One of CWI's 46 real-world BI-dashboard workloads in the Public BI Benchmark — pipe-delimited CSV partitions used to stress columnar engines on production data quirks: inconsistent encodings, mixed quoting, sparse columns, real-world cardinalities. | https://github.com/cwida/public_bi_benchmark/tree/master/benchmark/CityMaxCapita | Custom | MIT | 912,657 | — | 102.0 MB | 136.6 MB | | BI-CMSprovider | Public BI Benchmark — CMSprovider | Public BI workload `CMSprovider` — 13.0M rows × 26 columns (14 string, 7 decimal/float, 5 integer). Notable columns: `AVERAGE_MEDICARE_ALLOWED_AMT`, `AVERAGE_MEDICARE_PAYMENT_AMT`, `AVERAGE_SUBMITTED_CHRG_AMT`, `BENE_DAY_SRVC_CNT`. Background: US Centers for Medicare & Medicaid Services — physician/provider utilization and Medicare payment data; columns include AVERAGE_MEDICARE_ALLOWED_AMT, BENE_UNIQUE_CNT, HCPCS_CODE. One of CWI's 46 real-world BI-dashboard workloads in the Public BI Benchmark — pipe-delimited CSV partitions used to stress columnar engines on production data quirks: inconsistent encodings, mixed quoting, sparse columns, real-world cardinalities. | https://github.com/cwida/public_bi_benchmark/tree/master/benchmark/CMSprovider | Custom | MIT | 18,575,754 | — | 798.3 MB | 804.8 MB | -| BI-CommonGovernment | Public BI Benchmark — CommonGovernment | Public BI Benchmark workload `CommonGovernment` — pipe-delimited CSV partitions drawn from a real-world BI dashboard. Part of CWI's 47-workload corpus assembled to stress-test columnar query engines on quirky production data: inconsistent encodings, mixed quoting, sparse columns, real-world cardinalities. Each workload shares a schema across its partitions and ships as raw CSV (no parquet upstream); raincloud merges them via `public_bi_merge`. | https://github.com/cwida/public_bi_benchmark/tree/master/benchmark/CommonGovernment | Custom | MIT | 141,123,827 | 6,568 | 6,358.3 MB | — | +| BI-CommonGovernment | Public BI Benchmark — CommonGovernment | Public BI workload `CommonGovernment` — 141.1M rows × 56 columns (37 string, 10 decimal/float, 9 integer). Notable columns: `a_aid_acontid_agencyid`, `a_aid_acontid_piid`, `ag_name`, `agency_code`. Background: US federal contract / grant awards data of the kind published on USAspending.gov. One of CWI's 46 real-world BI-dashboard workloads in the Public BI Benchmark — pipe-delimited CSV partitions used to stress columnar engines on production data quirks: inconsistent encodings, mixed quoting, sparse columns, real-world cardinalities. | https://github.com/cwida/public_bi_benchmark/tree/master/benchmark/CommonGovernment | Custom | MIT | 141,123,827 | 6,568 | 6,358.3 MB | — | | BI-Corporations | Public BI Benchmark — Corporations | Public BI workload `Corporations` — 742k rows × 27 columns (21 string, 6 integer). Notable columns: `Id1`, `angelco_account`, `business_model`, `city`. Background: Startup / company database — fields include `angelco_account` (AngelList) and `crunchbase_account` identifiers, plus `business_model`, `city`, `continent`, `country`. One of CWI's 46 real-world BI-dashboard workloads in the Public BI Benchmark — pipe-delimited CSV partitions used to stress columnar engines on production data quirks: inconsistent encodings, mixed quoting, sparse columns, real-world cardinalities. | https://github.com/cwida/public_bi_benchmark/tree/master/benchmark/Corporations | Custom | MIT | 741,723 | — | 53.6 MB | 67.9 MB | | BI-Eixo | Public BI Benchmark — Eixo | Public BI workload `Eixo` — 7.6M rows × 80 columns (59 string, 12 integer, 7 temporal, 2 bool). Notable columns: `Calculation_838513981443702785`, `Calculation_838513981462429699`, `Codigo Diploma/Certificado`, `Cor da Pele`. Background: Brazilian federal education / training program data (same schema family as `mulheresmil` — diploma codes, skin-colour demographic, certificate dates). The workbook name ("Eixo" = axis) doesn't reflect the contents. One of CWI's 46 real-world BI-dashboard workloads in the Public BI Benchmark — pipe-delimited CSV partitions used to stress columnar engines on production data quirks: inconsistent encodings, mixed quoting, sparse columns, real-world cardinalities. | https://github.com/cwida/public_bi_benchmark/tree/master/benchmark/Eixo | Custom | MIT | 7,559,227 | — | 463.1 MB | 616.6 MB | | BI-Euro2016 | Public BI Benchmark — Euro2016 | Public BI workload `Euro2016` — 2.1M rows × 11 columns (6 string, 2 integer, 2 decimal/float, 1 temporal). Notable columns: `id`, `lang`, `latitude`, `longitude`. Background: Twitter sentiment data *about* UEFA Euro 2016 — `lang`, `latitude` / `longitude`, `polarity`. Not match / player statistics despite the workbook name. One of CWI's 46 real-world BI-dashboard workloads in the Public BI Benchmark — pipe-delimited CSV partitions used to stress columnar engines on production data quirks: inconsistent encodings, mixed quoting, sparse columns, real-world cardinalities. | https://github.com/cwida/public_bi_benchmark/tree/master/benchmark/Euro2016 | Custom | MIT | 2,052,497 | — | 127.7 MB | 156.9 MB | @@ -199,7 +199,7 @@ | Online Retail | UCI ML Repository — Online Retail | This is a transactional data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. This is a transactional data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers. | https://archive.ics.uci.edu/dataset/352/online+retail | Tabular (CSV) | CC-BY-4.0 | 541,909 | 1 | 2.9 MB | 3.3 MB | | Online Retail II | UCI ML Repository — Online Retail II | A real online retail transaction data set of two years. This Online Retail II data set contains all the transactions occurring for a UK-based and registered, non-store online retail between 01/12/2009 and 09/12/2011.The company mainly sells unique all-occasion gift-ware. Many customers of the company are wholesalers. | https://archive.ics.uci.edu/dataset/502/online+retail+ii | Custom | CC-BY-4.0 | 1,067,371 | — | 5.9 MB | 6.9 MB | | Online Shoppers Purchasing Intention Dataset | UCI ML Repository — Online Shoppers Purchasing Intention Dataset | Of the 12,330 sessions in the dataset, 84.5% (10,422) were negative class samples that did not end with shopping, and the rest (1908) were positive class samples ending with shopping. The dataset consists of feature vectors belonging to 12,330 sessions. The dataset was formed so that each session would belong to a different user in a 1-year period to avoid any tendency to a specific campaign, special day, user profile, or period. | https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset | Tabular (CSV) | CC-BY-4.0 | 12,330 | — | 0.2 MB | 0.3 MB | -| Open Food Facts | Open Food Facts product database | Crowd-sourced product facts for 4.4M packaged food products. Each row carries deeply nested nutrition, ingredient, allergen, and labelling metadata (origin, packaging, traffic-light scores). One of the larger heavily-nested JSON-shaped corpora in the catalog; the current build ships the canonical JSONL as a single `raw_json: string` column. | https://world.openfoodfacts.org/data | Custom | ODbL-1.0 | 4,466,927 | — | 12,910.8 MB | 36,431.3 MB | +| Open Food Facts | Open Food Facts product database | Crowd-sourced product facts for 4.4M packaged food products. Each row carries deeply nested nutrition, ingredient, allergen, and labelling metadata (origin, packaging, traffic-light scores). One of the larger heavily-nested JSON-shaped corpora in the catalog; the current build ships the canonical JSONL as a single `raw_json: string` column. | https://world.openfoodfacts.org/data | Custom | ODbL-1.0 | 4,517,492 | 46 | 13,045.6 MB | 36,744.8 MB | | OpenAssistant Conversations (oasst1) | OpenAssistant Conversations Release 1 (Köpf et al., NeurIPS 2023) | 85k crowd-authored assistant conversation messages organized as tree-structured threads, spanning 35 languages (Köpf et al., NeurIPS 2023). Each row carries quality, toxicity, and emoji-feedback labels alongside the message text and parent pointer. First public RLHF-grade conversation corpus; powers OpenAssistant and many downstream fine-tunes. | https://huggingface.co/datasets/OpenAssistant/oasst1 | Tabular (Parquet) | Apache-2.0 | 88,838 | — | 27.0 MB | 43.2 MB | | OpenLibrary Authors | Internet Archive OpenLibrary — Authors | Bibliographic records for book authors — name variants, birth/death dates, Wikidata cross-references, biographical notes. Joins to `openlibrary-works` via author keys. Sourced from OpenLibrary's monthly data dumps. | https://openlibrary.org/developers/dumps | Custom | CC0-1.0 | 15,177,329 | — | 809.1 MB | 2,061.5 MB | | OpenLibrary Editions | Internet Archive OpenLibrary — Editions | Bibliographic records for individual book editions (ISBN, publisher, language, page count, physical format). Each edition ties back to a `works` row via the work key. Sourced from OpenLibrary's monthly data dumps. | https://openlibrary.org/developers/dumps | Custom | CC0-1.0 | 55,962,700 | — | 12,931.6 MB | 28,298.0 MB | @@ -208,7 +208,7 @@ | OpenPowerlifting | OpenPowerlifting meet results | 3.9M competition-lift records from powerlifting meets worldwide, maintained by openpowerlifting.org. One row per lift attempt with lifter, federation, weight class, equipment, and the four scores (squat / bench / deadlift / total). | https://www.openpowerlifting.org/data | Tabular (CSV) | CC0-1.0 | 3,916,281 | — | 104.8 MB | 156.5 MB | | Optical Recognition of Handwritten Digits | UCI ML Repository — Optical Recognition of Handwritten Digits | Two versions of this database available; see folder We used preprocessing programs made available by NIST to extract normalized bitmaps of handwritten digits from a preprinted form. From a total of 43 people, 30 contributed to the training set and different 13 to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of 4x4 and the number of on pixels are counted in each block. This generates an input matrix of 8x8 where each element is an integer in the range 0..16. This reduces dimensionality and gives invariance to small distortions. For info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G. T. Candela, D. L. | https://archive.ics.uci.edu/dataset/80/optical+recognition+of+handwritten+digits | Tabular (CSV) | CC-BY-4.0 | 5,620 | — | 0.2 MB | 0.2 MB | | OSM Germany Nodes | OpenStreetMap Germany — Nodes | OSM nodes for Germany from the Geofabrik extract — point features (addresses, POIs, traffic signals, etc.) with their geographic coordinates and tag bag. Emitted as GeoParquet 1.1 with WKB geometry. | https://www.openstreetmap.org/copyright | Geo (OSM PBF) | ODbL-1.0 | 432,906,290 | — | 15,335.9 MB | 21,061.1 MB | -| OSM Germany Relations | OpenStreetMap Germany — Relations | OSM relations (composite features) for Germany — multi-polygon land covers, route memberships, administrative boundaries. The richest of the three OSM-Germany slugs in tag complexity. Emitted as GeoParquet 1.1 with WKB geometry. | https://www.openstreetmap.org/copyright | Geo (OSM PBF) | ODbL-1.0 | 889,712 | — | 91.2 MB | 148.5 MB | +| OSM Germany Relations | OpenStreetMap Germany — Relations | OSM relations (composite features) for Germany — multi-polygon land covers, route memberships, administrative boundaries. The richest of the three OSM-Germany slugs in tag complexity. Emitted as GeoParquet 1.1 with WKB geometry. | https://www.openstreetmap.org/copyright | Geo (OSM PBF) | ODbL-1.0 | 890,059 | 9 | 91.3 MB | 148.4 MB | | OSM Germany Ways | OpenStreetMap Germany — Ways | OSM ways (linear features) for Germany from the Geofabrik extract — roads, paths, rivers, building outlines, railway lines. Each row carries a tag bag plus the ordered node references. Emitted as GeoParquet 1.1 with WKB LineString / Polygon geometry. | https://www.openstreetmap.org/copyright | Geo (OSM PBF) | ODbL-1.0 | 70,097,667 | 701 | 9,514.0 MB | 14,789.9 MB | | OSMI Mental Health in Tech 2016 | OSMI 2016 Mental Health in Tech Survey | Data on prevalence and attitudes towards mental health among tech workers. OSMI Mental Health in Tech Survey 2016 Currently over 1400 responses, the ongoing 2016 survey aims to measure attitudes towards mental health in the tech workplace, and examine the frequency of mental health disorders among tech workers. How Will This Data Be Used? We are interested in gauging how mental health is viewed within the tech/IT workplace, and the prevalence of certain mental health disorders within the tech industry. — adapted from the dataset's Kaggle description (osmi/mental-health-in-tech-2016). | https://osmhhelp.org/research.html | Tabular (CSV) | CC-BY-SA-4.0 | 1,433 | — | 0.1 MB | 0.3 MB | | OSMI Mental Health in Tech 2017 | OSMI 2017 Mental Health in Tech Survey | Data on prevalence and attitudes towards mental health among tech workers. OSMI Mental Health in Tech Survey 2017 The 2017 survey aims to measure attitudes towards mental health in the tech workplace, and examine the frequency of mental health disorders among tech workers. How Will This Data Be Used? We are interested in gauging how mental health is viewed within the tech/IT workplace, and the prevalence of certain mental health disorders within the tech industry. — adapted from the dataset's Kaggle description (osmihelp/osmi-mental-health-in-tech-survey-2017). | https://osmhhelp.org/research.html | Tabular (CSV) | CC-BY-SA-4.0 | 756 | — | 0.2 MB | 0.4 MB | @@ -233,7 +233,7 @@ | Seoul Bike Sharing Demand | UCI ML Repository — Seoul Bike Sharing Demand | The dataset contains count of public bicycles rented per hour in the Seoul Bike Sharing System, with corresponding weather data and holiday information Currently Rental bikes are introduced in many urban cities for the enhancement of mobility comfort. It is important to make the rental bike available and accessible to the public at the right time as it lessens the waiting time. Eventually, providing the city with a stable supply of rental bikes becomes a major concern. The crucial part is the prediction of bike count required at each hour for the stable supply of rental bikes. | https://archive.ics.uci.edu/dataset/560/seoul+bike+sharing+demand | Tabular (CSV) | CC-BY-4.0 | 8,760 | — | 0.1 MB | 0.1 MB | | SF Salaries | SF Salaries | Explore San Francisco city employee salary data. One way to understand how a city government works is by looking at who it employs and how its employees are compensated. This data contains the names, job title, and compensation for San Francisco city employees on an annual basis from 2011 to 2014. [](https://www.kaggle.com/benhamner/d/kaggle/sf-salaries/exploring-the-sf-city-salary-data) Exploration Ideas To help get you started, here are some data exploration ideas: - How have salaries changed over time between different groups of people? — adapted from the dataset's Kaggle description (kaggle/sf-salaries). | https://data.sfgov.org/City-Management-and-Ethics/Employee-Compensation/88g8-5mnd | Tabular (CSV) | CC0-1.0 | 1,096,102 | 2 | 59.4 MB | 51.2 MB | | SMS Spam Collection | UCI ML Repository — SMS Spam Collection | The SMS Spam Collection is a public set of SMS labeled messages that have been collected for mobile phone spam research. This corpus has been collected from free or free for research sources at the Internet: -> A collection of 425 SMS spam messages was manually extracted from the Grumbletext Web site. This is a UK forum in which cell phone users make public claims about SMS spam messages, most of them without reporting the very spam message received. The identification of the text of spam messages in the claims is a very hard and time-consuming task, and it involved carefully scanning hundreds of web pages. | https://archive.ics.uci.edu/dataset/228/sms+spam+collection | Tabular (CSV) | CC-BY-4.0 | 5,574 | — | 0.2 MB | 0.3 MB | -| Spambase | UCI ML Repository — Spambase | Classifying Email as Spam or Non-Spam The "spam" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography... The classification task for this dataset is to determine whether a given email is spam or not. Our collection of spam e-mails came from our postmaster and individuals who had filed spam. Our collection of non-spam e-mails came from filed work and personal e-mails, and hence the word 'george' and the area code '650' are indicators of non-spam. These are useful when constructing a personalized spam filter. | https://archive.ics.uci.edu/dataset/94/spambase | Tabular (CSV) | CC-BY-4.0 | 4,601 | — | 0.2 MB | 0.4 MB | +| Spambase | UCI ML Repository — Spambase | Classifying Email as Spam or Non-Spam The "spam" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography... The classification task for this dataset is to determine whether a given email is spam or not. Our collection of spam e-mails came from our postmaster and individuals who had filed spam. Our collection of non-spam e-mails came from filed work and personal e-mails, and hence the word 'george' and the area code '650' are indicators of non-spam. These are useful when constructing a personalized spam filter. | https://archive.ics.uci.edu/dataset/94/spambase | Tabular (CSV) | CC-BY-4.0 | 4,601 | 1 | 0.2 MB | 0.4 MB | | SQuAD v2 | Stanford Question Answering Dataset v2.0 | Stanford Question Answering Dataset v2 — 130k crowdsourced questions about Wikipedia paragraphs, with 50k of them deliberately unanswerable from the given context. Each row pairs a question, the passage it was asked about, and the canonical answer span(s). The v2 release added the unanswerable subset specifically to test whether models know when to abstain — a longstanding QA-eval weakness. | https://huggingface.co/datasets/rajpurkar/squad_v2 | Tabular (Parquet) | CC-BY-SA-4.0 | 142,192 | — | 11.1 MB | 16.5 MB | | Stack Overflow 2018 Developer Survey | Stack Overflow 2018 Developer Survey | Individual responses on the 2018 Developer Survey fielded by Stack Overflow. Each year, we at Stack Overflow ask the developer community about everything from their favorite technologies to their job preferences. This year marks the eighth year we’ve published our Annual Developer Survey results—with the largest number of respondents yet. Over 100,000 developers took the 30-minute survey in January 2018. — adapted from the dataset's Kaggle description (stackoverflow/stack-overflow-2018-developer-survey). | https://insights.stackoverflow.com/survey/2018 | Tabular (CSV) | DbCL-1.0 | 98,855 | — | 6.9 MB | 9.5 MB | | Stack Overflow Badges | Stack Exchange Data Dump — Stack Overflow Badges | Badges earned by Stack Overflow users — badge name, class (gold/silver/bronze), tag-based vs activity-based, awarded timestamp. Joins to `stackoverflow-users` via `user_id`. | https://archive.org/details/stackexchange | Structured (XML) | CC-BY-SA-4.0 | 51,289,973 | — | 583.8 MB | 487.6 MB | diff --git a/docs/v1/snapshot.json b/docs/v1/snapshot.json index c87b85c..d8ff359 100644 --- a/docs/v1/snapshot.json +++ b/docs/v1/snapshot.json @@ -1777,19 +1777,31 @@ }, "open-food-facts": { "expected_rows": null, - "last_built_rows": 4466927, - "parquet_bytes": 13537973388, - "vortex_bytes": 38200971468, + "last_built_rows": 4517492, + "last_built_row_groups": 46, + "parquet_bytes": 13679279833, + "vortex_bytes": 38529751820, + "parquet_sha256": "1e7a61b7bebab4a2e68007d468377aa793b37e17c2482deff7487a66967931d7", + "vortex_sha256": "cec166ea889d97baf9172ea0dc694134182901133255c9c2204fefd931c0a090", "columns": [ { "name": "raw_json", "type": "string", - "length": 13537965664, + "length": 13679271988, "null_count": 0, "min": null, "max": null } - ] + ], + "size_bucket": "xl", + "shape_traits": { + "has_nested": false, + "has_timestamp": false, + "has_variant": false, + "string_heavy": true, + "wide_row": false, + "high_cardinality_present": null + } }, "nyc-311": { "expected_rows": null, @@ -30080,9 +30092,9 @@ "last_built_rows": 150, "last_built_row_groups": 1, "parquet_bytes": 2732, - "vortex_bytes": 10056, + "vortex_bytes": 10024, "parquet_sha256": "72ed0dd70848acef0c2c94ad58385144a50f21484fa012c18b15dfa53ed25538", - "vortex_sha256": "4d3e683f6d911a2d3876e28cda03e22d029b051427ef3cfd65e3e367de4d5300", + "vortex_sha256": "e5e82d2e4672917a61fea0a686ed57c6026008c0cddaa54d546acc50bc7e7076", "columns": [ { "name": "sepal_length",