Skip to content

Commit a590896

Browse files
SaulLuthomasw21pre-commit-ci[bot]
authored
Apply cleaning: remove deduplicated exact lines (#408)
* update `pseudo_crawl_seed_to_lm_dset` * temporary hack to launch * slurm script * modify yacine script * update slurm script * duplicate script * update new script * Nit * Count lines across documents instead of within * Minimum of characters per line * Export skipped lines in another file * Update slurm script * Remove none text * Woops * Fit Thomas setup * Woops * Woops * Woops * Woops * Woops * Have a save batch size * Deactivate old code * Woops added more cores * Woops added more cores * Debug why np.choice fails * I'm stupid * I'm stupid * I'm stupid * I'm stupid * Remove columns as the dataset changes size * Woops * Lower memory footprint * Got OOM * Add simple log * Change strategy to move folders after * Woops * Fix caching * Try lowering memory footprint by 20 * Remove columns before map to reduce memory footprint * Reduce the number of processes * Ask for maximum time * Use list comprehention to make it faster * Revert "Use list comprehention to make it faster" This reverts commit 5ccc341. * Hopefully this helps * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: thomasw21 <24695242+thomasw21@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent b4be13f commit a590896

4 files changed

Lines changed: 482 additions & 25 deletions

File tree

cc_pseudo_crawl/python_scripts/load_all_seed_ids.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
import csv
22
from argparse import ArgumentParser
3-
from pathlib import Path
43

54

65
def get_args():
76
parser = ArgumentParser()
87
parser.add_argument(
9-
"--seed-path",
10-
type=str,
8+
"--seed-paths",
9+
type=lambda x: x.split(","),
1110
required=True,
1211
help="Seed full path. e.g. 'xxx/seeds.csv'",
1312
)
@@ -20,11 +19,13 @@ def get_args():
2019
def main():
2120
args = get_args()
2221

23-
with open(args.seed_path, "r") as fi:
24-
data = csv.reader(fi)
25-
# First line is all the headers that we remove.
26-
seed_ids = [row[0] for row_id, row in enumerate(data) if row_id > 0]
27-
print(seed_ids[args.seed_index])
22+
seed_ids = []
23+
for seed_path in args.seed_paths:
24+
with open(seed_path, "r") as fi:
25+
data = csv.reader(fi)
26+
# First line is all the headers that we remove.
27+
seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0]
28+
print(seed_ids[args.seed_index])
2829

2930

3031
if __name__ == "__main__":

cc_pseudo_crawl/pseudo_crawl_seed_to_lm_dset.py renamed to cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset.py

Lines changed: 57 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,23 @@
1+
import os
12
import argparse
3+
import logging
24
import gzip
35
import json
46

57
import datasets
8+
import pandas as pd
69
from datasets import Features, load_dataset
710
from huggingface_hub import HfApi
811
from tqdm import tqdm
12+
from datasets.utils.logging import set_verbosity_info
913

1014
###
1115
# features of the pseudocrawl seeds
1216
###
1317

18+
set_verbosity_info()
19+
logger = logging.getLogger(__name__)
20+
1421
_PATH_TO_PSEUDO_CRAWL = "pseudo_crawl"
1522

1623
null = None
@@ -123,8 +130,20 @@ def get_lines_to_skip(dset):
123130

124131

125132
# create a private repository and push processed seed in jsonl format
126-
def make_seed_jsonl(dset, language, name, skip_lines_dict, min_chars=32, gzipped=False):
127-
repo_name = f"lm_{language}_pseudocrawl_{name}"
133+
def make_seed_jsonl(
134+
dset,
135+
language,
136+
name,
137+
skip_lines_dict,
138+
seed_id,
139+
min_chars=32,
140+
gzipped=False,
141+
save_dir=None,
142+
):
143+
repo_name = f"lm_{language}_seed_id_{seed_id}_pseudocrawl_{name}"
144+
if save_dir is not None:
145+
repo_name = os.path.join(save_dir, repo_name)
146+
logger.info(f"the dataset will be saved at {repo_name}")
128147
# process and write to file
129148
if gzipped:
130149
file_name = f"{repo_name}.jsonl.gz"
@@ -137,7 +156,9 @@ def make_seed_jsonl(dset, language, name, skip_lines_dict, min_chars=32, gzipped
137156
processed_dct = process_page(article, skip_lines_dict)
138157
txt = processed_dct["text"].strip().lower()
139158
if len(processed_dct["text"]) > min_chars and txt not in duplicated:
140-
_ = f.write((json.dumps(processed_dct) + "\n").encode("utf-8"))
159+
_ = f.write(
160+
(json.dumps(processed_dct) + "\n").encode("utf-8").decode("utf-8")
161+
)
141162
f.close()
142163
return file_name, repo_name
143164

@@ -160,31 +181,45 @@ def push_jsonl_to_hub(file_name, repo_name, token):
160181
return file_loc
161182

162183

184+
def get_dataset_name_and_lang_id_from_seed_id(seed_id, seed_id_info_path):
185+
df = pd.read_csv(seed_id_info_path)
186+
sub_df = df[df["id"] == seed_id]
187+
if len(sub_df) != 1:
188+
raise ValueError("You should have only one match per seed id")
189+
name = sub_df.name[0]
190+
lang_id = sub_df.lang_id[0]
191+
return name, lang_id
192+
193+
194+
def get_dataset_name_and_lang_id_from_seed_id_fake(seed_id, seed_id_info_path):
195+
return "change_name", "change_lang_id"
196+
197+
163198
###
164199
# combine everything
165200
###
166201
def main():
202+
logging.basicConfig(
203+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
204+
datefmt="%m/%d/%Y %H:%M:%S",
205+
level=logging.INFO,
206+
)
167207
parser = argparse.ArgumentParser(description="Load seed and upload to hub")
168208
parser.add_argument(
169209
"-sid",
170-
"--seed_id",
210+
"--seed-id",
171211
help="seed ID",
172212
required=True,
173213
type=int,
174214
)
175215
parser.add_argument(
176-
"-ln",
177-
"--language_code",
178-
help="language code used on the repo",
179-
required=True,
216+
"--seed-id-info-path",
217+
help="The path to a csv containing the seed id and the corresponding lang-id and name",
218+
# required=True,
180219
type=str,
181220
)
182221
parser.add_argument(
183-
"-n",
184-
"--name",
185-
help="name of the website",
186-
required=True,
187-
type=str,
222+
"--save-dir", required=True, type=str, help="Where to save the datasets."
188223
)
189224
parser.add_argument(
190225
"-pc_path",
@@ -196,7 +231,7 @@ def main():
196231
parser.add_argument(
197232
"-gz",
198233
"--gzipped",
199-
help="Write file directly in jsonl.gz compresed format",
234+
help="Write file directly in jsonl.gz compressed format",
200235
action="store_true",
201236
)
202237
parser.add_argument(
@@ -223,16 +258,21 @@ def main():
223258
f"{args.pseudo_crawl_path}/seed_id={args.seed_id}/text__html/*.jsonl.gz"
224259
],
225260
features=final_features,
226-
cache_dir=f"cache_seed_{args.seed_id}",
261+
)
262+
263+
name, language_code = get_dataset_name_and_lang_id_from_seed_id_fake(
264+
args.seed_id, args.seed_id_info_path
227265
)
228266
skip_lines_dict = get_lines_to_skip(dset)
229267
file_name, repo_name = make_seed_jsonl(
230268
dset,
231-
language=args.language_code,
232-
name=args.name,
269+
language=language_code,
270+
name=name,
271+
seed_id=args.seed_id,
233272
skip_lines_dict=skip_lines_dict,
234273
min_chars=128, # only keep examples with at least 128 characters
235274
gzipped=args.gzipped,
275+
save_dir=args.save_dir,
236276
)
237277
if args.push_to_hub:
238278
push_jsonl_to_hub(file_name, repo_name, args.token)

0 commit comments

Comments
 (0)