bigscience-workshop
diff --git a/‎cc_pseudo_crawl/python_scripts/load_all_seed_ids.py‎
Lines changed: 9 additions & 8 deletions b/‎cc_pseudo_crawl/python_scripts/load_all_seed_ids.py‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎…do_crawl/pseudo_crawl_seed_to_lm_dset.py‎ ‎…_scripts/pseudo_crawl_seed_to_lm_dset.py‎cc_pseudo_crawl/pseudo_crawl_seed_to_lm_dset.py renamed to cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset.py
Lines changed: 57 additions & 17 deletions b/‎…do_crawl/pseudo_crawl_seed_to_lm_dset.py‎ ‎…_scripts/pseudo_crawl_seed_to_lm_dset.py‎cc_pseudo_crawl/pseudo_crawl_seed_to_lm_dset.py renamed to cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset.py
Lines changed: 57 additions & 17 deletions
@@ -1,13 +1,12 @@
 import csv
 from argparse import ArgumentParser
-from pathlib import Path
 
 
 def get_args():
     parser = ArgumentParser()
     parser.add_argument(
-        "--seed-path",
-        type=str,
+        "--seed-paths",
+        type=lambda x: x.split(","),
         required=True,
         help="Seed full path. e.g. 'xxx/seeds.csv'",
     )
@@ -20,11 +19,13 @@ def get_args():
 def main():
     args = get_args()
 
-    with open(args.seed_path, "r") as fi:
-        data = csv.reader(fi)
-        # First line is all the headers that we remove.
-        seed_ids = [row[0] for row_id, row in enumerate(data) if row_id > 0]
-        print(seed_ids[args.seed_index])
+    seed_ids = []
+    for seed_path in args.seed_paths:
+        with open(seed_path, "r") as fi:
+            data = csv.reader(fi)
+            # First line is all the headers that we remove.
+            seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0]
+    print(seed_ids[args.seed_index])
 
 
 if __name__ == "__main__":
 
@@ -1,16 +1,23 @@
+import os
 import argparse
+import logging
 import gzip
 import json
 
 import datasets
+import pandas as pd
 from datasets import Features, load_dataset
 from huggingface_hub import HfApi
 from tqdm import tqdm
+from datasets.utils.logging import set_verbosity_info
 
 ###
 # features of the pseudocrawl seeds
 ###
 
+set_verbosity_info()
+logger = logging.getLogger(__name__)
+
 _PATH_TO_PSEUDO_CRAWL = "pseudo_crawl"
 
 null = None
@@ -123,8 +130,20 @@ def get_lines_to_skip(dset):
 
 
 # create a private repository and push processed seed in jsonl format
-def make_seed_jsonl(dset, language, name, skip_lines_dict, min_chars=32, gzipped=False):
-    repo_name = f"lm_{language}_pseudocrawl_{name}"
+def make_seed_jsonl(
+    dset,
+    language,
+    name,
+    skip_lines_dict,
+    seed_id,
+    min_chars=32,
+    gzipped=False,
+    save_dir=None,
+):
+    repo_name = f"lm_{language}_seed_id_{seed_id}_pseudocrawl_{name}"
+    if save_dir is not None:
+        repo_name = os.path.join(save_dir, repo_name)
+    logger.info(f"the dataset will be saved at {repo_name}")
     # process and write to file
     if gzipped:
         file_name = f"{repo_name}.jsonl.gz"
@@ -137,7 +156,9 @@ def make_seed_jsonl(dset, language, name, skip_lines_dict, min_chars=32, gzipped
         processed_dct = process_page(article, skip_lines_dict)
         txt = processed_dct["text"].strip().lower()
         if len(processed_dct["text"]) > min_chars and txt not in duplicated:
-            _ = f.write((json.dumps(processed_dct) + "\n").encode("utf-8"))
+            _ = f.write(
+                (json.dumps(processed_dct) + "\n").encode("utf-8").decode("utf-8")
+            )
     f.close()
     return file_name, repo_name
 
@@ -160,31 +181,45 @@ def push_jsonl_to_hub(file_name, repo_name, token):
     return file_loc
 
 
+def get_dataset_name_and_lang_id_from_seed_id(seed_id, seed_id_info_path):
+    df = pd.read_csv(seed_id_info_path)
+    sub_df = df[df["id"] == seed_id]
+    if len(sub_df) != 1:
+        raise ValueError("You should have only one match per seed id")
+    name = sub_df.name[0]
+    lang_id = sub_df.lang_id[0]
+    return name, lang_id
+
+
+def get_dataset_name_and_lang_id_from_seed_id_fake(seed_id, seed_id_info_path):
+    return "change_name", "change_lang_id"
+
+
 ###
 # combine everything
 ###
 def main():
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
     parser = argparse.ArgumentParser(description="Load seed and upload to hub")
     parser.add_argument(
         "-sid",
-        "--seed_id",
+        "--seed-id",
         help="seed ID",
         required=True,
         type=int,
     )
     parser.add_argument(
-        "-ln",
-        "--language_code",
-        help="language code used on the repo",
-        required=True,
+        "--seed-id-info-path",
+        help="The path to a csv containing the seed id and the corresponding lang-id and name",
+        # required=True,
         type=str,
     )
     parser.add_argument(
-        "-n",
-        "--name",
-        help="name of the website",
-        required=True,
-        type=str,
+        "--save-dir", required=True, type=str, help="Where to save the datasets."
     )
     parser.add_argument(
         "-pc_path",
@@ -196,7 +231,7 @@ def main():
     parser.add_argument(
         "-gz",
         "--gzipped",
-        help="Write file directly in jsonl.gz compresed format",
+        help="Write file directly in jsonl.gz compressed format",
         action="store_true",
     )
     parser.add_argument(
@@ -223,16 +258,21 @@ def main():
             f"{args.pseudo_crawl_path}/seed_id={args.seed_id}/text__html/*.jsonl.gz"
         ],
         features=final_features,
-        cache_dir=f"cache_seed_{args.seed_id}",
+    )
+
+    name, language_code = get_dataset_name_and_lang_id_from_seed_id_fake(
+        args.seed_id, args.seed_id_info_path
     )
     skip_lines_dict = get_lines_to_skip(dset)
     file_name, repo_name = make_seed_jsonl(
         dset,
-        language=args.language_code,
-        name=args.name,
+        language=language_code,
+        name=name,
+        seed_id=args.seed_id,
         skip_lines_dict=skip_lines_dict,
         min_chars=128,  # only keep examples with at least 128 characters
         gzipped=args.gzipped,
+        save_dir=args.save_dir,
     )
     if args.push_to_hub:
         push_jsonl_to_hub(file_name, repo_name, args.token)