From 8c7fb0ed99d8f01fbc161b8574bc4b697848fcae Mon Sep 17 00:00:00 2001 From: bentrevett Date: Wed, 14 Jul 2021 15:34:35 +0100 Subject: [PATCH] added streaming duplicate remover (probably v. slow) --- deduplication/deduplication_streaming.py | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 deduplication/deduplication_streaming.py diff --git a/deduplication/deduplication_streaming.py b/deduplication/deduplication_streaming.py new file mode 100644 index 0000000..84b0647 --- /dev/null +++ b/deduplication/deduplication_streaming.py @@ -0,0 +1,39 @@ +import argparse +import datasets +import lm_dataformat +import re +import tqdm + +parser = argparse.ArgumentParser(description="Deduplicate a list of files") +parser.add_argument("--data_dir", type=str, required=True) +parser.add_argument("--output_dir", type=str, required=True) +parser.add_argument("--archive_commit_freq", type=int, default=10_000) +args = parser.parse_args() + +dataset = datasets.load_dataset( + "script.py", data_dir=args.data_dir, split="train", streaming=True +) + + +def get_variables(example): + variables = " ".join(re.split(r"\W+", example["text"])) + return variables + + +uniques = set() +ar = lm_dataformat.Archive(args.output_dir) +i = 0 + +for example in tqdm.tqdm(dataset): + variables = get_variables(example) + if variables not in uniques: + uniques.add(variables) + code = example["text"] + del example["text"] + ar.add_data(code, meta=example) + i += 1 + if i % args.archive_commit_freq == 0: + ar.commit() +ar.commit() + +print(i)