diff --git a/deduplication/duplicate_detector.py b/deduplication/duplicate_detector.py index b3ffdb0..6332cd7 100644 --- a/deduplication/duplicate_detector.py +++ b/deduplication/duplicate_detector.py @@ -4,6 +4,7 @@ from SetSimilaritySearch import all_pairs import numpy as np import tqdm from typing import List, Set, Tuple +import joblib class DocumentID: @@ -70,14 +71,20 @@ class DuplicateDetector: similarity_func_name="jaccard", similarity_threshold=self.set_similarity_threshold, ) - for index_1, index_2, _ in tqdm.tqdm( - similar_pairs, desc="computing duplicates..." - ): - if ( - self.get_multiset_jaccard_similarity(index_1, index_2) - >= self.multiset_similarity_threshold - ): + + def worker(index_1, index_2): + similarity = self.get_multiset_jaccard_similarity(index_1, index_2) + if similarity > self.multiset_similarity_threshold: yield index_1, index_2 + else: + yield None, None + + parallel_pool = joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=100)( + joblib.delayed(worker)(index_1, index_2) + for index_1, index_2, _ in similar_pairs + ) + + yield from parallel_pool def get_multiset_jaccard_similarity(self, index_1: int, index_2: int) -> float: """Calculate the multiset Jaccard similarity between two documents.""" @@ -95,12 +102,12 @@ class DuplicateDetector: # stores duplicate clusters, list of set of DocumentID duplicate_clusters = [] - # get the duplicate pairs + # get pairwise relationships from duplicate pairs pairwise_relationships = collections.defaultdict(list) for index_1, index_2 in self.get_duplicate_pairs(): - assert index_1 != index_2 - pairwise_relationships[index_1].append(index_2) - pairwise_relationships[index_2].append(index_1) + if index_1 is not None and index_2 is not None: + pairwise_relationships[index_1].append(index_2) + pairwise_relationships[index_2].append(index_1) # set of which documents have duplicates documents_with_duplicates = set(pairwise_relationships.keys())