update code with joblib

2024-10-26 09:17:45 +03:00 · 2021-07-14 12:56:29 +01:00 · 2021-07-14 12:56:29 +01:00 · 5aee2c9197
commit 5aee2c9197
parent 47bb3895bc
1 changed files with 18 additions and 11 deletions
--- a/deduplication/duplicate_detector.py
+++ b/deduplication/duplicate_detector.py
@ -4,6 +4,7 @@ from SetSimilaritySearch import all_pairs
 import numpy as np
 import tqdm
 from typing import List, Set, Tuple
 import joblib
 class DocumentID:
@ -70,14 +71,20 @@ class DuplicateDetector:
            similarity_func_name="jaccard",
            similarity_threshold=self.set_similarity_threshold,
        )
-        for index_1, index_2, _ in tqdm.tqdm(
+
-            similar_pairs, desc="computing duplicates..."
+        def worker(index_1, index_2):
-        ):
+            similarity = self.get_multiset_jaccard_similarity(index_1, index_2)
-            if (
+            if similarity > self.multiset_similarity_threshold:
                self.get_multiset_jaccard_similarity(index_1, index_2)
                >= self.multiset_similarity_threshold
            ):
                yield index_1, index_2
            else:
                yield None, None
        parallel_pool = joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=100)(
            joblib.delayed(worker)(index_1, index_2)
            for index_1, index_2, _ in similar_pairs
        )
        yield from parallel_pool
    def get_multiset_jaccard_similarity(self, index_1: int, index_2: int) -> float:
        """Calculate the multiset Jaccard similarity between two documents."""
@ -95,12 +102,12 @@ class DuplicateDetector:
        # stores duplicate clusters, list of set of DocumentID
        duplicate_clusters = []
-        # get the duplicate pairs
+        # get pairwise relationships from duplicate pairs
        pairwise_relationships = collections.defaultdict(list)
        for index_1, index_2 in self.get_duplicate_pairs():
-            assert index_1 != index_2
+            if index_1 is not None and index_2 is not None:
-            pairwise_relationships[index_1].append(index_2)
+                pairwise_relationships[index_1].append(index_2)
-            pairwise_relationships[index_2].append(index_1)
+                pairwise_relationships[index_2].append(index_1)
        # set of which documents have duplicates
        documents_with_duplicates = set(pairwise_relationships.keys())