mirror of
https://github.com/CodedotAl/gpt-code-clippy.git
synced 2024-10-26 09:17:45 +03:00
update code with joblib
This commit is contained in:
parent
47bb3895bc
commit
5aee2c9197
@ -4,6 +4,7 @@ from SetSimilaritySearch import all_pairs
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import tqdm
|
import tqdm
|
||||||
from typing import List, Set, Tuple
|
from typing import List, Set, Tuple
|
||||||
|
import joblib
|
||||||
|
|
||||||
|
|
||||||
class DocumentID:
|
class DocumentID:
|
||||||
@ -70,14 +71,20 @@ class DuplicateDetector:
|
|||||||
similarity_func_name="jaccard",
|
similarity_func_name="jaccard",
|
||||||
similarity_threshold=self.set_similarity_threshold,
|
similarity_threshold=self.set_similarity_threshold,
|
||||||
)
|
)
|
||||||
for index_1, index_2, _ in tqdm.tqdm(
|
|
||||||
similar_pairs, desc="computing duplicates..."
|
def worker(index_1, index_2):
|
||||||
):
|
similarity = self.get_multiset_jaccard_similarity(index_1, index_2)
|
||||||
if (
|
if similarity > self.multiset_similarity_threshold:
|
||||||
self.get_multiset_jaccard_similarity(index_1, index_2)
|
|
||||||
>= self.multiset_similarity_threshold
|
|
||||||
):
|
|
||||||
yield index_1, index_2
|
yield index_1, index_2
|
||||||
|
else:
|
||||||
|
yield None, None
|
||||||
|
|
||||||
|
parallel_pool = joblib.Parallel(n_jobs=joblib.cpu_count(), verbose=100)(
|
||||||
|
joblib.delayed(worker)(index_1, index_2)
|
||||||
|
for index_1, index_2, _ in similar_pairs
|
||||||
|
)
|
||||||
|
|
||||||
|
yield from parallel_pool
|
||||||
|
|
||||||
def get_multiset_jaccard_similarity(self, index_1: int, index_2: int) -> float:
|
def get_multiset_jaccard_similarity(self, index_1: int, index_2: int) -> float:
|
||||||
"""Calculate the multiset Jaccard similarity between two documents."""
|
"""Calculate the multiset Jaccard similarity between two documents."""
|
||||||
@ -95,12 +102,12 @@ class DuplicateDetector:
|
|||||||
# stores duplicate clusters, list of set of DocumentID
|
# stores duplicate clusters, list of set of DocumentID
|
||||||
duplicate_clusters = []
|
duplicate_clusters = []
|
||||||
|
|
||||||
# get the duplicate pairs
|
# get pairwise relationships from duplicate pairs
|
||||||
pairwise_relationships = collections.defaultdict(list)
|
pairwise_relationships = collections.defaultdict(list)
|
||||||
for index_1, index_2 in self.get_duplicate_pairs():
|
for index_1, index_2 in self.get_duplicate_pairs():
|
||||||
assert index_1 != index_2
|
if index_1 is not None and index_2 is not None:
|
||||||
pairwise_relationships[index_1].append(index_2)
|
pairwise_relationships[index_1].append(index_2)
|
||||||
pairwise_relationships[index_2].append(index_1)
|
pairwise_relationships[index_2].append(index_1)
|
||||||
|
|
||||||
# set of which documents have duplicates
|
# set of which documents have duplicates
|
||||||
documents_with_duplicates = set(pairwise_relationships.keys())
|
documents_with_duplicates = set(pairwise_relationships.keys())
|
||||||
|
Loading…
Reference in New Issue
Block a user