🚑 use multithreading instead of multiprocessing for container in ECS

2024-09-11 13:15:41 +03:00 · 2023-07-05 18:06:53 +02:00 · 2023-07-05 18:06:53 +02:00 · f799d1afd6
commit f799d1afd6
parent 22e8189057
1 changed files with 10 additions and 17 deletions
--- a/backend/utils/vectors.py
+++ b/backend/utils/vectors.py
@ -1,4 +1,4 @@
-import multiprocessing as mp
+from concurrent.futures import ThreadPoolExecutor
 from typing import List

 from langchain.embeddings.openai import OpenAIEmbeddings
@ -100,26 +100,19 @@ def get_unique_files_from_vector_ids(vectors_ids: List[int]):
    """
    print("vectors_ids", vectors_ids)

-    manager = mp.Manager()
-    vectors_responses = manager.list()
-
    # constants
    BATCH_SIZE = 5

-    # if __name__ == '__main__':
-    # multiprocessing pool initialization
-
-    pool = mp.Pool()
-    results = []
-    for i in range(0, len(vectors_ids), BATCH_SIZE):
-        batch_ids = vectors_ids[i:i + BATCH_SIZE]
-        result = pool.apply_async(process_batch, args=(batch_ids,), error_callback=error_callback)
-        results.append(result)
-    # Retrieve the results
-    vectors_responses = [result.get() for result in results]
-    pool.close()
-    pool.join()
+    with ThreadPoolExecutor() as executor:
+        futures = []
+        for i in range(0, len(vectors_ids), BATCH_SIZE):
+            batch_ids = vectors_ids[i:i + BATCH_SIZE]
+            future = executor.submit(process_batch, batch_ids)
+            futures.append(future)

+        # Retrieve the results
+        vectors_responses = [future.result() for future in futures]
+   
    documents = [item for sublist in vectors_responses for item in sublist]
    print('document', documents)
    unique_files = [dict(t) for t in set(tuple(d.items()) for d in documents)]