🚑 use multithreading instead of multiprocessing for container in ECS (#525)

This commit is contained in:
Zineb El Bachiri 2023-07-05 18:15:18 +02:00 committed by GitHub
parent 22e8189057
commit 0edc4f628c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,4 +1,4 @@
import multiprocessing as mp from concurrent.futures import ThreadPoolExecutor
from typing import List from typing import List
from langchain.embeddings.openai import OpenAIEmbeddings from langchain.embeddings.openai import OpenAIEmbeddings
@ -100,25 +100,18 @@ def get_unique_files_from_vector_ids(vectors_ids: List[int]):
""" """
print("vectors_ids", vectors_ids) print("vectors_ids", vectors_ids)
manager = mp.Manager()
vectors_responses = manager.list()
# constants # constants
BATCH_SIZE = 5 BATCH_SIZE = 5
# if __name__ == '__main__': with ThreadPoolExecutor() as executor:
# multiprocessing pool initialization futures = []
for i in range(0, len(vectors_ids), BATCH_SIZE):
batch_ids = vectors_ids[i:i + BATCH_SIZE]
future = executor.submit(process_batch, batch_ids)
futures.append(future)
pool = mp.Pool() # Retrieve the results
results = [] vectors_responses = [future.result() for future in futures]
for i in range(0, len(vectors_ids), BATCH_SIZE):
batch_ids = vectors_ids[i:i + BATCH_SIZE]
result = pool.apply_async(process_batch, args=(batch_ids,), error_callback=error_callback)
results.append(result)
# Retrieve the results
vectors_responses = [result.get() for result in results]
pool.close()
pool.join()
documents = [item for sublist in vectors_responses for item in sublist] documents = [item for sublist in vectors_responses for item in sublist]
print('document', documents) print('document', documents)