mirror of
https://github.com/QuivrHQ/quivr.git
synced 2024-10-26 15:18:16 +03:00
feat: Normalize file names in sync module (#2661)
This commit adds a new utility function `remove_special_characters` to the `normalize.py` module in the `sync/utils` directory. The function removes special characters from file names by normalizing the input string and using regular expressions to remove non-alphanumeric characters. The function is then used in the `list_files.py` module in the `sync/utils` directory to normalize the names of files retrieved from Google Drive and Azure Drive. This ensures that the file names are free of special characters, improving consistency and compatibility with other parts of the system. Co-authored-by: Stan Girard <stan@quivr.app>
This commit is contained in:
parent
a04ceea899
commit
8e5af2c8c5
@ -7,6 +7,7 @@ from fastapi import HTTPException
|
||||
from google.auth.transport.requests import Request as GoogleRequest
|
||||
from google.oauth2.credentials import Credentials
|
||||
from googleapiclient.discovery import build
|
||||
from modules.sync.utils.normalize import remove_special_characters
|
||||
from logger import get_logger
|
||||
from requests import HTTPError
|
||||
|
||||
@ -53,6 +54,8 @@ def get_google_drive_files_by_id(credentials: dict, file_ids: List[str]):
|
||||
)
|
||||
|
||||
logger.info("Google Drive files retrieved successfully: %s", len(files))
|
||||
for file in files:
|
||||
file["name"] = remove_special_characters(file["name"])
|
||||
return files
|
||||
except HTTPError as error:
|
||||
logger.error("An error occurred while retrieving Google Drive files: %s", error)
|
||||
@ -138,6 +141,9 @@ def get_google_drive_files(
|
||||
break
|
||||
|
||||
logger.info("Google Drive files retrieved successfully: %s", len(files))
|
||||
|
||||
for file in files:
|
||||
file["name"] = remove_special_characters(file["name"])
|
||||
return files
|
||||
except HTTPError as error:
|
||||
logger.error("An error occurred while retrieving Google Drive files: %s", error)
|
||||
@ -225,7 +231,8 @@ def list_azure_files(credentials, folder_id=None, recursive=False):
|
||||
)
|
||||
|
||||
files.extend(folder_files)
|
||||
|
||||
for file in files:
|
||||
file["name"] = remove_special_characters(file["name"])
|
||||
logger.info("Azure Drive files retrieved successfully: %s", len(files))
|
||||
return files
|
||||
|
||||
@ -271,5 +278,7 @@ def get_azure_files_by_id(credentials: dict, file_ids: List[str]):
|
||||
}
|
||||
)
|
||||
|
||||
for file in files:
|
||||
file["name"] = remove_special_characters(file["name"])
|
||||
logger.info("Azure Drive files retrieved successfully: %s", len(files))
|
||||
return files
|
||||
|
15
backend/modules/sync/utils/normalize.py
Normal file
15
backend/modules/sync/utils/normalize.py
Normal file
@ -0,0 +1,15 @@
|
||||
import unicodedata
|
||||
import re
|
||||
from logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
def remove_special_characters(input):
|
||||
try:
|
||||
normalized_string = unicodedata.normalize('NFD', input)
|
||||
normalized_string = re.sub(r'[^\w\s.]', '', normalized_string)
|
||||
logger.info(f"Input: {input}, Normalized: {normalized_string}")
|
||||
return normalized_string
|
||||
except Exception as e:
|
||||
logger.error(f"Error removing special characters: {e}")
|
||||
return input
|
Loading…
Reference in New Issue
Block a user