From 8e5af2c8c52f80ccb0006362042bda66fb9817ba Mon Sep 17 00:00:00 2001
From: Stan Girard <girard.stanislas@gmail.com>
Date: Wed, 12 Jun 2024 11:08:48 +0200
Subject: [PATCH] feat: Normalize file names in sync module (#2661)

This commit adds a new utility function `remove_special_characters` to
the `normalize.py` module in the `sync/utils` directory. The function
removes special characters from file names by normalizing the input
string and using regular expressions to remove non-alphanumeric
characters.

The function is then used in the `list_files.py` module in the
`sync/utils` directory to normalize the names of files retrieved from
Google Drive and Azure Drive. This ensures that the file names are free
of special characters, improving consistency and compatibility with
other parts of the system.

Co-authored-by: Stan Girard <stan@quivr.app>
---
 backend/modules/sync/utils/list_files.py | 13 +++++++++++--
 backend/modules/sync/utils/normalize.py  | 15 +++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)
 create mode 100644 backend/modules/sync/utils/normalize.py

diff --git a/backend/modules/sync/utils/list_files.py b/backend/modules/sync/utils/list_files.py
index 984a842f0..6c1201ea8 100644
--- a/backend/modules/sync/utils/list_files.py
+++ b/backend/modules/sync/utils/list_files.py
@@ -7,6 +7,7 @@ from fastapi import HTTPException
 from google.auth.transport.requests import Request as GoogleRequest
 from google.oauth2.credentials import Credentials
 from googleapiclient.discovery import build
+from modules.sync.utils.normalize import remove_special_characters
 from logger import get_logger
 from requests import HTTPError
 
@@ -53,6 +54,8 @@ def get_google_drive_files_by_id(credentials: dict, file_ids: List[str]):
             )
 
         logger.info("Google Drive files retrieved successfully: %s", len(files))
+        for file in files:
+            file["name"] = remove_special_characters(file["name"])
         return files
     except HTTPError as error:
         logger.error("An error occurred while retrieving Google Drive files: %s", error)
@@ -138,6 +141,9 @@ def get_google_drive_files(
                 break
 
         logger.info("Google Drive files retrieved successfully: %s", len(files))
+        
+        for file in files:
+            file["name"] = remove_special_characters(file["name"])
         return files
     except HTTPError as error:
         logger.error("An error occurred while retrieving Google Drive files: %s", error)
@@ -225,7 +231,8 @@ def list_azure_files(credentials, folder_id=None, recursive=False):
             )
 
             files.extend(folder_files)
-
+    for file in files:
+        file["name"] = remove_special_characters(file["name"])
     logger.info("Azure Drive files retrieved successfully: %s", len(files))
     return files
 
@@ -270,6 +277,8 @@ def get_azure_files_by_id(credentials: dict, file_ids: List[str]):
                 "mime_type": result.get("file", {}).get("mimeType", "folder"),
             }
         )
-
+    
+    for file in files:
+        file["name"] = remove_special_characters(file["name"])
     logger.info("Azure Drive files retrieved successfully: %s", len(files))
     return files
diff --git a/backend/modules/sync/utils/normalize.py b/backend/modules/sync/utils/normalize.py
new file mode 100644
index 000000000..154e8eb10
--- /dev/null
+++ b/backend/modules/sync/utils/normalize.py
@@ -0,0 +1,15 @@
+import unicodedata
+import re
+from logger import get_logger
+
+logger = get_logger(__name__)
+
+def remove_special_characters(input):
+    try:
+        normalized_string = unicodedata.normalize('NFD', input)
+        normalized_string = re.sub(r'[^\w\s.]', '', normalized_string)
+        logger.info(f"Input: {input}, Normalized: {normalized_string}")
+        return normalized_string
+    except Exception as e:
+        logger.error(f"Error removing special characters: {e}")
+        return input
\ No newline at end of file