quivr/backend/loaders/telegram.py
Stan Girard 5eed1a05cb
feat: 🎸 telegram (#1555)
added loaders and compatibility

# Description

Please include a summary of the changes and the related issue. Please
also include relevant motivation and context.

## Checklist before requesting a review

Please delete options that are not relevant.

- [ ] My code follows the style guidelines of this project
- [ ] I have performed a self-review of my code
- [ ] I have commented hard-to-understand areas
- [ ] I have ideally added tests that prove my fix is effective or that
my feature works
- [ ] New and existing unit tests pass locally with my changes
- [ ] Any dependent changes have been merged

## Screenshots (if appropriate):
2023-11-01 22:09:04 +01:00

66 lines
1.8 KiB
Python

from __future__ import annotations
import json
from pathlib import Path
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
def concatenate_rows(row: dict) -> str:
"""Combine message information in a readable format ready to be used."""
date = row["date"]
sender = row.get(
"from", "Unknown"
) # Using .get() to handle cases where 'from' might not be present
text_content = row.get("text", "")
# Function to process a single text entity
def process_text_entity(entity):
if isinstance(entity, str):
return entity
elif isinstance(entity, dict) and "text" in entity:
return entity["text"]
return ""
# Process the text content based on its type
if isinstance(text_content, str):
text = text_content
elif isinstance(text_content, list):
text = "".join(process_text_entity(item) for item in text_content)
else:
text = ""
# Skip messages with empty text
if not text.strip():
return ""
return f"{sender} on {date}: {text}\n\n"
class TelegramChatFileLoader(BaseLoader):
"""Load from `Telegram chat` dump."""
def __init__(self, path: str):
"""Initialize with a path."""
self.file_path = path
def load(self) -> List[Document]:
"""Load documents."""
p = Path(self.file_path)
with open(p, encoding="utf8") as f:
d = json.load(f)
text = "".join(
concatenate_rows(message)
for message in d["messages"]
if message["type"] == "message"
and (isinstance(message["text"], str) or isinstance(message["text"], list))
)
metadata = {"source": str(p)}
return [Document(page_content=text, metadata=metadata)]