2023-10-11 11:57:46 +03:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2023-12-07 09:18:05 +03:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from aiohttp import ClientSession, ClientTimeout
|
2023-10-10 15:47:41 +03:00
|
|
|
from duckduckgo_search import DDGS
|
2023-12-07 09:18:05 +03:00
|
|
|
import asyncio
|
|
|
|
|
|
|
|
class SearchResults():
|
|
|
|
def __init__(self, results: list):
|
|
|
|
self.results = results
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
yield from self.results
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
search = ""
|
|
|
|
for idx, result in enumerate(self.results):
|
|
|
|
if search:
|
|
|
|
search += "\n\n\n"
|
|
|
|
search += f"Title: {result.title}\n\n"
|
|
|
|
if result.text:
|
|
|
|
search += result.text
|
|
|
|
else:
|
|
|
|
search += result.snippet
|
|
|
|
search += f"\n\nSource: [[{idx}]]({result.url})"
|
|
|
|
return search
|
|
|
|
|
|
|
|
class SearchResultEntry():
|
|
|
|
def __init__(self, title: str, url: str, snippet: str, text: str = None):
|
|
|
|
self.title = title
|
|
|
|
self.url = url
|
|
|
|
self.snippet = snippet
|
|
|
|
self.text = text
|
|
|
|
|
|
|
|
def set_text(self, text: str):
|
|
|
|
self.text = text
|
|
|
|
|
|
|
|
def scrape_text(html: str, max_words: int = None) -> str:
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
for exclude in soup(["script", "style"]):
|
|
|
|
exclude.extract()
|
|
|
|
for selector in [
|
|
|
|
"main",
|
|
|
|
".main-content-wrapper",
|
|
|
|
".main-content",
|
|
|
|
".emt-container-inner",
|
|
|
|
".content-wrapper",
|
|
|
|
"#content",
|
|
|
|
"#mainContent",
|
|
|
|
]:
|
|
|
|
select = soup.select_one(selector)
|
|
|
|
if select:
|
|
|
|
soup = select
|
|
|
|
break
|
|
|
|
# Zdnet
|
|
|
|
for remove in [".c-globalDisclosure"]:
|
|
|
|
select = soup.select_one(remove)
|
|
|
|
if select:
|
|
|
|
select.extract()
|
|
|
|
clean_text = ""
|
|
|
|
for paragraph in soup.select("p"):
|
|
|
|
text = paragraph.get_text()
|
|
|
|
for line in text.splitlines():
|
|
|
|
words = []
|
|
|
|
for word in line.replace("\t", " ").split(" "):
|
|
|
|
if word:
|
|
|
|
words.append(word)
|
|
|
|
count = len(words)
|
|
|
|
if not count:
|
|
|
|
continue
|
|
|
|
if max_words:
|
|
|
|
max_words -= count
|
|
|
|
if max_words <= 0:
|
|
|
|
break
|
|
|
|
if clean_text:
|
|
|
|
clean_text += "\n"
|
|
|
|
clean_text += " ".join(words)
|
|
|
|
|
|
|
|
return clean_text
|
|
|
|
|
|
|
|
async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = None) -> str:
|
2023-10-06 21:52:17 +03:00
|
|
|
try:
|
2023-12-07 09:18:05 +03:00
|
|
|
async with session.get(url) as response:
|
|
|
|
if response.status == 200:
|
|
|
|
html = await response.text()
|
|
|
|
return scrape_text(html, max_words)
|
|
|
|
except:
|
|
|
|
return
|
|
|
|
|
|
|
|
async def search(query: str, n_results: int = 5, max_words: int = 2500, add_text: bool = True) -> SearchResults:
|
|
|
|
with DDGS() as ddgs:
|
|
|
|
results = []
|
|
|
|
for result in ddgs.text(
|
|
|
|
query,
|
|
|
|
region="wt-wt",
|
|
|
|
safesearch="moderate",
|
|
|
|
timelimit="y",
|
|
|
|
):
|
|
|
|
results.append(SearchResultEntry(
|
|
|
|
result["title"],
|
|
|
|
result["href"],
|
|
|
|
result["body"]
|
|
|
|
))
|
|
|
|
if len(results) >= n_results:
|
|
|
|
break
|
2023-10-06 21:52:17 +03:00
|
|
|
|
2023-12-07 09:18:05 +03:00
|
|
|
if add_text:
|
|
|
|
requests = []
|
|
|
|
async with ClientSession(timeout=ClientTimeout(5)) as session:
|
|
|
|
for entry in results:
|
|
|
|
requests.append(fetch_and_scrape(session, entry.url, int(max_words / (n_results - 1))))
|
|
|
|
texts = await asyncio.gather(*requests)
|
|
|
|
|
|
|
|
formatted_results = []
|
2023-12-31 23:12:10 +03:00
|
|
|
left_words = max_words
|
2023-12-07 09:18:05 +03:00
|
|
|
for i, entry in enumerate(results):
|
|
|
|
if add_text:
|
|
|
|
entry.text = texts[i]
|
|
|
|
if left_words:
|
|
|
|
left_words -= entry.title.count(" ") + 5
|
|
|
|
if entry.text:
|
|
|
|
left_words -= entry.text.count(" ")
|
|
|
|
else:
|
|
|
|
left_words -= entry.snippet.count(" ")
|
|
|
|
if 0 > left_words:
|
|
|
|
break
|
|
|
|
formatted_results.append(entry)
|
|
|
|
|
|
|
|
return SearchResults(formatted_results)
|
|
|
|
|
|
|
|
|
|
|
|
def get_search_message(prompt) -> str:
|
|
|
|
try:
|
|
|
|
search_results = asyncio.run(search(prompt))
|
|
|
|
message = f"""
|
|
|
|
{search_results}
|
2023-10-06 21:52:17 +03:00
|
|
|
|
|
|
|
|
2023-12-07 09:18:05 +03:00
|
|
|
Instruction: Using the provided web search results, to write a comprehensive reply to the user request.
|
|
|
|
Make sure to add the sources of cites using [[Number]](Url) notation after the reference. Example: [[0]](http://google.com)
|
|
|
|
If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.
|
2023-10-06 21:52:17 +03:00
|
|
|
|
2023-12-07 09:18:05 +03:00
|
|
|
User request:
|
|
|
|
{prompt}
|
|
|
|
"""
|
|
|
|
return message
|
2023-10-06 21:52:17 +03:00
|
|
|
except Exception as e:
|
2024-01-10 12:34:56 +03:00
|
|
|
print("Couldn't do web search:", e)
|
2023-12-07 09:18:05 +03:00
|
|
|
return prompt
|