gpt4free/g4f/gui/server/internet.py

from __future__ import annotations

from bs4 import BeautifulSoup
from aiohttp import ClientSession, ClientTimeout
from duckduckgo_search import DDGS
import asyncio

class SearchResults():
    def __init__(self, results: list):
        self.results = results

    def __iter__(self):
        yield from self.results

    def __str__(self):
        search = ""
        for idx, result in enumerate(self.results):
            if search:
                search += "\n\n\n"
            search += f"Title: {result.title}\n\n"
            if result.text:
                search += result.text
            else:
                search += result.snippet
            search += f"\n\nSource: [[{idx}]]({result.url})"
        return search
    
class SearchResultEntry():
    def __init__(self, title: str, url: str, snippet: str, text: str = None):
        self.title = title
        self.url = url
        self.snippet = snippet
        self.text = text

    def set_text(self, text: str):
        self.text = text

def scrape_text(html: str, max_words: int = None) -> str:
    soup = BeautifulSoup(html, "html.parser")
    for exclude in soup(["script", "style"]):
        exclude.extract()
    for selector in [
            "main",
            ".main-content-wrapper",
            ".main-content",
            ".emt-container-inner",
            ".content-wrapper",
            "#content",
            "#mainContent",
        ]:
        select = soup.select_one(selector)
        if select:
            soup = select
            break
    # Zdnet
    for remove in [".c-globalDisclosure"]:
        select = soup.select_one(remove)
        if select:
            select.extract()
    clean_text = ""
    for paragraph in soup.select("p"):
        text = paragraph.get_text()
        for line in text.splitlines():
            words = []
            for word in line.replace("\t", " ").split(" "):
                if word:
                    words.append(word)
            count = len(words)
            if not count:
                continue
            if max_words:
                max_words -= count
                if max_words <= 0:
                    break
            if clean_text:
                clean_text += "\n"
            clean_text += " ".join(words)

    return clean_text

async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = None) -> str:
    try:
        async with session.get(url) as response:
            if response.status == 200:
                html = await response.text()
                return scrape_text(html, max_words)
    except:
        return

async def search(query: str, n_results: int = 5, max_words: int = 2500, add_text: bool = True) -> SearchResults:
    with DDGS() as ddgs:
        results = []
        for result in ddgs.text(
                query,
                region="wt-wt",
                safesearch="moderate",
                timelimit="y",
            ):
            results.append(SearchResultEntry(
                result["title"],
                result["href"],
                result["body"]
            ))
            if len(results) >= n_results:
                break

        if add_text:
            requests = []
            async with ClientSession(timeout=ClientTimeout(5)) as session:
                for entry in results:
                    requests.append(fetch_and_scrape(session, entry.url, int(max_words / (n_results - 1))))
                texts = await asyncio.gather(*requests)

        formatted_results = []
        left_words = max_words
        for i, entry in enumerate(results):
            if add_text:
                entry.text = texts[i]
            if left_words:
                left_words -= entry.title.count(" ") + 5
                if entry.text:
                    left_words -= entry.text.count(" ")
                else:
                    left_words -= entry.snippet.count(" ")
                if 0 > left_words:
                    break
            formatted_results.append(entry)

        return SearchResults(formatted_results)


def get_search_message(prompt) -> str:
    try:
        search_results = asyncio.run(search(prompt))
        message = f"""
{search_results}


Instruction: Using the provided web search results, to write a comprehensive reply to the user request.
Make sure to add the sources of cites using [[Number]](Url) notation after the reference. Example: [[0]](http://google.com)
If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.

User request:
{prompt}
"""
        return message
    except Exception as e:
        print("Couldn't do web search:", e)
        return prompt
Fix: TypeError: unsupported operand type(s) for \|:... 2023-10-11 11:57:46 +03:00			`from __future__ import annotations`

Add websearch to gui (#1314) * Add websearch to gui * Fix version_check config * Add version badge in README.md * Show version in gui * Add docker hub build * Fix gui backend, improve style 2023-12-07 09:18:05 +03:00			`from bs4 import BeautifulSoup`
			`from aiohttp import ClientSession, ClientTimeout`
make internet searching better 2023-10-10 15:47:41 +03:00			`from duckduckgo_search import DDGS`
Add websearch to gui (#1314) * Add websearch to gui * Fix version_check config * Add version badge in README.md * Show version in gui * Add docker hub build * Fix gui backend, improve style 2023-12-07 09:18:05 +03:00			`import asyncio`

			`class SearchResults():`
			`def __init__(self, results: list):`
			`self.results = results`

			`def __iter__(self):`
			`yield from self.results`

			`def __str__(self):`
			`search = ""`
			`for idx, result in enumerate(self.results):`
			`if search:`
			`search += "\n\n\n"`
			`search += f"Title: {result.title}\n\n"`
			`if result.text:`
			`search += result.text`
			`else:`
			`search += result.snippet`
			`search += f"\n\nSource: [[{idx}]]({result.url})"`
			`return search`

			`class SearchResultEntry():`
			`def __init__(self, title: str, url: str, snippet: str, text: str = None):`
			`self.title = title`
			`self.url = url`
			`self.snippet = snippet`
			`self.text = text`

			`def set_text(self, text: str):`
			`self.text = text`

			`def scrape_text(html: str, max_words: int = None) -> str:`
			`soup = BeautifulSoup(html, "html.parser")`
			`for exclude in soup(["script", "style"]):`
			`exclude.extract()`
			`for selector in [`
			`"main",`
			`".main-content-wrapper",`
			`".main-content",`
			`".emt-container-inner",`
			`".content-wrapper",`
			`"#content",`
			`"#mainContent",`
			`]:`
			`select = soup.select_one(selector)`
			`if select:`
			`soup = select`
			`break`
			`# Zdnet`
			`for remove in [".c-globalDisclosure"]:`
			`select = soup.select_one(remove)`
			`if select:`
			`select.extract()`
			`clean_text = ""`
			`for paragraph in soup.select("p"):`
			`text = paragraph.get_text()`
			`for line in text.splitlines():`
			`words = []`
			`for word in line.replace("\t", " ").split(" "):`
			`if word:`
			`words.append(word)`
			`count = len(words)`
			`if not count:`
			`continue`
			`if max_words:`
			`max_words -= count`
			`if max_words <= 0:`
			`break`
			`if clean_text:`
			`clean_text += "\n"`
			`clean_text += " ".join(words)`

			`return clean_text`

			`async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = None) -> str:`
~ \| new g4f GUI 2023-10-06 21:52:17 +03:00			`try:`
Add websearch to gui (#1314) * Add websearch to gui * Fix version_check config * Add version badge in README.md * Show version in gui * Add docker hub build * Fix gui backend, improve style 2023-12-07 09:18:05 +03:00			`async with session.get(url) as response:`
			`if response.status == 200:`
			`html = await response.text()`
			`return scrape_text(html, max_words)`
			`except:`
			`return`

			`async def search(query: str, n_results: int = 5, max_words: int = 2500, add_text: bool = True) -> SearchResults:`
			`with DDGS() as ddgs:`
			`results = []`
			`for result in ddgs.text(`
			`query,`
			`region="wt-wt",`
			`safesearch="moderate",`
			`timelimit="y",`
			`):`
			`results.append(SearchResultEntry(`
			`result["title"],`
			`result["href"],`
			`result["body"]`
			`))`
			`if len(results) >= n_results:`
			`break`
~ \| new g4f GUI 2023-10-06 21:52:17 +03:00
Add websearch to gui (#1314) * Add websearch to gui * Fix version_check config * Add version badge in README.md * Show version in gui * Add docker hub build * Fix gui backend, improve style 2023-12-07 09:18:05 +03:00			`if add_text:`
			`requests = []`
			`async with ClientSession(timeout=ClientTimeout(5)) as session:`
			`for entry in results:`
			`requests.append(fetch_and_scrape(session, entry.url, int(max_words / (n_results - 1))))`
			`texts = await asyncio.gather(*requests)`

			`formatted_results = []`
remove unnecessary semicolon 2023-12-31 23:12:10 +03:00			`left_words = max_words`
Add websearch to gui (#1314) * Add websearch to gui * Fix version_check config * Add version badge in README.md * Show version in gui * Add docker hub build * Fix gui backend, improve style 2023-12-07 09:18:05 +03:00			`for i, entry in enumerate(results):`
			`if add_text:`
			`entry.text = texts[i]`
			`if left_words:`
			`left_words -= entry.title.count(" ") + 5`
			`if entry.text:`
			`left_words -= entry.text.count(" ")`
			`else:`
			`left_words -= entry.snippet.count(" ")`
			`if 0 > left_words:`
			`break`
			`formatted_results.append(entry)`

			`return SearchResults(formatted_results)`


			`def get_search_message(prompt) -> str:`
			`try:`
			`search_results = asyncio.run(search(prompt))`
			`message = f"""`
			`{search_results}`
~ \| new g4f GUI 2023-10-06 21:52:17 +03:00

Add websearch to gui (#1314) * Add websearch to gui * Fix version_check config * Add version badge in README.md * Show version in gui * Add docker hub build * Fix gui backend, improve style 2023-12-07 09:18:05 +03:00			`Instruction: Using the provided web search results, to write a comprehensive reply to the user request.`
			`Make sure to add the sources of cites using [[Number]](Url) notation after the reference. Example: [[0]](http://google.com)`
			`If the provided search results refer to multiple subjects with the same name, write separate answers for each subject.`
~ \| new g4f GUI 2023-10-06 21:52:17 +03:00
Add websearch to gui (#1314) * Add websearch to gui * Fix version_check config * Add version badge in README.md * Show version in gui * Add docker hub build * Fix gui backend, improve style 2023-12-07 09:18:05 +03:00			`User request:`
			`{prompt}`
			`"""`
			`return message`
~ \| new g4f GUI 2023-10-06 21:52:17 +03:00			`except Exception as e:`
Add create images to Bing (#1426) Add create images from Bing Add FreeChatgpt Provider Fix Bard Provider 2024-01-10 12:34:56 +03:00			`print("Couldn't do web search:", e)`
Add websearch to gui (#1314) * Add websearch to gui * Fix version_check config * Add version badge in README.md * Show version in gui * Add docker hub build * Fix gui backend, improve style 2023-12-07 09:18:05 +03:00			`return prompt`