ChatDev/camel/web_spider.py
2023-12-29 14:22:05 +08:00

89 lines
2.7 KiB
Python

import requests
from bs4 import BeautifulSoup
import openai
from openai import OpenAI
import wikipediaapi
import os
import time
self_api_key = os.environ.get('OPENAI_API_KEY')
BASE_URL = os.environ.get('BASE_URL')
if BASE_URL:
client = openai.OpenAI(
api_key=self_api_key,
base_url=BASE_URL,
)
else:
client = openai.OpenAI(
api_key=self_api_key
)
def get_baidu_baike_content(keyword):
# design api by the baidubaike
url = f'https://baike.baidu.com/item/{keyword}'
# post request
response = requests.get(url)
# Beautiful Soup part for the html content
soup = BeautifulSoup(response.content, 'html.parser')
# find the main content in the page
# main_content = soup.find('div', class_='lemma-summary')
main_content = soup.contents[-1].contents[0].contents[4].attrs['content']
# find the target content
# content_text = main_content.get_text().strip()
return main_content
def get_wiki_content(keyword):
# Wikipedia API ready
wiki_wiki = wikipediaapi.Wikipedia('MyProjectName (merlin@example.com)', 'en')
#the topic content which you want to spider
search_topic = keyword
# get the page content
page_py = wiki_wiki.page(search_topic)
# check the existence of the content in the page
if page_py.exists():
print("Page - Title:", page_py.title)
print("Page - Summary:", page_py.summary)
else:
print("Page not found.")
return page_py.summary
def modal_trans(task_dsp):
try:
task_in ="'" + task_dsp + \
"'Just give me the most important keyword about this sentence without explaining it and your answer should be only one keyword."
messages = [{"role": "user", "content": task_in}]
response = client.chat.completions.create(messages=messages,
model="gpt-3.5-turbo-16k",
temperature=0.2,
top_p=1.0,
n=1,
stream=False,
frequency_penalty=0.0,
presence_penalty=0.0,
logit_bias={})
response_text = response.choices[0].message.content
spider_content = get_wiki_content(response_text)
# time.sleep(1)
task_in = "'" + spider_content + \
"',Summarize this paragraph and return the key information."
messages = [{"role": "user", "content": task_in}]
response = client.chat.completions.create(messages=messages,
model="gpt-3.5-turbo-16k",
temperature=0.2,
top_p=1.0,
n=1,
stream=False,
frequency_penalty=0.0,
presence_penalty=0.0,
logit_bias={})
result = response.choices[0].message.content
print("web spider content:", result)
except:
result = ''
print("the content is none")
return result