Files
gogo2/agent-py-bot/agents/webagent.py
2024-01-08 17:52:31 +02:00

124 lines
3.9 KiB
Python

import requests
from bs4 import BeautifulSoup
import os
import json
from datetime import datetime
import feedparser
def search_duckduckgo(topic):
# try with https://duckduckgo.com/?q=tesla&iar=news&ia=news
url = f"http://api.duckduckgo.com/?q={topic}&format=json"
response = requests.get(url)
#results = response.json().get('RelatedTopics', [])
soup = BeautifulSoup(response.text, 'html.parser')
page_text = soup.get_text(separator='\n', strip=True)
url = f"https://duckduckgo.com/?q={topic}&iar=news&ia=news"
soup = BeautifulSoup(response.text, 'html.parser')
page_text2 = soup.get_text(separator='\n', strip=True)
return page_text + page_text2
def search_newsapi(topic, api_key):
url = f"https://newsapi.org/v2/everything?q={topic}&apiKey={api_key}"
response = requests.get(url)
articles = response.json().get('articles', [])
return articles
def parse_rss_feed(feed_url):
feed = feedparser.parse(feed_url)
articles = [{'title': entry.title, 'link': entry.link} for entry in feed.entries]
return articles
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def search_google_news(topic):
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
try:
driver.get(f"https://www.google.com/search?q={topic}&tbm=nws")
# Code to accept cookies or terms goes here
soup = BeautifulSoup(driver.page_source, 'html.parser')
page_text = soup.get_text(separator='\n', strip=True)
return page_text
finally:
driver.quit()
def get_google_search_results(query):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# if response.status_code == 200:
# soup = BeautifulSoup(response.text, 'html.parser')
# page_text = soup.get_text(separator='\n', strip=True)
# return page_text
# else:
# return f"Error: {response.status_code}"
try:
response = requests.get(f"https://www.google.com/search?q={query}", headers=headers)
response.raise_for_status()
# Ensure the correct character set is used
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
page_text = soup.get_text(separator='\n', strip=True)
return page_text
except Exception as e:
return f"Parsing Error: {e}"
def search_news(topic):
# DuckDuckGo Results
duck_results = search_duckduckgo(topic)
# NewsAPI Results
newsapi_key = "44721311c40147ea9fe19080621cdb8a"
newsapi_results = search_newsapi(topic, newsapi_key)
# RSS Feed Results
rss_feeds = ['http://feeds.reuters.com/Reuters/worldNews',] # Add more RSS URLs
rss_results = []
for feed_url in rss_feeds:
rss_results.extend(parse_rss_feed(feed_url))
# Google News Results
# google_results = search_google_news(topic)
google_results = get_google_search_results(topic)
return {
"duckduckgo": duck_results,
"newsapi": newsapi_results,
"rss": rss_results,
"google": google_results
}
def save_data(data, folder):
if not os.path.exists(folder):
os.makedirs(folder)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
file_path = os.path.join(folder, f"data_{timestamp}.json")
with open(file_path, 'w') as file:
json.dump(data, file)
def summarize_data(data):
summary = "Summarized information" # Replace with actual summarization logic
return summary
def run_web_agent(topic, folder):
print(f"Running web agent for topic: {topic}")
news_data = search_news(topic)
save_data(news_data, folder)
summary = summarize_data(news_data)
with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file:
log_file.write(f"{datetime.now()}: {summary}\n")
return news_data