do web searches in ddg, google and google news
This commit is contained in:
parent
317a876ec4
commit
7c0dd39227
@ -204,17 +204,23 @@ async def async_main():
|
||||
|
||||
def sync_main():
|
||||
# Synchronous part for scheduling
|
||||
topic = "TSLA"
|
||||
topic = "tesla news"
|
||||
interval = 1 # in minutes
|
||||
folder = "agent-py-bot/scrape/raw"
|
||||
|
||||
schedule.every(interval).minutes.do(run_web_agent, topic=topic, folder=folder)
|
||||
# Run once at the start
|
||||
run_web_agent(topic=topic, folder=folder)
|
||||
news_json = run_web_agent(topic=topic, folder=folder)
|
||||
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(1)
|
||||
# Check if there's new data obtained from web agent
|
||||
new_data, new_summary = run_web_agent(topic=topic, folder=folder)
|
||||
|
||||
# Use the new data to call the async function
|
||||
user_message = f"New data received: {new_data}"
|
||||
query_result = query_llm(user_message)
|
||||
|
||||
if __name__ == '__main__':
|
||||
loop = asyncio.get_event_loop()
|
||||
|
@ -3,14 +3,102 @@ from bs4 import BeautifulSoup
|
||||
import os
|
||||
import json
|
||||
from datetime import datetime
|
||||
import feedparser
|
||||
|
||||
def search_duckduckgo(topic):
|
||||
# try with https://duckduckgo.com/?q=tesla&iar=news&ia=news
|
||||
url = f"http://api.duckduckgo.com/?q={topic}&format=json"
|
||||
response = requests.get(url)
|
||||
#results = response.json().get('RelatedTopics', [])
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
page_text = soup.get_text(separator='\n', strip=True)
|
||||
|
||||
url = f"https://duckduckgo.com/?q={topic}&iar=news&ia=news"
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
page_text2 = soup.get_text(separator='\n', strip=True)
|
||||
|
||||
return page_text + page_text2
|
||||
|
||||
def search_newsapi(topic, api_key):
|
||||
url = f"https://newsapi.org/v2/everything?q={topic}&apiKey={api_key}"
|
||||
response = requests.get(url)
|
||||
articles = response.json().get('articles', [])
|
||||
return articles
|
||||
|
||||
def parse_rss_feed(feed_url):
|
||||
feed = feedparser.parse(feed_url)
|
||||
articles = [{'title': entry.title, 'link': entry.link} for entry in feed.entries]
|
||||
return articles
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
def search_google_news(topic):
|
||||
options = Options()
|
||||
options.headless = True
|
||||
driver = webdriver.Chrome(options=options)
|
||||
|
||||
try:
|
||||
driver.get(f"https://www.google.com/search?q={topic}&tbm=nws")
|
||||
# Code to accept cookies or terms goes here
|
||||
|
||||
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||
page_text = soup.get_text(separator='\n', strip=True)
|
||||
return page_text
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
|
||||
|
||||
def get_google_search_results(query):
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
|
||||
|
||||
# if response.status_code == 200:
|
||||
# soup = BeautifulSoup(response.text, 'html.parser')
|
||||
# page_text = soup.get_text(separator='\n', strip=True)
|
||||
# return page_text
|
||||
# else:
|
||||
# return f"Error: {response.status_code}"
|
||||
|
||||
try:
|
||||
response = requests.get(f"https://www.google.com/search?q={query}", headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
# Ensure the correct character set is used
|
||||
response.encoding = response.apparent_encoding
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
page_text = soup.get_text(separator='\n', strip=True)
|
||||
return page_text
|
||||
except Exception as e:
|
||||
return f"Parsing Error: {e}"
|
||||
|
||||
def search_news(topic):
|
||||
url = f"https://www.google.com/search?q={topic}"
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
# DuckDuckGo Results
|
||||
duck_results = search_duckduckgo(topic)
|
||||
|
||||
news_data = [] # Extract relevant information here
|
||||
return news_data
|
||||
# NewsAPI Results
|
||||
newsapi_key = "44721311c40147ea9fe19080621cdb8a"
|
||||
newsapi_results = search_newsapi(topic, newsapi_key)
|
||||
|
||||
# RSS Feed Results
|
||||
rss_feeds = ['http://feeds.reuters.com/Reuters/worldNews',] # Add more RSS URLs
|
||||
rss_results = []
|
||||
for feed_url in rss_feeds:
|
||||
rss_results.extend(parse_rss_feed(feed_url))
|
||||
|
||||
# Google News Results
|
||||
# google_results = search_google_news(topic)
|
||||
google_results = get_google_search_results(topic)
|
||||
|
||||
return {
|
||||
"duckduckgo": duck_results,
|
||||
"newsapi": newsapi_results,
|
||||
"rss": rss_results,
|
||||
"google": google_results
|
||||
}
|
||||
|
||||
def save_data(data, folder):
|
||||
if not os.path.exists(folder):
|
||||
@ -33,3 +121,4 @@ def run_web_agent(topic, folder):
|
||||
summary = summarize_data(news_data)
|
||||
with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file:
|
||||
log_file.write(f"{datetime.now()}: {summary}\n")
|
||||
return news_data
|
1
agent-py-bot/scrape/raw/data_20231225_170201.json
Normal file
1
agent-py-bot/scrape/raw/data_20231225_170201.json
Normal file
File diff suppressed because one or more lines are too long
1
agent-py-bot/scrape/raw/data_20240108_131204.json
Normal file
1
agent-py-bot/scrape/raw/data_20240108_131204.json
Normal file
File diff suppressed because one or more lines are too long
11
agent-py-bot/scrape/raw/summary_log.txt
Normal file
11
agent-py-bot/scrape/raw/summary_log.txt
Normal file
@ -0,0 +1,11 @@
|
||||
2023-12-23 01:18:42.922812: Summarized information
|
||||
2023-12-25 17:02:01.477567: Summarized information
|
||||
2024-01-08 13:12:04.190959: Summarized information
|
||||
2024-01-08 13:13:03.437567: Summarized information
|
||||
2024-01-08 13:14:04.749784: Summarized information
|
||||
2024-01-08 13:15:06.100403: Summarized information
|
||||
2024-01-08 13:16:07.387491: Summarized information
|
||||
2024-01-08 13:17:09.016139: Summarized information
|
||||
2024-01-08 13:18:10.384559: Summarized information
|
||||
2024-01-08 13:19:12.129203: Summarized information
|
||||
2024-01-08 13:20:13.569597: Summarized information
|
Loading…
x
Reference in New Issue
Block a user