do web searches in ddg, google and google news

2024-01-08 17:52:31 +02:00
parent 317a876ec4
commit 7c0dd39227
5 changed files with 115 additions and 7 deletions
--- a/agent-py-bot/agent.py
+++ b/agent-py-bot/agent.py
@ -204,17 +204,23 @@ async def async_main():

 def sync_main():
    # Synchronous part for scheduling
-    topic = "TSLA"
+    topic = "tesla news"
    interval = 1  # in minutes
    folder = "agent-py-bot/scrape/raw"

    schedule.every(interval).minutes.do(run_web_agent, topic=topic, folder=folder)
    # Run once at the start
-    run_web_agent(topic=topic, folder=folder)
+    news_json = run_web_agent(topic=topic, folder=folder)

    while True:
        schedule.run_pending()
        time.sleep(1)
+        # Check if there's new data obtained from web agent
+        new_data, new_summary = run_web_agent(topic=topic, folder=folder)
+        
+        # Use the new data to call the async function
+        user_message = f"New data received: {new_data}"
+        query_result = query_llm(user_message)

 if __name__ == '__main__':
    loop = asyncio.get_event_loop()
--- a/agent-py-bot/agents/webagent.py
+++ b/agent-py-bot/agents/webagent.py
@ -3,14 +3,102 @@ from bs4 import BeautifulSoup
 import os
 import json
 from datetime import datetime
+import feedparser
+
+def search_duckduckgo(topic):
+    # try with https://duckduckgo.com/?q=tesla&iar=news&ia=news 
+    url = f"http://api.duckduckgo.com/?q={topic}&format=json"
+    response = requests.get(url)
+    #results = response.json().get('RelatedTopics', [])
+    
+    soup = BeautifulSoup(response.text, 'html.parser')
+    page_text = soup.get_text(separator='\n', strip=True)
+
+    url = f"https://duckduckgo.com/?q={topic}&iar=news&ia=news"
+    soup = BeautifulSoup(response.text, 'html.parser')
+    page_text2 = soup.get_text(separator='\n', strip=True)
+
+    return page_text + page_text2
+
+def search_newsapi(topic, api_key):
+    url = f"https://newsapi.org/v2/everything?q={topic}&apiKey={api_key}"
+    response = requests.get(url)
+    articles = response.json().get('articles', [])
+    return articles
+
+def parse_rss_feed(feed_url):
+    feed = feedparser.parse(feed_url)
+    articles = [{'title': entry.title, 'link': entry.link} for entry in feed.entries]
+    return articles
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+
+def search_google_news(topic):
+    options = Options()
+    options.headless = True
+    driver = webdriver.Chrome(options=options)
+
+    try:
+        driver.get(f"https://www.google.com/search?q={topic}&tbm=nws")
+        # Code to accept cookies or terms goes here
+
+        soup = BeautifulSoup(driver.page_source, 'html.parser')
+        page_text = soup.get_text(separator='\n', strip=True)
+        return page_text
+    finally:
+        driver.quit()
+
+
+
+def get_google_search_results(query):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
+    
+    # if response.status_code == 200:
+    #     soup = BeautifulSoup(response.text, 'html.parser')
+    #     page_text = soup.get_text(separator='\n', strip=True)
+    #     return page_text
+    # else:
+    #     return f"Error: {response.status_code}"
+    
+    try:
+        response = requests.get(f"https://www.google.com/search?q={query}", headers=headers)
+        response.raise_for_status()
+
+        # Ensure the correct character set is used
+        response.encoding = response.apparent_encoding
+
+        soup = BeautifulSoup(response.text, 'html.parser')
+        page_text = soup.get_text(separator='\n', strip=True)
+        return page_text
+    except Exception as e:
+        return f"Parsing Error: {e}"

 def search_news(topic):
-    url = f"https://www.google.com/search?q={topic}"
-    response = requests.get(url)
-    soup = BeautifulSoup(response.text, 'html.parser')
+    # DuckDuckGo Results
+    duck_results = search_duckduckgo(topic)

-    news_data = [] # Extract relevant information here
-    return news_data
+    # NewsAPI Results
+    newsapi_key = "44721311c40147ea9fe19080621cdb8a"
+    newsapi_results = search_newsapi(topic, newsapi_key)
+
+    # RSS Feed Results
+    rss_feeds = ['http://feeds.reuters.com/Reuters/worldNews',]  # Add more RSS URLs
+    rss_results = []
+    for feed_url in rss_feeds:
+        rss_results.extend(parse_rss_feed(feed_url))
+
+    # Google News Results
+    # google_results = search_google_news(topic)
+    google_results = get_google_search_results(topic)
+
+    return {
+        "duckduckgo": duck_results,
+        "newsapi": newsapi_results,
+        "rss": rss_results,
+        "google": google_results
+    }

 def save_data(data, folder):
    if not os.path.exists(folder):
@ -33,3 +121,4 @@ def run_web_agent(topic, folder):
    summary = summarize_data(news_data)
    with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file:
        log_file.write(f"{datetime.now()}: {summary}\n")
+    return news_data
--- a/agent-py-bot/scrape/raw/data_20231225_170201.json
+++ b/agent-py-bot/scrape/raw/data_20231225_170201.json
--- a/agent-py-bot/scrape/raw/data_20240108_131204.json
+++ b/agent-py-bot/scrape/raw/data_20240108_131204.json
--- a/agent-py-bot/scrape/raw/summary_log.txt
+++ b/agent-py-bot/scrape/raw/summary_log.txt
@ -0,0 +1,11 @@
+2023-12-23 01:18:42.922812: Summarized information
+2023-12-25 17:02:01.477567: Summarized information
+2024-01-08 13:12:04.190959: Summarized information
+2024-01-08 13:13:03.437567: Summarized information
+2024-01-08 13:14:04.749784: Summarized information
+2024-01-08 13:15:06.100403: Summarized information
+2024-01-08 13:16:07.387491: Summarized information
+2024-01-08 13:17:09.016139: Summarized information
+2024-01-08 13:18:10.384559: Summarized information
+2024-01-08 13:19:12.129203: Summarized information
+2024-01-08 13:20:13.569597: Summarized information