fix google search - using api

2024-01-08 17:53:18 +02:00
parent 7c0dd39227
commit 3d114d1a76
2 changed files with 118 additions and 23 deletions
--- a/agent-py-bot/agent.py
+++ b/agent-py-bot/agent.py
@@ -190,9 +190,21 @@ async def bad_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> Non
 #------------------------- webagent --------------------------# 
 import schedule
 import time
-from agents.webagent import run_web_agent
-
+from agents.webagent import run_web_agent, save_data

+async def run_web_agent_and_process_result(topic, folder):
+    print(f"Running web agent for topic: {topic}")
+    news_data = run_web_agent(topic, folder)
+    save_data(news_data, folder)
+    # summary = summarize_data(news_data)
+    
+    # with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file:
+    #     log_file.write(f"{datetime.now()}: {summary}\n")
+    
+    # Process the result immediately after obtaining it
+    user_message = f"New data received: {news_data}"
+    query_result = await query_llm(user_message)
+    # Process the query_result as needed

 async def async_main():
    # Assuming this is your asynchronous main function with its full details
@@ -205,22 +217,17 @@ async def async_main():
 def sync_main():
    # Synchronous part for scheduling
    topic = "tesla news"
-    interval = 1  # in minutes
+    interval = 8  # in minutes
    folder = "agent-py-bot/scrape/raw"

-    schedule.every(interval).minutes.do(run_web_agent, topic=topic, folder=folder)
+    # schedule.every(interval).minutes.do(run_web_agent_and_process_result, topic=topic, folder=folder)
+    schedule.every(interval).hours.do(run_web_agent_and_process_result, topic=topic, folder=folder)
    # Run once at the start
-    news_json = run_web_agent(topic=topic, folder=folder)
+    news_json = await run_web_agent_and_process_result(topic=topic, folder=folder)

    while True:
        schedule.run_pending()
        time.sleep(1)
-        # Check if there's new data obtained from web agent
-        new_data, new_summary = run_web_agent(topic=topic, folder=folder)
-        
-        # Use the new data to call the async function
-        user_message = f"New data received: {new_data}"
-        query_result = query_llm(user_message)

 if __name__ == '__main__':
    loop = asyncio.get_event_loop()
--- a/agent-py-bot/agents/webagent.py
+++ b/agent-py-bot/agents/webagent.py
@@ -2,7 +2,7 @@ import requests
 from bs4 import BeautifulSoup
 import os
 import json
-from datetime import datetime
+from datetime import datetime, timedelta
 import feedparser

 def search_duckduckgo(topic):
@@ -20,10 +20,25 @@ def search_duckduckgo(topic):

    return page_text + page_text2

-def search_newsapi(topic, api_key):
-    url = f"https://newsapi.org/v2/everything?q={topic}&apiKey={api_key}"
-    response = requests.get(url)
+def search_newsapi(topic, api_key, from_param=None):
+    endpoint = "https://newsapi.org/v2/everything"
+
+    # Set up parameters including your API key and query parameters
+    params = {
+        'apiKey': api_key,
+        'q': topic,
+        'from': from_param,  # Specify the date in the format "YYYY-MM-DD"
+        'sortBy': 'publishedAt',
+        'language': 'en',
+    }
+
+    # Add 'from' parameter only if 'from_param' is provided
+    if from_param:
+        params['from'] = from_param
+
+    response = requests.get(endpoint, params=params)
    articles = response.json().get('articles', [])
+    headlines = [article.get('title', '') for article in articles]
    return articles

 def parse_rss_feed(feed_url):
@@ -51,7 +66,7 @@ def search_google_news(topic):



-def get_google_search_results(query):
+def get_google_search_results_old_requiresLogin(query):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
@@ -61,7 +76,6 @@ def get_google_search_results(query):
    #     return page_text
    # else:
    #     return f"Error: {response.status_code}"
-    
    try:
        response = requests.get(f"https://www.google.com/search?q={query}", headers=headers)
        response.raise_for_status()
@@ -75,13 +89,76 @@ def get_google_search_results(query):
    except Exception as e:
        return f"Parsing Error: {e}"

+
+def google_search_api_headlines(query, api_key, cx, daterange=None):
+    try:
+        # Set up the API endpoint
+        endpoint = "https://www.googleapis.com/customsearch/v1"
+
+        # Set up parameters including your API key and custom search engine ID (cx)
+        params = {
+            'key': api_key,
+            'cx': cx,
+            'q': query
+        }        
+        
+        # Add 'dateRestrict' parameter if provided
+        if daterange:
+            params['dateRestrict'] = daterange
+
+        # Make the request to the Custom Search API
+        response = requests.get(endpoint, params=params)
+        response.raise_for_status()
+
+        # Parse the JSON response
+        search_results = response.json()
+
+        # Extract and return headlines from the response
+        items = search_results.get('items', [])
+        headlines = [item.get('title', '') for item in items]
+        return headlines
+    except Exception as e:
+        return f"API Request Error: {e}"
+
+def get_news_api_results(query, api_key, from_param):
+    try:
+        # Set up the API endpoint
+        endpoint = "https://newsapi.org/v2/everything"
+
+        # Set up parameters including your API key and query parameters
+        params = {
+            'apiKey': api_key,
+            'q': query,
+            'from': from_param,  # Specify the date in the format "YYYY-MM-DD"
+            'sortBy': 'publishedAt',
+            'language': 'en',
+        }
+
+        # Make the request to the News API
+        response = requests.get(endpoint, params=params)
+        response.raise_for_status()
+
+        # Parse the JSON response
+        news_results = response.json()
+
+        # Extract and return relevant information from the response
+        articles = news_results.get('articles', [])
+        headlines = [article.get('title', '') for article in articles]
+        return headlines
+    except Exception as e:
+        return f"API Request Error: {e}"
+
 def search_news(topic):
    # DuckDuckGo Results
    duck_results = search_duckduckgo(topic)

    # NewsAPI Results
+    current_date = datetime.now()
+    from_date = current_date - timedelta(days=2)
+    from_param = from_date.strftime('%Y-%m-%d')
+
    newsapi_key = "44721311c40147ea9fe19080621cdb8a"
-    newsapi_results = search_newsapi(topic, newsapi_key)
+    newsapi_results = search_newsapi(topic, newsapi_key, from_param)

    # RSS Feed Results
    rss_feeds = ['http://feeds.reuters.com/Reuters/worldNews',]  # Add more RSS URLs
@@ -91,7 +168,18 @@ def search_news(topic):

    # Google News Results
    # google_results = search_google_news(topic)
-    google_results = get_google_search_results(topic)
+    # google_results = get_google_search_results(topic)
+
+    #  # //t-air: AIzaSyBC5-h1-WFqwKeHhagB-I1pcjRVEkvUZp4 https://console.cloud.google.com/apis/credentials/key/15ab8371-c67b-4d3a-a9af-7106cb4015e5?authuser=0&project=t-air-1704714414235 cx=049ff6d98d29c4e67
+    api_key = "AIzaSyBC5-h1-WFqwKeHhagB-I1pcjRVEkvUZp4"
+    cx = "049ff6d98d29c4e67"
+    query = topic
+    daterange = "d1"  # Specify the date range according to Google's search syntax
+    #     d1: Past 24 hours
+    # w1: Past week
+    # daterange_8_hours = "h8"
+    # daterange_3_days = "d3"
+    google_results = google_search_api_headlines(query, api_key, cx, daterange)

    return {
        "duckduckgo": duck_results,
@@ -117,8 +205,8 @@ def summarize_data(data):
 def run_web_agent(topic, folder):
    print(f"Running web agent for topic: {topic}")
    news_data = search_news(topic)
-    save_data(news_data, folder)
-    summary = summarize_data(news_data)
-    with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file:
-        log_file.write(f"{datetime.now()}: {summary}\n")
+    # save_data(news_data, folder)
+    # summary = summarize_data(news_data)
+    # with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file:
+    #     log_file.write(f"{datetime.now()}: {summary}\n")
    return news_data