From 3d114d1a766238eb69c650410aa03cc611e8afbb Mon Sep 17 00:00:00 2001 From: Dobromir Popov Date: Mon, 8 Jan 2024 17:53:18 +0200 Subject: [PATCH] fix google search - using api --- agent-py-bot/agent.py | 29 +++++---- agent-py-bot/agents/webagent.py | 112 ++++++++++++++++++++++++++++---- 2 files changed, 118 insertions(+), 23 deletions(-) diff --git a/agent-py-bot/agent.py b/agent-py-bot/agent.py index 0bce74e..f0fad9a 100644 --- a/agent-py-bot/agent.py +++ b/agent-py-bot/agent.py @@ -190,9 +190,21 @@ async def bad_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> Non #------------------------- webagent --------------------------# import schedule import time -from agents.webagent import run_web_agent - +from agents.webagent import run_web_agent, save_data +async def run_web_agent_and_process_result(topic, folder): + print(f"Running web agent for topic: {topic}") + news_data = run_web_agent(topic, folder) + save_data(news_data, folder) + # summary = summarize_data(news_data) + + # with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file: + # log_file.write(f"{datetime.now()}: {summary}\n") + + # Process the result immediately after obtaining it + user_message = f"New data received: {news_data}" + query_result = await query_llm(user_message) + # Process the query_result as needed async def async_main(): # Assuming this is your asynchronous main function with its full details @@ -205,22 +217,17 @@ async def async_main(): def sync_main(): # Synchronous part for scheduling topic = "tesla news" - interval = 1 # in minutes + interval = 8 # in minutes folder = "agent-py-bot/scrape/raw" - schedule.every(interval).minutes.do(run_web_agent, topic=topic, folder=folder) + # schedule.every(interval).minutes.do(run_web_agent_and_process_result, topic=topic, folder=folder) + schedule.every(interval).hours.do(run_web_agent_and_process_result, topic=topic, folder=folder) # Run once at the start - news_json = run_web_agent(topic=topic, folder=folder) + news_json = await run_web_agent_and_process_result(topic=topic, folder=folder) while True: schedule.run_pending() time.sleep(1) - # Check if there's new data obtained from web agent - new_data, new_summary = run_web_agent(topic=topic, folder=folder) - - # Use the new data to call the async function - user_message = f"New data received: {new_data}" - query_result = query_llm(user_message) if __name__ == '__main__': loop = asyncio.get_event_loop() diff --git a/agent-py-bot/agents/webagent.py b/agent-py-bot/agents/webagent.py index 3759a3e..41e6e97 100644 --- a/agent-py-bot/agents/webagent.py +++ b/agent-py-bot/agents/webagent.py @@ -2,7 +2,7 @@ import requests from bs4 import BeautifulSoup import os import json -from datetime import datetime +from datetime import datetime, timedelta import feedparser def search_duckduckgo(topic): @@ -20,10 +20,25 @@ def search_duckduckgo(topic): return page_text + page_text2 -def search_newsapi(topic, api_key): - url = f"https://newsapi.org/v2/everything?q={topic}&apiKey={api_key}" - response = requests.get(url) +def search_newsapi(topic, api_key, from_param=None): + endpoint = "https://newsapi.org/v2/everything" + + # Set up parameters including your API key and query parameters + params = { + 'apiKey': api_key, + 'q': topic, + 'from': from_param, # Specify the date in the format "YYYY-MM-DD" + 'sortBy': 'publishedAt', + 'language': 'en', + } + + # Add 'from' parameter only if 'from_param' is provided + if from_param: + params['from'] = from_param + + response = requests.get(endpoint, params=params) articles = response.json().get('articles', []) + headlines = [article.get('title', '') for article in articles] return articles def parse_rss_feed(feed_url): @@ -51,7 +66,7 @@ def search_google_news(topic): -def get_google_search_results(query): +def get_google_search_results_old_requiresLogin(query): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} @@ -61,7 +76,6 @@ def get_google_search_results(query): # return page_text # else: # return f"Error: {response.status_code}" - try: response = requests.get(f"https://www.google.com/search?q={query}", headers=headers) response.raise_for_status() @@ -75,13 +89,76 @@ def get_google_search_results(query): except Exception as e: return f"Parsing Error: {e}" + +def google_search_api_headlines(query, api_key, cx, daterange=None): + try: + # Set up the API endpoint + endpoint = "https://www.googleapis.com/customsearch/v1" + + # Set up parameters including your API key and custom search engine ID (cx) + params = { + 'key': api_key, + 'cx': cx, + 'q': query + } + + # Add 'dateRestrict' parameter if provided + if daterange: + params['dateRestrict'] = daterange + + # Make the request to the Custom Search API + response = requests.get(endpoint, params=params) + response.raise_for_status() + + # Parse the JSON response + search_results = response.json() + + # Extract and return headlines from the response + items = search_results.get('items', []) + headlines = [item.get('title', '') for item in items] + return headlines + except Exception as e: + return f"API Request Error: {e}" + +def get_news_api_results(query, api_key, from_param): + try: + # Set up the API endpoint + endpoint = "https://newsapi.org/v2/everything" + + # Set up parameters including your API key and query parameters + params = { + 'apiKey': api_key, + 'q': query, + 'from': from_param, # Specify the date in the format "YYYY-MM-DD" + 'sortBy': 'publishedAt', + 'language': 'en', + } + + # Make the request to the News API + response = requests.get(endpoint, params=params) + response.raise_for_status() + + # Parse the JSON response + news_results = response.json() + + # Extract and return relevant information from the response + articles = news_results.get('articles', []) + headlines = [article.get('title', '') for article in articles] + return headlines + except Exception as e: + return f"API Request Error: {e}" + def search_news(topic): # DuckDuckGo Results duck_results = search_duckduckgo(topic) # NewsAPI Results + current_date = datetime.now() + from_date = current_date - timedelta(days=2) + from_param = from_date.strftime('%Y-%m-%d') + newsapi_key = "44721311c40147ea9fe19080621cdb8a" - newsapi_results = search_newsapi(topic, newsapi_key) + newsapi_results = search_newsapi(topic, newsapi_key, from_param) # RSS Feed Results rss_feeds = ['http://feeds.reuters.com/Reuters/worldNews',] # Add more RSS URLs @@ -91,7 +168,18 @@ def search_news(topic): # Google News Results # google_results = search_google_news(topic) - google_results = get_google_search_results(topic) + # google_results = get_google_search_results(topic) + + # # //t-air: AIzaSyBC5-h1-WFqwKeHhagB-I1pcjRVEkvUZp4 https://console.cloud.google.com/apis/credentials/key/15ab8371-c67b-4d3a-a9af-7106cb4015e5?authuser=0&project=t-air-1704714414235 cx=049ff6d98d29c4e67 + api_key = "AIzaSyBC5-h1-WFqwKeHhagB-I1pcjRVEkvUZp4" + cx = "049ff6d98d29c4e67" + query = topic + daterange = "d1" # Specify the date range according to Google's search syntax + # d1: Past 24 hours + # w1: Past week + # daterange_8_hours = "h8" + # daterange_3_days = "d3" + google_results = google_search_api_headlines(query, api_key, cx, daterange) return { "duckduckgo": duck_results, @@ -117,8 +205,8 @@ def summarize_data(data): def run_web_agent(topic, folder): print(f"Running web agent for topic: {topic}") news_data = search_news(topic) - save_data(news_data, folder) - summary = summarize_data(news_data) - with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file: - log_file.write(f"{datetime.now()}: {summary}\n") + # save_data(news_data, folder) + # summary = summarize_data(news_data) + # with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file: + # log_file.write(f"{datetime.now()}: {summary}\n") return news_data \ No newline at end of file