fix google search - using api

This commit is contained in:
Dobromir Popov 2024-01-08 17:53:18 +02:00
parent 7c0dd39227
commit 3d114d1a76
2 changed files with 118 additions and 23 deletions

View File

@ -190,9 +190,21 @@ async def bad_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> Non
#------------------------- webagent --------------------------# #------------------------- webagent --------------------------#
import schedule import schedule
import time import time
from agents.webagent import run_web_agent from agents.webagent import run_web_agent, save_data
async def run_web_agent_and_process_result(topic, folder):
print(f"Running web agent for topic: {topic}")
news_data = run_web_agent(topic, folder)
save_data(news_data, folder)
# summary = summarize_data(news_data)
# with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file:
# log_file.write(f"{datetime.now()}: {summary}\n")
# Process the result immediately after obtaining it
user_message = f"New data received: {news_data}"
query_result = await query_llm(user_message)
# Process the query_result as needed
async def async_main(): async def async_main():
# Assuming this is your asynchronous main function with its full details # Assuming this is your asynchronous main function with its full details
@ -205,22 +217,17 @@ async def async_main():
def sync_main(): def sync_main():
# Synchronous part for scheduling # Synchronous part for scheduling
topic = "tesla news" topic = "tesla news"
interval = 1 # in minutes interval = 8 # in minutes
folder = "agent-py-bot/scrape/raw" folder = "agent-py-bot/scrape/raw"
schedule.every(interval).minutes.do(run_web_agent, topic=topic, folder=folder) # schedule.every(interval).minutes.do(run_web_agent_and_process_result, topic=topic, folder=folder)
schedule.every(interval).hours.do(run_web_agent_and_process_result, topic=topic, folder=folder)
# Run once at the start # Run once at the start
news_json = run_web_agent(topic=topic, folder=folder) news_json = await run_web_agent_and_process_result(topic=topic, folder=folder)
while True: while True:
schedule.run_pending() schedule.run_pending()
time.sleep(1) time.sleep(1)
# Check if there's new data obtained from web agent
new_data, new_summary = run_web_agent(topic=topic, folder=folder)
# Use the new data to call the async function
user_message = f"New data received: {new_data}"
query_result = query_llm(user_message)
if __name__ == '__main__': if __name__ == '__main__':
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()

View File

@ -2,7 +2,7 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import os import os
import json import json
from datetime import datetime from datetime import datetime, timedelta
import feedparser import feedparser
def search_duckduckgo(topic): def search_duckduckgo(topic):
@ -20,10 +20,25 @@ def search_duckduckgo(topic):
return page_text + page_text2 return page_text + page_text2
def search_newsapi(topic, api_key): def search_newsapi(topic, api_key, from_param=None):
url = f"https://newsapi.org/v2/everything?q={topic}&apiKey={api_key}" endpoint = "https://newsapi.org/v2/everything"
response = requests.get(url)
# Set up parameters including your API key and query parameters
params = {
'apiKey': api_key,
'q': topic,
'from': from_param, # Specify the date in the format "YYYY-MM-DD"
'sortBy': 'publishedAt',
'language': 'en',
}
# Add 'from' parameter only if 'from_param' is provided
if from_param:
params['from'] = from_param
response = requests.get(endpoint, params=params)
articles = response.json().get('articles', []) articles = response.json().get('articles', [])
headlines = [article.get('title', '') for article in articles]
return articles return articles
def parse_rss_feed(feed_url): def parse_rss_feed(feed_url):
@ -51,7 +66,7 @@ def search_google_news(topic):
def get_google_search_results(query): def get_google_search_results_old_requiresLogin(query):
headers = { headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
@ -61,7 +76,6 @@ def get_google_search_results(query):
# return page_text # return page_text
# else: # else:
# return f"Error: {response.status_code}" # return f"Error: {response.status_code}"
try: try:
response = requests.get(f"https://www.google.com/search?q={query}", headers=headers) response = requests.get(f"https://www.google.com/search?q={query}", headers=headers)
response.raise_for_status() response.raise_for_status()
@ -75,13 +89,76 @@ def get_google_search_results(query):
except Exception as e: except Exception as e:
return f"Parsing Error: {e}" return f"Parsing Error: {e}"
def google_search_api_headlines(query, api_key, cx, daterange=None):
try:
# Set up the API endpoint
endpoint = "https://www.googleapis.com/customsearch/v1"
# Set up parameters including your API key and custom search engine ID (cx)
params = {
'key': api_key,
'cx': cx,
'q': query
}
# Add 'dateRestrict' parameter if provided
if daterange:
params['dateRestrict'] = daterange
# Make the request to the Custom Search API
response = requests.get(endpoint, params=params)
response.raise_for_status()
# Parse the JSON response
search_results = response.json()
# Extract and return headlines from the response
items = search_results.get('items', [])
headlines = [item.get('title', '') for item in items]
return headlines
except Exception as e:
return f"API Request Error: {e}"
def get_news_api_results(query, api_key, from_param):
try:
# Set up the API endpoint
endpoint = "https://newsapi.org/v2/everything"
# Set up parameters including your API key and query parameters
params = {
'apiKey': api_key,
'q': query,
'from': from_param, # Specify the date in the format "YYYY-MM-DD"
'sortBy': 'publishedAt',
'language': 'en',
}
# Make the request to the News API
response = requests.get(endpoint, params=params)
response.raise_for_status()
# Parse the JSON response
news_results = response.json()
# Extract and return relevant information from the response
articles = news_results.get('articles', [])
headlines = [article.get('title', '') for article in articles]
return headlines
except Exception as e:
return f"API Request Error: {e}"
def search_news(topic): def search_news(topic):
# DuckDuckGo Results # DuckDuckGo Results
duck_results = search_duckduckgo(topic) duck_results = search_duckduckgo(topic)
# NewsAPI Results # NewsAPI Results
current_date = datetime.now()
from_date = current_date - timedelta(days=2)
from_param = from_date.strftime('%Y-%m-%d')
newsapi_key = "44721311c40147ea9fe19080621cdb8a" newsapi_key = "44721311c40147ea9fe19080621cdb8a"
newsapi_results = search_newsapi(topic, newsapi_key) newsapi_results = search_newsapi(topic, newsapi_key, from_param)
# RSS Feed Results # RSS Feed Results
rss_feeds = ['http://feeds.reuters.com/Reuters/worldNews',] # Add more RSS URLs rss_feeds = ['http://feeds.reuters.com/Reuters/worldNews',] # Add more RSS URLs
@ -91,7 +168,18 @@ def search_news(topic):
# Google News Results # Google News Results
# google_results = search_google_news(topic) # google_results = search_google_news(topic)
google_results = get_google_search_results(topic) # google_results = get_google_search_results(topic)
# # //t-air: AIzaSyBC5-h1-WFqwKeHhagB-I1pcjRVEkvUZp4 https://console.cloud.google.com/apis/credentials/key/15ab8371-c67b-4d3a-a9af-7106cb4015e5?authuser=0&project=t-air-1704714414235 cx=049ff6d98d29c4e67
api_key = "AIzaSyBC5-h1-WFqwKeHhagB-I1pcjRVEkvUZp4"
cx = "049ff6d98d29c4e67"
query = topic
daterange = "d1" # Specify the date range according to Google's search syntax
# d1: Past 24 hours
# w1: Past week
# daterange_8_hours = "h8"
# daterange_3_days = "d3"
google_results = google_search_api_headlines(query, api_key, cx, daterange)
return { return {
"duckduckgo": duck_results, "duckduckgo": duck_results,
@ -117,8 +205,8 @@ def summarize_data(data):
def run_web_agent(topic, folder): def run_web_agent(topic, folder):
print(f"Running web agent for topic: {topic}") print(f"Running web agent for topic: {topic}")
news_data = search_news(topic) news_data = search_news(topic)
save_data(news_data, folder) # save_data(news_data, folder)
summary = summarize_data(news_data) # summary = summarize_data(news_data)
with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file: # with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file:
log_file.write(f"{datetime.now()}: {summary}\n") # log_file.write(f"{datetime.now()}: {summary}\n")
return news_data return news_data