229 lines
7.5 KiB
Python
229 lines
7.5 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
import json
|
|
from datetime import datetime, timedelta
|
|
import feedparser
|
|
|
|
def search_duckduckgo(topic):
|
|
# try with https://duckduckgo.com/?q=tesla&iar=news&ia=news
|
|
url = f"http://api.duckduckgo.com/?q={topic}&format=json"
|
|
response = requests.get(url)
|
|
#results = response.json().get('RelatedTopics', [])
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
page_text = soup.get_text(separator='\n', strip=True)
|
|
|
|
url = f"https://duckduckgo.com/?q={topic}&iar=news&ia=news"
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
page_text2 = soup.get_text(separator='\n', strip=True)
|
|
|
|
return page_text + page_text2
|
|
|
|
def search_newsapi(topic, api_key, from_param=None):
|
|
endpoint = "https://newsapi.org/v2/everything"
|
|
|
|
# Set up parameters including your API key and query parameters
|
|
params = {
|
|
'apiKey': api_key,
|
|
'q': topic,
|
|
'from': from_param, # Specify the date in the format "YYYY-MM-DD"
|
|
'sortBy': 'publishedAt',
|
|
'language': 'en',
|
|
}
|
|
|
|
# Add 'from' parameter only if 'from_param' is provided
|
|
if from_param:
|
|
params['from'] = from_param
|
|
|
|
response = requests.get(endpoint, params=params)
|
|
articles = response.json().get('articles', [])
|
|
headlines = [article.get('title', '') for article in articles]
|
|
return articles
|
|
|
|
def parse_rss_feed(feed_url):
|
|
feed = feedparser.parse(feed_url)
|
|
articles = [{'title': entry.title, 'link': entry.link} for entry in feed.entries]
|
|
return articles
|
|
|
|
import yfinance as yf
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
def fetch_stock_data(ticker, interval='1d', period='1mo'):
|
|
stock = yf.Ticker(ticker)
|
|
hist = stock.history(interval=interval, period=period)
|
|
return hist
|
|
|
|
def search_google_news(topic):
|
|
options = Options()
|
|
options.headless = True
|
|
driver = webdriver.Chrome(options=options)
|
|
|
|
try:
|
|
driver.get(f"https://www.google.com/search?q={topic}&tbm=nws")
|
|
# Code to accept cookies or terms goes here
|
|
|
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
page_text = soup.get_text(separator='\n', strip=True)
|
|
return page_text
|
|
finally:
|
|
driver.quit()
|
|
|
|
|
|
|
|
def get_google_search_results_old_requiresLogin(query):
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
|
|
|
|
# if response.status_code == 200:
|
|
# soup = BeautifulSoup(response.text, 'html.parser')
|
|
# page_text = soup.get_text(separator='\n', strip=True)
|
|
# return page_text
|
|
# else:
|
|
# return f"Error: {response.status_code}"
|
|
try:
|
|
response = requests.get(f"https://www.google.com/search?q={query}", headers=headers)
|
|
response.raise_for_status()
|
|
|
|
# Ensure the correct character set is used
|
|
response.encoding = response.apparent_encoding
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
page_text = soup.get_text(separator='\n', strip=True)
|
|
return page_text
|
|
except Exception as e:
|
|
return f"Parsing Error: {e}"
|
|
|
|
|
|
def google_search_api_headlines(query, api_key, cx, daterange=None):
|
|
try:
|
|
# Set up the API endpoint
|
|
endpoint = "https://www.googleapis.com/customsearch/v1"
|
|
|
|
# Set up parameters including your API key and custom search engine ID (cx)
|
|
params = {
|
|
'key': api_key,
|
|
'cx': cx,
|
|
'q': query
|
|
}
|
|
|
|
# Add 'dateRestrict' parameter if provided
|
|
if daterange:
|
|
params['dateRestrict'] = daterange
|
|
|
|
# Make the request to the Custom Search API
|
|
response = requests.get(endpoint, params=params)
|
|
response.raise_for_status()
|
|
|
|
# Parse the JSON response
|
|
search_results = response.json()
|
|
|
|
# Extract and return headlines from the response
|
|
items = search_results.get('items', [])
|
|
headlines = [item.get('title', '') for item in items]
|
|
return headlines
|
|
except Exception as e:
|
|
return f"API Request Error: {e}"
|
|
|
|
def get_news_api_results(query, api_key, from_param):
|
|
try:
|
|
# Set up the API endpoint
|
|
endpoint = "https://newsapi.org/v2/everything"
|
|
|
|
# Set up parameters including your API key and query parameters
|
|
params = {
|
|
'apiKey': api_key,
|
|
'q': query,
|
|
'from': from_param, # Specify the date in the format "YYYY-MM-DD"
|
|
'sortBy': 'publishedAt',
|
|
'language': 'en',
|
|
}
|
|
|
|
# Make the request to the News API
|
|
response = requests.get(endpoint, params=params)
|
|
response.raise_for_status()
|
|
|
|
# Parse the JSON response
|
|
news_results = response.json()
|
|
|
|
# Extract and return relevant information from the response
|
|
articles = news_results.get('articles', [])
|
|
headlines = [article.get('title', '') for article in articles]
|
|
return headlines
|
|
except Exception as e:
|
|
return f"API Request Error: {e}"
|
|
|
|
def search_tavily(topic, api_key):
|
|
url = f"https://app.tavily.com/api/search?q={topic}"
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}"
|
|
}
|
|
response = requests.get(url, headers=headers)
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
else:
|
|
return {"error": response.text}
|
|
|
|
def search_news(topic):
|
|
# DuckDuckGo Results
|
|
duck_results = search_duckduckgo(topic)
|
|
|
|
# NewsAPI Results
|
|
current_date = datetime.now()
|
|
from_date = current_date - timedelta(days=2)
|
|
from_param = from_date.strftime('%Y-%m-%d')
|
|
|
|
newsapi_key = "44721311c40147ea9fe19080621cdb8a"
|
|
newsapi_results = search_newsapi(topic, newsapi_key, from_param)
|
|
|
|
# RSS Feed Results
|
|
rss_feeds = ['http://feeds.reuters.com/Reuters/worldNews',] # Add more RSS URLs
|
|
rss_results = []
|
|
for feed_url in rss_feeds:
|
|
rss_results.extend(parse_rss_feed(feed_url))
|
|
|
|
# Google News Results
|
|
# google_results = search_google_news(topic)
|
|
# google_results = get_google_search_results(topic)
|
|
|
|
# # //t-air: AIzaSyBC5-h1-WFqwKeHhagB-I1pcjRVEkvUZp4 https://console.cloud.google.com/apis/credentials/key/15ab8371-c67b-4d3a-a9af-7106cb4015e5?authuser=0&project=t-air-1704714414235 cx=049ff6d98d29c4e67
|
|
api_key = "AIzaSyBC5-h1-WFqwKeHhagB-I1pcjRVEkvUZp4"
|
|
cx = "049ff6d98d29c4e67"
|
|
query = topic
|
|
daterange = "d1" # Specify the date range according to Google's search syntax
|
|
# d1: Past 24 hours
|
|
# w1: Past week
|
|
# daterange_8_hours = "h8"
|
|
# daterange_3_days = "d3"
|
|
google_results = google_search_api_headlines(query, api_key, cx, daterange)
|
|
|
|
return {
|
|
"duckduckgo": duck_results,
|
|
"newsapi": newsapi_results,
|
|
"rss": rss_results,
|
|
"google": google_results
|
|
}
|
|
|
|
def save_data(data, folder):
|
|
if not os.path.exists(folder):
|
|
os.makedirs(folder)
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
file_path = os.path.join(folder, f"data_{timestamp}.json")
|
|
|
|
with open(file_path, 'w') as file:
|
|
json.dump(data, file)
|
|
|
|
def summarize_data(data):
|
|
summary = "Summarized information" # Replace with actual summarization logic
|
|
return summary
|
|
|
|
def run_web_agent(topic, folder):
|
|
print(f"[{datetime.now()}] Running web agent for topic: {topic}")
|
|
news_data = search_news(topic)
|
|
tavily_api_key = "YOUR_TAVILY_API_KEY"
|
|
tavily_results = search_tavily(topic, tavily_api_key)
|
|
news_data["tavily"] = tavily_results
|
|
return news_data
|