gogo2/agent-py-bot/agents/webagent.py

import requests
from bs4 import BeautifulSoup
import os
import json
from datetime import datetime, timedelta
import feedparser

def search_duckduckgo(topic):
    # try with https://duckduckgo.com/?q=tesla&iar=news&ia=news
    url = f"http://api.duckduckgo.com/?q={topic}&format=json"
    response = requests.get(url)
    #results = response.json().get('RelatedTopics', [])

    soup = BeautifulSoup(response.text, 'html.parser')
    page_text = soup.get_text(separator='\n', strip=True)

    url = f"https://duckduckgo.com/?q={topic}&iar=news&ia=news"
    soup = BeautifulSoup(response.text, 'html.parser')
    page_text2 = soup.get_text(separator='\n', strip=True)

    return page_text + page_text2

def search_newsapi(topic, api_key, from_param=None):
    endpoint = "https://newsapi.org/v2/everything"

    # Set up parameters including your API key and query parameters
    params = {
        'apiKey': api_key,
        'q': topic,
        'from': from_param,  # Specify the date in the format "YYYY-MM-DD"
        'sortBy': 'publishedAt',
        'language': 'en',
    }

    # Add 'from' parameter only if 'from_param' is provided
    if from_param:
        params['from'] = from_param

    response = requests.get(endpoint, params=params)
    articles = response.json().get('articles', [])
    headlines = [article.get('title', '') for article in articles]
    return articles

def parse_rss_feed(feed_url):
    feed = feedparser.parse(feed_url)
    articles = [{'title': entry.title, 'link': entry.link} for entry in feed.entries]
    return articles

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

def search_google_news(topic):
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(f"https://www.google.com/search?q={topic}&tbm=nws")
        # Code to accept cookies or terms goes here

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        page_text = soup.get_text(separator='\n', strip=True)
        return page_text
    finally:
        driver.quit()


def get_google_search_results_old_requiresLogin(query):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

    # if response.status_code == 200:
    #     soup = BeautifulSoup(response.text, 'html.parser')
    #     page_text = soup.get_text(separator='\n', strip=True)
    #     return page_text
    # else:
    #     return f"Error: {response.status_code}"
    try:
        response = requests.get(f"https://www.google.com/search?q={query}", headers=headers)
        response.raise_for_status()

        # Ensure the correct character set is used
        response.encoding = response.apparent_encoding

        soup = BeautifulSoup(response.text, 'html.parser')
        page_text = soup.get_text(separator='\n', strip=True)
        return page_text
    except Exception as e:
        return f"Parsing Error: {e}"


def google_search_api_headlines(query, api_key, cx, daterange=None):
    try:
        # Set up the API endpoint
        endpoint = "https://www.googleapis.com/customsearch/v1"

        # Set up parameters including your API key and custom search engine ID (cx)
        params = {
            'key': api_key,
            'cx': cx,
            'q': query
        }

        # Add 'dateRestrict' parameter if provided
        if daterange:
            params['dateRestrict'] = daterange

        # Make the request to the Custom Search API
        response = requests.get(endpoint, params=params)
        response.raise_for_status()

        # Parse the JSON response
        search_results = response.json()

        # Extract and return headlines from the response
        items = search_results.get('items', [])
        headlines = [item.get('title', '') for item in items]
        return headlines
    except Exception as e:
        return f"API Request Error: {e}"

def get_news_api_results(query, api_key, from_param):
    try:
        # Set up the API endpoint
        endpoint = "https://newsapi.org/v2/everything"

        # Set up parameters including your API key and query parameters
        params = {
            'apiKey': api_key,
            'q': query,
            'from': from_param,  # Specify the date in the format "YYYY-MM-DD"
            'sortBy': 'publishedAt',
            'language': 'en',
        }

        # Make the request to the News API
        response = requests.get(endpoint, params=params)
        response.raise_for_status()

        # Parse the JSON response
        news_results = response.json()

        # Extract and return relevant information from the response
        articles = news_results.get('articles', [])
        headlines = [article.get('title', '') for article in articles]
        return headlines
    except Exception as e:
        return f"API Request Error: {e}"

def search_news(topic):
    # DuckDuckGo Results
    duck_results = search_duckduckgo(topic)

    # NewsAPI Results
    current_date = datetime.now()
    from_date = current_date - timedelta(days=2)
    from_param = from_date.strftime('%Y-%m-%d')

    newsapi_key = "44721311c40147ea9fe19080621cdb8a"
    newsapi_results = search_newsapi(topic, newsapi_key, from_param)

    # RSS Feed Results
    rss_feeds = ['http://feeds.reuters.com/Reuters/worldNews',]  # Add more RSS URLs
    rss_results = []
    for feed_url in rss_feeds:
        rss_results.extend(parse_rss_feed(feed_url))

    # Google News Results
    # google_results = search_google_news(topic)
    # google_results = get_google_search_results(topic)

    #  # //t-air: AIzaSyBC5-h1-WFqwKeHhagB-I1pcjRVEkvUZp4 https://console.cloud.google.com/apis/credentials/key/15ab8371-c67b-4d3a-a9af-7106cb4015e5?authuser=0&project=t-air-1704714414235 cx=049ff6d98d29c4e67
    api_key = "AIzaSyBC5-h1-WFqwKeHhagB-I1pcjRVEkvUZp4"
    cx = "049ff6d98d29c4e67"
    query = topic
    daterange = "d1"  # Specify the date range according to Google's search syntax
    #     d1: Past 24 hours
    # w1: Past week
    # daterange_8_hours = "h8"
    # daterange_3_days = "d3"
    google_results = google_search_api_headlines(query, api_key, cx, daterange)

    return {
        "duckduckgo": duck_results,
        "newsapi": newsapi_results,
        "rss": rss_results,
        "google": google_results
    }

def save_data(data, folder):
    if not os.path.exists(folder):
        os.makedirs(folder)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    file_path = os.path.join(folder, f"data_{timestamp}.json")

    with open(file_path, 'w') as file:
        json.dump(data, file)

def summarize_data(data):
    summary = "Summarized information"  # Replace with actual summarization logic
    return summary

def run_web_agent(topic, folder):
    print(f"Running web agent for topic: {topic}")
    news_data = search_news(topic)
    # save_data(news_data, folder)
    # summary = summarize_data(news_data)
    # with open(os.path.join(folder, "summary_log.txt"), 'a') as log_file:
    #     log_file.write(f"{datetime.now()}: {summary}\n")
    return news_data