Forum in maintenance, we will back soon 🙂

Notifications

Clear all

Google API

API Ideas

Last Post by Husein Aboul Hasan 4 weeks ago

13 Posts

4 Users

9 Reactions

20 Views

RSS

Amr Atef

(@amr-atef)

Posts: 10

Active Member Customer

Topic starter

Hey awesome programmers,

I’ve started my first project using the Google Search API, but I’m facing a restriction: it only allows a maximum of 100 results per query. Given that I have $300 in my wallet, it doesn’t make sense to have this limit. Could anyone help me find a way to remove or work around this restriction?

what about proxies also? could any buddy give me the way to use it?

import requests
from urllib.parse import urlparse
import pandas as pd
from datetime import datetime
import os
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Font, Alignment


# Load the CSV file into a DataFrame
csv_file_path = r'D:\OneDrive - Hewar group\GitHub\test_verto_search\pr_values_for_search_engine.csv'
pr_values_df = pd.read_csv(csv_file_path)

# Ensure the 'website' and 'domain' columns are treated as strings to avoid any potential issues
pr_values_df['website'] = pr_values_df['website'].astype(str)
pr_values_df['domain'] = pr_values_df['domain'].astype(str)


def query_google(query, api_key, cse_id, start_page=1, num_results=10, date_range=None):
    url = "https://www.googleapis.com/customsearch/v1"
    results = []
    while len(results) < num_results:
        params = {
            'q': query,
            'key': api_key,
            'cx': cse_id,
            'start': start_page
        }
        if date_range:
            params['dateRestrict'] = date_range

        response = requests.get(url, params=params)
        if response.status_code == 200:
            response_data = response.json()
            results.extend(response_data.get("items", []))
            start_page += 10
            total_results = int(response_data.get('searchInformation', {}).get('totalResults', "0"))
            if start_page > total_results:
                break
        else:
            error_info = response.json()
            print(f"Error occurred: {response.status_code}, Details: {error_info}")
            break

    return results[:num_results]

def extract_domain(url):
    parsed_url = urlparse(url)
    domain_parts = parsed_url.netloc.split('.')
    if len(domain_parts) >= 2:
        return domain_parts[-2].capitalize()
    return "Unknown"

def extract_search_results(results, pr_values_df):
    extracted_results = []
    for item in results:
        link = item.get('link')
        parsed_domain = urlparse(link).netloc
        
        # Try to match with 'domain' column first
        match = pr_values_df[(pr_values_df['domain'].str.contains(parsed_domain, case=False)) |
                             (pr_values_df['website'].str.contains(parsed_domain, case=False))]
        
        if not match.empty:
            publication_info = match.iloc[0]
        else:
            publication_info = {'publication': 'N/A', 'type of publication': 'N/A', 'ave': 0, 'circulation': 0, 'impression': 0}

        extracted_data = {
            'date': item.get('pagemap', {}).get('metatags', [{}])[0].get('article:published_time', 'Unknown'),
            'title': item.get('title'),
            'link': link,
            'source_name': urlparse(link).netloc,
            'search_engine_name': 'Google',
            'publication': publication_info['publication'],
            'type_of_publication': publication_info['type of publication'],
            'ave': publication_info.get('ave', 'N/A'),
            'circulation': publication_info.get('circulation', 'N/A'),
            'impression': publication_info.get('impression', 'N/A')
        }
        extracted_results.append(extracted_data)
    return extracted_results


def sanitize_filename(filename):
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        filename = filename.replace(char, '')
    return filename




# ... (Other parts of the script) ...


def save_to_excel(data, search_query, directory_path=''):
    today_date = datetime.now().strftime('%Y-%m-%d')
    sanitized_query = sanitize_filename(search_query.replace(' ', '_'))
    base_filename = f"{sanitized_query}_{today_date}"
    extension = ".xlsx"
    filename = base_filename + extension
    full_path = os.path.join(directory_path, filename)

    # Check if the file exists and change the filename by appending a number in brackets
    counter = 2
    while os.path.exists(full_path):
        filename = f"{base_filename}({counter}){extension}"
        full_path = os.path.join(directory_path, filename)
        counter += 1

    wb = Workbook()
    ws = wb.active

    # Convert DataFrame to rows
    df = pd.DataFrame(data)
    rows = dataframe_to_rows(df, index=False, header=True)

    # Write rows, set hyperlink, and format headers
    for r_idx, row in enumerate(rows, 1):
        for c_idx, value in enumerate(row, 1):
            cell = ws.cell(row=r_idx, column=c_idx, value=value)
            if r_idx == 1:
                # For headers, make the text bold and capitalize
                cell.font = Font(bold=True)
                cell.value = cell.value.title()  # Capitalize every first character of each word
            elif r_idx > 1 and c_idx == df.columns.get_loc('link') + 1:  # Hyperlink column
                cell.hyperlink = value
                cell.font = Font(color='0000FF', underline='single')
            # Apply formatting for numeric fields to ensure they are displayed correctly
            elif r_idx > 1 and c_idx in [df.columns.get_loc('ave') + 1, df.columns.get_loc('circulation') + 1, df.columns.get_loc('impression') + 1]:
                cell.number_format = '#,##0'

    wb.save(full_path)
    print(f"Results saved to {full_path}")





def main():
    api_key = 'AIzaSyDNbi-e_zv_tCPB-gh9x9MB_9cVRVPbXgY'
    cse_id = '81b2064e655924c82'
    user_query = input("Enter your search query: ")
    num_results = int(input("Enter the number of results you want: "))
    date_range_input = input("Enter the date range (e.g., 'd5' for last 5 days): ")
    directory_path = input("Enter the directory path to save the file (leave empty for current directory): ")
    pr_values_df = pd.read_csv(r'D:\OneDrive - Hewar group\GitHub\test_verto_search\pr_values_for_search_engine.csv') # Ensure this path is correct

    results = query_google(user_query, api_key, cse_id, num_results=num_results, date_range=date_range_input)
    if results:
        search_results = extract_search_results(results, pr_values_df)
        save_to_excel(search_results, user_query, directory_path)

if __name__ == "__main__":
    main()

Posted : 06/02/2024 2:08 pm

Husein Aboul Hasan

(@husein)

Posts: 335

Member Moderator

@amr-atef Yeah the API allows a maximum of 100 results, your account balance doesn't matter.

Posted : 06/03/2024 12:58 pm

Amr Atef reacted

Hasan Aboul Hasan

(@admin)

Posts: 1098

Member Admin

you will need to use paging to get more than 100 results

Posted : 06/03/2024 1:56 pm

SSAdvisor and Amr Atef reacted

Amr Atef

(@amr-atef)

Posts: 10

Active Member Customer

Topic starter

@admin sir, could you please show me the way to make so in the right way to get sustainable and reliable results please

Posted : 06/04/2024 3:12 am

Hasan Aboul Hasan

(@admin)

Posts: 1098

Member Admin

@amr-atef paging is a feature in the API. check the documentation here: Using REST to Invoke the API | Programmable Search Engine | Google for Developers

you will see NextPage in the meta data values, you can use it to load next page and so on.

did you write the code you shared?

Posted : 06/04/2024 10:38 am

Amr Atef

(@amr-atef)

Posts: 10

Active Member Customer

Topic starter

@admin no of course, chatgpt 😆 , that's why I can't get further and I can't find a solution 😪

Posted : 06/04/2024 11:57 am

Husein Aboul Hasan

(@husein)

Posts: 335

Member Moderator

@amr-atef So, you dont have any experience in coding and you cant read code?

Posted : 06/04/2024 12:37 pm

Amr Atef

(@amr-atef)

Posts: 10

Active Member Customer

Topic starter

@husein no, I can read code scripts and make some modifications thanks to the mini course of Python scripting in your prompt engineering course... isn't enough?

Posted : 06/04/2024 2:35 pm

Husein Aboul Hasan

(@husein)

Posts: 335

Member Moderator

@amr-atef It is enough for now, but if you want to get better at writing codes and making bigger projects ofcourse you'll have to keep practicing to improve.

But, for now the skillset you gained from the course should be enough for you to modify the code in accordance to what @admin mention above:

Posted by: @admin

↑

@amr-atef paging is a feature in the API. check the documentation here: Using REST to Invoke the API | Programmable Search Engine | Google for Developers

you will see NextPage in the meta data values, you can use it to load next page and so on.

did you write the code you shared?

So,try doing so and if you face any problems we'll help you with it.

Posted : 06/04/2024 2:45 pm

Amr Atef and SSAdvisor reacted

Amr Atef

(@amr-atef)

Posts: 10

Active Member Customer

Topic starter

@husein I'm interested to be better in coding, do you have road map, method, webiste or any advice you can tell me to get better in writing and developing codes?

thank you alot bro for your reply and support ❤️ ❤️ ❤️

Posted : 06/05/2024 2:04 pm

Husein Aboul Hasan reacted

SSAdvisor

(@ssadvisor)

Posts: 1056

Noble Member

@amr-atef you can check https://freecodecamp.org for some lessons.

Regards,
Earnie Boyd, CEO
Seasoned Solutions Advisor LLC
Schedule 1-on-1 help
Join me on Slack

Posted : 06/05/2024 6:43 pm

Amr Atef reacted

Hasan Aboul Hasan

(@admin)

Posts: 1098

Member Admin

@amr-atef Soon we will have new Python course with coding projects to help you practice more.

For now, you can pick a project idea that you like to build and start with it. and we are here to help. this is the best way to become better at coding and programming by creating a real-world project.

Posted : 06/06/2024 11:51 am

Amr Atef reacted

Husein Aboul Hasan

(@husein)

Posts: 335

Member Moderator

@amr-atef I don't have anything in mind now, but if i came across a good roadmap ill let you know. Hopefully @admin will upload a roadmap soon with more practice exercise, this would help you.

Posted : 06/06/2024 12:49 pm

Amr Atef reacted

Cookie	Duration	Description
cookielawinfo-checkbox-analytics	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookie is used to store the user consent for the cookies in the category "Analytics".
cookielawinfo-checkbox-functional	11 months	The cookie is set by GDPR cookie consent to record the user consent for the cookies in the category "Functional".
cookielawinfo-checkbox-necessary	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookies is used to store the user consent for the cookies in the category "Necessary".
cookielawinfo-checkbox-others	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookie is used to store the user consent for the cookies in the category "Other.
cookielawinfo-checkbox-performance	11 months	This cookie is set by GDPR Cookie Consent plugin. The cookie is used to store the user consent for the cookies in the category "Performance".
viewed_cookie_policy	11 months	The cookie is set by the GDPR Cookie Consent plugin and is used to store whether or not user has consented to the use of cookies. It does not store any personal data.

Forum in maintenance, we will back soon 🙂

Google API

© 2024 LearnWithHasan | All Rights Reserved.