Forum in maintenance, we will back soon 🙂
Google API
Hey awesome programmers,
I’ve started my first project using the Google Search API, but I’m facing a restriction: it only allows a maximum of 100 results per query. Given that I have $300 in my wallet, it doesn’t make sense to have this limit. Could anyone help me find a way to remove or work around this restriction?
what about proxies also? could any buddy give me the way to use it?
import requests from urllib.parse import urlparse import pandas as pd from datetime import datetime import os from openpyxl import Workbook from openpyxl.utils.dataframe import dataframe_to_rows from openpyxl.styles import Font, Alignment # Load the CSV file into a DataFrame csv_file_path = r'D:\OneDrive - Hewar group\GitHub\test_verto_search\pr_values_for_search_engine.csv' pr_values_df = pd.read_csv(csv_file_path) # Ensure the 'website' and 'domain' columns are treated as strings to avoid any potential issues pr_values_df['website'] = pr_values_df['website'].astype(str) pr_values_df['domain'] = pr_values_df['domain'].astype(str) def query_google(query, api_key, cse_id, start_page=1, num_results=10, date_range=None): url = "https://www.googleapis.com/customsearch/v1" results = [] while len(results) < num_results: params = { 'q': query, 'key': api_key, 'cx': cse_id, 'start': start_page } if date_range: params['dateRestrict'] = date_range response = requests.get(url, params=params) if response.status_code == 200: response_data = response.json() results.extend(response_data.get("items", [])) start_page += 10 total_results = int(response_data.get('searchInformation', {}).get('totalResults', "0")) if start_page > total_results: break else: error_info = response.json() print(f"Error occurred: {response.status_code}, Details: {error_info}") break return results[:num_results] def extract_domain(url): parsed_url = urlparse(url) domain_parts = parsed_url.netloc.split('.') if len(domain_parts) >= 2: return domain_parts[-2].capitalize() return "Unknown" def extract_search_results(results, pr_values_df): extracted_results = [] for item in results: link = item.get('link') parsed_domain = urlparse(link).netloc # Try to match with 'domain' column first match = pr_values_df[(pr_values_df['domain'].str.contains(parsed_domain, case=False)) | (pr_values_df['website'].str.contains(parsed_domain, case=False))] if not match.empty: publication_info = match.iloc[0] else: publication_info = {'publication': 'N/A', 'type of publication': 'N/A', 'ave': 0, 'circulation': 0, 'impression': 0} extracted_data = { 'date': item.get('pagemap', {}).get('metatags', [{}])[0].get('article:published_time', 'Unknown'), 'title': item.get('title'), 'link': link, 'source_name': urlparse(link).netloc, 'search_engine_name': 'Google', 'publication': publication_info['publication'], 'type_of_publication': publication_info['type of publication'], 'ave': publication_info.get('ave', 'N/A'), 'circulation': publication_info.get('circulation', 'N/A'), 'impression': publication_info.get('impression', 'N/A') } extracted_results.append(extracted_data) return extracted_results def sanitize_filename(filename): invalid_chars = '<>:"/\\|?*' for char in invalid_chars: filename = filename.replace(char, '') return filename # ... (Other parts of the script) ... def save_to_excel(data, search_query, directory_path=''): today_date = datetime.now().strftime('%Y-%m-%d') sanitized_query = sanitize_filename(search_query.replace(' ', '_')) base_filename = f"{sanitized_query}_{today_date}" extension = ".xlsx" filename = base_filename + extension full_path = os.path.join(directory_path, filename) # Check if the file exists and change the filename by appending a number in brackets counter = 2 while os.path.exists(full_path): filename = f"{base_filename}({counter}){extension}" full_path = os.path.join(directory_path, filename) counter += 1 wb = Workbook() ws = wb.active # Convert DataFrame to rows df = pd.DataFrame(data) rows = dataframe_to_rows(df, index=False, header=True) # Write rows, set hyperlink, and format headers for r_idx, row in enumerate(rows, 1): for c_idx, value in enumerate(row, 1): cell = ws.cell(row=r_idx, column=c_idx, value=value) if r_idx == 1: # For headers, make the text bold and capitalize cell.font = Font(bold=True) cell.value = cell.value.title() # Capitalize every first character of each word elif r_idx > 1 and c_idx == df.columns.get_loc('link') + 1: # Hyperlink column cell.hyperlink = value cell.font = Font(color='0000FF', underline='single') # Apply formatting for numeric fields to ensure they are displayed correctly elif r_idx > 1 and c_idx in [df.columns.get_loc('ave') + 1, df.columns.get_loc('circulation') + 1, df.columns.get_loc('impression') + 1]: cell.number_format = '#,##0' wb.save(full_path) print(f"Results saved to {full_path}") def main(): api_key = 'AIzaSyDNbi-e_zv_tCPB-gh9x9MB_9cVRVPbXgY' cse_id = '81b2064e655924c82' user_query = input("Enter your search query: ") num_results = int(input("Enter the number of results you want: ")) date_range_input = input("Enter the date range (e.g., 'd5' for last 5 days): ") directory_path = input("Enter the directory path to save the file (leave empty for current directory): ") pr_values_df = pd.read_csv(r'D:\OneDrive - Hewar group\GitHub\test_verto_search\pr_values_for_search_engine.csv') # Ensure this path is correct results = query_google(user_query, api_key, cse_id, num_results=num_results, date_range=date_range_input) if results: search_results = extract_search_results(results, pr_values_df) save_to_excel(search_results, user_query, directory_path) if __name__ == "__main__": main()
@amr-atef paging is a feature in the API. check the documentation here: Using REST to Invoke the API | Programmable Search Engine | Google for Developers
you will see NextPage in the meta data values, you can use it to load next page and so on.
did you write the code you shared?
@amr-atef So, you dont have any experience in coding and you cant read code?
@amr-atef It is enough for now, but if you want to get better at writing codes and making bigger projects ofcourse you'll have to keep practicing to improve.
But, for now the skillset you gained from the course should be enough for you to modify the code in accordance to what @admin mention above:
Posted by: @admin@amr-atef paging is a feature in the API. check the documentation here: Using REST to Invoke the API | Programmable Search Engine | Google for Developers
you will see NextPage in the meta data values, you can use it to load next page and so on.
did you write the code you shared?
So,try doing so and if you face any problems we'll help you with it.
@husein I'm interested to be better in coding, do you have road map, method, webiste or any advice you can tell me to get better in writing and developing codes?
thank you alot bro for your reply and support ❤️ ❤️ ❤️
@amr-atef you can check https://freecodecamp.org for some lessons.
Regards,
Earnie Boyd, CEO
Seasoned Solutions Advisor LLC
Schedule 1-on-1 help
Join me on Slack
@amr-atef Soon we will have new Python course with coding projects to help you practice more.
For now, you can pick a project idea that you like to build and start with it. and we are here to help. this is the best way to become better at coding and programming by creating a real-world project.