Find Research Paper Abstract Similarity with Arxiv API
🔑 ID:
38888
👨💻
Python
🕒
02/04/2024
Free
Description:
This code will help you check for similar academic research paper abstracts to your abstract. However, make sure you get your own OPENAI API Key and implement it in a .env file like this:
OPENAI_API_KEY = “YOUR_API_KEY”
Code:
import requests import json import xml.etree.ElementTree as ET from SimplerLLM.language.llm import LLM, LLMProvider # Get the top keywords present in input abstract def extract_keywords(abstract): # Constructing a prompt for the language model to generate keywords from the abstract prompt = f""" ### TASK You are an expert in text analysis and keyword extraction. Your task is to analyse an abstract I'm going to give you and extract from it the top 5 keywords that are most representative of its content. Then you're going to generate them in a JSON format in descending order from the most relevant to the least relevant. ### INPUTS Abstract: {abstract} ### OUTPUT The output should be in JSON format. Here's how it should look like: [ {{"theme": "[theme 1]"}}, {{"theme": "[theme 2]"}}, {{"theme": "[theme 3]"}}, {{"theme": "[theme 4]"}}, {{"theme": "[theme 5]"}} ] """ # Creating an instance of the language model using SimplerLLM llm_instance = LLM.create(provider=LLMProvider.OPENAI, model_name="gpt-4") # Generating response from the language model response = llm_instance.generate_response(user_prompt=prompt) # Attempting to parse the response as JSON try: response_data = json.loads(response) return json.dumps(response_data, indent=2) except json.JSONDecodeError: # Returning an error message if the response is not valid JSON return json.dumps({"error": "Invalid response from LLM"}, indent=2) # Search for related abstracts according to keywords and get the ID and abstract def get_abstracts(json_input): input_data = json.loads(json_input) all_summaries_data = [] for theme_info in input_data: keyword = theme_info['theme'] max_results = 1 # Number of results to fetch for each keyword # Constructing the query URL for the arXiv API url = f"http://export.arxiv.org/api/query?search_query=all:{keyword}&start=0&max_results={max_results}&sortBy=submittedDate&sortOrder=descending" response = requests.get(url) if response.status_code == 200: root = ET.fromstring(response.text) ns = {'atom': 'http://www.w3.org/2005/Atom'} summaries_data = [] for entry in root.findall('atom:entry', ns): arxiv_id = entry.find('atom:id', ns).text.split('/')[-1] summary = entry.find('atom:summary', ns).text.strip() summaries_data.append({"ID": arxiv_id, "abstract": summary, "theme": keyword}) all_summaries_data.extend(summaries_data[:max_results]) else: print(f"Failed to retrieve data for theme '{keyword}'. Status code: {response.status_code}") json_output = json.dumps(all_summaries_data, indent=2) return json_output # Get scores for each abstract and if there is a perfect match def score_abstracts(abstracts, reference_abstract): new_abstracts = json.loads(abstracts) scored_abstracts = [] for item in new_abstracts: prompt = f""" ### TASK You are an expert in abstract evaluation and English Literature. Your task is to analyze two abstracts and then check how similar abstract 2 is to abstract 1 in meaning. Then you're gonna generate a score out of 10 for how similar they are. 0 being have nothing in common on different topics, and 10 being exactly the same. Make sure to go over them multiple times to check if your score is correct. ### INPUTS Abstract 1: {reference_abstract} Abstract 2: {item["abstract"]} ### OUTPUT The output should be only the number out of 10, nothing else. """ llm_instance = LLM.create(provider=LLMProvider.OPENAI, model_name="gpt-4") # Generating the similarity score from the language model response = llm_instance.generate_response(user_prompt=prompt) # Extracting the score from the response and handling potential errors try: score = int(response) perfect_match = score == 10 except ValueError: score = 0 perfect_match = False scored_abstracts.append({ "ID": item["ID"], "theme": item["theme"], "score": score, "perfect_match": perfect_match }) return scored_abstracts # MAIN SCRIPT # MAIN SCRIPT reference_abstract = """ YOUR_ABSTRACT """ json_data = extract_keywords(reference_abstract) abstracts = get_abstracts(json_data) data = json.dumps(score_abstracts(abstracts, reference_abstract),indent=2) print(data)
GitHub Link
✖️ Not Available
Download File
✖️ Not Available
If you’re encountering any problems or need further assistance with this code, we’re here to help! Join our community on the forum or Discord for support, tips, and discussion.