Skip to content

Find Research Paper Abstract Similarity with Arxiv API

🔑 ID:

38888

👨‍💻

Python

🕒

02/04/2024
Free

Description:

This code will help you check for similar academic research paper abstracts to your abstract. However, make sure you get your own OPENAI API Key and implement it in a .env file like this:

OPENAI_API_KEY = “YOUR_API_KEY”

Code:

import requests
import json
import xml.etree.ElementTree as ET
from SimplerLLM.language.llm import LLM, LLMProvider

# Get the top keywords present in input abstract
def extract_keywords(abstract):
    # Constructing a prompt for the language model to generate keywords from the abstract
    prompt = f"""
    ### TASK
    You are an expert in text analysis and keyword extraction. Your task is to analyse an abstract I'm going to give you
    and extract from it the top 5 keywords that are most representative of its content. Then you're going to generate
    them in a JSON format in descending order from the most relevant to the least relevant.

    ### INPUTS
    Abstract: {abstract}

    ### OUTPUT
    The output should be in JSON format. Here's how it should look like:
    [
        {{"theme": "[theme 1]"}},
        {{"theme": "[theme 2]"}},
        {{"theme": "[theme 3]"}},
        {{"theme": "[theme 4]"}},
        {{"theme": "[theme 5]"}}
    ]
    """
    # Creating an instance of the language model using SimplerLLM
    llm_instance = LLM.create(provider=LLMProvider.OPENAI, model_name="gpt-4")

    # Generating response from the language model
    response = llm_instance.generate_response(user_prompt=prompt)

    # Attempting to parse the response as JSON
    try:
        response_data = json.loads(response)
        return json.dumps(response_data, indent=2)
    except json.JSONDecodeError:
        # Returning an error message if the response is not valid JSON
        return json.dumps({"error": "Invalid response from LLM"}, indent=2)

# Search for related abstracts according to keywords and get the ID and abstract
def get_abstracts(json_input):
    input_data = json.loads(json_input)
    all_summaries_data = []

    for theme_info in input_data:
        keyword = theme_info['theme']
        max_results = 1  # Number of results to fetch for each keyword

        # Constructing the query URL for the arXiv API
        url = f"http://export.arxiv.org/api/query?search_query=all:{keyword}&start=0&max_results={max_results}&sortBy=submittedDate&sortOrder=descending"
        
        response = requests.get(url)
        if response.status_code == 200:
            root = ET.fromstring(response.text)
            ns = {'atom': 'http://www.w3.org/2005/Atom'}

            summaries_data = []
            for entry in root.findall('atom:entry', ns):
                arxiv_id = entry.find('atom:id', ns).text.split('/')[-1]
                summary = entry.find('atom:summary', ns).text.strip()
                
                summaries_data.append({"ID": arxiv_id, "abstract": summary, "theme": keyword})

            all_summaries_data.extend(summaries_data[:max_results]) 
        else:
            print(f"Failed to retrieve data for theme '{keyword}'. Status code: {response.status_code}")

    json_output = json.dumps(all_summaries_data, indent=2)
    return json_output

# Get scores for each abstract and if there is a perfect match
def score_abstracts(abstracts, reference_abstract):
    new_abstracts = json.loads(abstracts)
    scored_abstracts = []

    for item in new_abstracts:
        prompt = f"""
        ### TASK
        You are an expert in abstract evaluation and English Literature. Your task is to analyze two abstracts
        and then check how similar abstract 2 is to abstract 1 in meaning. Then you're gonna generate
        a score out of 10 for how similar they are. 0 being have nothing in common on different topics, and 10
        being exactly the same. Make sure to go over them multiple times to check if your score is correct.

        ### INPUTS
        Abstract 1: {reference_abstract}
        Abstract 2: {item["abstract"]}

        ### OUTPUT
        The output should be only the number out of 10, nothing else.
        """
        llm_instance = LLM.create(provider=LLMProvider.OPENAI, model_name="gpt-4")

        # Generating the similarity score from the language model
        response = llm_instance.generate_response(user_prompt=prompt)
        
        # Extracting the score from the response and handling potential errors
        try:
            score = int(response)
            perfect_match = score == 10
        except ValueError:
            score = 0
            perfect_match = False
        
        scored_abstracts.append({
            "ID": item["ID"],
            "theme": item["theme"],
            "score": score,
            "perfect_match": perfect_match
        })
    
    return scored_abstracts

# MAIN SCRIPT
# MAIN SCRIPT
reference_abstract = """
YOUR_ABSTRACT
"""
json_data = extract_keywords(reference_abstract)   
abstracts = get_abstracts(json_data)
data = json.dumps(score_abstracts(abstracts, reference_abstract),indent=2)
print(data)

GitHub Link

✖️ Not Available

Download File

✖️ Not Available

If you’re encountering any problems or need further assistance with this code, we’re here to help! Join our community on the forum or Discord for support, tips, and discussion.