Advanced Semantic Interlinking With Parallel Processing
🔑 ID:
57634
👨💻
Python
🕒
19/07/2024
Free
Description:
This code is part of a tutorial I posted on my blog.
It helps you get interlinking opportunities between your blog posts.
Code:
import json import numpy as np from sklearn.metrics.pairwise import cosine_similarity from concurrent.futures import ThreadPoolExecutor, as_completed from SimplerLLM.tools.generic_loader import load_content from SimplerLLM.language.llm import LLM, LLMProvider from SimplerLLM.language.embeddings import LLM as EmbeddingsLLM, EmbeddingsProvider from SimplerLLM.tools import text_chunker as chunker def get_embeddings(texts): try: embeddings_instance = EmbeddingsLLM.create(provider=EmbeddingsProvider.OPENAI, model_name="text-embedding-3-small") batch_size = 10 embeddings = [] for i in range(0, len(texts), batch_size): batch_texts = texts[i:i + batch_size] response = embeddings_instance.generate_embeddings(batch_texts) embeddings.extend([item.embedding for item in response]) return np.array(embeddings) except Exception as e: print("An error occurred:", e) return np.array([]) def compare_with_threshold(input_chunks, target_title, threshold=0.5): results = {} title_embedding = get_embeddings([target_title]) chunk_embeddings = get_embeddings(input_chunks) similarities = cosine_similarity(chunk_embeddings, title_embedding) for idx, similarity in enumerate(similarities): if similarity[0] >= threshold: results[input_chunks[idx]] = similarity[0] return results def choose_the_keyword(target_blog_title, similar_chunks): llm_instance = LLM.create(provider=LLMProvider.OPENAI, model_name="gpt-4") results = {} for chunk, _ in similar_chunks: prompt = f""" You are an expert in SEO and blog interlinking. I have a chunk of text from a blog post which is semantically similar to the title of a target blog post, so I can use a keyword from the chunk to link to the blog post. Chunk: ```{chunk}``` Title of target blog post: ```{target_blog_title}``` The response should be only the keyword and nothing else. """ response = llm_instance.generate_response(prompt=prompt) results[chunk] = response return results def process_blog_pair(input_blog_url, target_blog_url): try: input_blog = load_content(input_blog_url.strip()) target_blog = load_content(target_blog_url.strip()) except Exception as e: print(f"Failed to load content for URLs {input_blog_url} or {target_blog_url}: {e}") return [] input_blog_chunks = [chunk.text for chunk in chunker.chunk_by_sentences(input_blog.content).chunk_list] target_blog_title = target_blog.title similar_chunks = compare_with_threshold(input_blog_chunks, target_blog_title) links_data = [] if similar_chunks: keywords = choose_the_keyword(target_blog_title, similar_chunks.items()) for chunk, keyword in keywords.items(): link_info = { "input_blog": input_blog_url, "target_blog": target_blog_url, "chunk": chunk, "keyword": keyword, "cosine_similarity": similar_chunks[chunk] } links_data.append(link_info) return links_data def process_blogs(blog_urls): links_data = [] with ThreadPoolExecutor() as executor: future_to_url = {executor.submit(process_blog_pair, blog_url, target_url): (blog_url, target_url) for blog_url in blog_urls for target_url in blog_urls if blog_url != target_url} for future in as_completed(future_to_url): links_data.extend(future.result()) return json.dumps(links_data, indent=4) blog_urls = [ "https://learnwithhasan.com/ai-paraphraser-tool/", "https://learnwithhasan.com/saas-on-wordpress/", "https://learnwithhasan.com/no-code-ai-system-topic-research/", "https://learnwithhasan.com/create-ai-agents-with-python/", ] result = process_blogs(blog_urls) print(result)
GitHub Link
✖️ Not Available
Download File
✖️ Not Available
If you’re encountering any problems or need further assistance with this code, we’re here to help! Join our community on the forum or Discord for support, tips, and discussion.