Pentru a implementa codul de mai jos in Google Colab, ai nevoie sa obtii in prealabil cheile API de la OpenAI si Pinecone.
In primul rand vom rula text embeddings pe PDF-ul de test si-l vom incarca intr-un vector in Pinecone.
Inainte de toate instaleaza:
%pip install pinecone
%pip install openai==0.28
%pip install pypdf
%pip install pdfplumber
%pip install -U langchain-community
%pip install requests==2.32.4
%pip install gradio
import os
import re
import pdfplumber
import openai
import pinecone
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import ServerlessSpec # Import ServerlessSpec
# Initializeaza OpenAI
openai.api_key = "OPENAI_API_KEY"
MODEL = "text-embedding-ada-002"
# Initializeaza Pinecone
# Use the new method to initialize Pinecone
pc = pinecone.Pinecone(api_key='PINECONE_API_KEY')
# Definim numele index-ului Pinecone in care vom urca pdf-ul
index_name = "vector-pdf"
# Creem indexul
index = pc.Index(index_name)
# Functia care proceseaza text-ul din PDF
def preprocess_text(text):
# Replace consecutive spaces, newlines and tabs
text = re.sub(r'\s+', ' ', text)
return text
def process_pdf(file_path):
# create a loader
loader = PyPDFLoader(file_path)
# load your data
data = loader.load()
# Split your data up into smaller documents with Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(data)
# Convert Document objects into strings
texts = [doc.page_content for doc in documents] # Extract text content
return texts
# Defineste o functie pentru a crea embeddings
def create_embeddings(texts):
embeddings_list = []
for text in texts:
res = openai.Embedding.create(input=[text], engine=MODEL)
embeddings_list.append(res['data'][0]['embedding'])
return embeddings_list
# Defineste o functie pentru a urca embeddings-urile create anterior in Pinecone
def upsert_embeddings_to_pinecone(index, embeddings, texts, ids, batch_size=100):
vectors_to_upsert = []
for i, embedding in enumerate(embeddings):
vectors_to_upsert.append(
{
"id": ids[i],
"values": embedding,
"metadata": {"text": texts[i]}
}
)
# Upsert in batches
if len(vectors_to_upsert) == batch_size:
index.upsert(vectors=vectors_to_upsert)
vectors_to_upsert = []
# Upsert any remaining vectors
if vectors_to_upsert:
index.upsert(vectors=vectors_to_upsert)
# Proceseaza PDF-ul si creeaza embeddings
file_path = "FISIER.pdf" # Replace with your actual file path
texts = process_pdf(file_path)
embeddings = create_embeddings(texts)
ids = [f"doc_{i}" for i in range(len(texts))] # Generate unique IDs for each chunk
# Upsert the embeddings to Pinecone, passing texts as metadata in batches
upsert_embeddings_to_pinecone(index, embeddings, texts, ids)
Codul de mai sus proceseaza Pdf-ul nostru de test (FISIER.pdf) si-l urca intr-un vector Pinecone.
Mai departe definim codul care ne permite cautarea in baza de date Pinecone. Rezultatul cautarii este folosit ca si context intr-o interogare cu OpenAI.
import os
import openai
from pinecone import Pinecone, ServerlessSpec
class ProprietaryAI:
def __init__(self):
openai.api_key = "OPENAI_API_KEY"
self.pc = Pinecone(api_key='PINECONE_API_KEY')
self.index = self.pc.Index("vector-pdf")
def answer_question(self, question: str) -> str:
# Step 1: "Advanced Semantic Encoding"
embedding = openai.Embedding.create(
input=question,
model="text-embedding-ada-002"
)
# Step 2: "Neural Retrieval System"
results = self.index.query(
vector=embedding.data[0].embedding,
top_k=7,
include_metadata=True
)
# Step 3: "Contextual Synthesis"
context = "\n\n".join([
match.metadata['text']
for match in results.matches
])
print(context)
# Step 4: "OpenAi Language Model"
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": f"Use this context: {context}"
},
{
"role": "user",
"content": question
}
]
)
return response.choices[0].message.content
In final realizam o cautare in baza de date.
proprietary_ai_instance = ProprietaryAI()
print(proprietary_ai_instance.answer_question("Interogare de exemplu a bazei de date Pinecone"))
Daca se doreste o implementare cu Gradio, puteti folosi codul de mai jos:
import gradio as gr
import os
import openai
from pinecone import Pinecone, ServerlessSpec
class ProprietaryAI:
def __init__(self):
openai.api_key = "OPENAI_API_KEY"
self.pc = Pinecone(api_key='PINECONE_API_KEY')
self.index = self.pc.Index("hs-codes")
def answer_question(self, question: str) -> str:
# Step 1: "Advanced Semantic Encoding"
embedding = openai.Embedding.create(
input=question,
model="text-embedding-ada-002"
)
# Step 2: "Neural Retrieval System"
results = self.index.query(
vector=embedding.data[0].embedding,
top_k=7,
include_metadata=True
)
# Step 3: "Contextual Synthesis"
context = "\n\n".join([
match.metadata['text']
for match in results.matches
])
print(context)
# Step 4: "OpenAI Language Model"
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": f"Use this context: {context}"
},
{
"role": "user",
"content": question
}
]
)
return response.choices[0].message.content
proprietary_ai_instance = ProprietaryAI()
def answer_question_gradio(question):
return proprietary_ai_instance.answer_question(question)
iface = gr.Interface(
fn=answer_question_gradio,
inputs=gr.Textbox(lines=2, placeholder="Enter your question here..."),
outputs="textbox"
)
iface.launch()
Astfel am implementat un RAG cu costuri minime. Pinecone este gratuit pentru utilizare in limite normale. De asemenea interogarea cu OpenAi foloseste resurse minime.