-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathingest.py
More file actions
95 lines (70 loc) · 2.71 KB
/
ingest.py
File metadata and controls
95 lines (70 loc) · 2.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import sqlite3
from sentence_transformers import SentenceTransformer
import pymupdf
import numpy as np
def extract_text_from_pdf(pdf_path):
doc = pymupdf.open(pdf_path)
text = "\n".join(page.get_text() for page in doc)
return text
def generate_embeddings(text, model_name="sentence-transformers/all-MiniLM-L6-v2"):
model = SentenceTransformer(model_name)
return model.encode([text])[0].tolist()
def chunk_text(text, chunk_size=1000, overlap_size=80):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
chunks.append(chunk)
start = end - overlap_size
return chunks
def create_db():
conn = sqlite3.connect("embeddings.db")
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS embeddings (
doc_id TEXT,
chunk_id INTEGER,
chunk TEXT,
embedding BLOB
)
''')
conn.commit()
return conn
def embed_pdf(pdf_path, doc_id):
text = extract_text_from_pdf(pdf_path)
chunks = chunk_text(text)
conn = create_db()
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
for chunk_id, chunk in enumerate(chunks):
embedding = generate_embeddings(chunk)
embedding_blob = np.array(embedding, dtype=np.float32).tobytes()
cursor = conn.cursor()
cursor.execute('''
INSERT INTO embeddings (doc_id, chunk_id, chunk, embedding)
VALUES (?, ?, ?, ?)
''', (doc_id, chunk_id, chunk, embedding_blob))
conn.commit()
print(f"PDF '{pdf_path}' embedded successfully.")
def search_pdf(query, top_k=3, model_name="sentence-transformers/all-MiniLM-L6-v2"):
conn = sqlite3.connect("embeddings.db")
cursor = conn.cursor()
query_embedding = generate_embeddings(query, model_name)
query_embedding_array = np.array(query_embedding, dtype=np.float32)
cursor.execute('''
SELECT doc_id, chunk_id, chunk, embedding
FROM embeddings
''')
results = []
for doc_id, chunk_id, chunk, embedding_blob in cursor.fetchall():
embedding = np.frombuffer(embedding_blob, dtype=np.float32)
similarity = np.dot(query_embedding_array, embedding) / (np.linalg.norm(query_embedding_array) * np.linalg.norm(embedding))
results.append((similarity, doc_id, chunk_id, chunk))
results.sort(reverse=True, key=lambda x: x[0])
return results[:top_k]
pdf_path = "CBook.pdf"
embed_pdf(pdf_path, doc_id="doc_1")
query = "Find information about fork()"
search_results = search_pdf(query)
for score, doc_id, chunk_id, chunk in search_results:
print(f"Score: {score}\nDoc ID: {doc_id}\nChunk: {chunk}\n")