-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_papers.py
131 lines (104 loc) · 3.66 KB
/
load_papers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
import pinecone
import time
import uuid
from config import OPENAI_API_KEY, PINECONE_API_KEY, PINECONE_ENVIRONMENT, PINECONE_INDEX_NAME, EMBEDDING_MODEL, SPLITTER_CHUNK_SIZE, SPLITTER_CHUNK_OVERLAP, UPLOAD_BATCH_SIZE
# Paper list
PAPER_LIST = ["data/paper1.pdf", "data/paper2.pdf", "data/paper3.pdf"]
# Helper functions
def print_match(result):
for match in result['matches']:
print("="*60)
print(f"Score: {match['score']:.2f} \t Source: {match['metadata']['source']} \t Page: {int(match['metadata']['page'])}")
print("="*60)
print(f"{match['metadata']['text']}")
print("="*60)
print()
# Initialize OpenAI
embedding_model = OpenAIEmbeddings(
openai_api_key=OPENAI_API_KEY,
model=EMBEDDING_MODEL
)
print("="*30)
print("OpenAI initialization: OK")
print("="*30)
print()
# Initialize Pinecone vector storage
pinecone.init(
api_key=PINECONE_API_KEY,
environment=PINECONE_ENVIRONMENT
)
if PINECONE_INDEX_NAME not in pinecone.list_indexes():
# we create a new index if it doesn't exist
pinecone.create_index(
name=PINECONE_INDEX_NAME,
metric='cosine',
dimension=1536 # 1536 dim of text-embedding-ada-002
)
# wait for index to be initialized
time.sleep(1)
pinecone_index = pinecone.Index(PINECONE_INDEX_NAME)
pinecone_stats = pinecone_index.describe_index_stats()
print("="*30)
print("Pinecone initialization: OK")
print(pinecone_stats)
print("="*30)
print()
for file_path in PAPER_LIST:
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()
print(f"Processing [{file_path}]")
print(f"Pages shape: {len(pages)}")
text_splitter = TokenTextSplitter(
chunk_size=SPLITTER_CHUNK_SIZE,
chunk_overlap=SPLITTER_CHUNK_OVERLAP
)
source = pages[0].metadata["source"]
total_sentences = []
page_number_list = []
for idx, page in enumerate(pages):
page_num = page.metadata["page"] + 1
sentences = text_splitter.split_text(page.page_content)
total_sentences += sentences
page_number_list += [page_num] * len(sentences)
# Due to OpenAPI rate limitation, I have to embed multiple chunks at the same time
paper_embedding = embedding_model.embed_documents(total_sentences)
# Reformat the vectors
to_upsert = []
for i, sentence_vector in enumerate(paper_embedding):
to_upsert.append({
"id": str(uuid.uuid4()),
"values": sentence_vector,
"metadata": {
"text": total_sentences[i],
"source": source,
"page": page_number_list[i]
}
})
# Upload the vectors in baches
batch_size = UPLOAD_BATCH_SIZE
n = len(to_upsert)
print(f"Total number: {n}")
for i in range(0, n, batch_size):
if i + batch_size <= n:
batch = to_upsert[i: i+batch_size]
else:
batch = to_upsert[i:]
pinecone_index.upsert(vectors=batch)
print(f"Uploaded batch [{i} : {min(n, i+batch_size)}]")
# Auto testing
query_list = [
"How to treat patient with ACHD?",
"How to diagnose Resistant Hypertension?",
"How to reduce the cardiorenal risk?"
]
for i, query in enumerate(query_list):
print("="*30)
print(f"Test {i+1}: {query}")
print("="*30)
query_embedding = embedding_model.embed_documents([query])
res = pinecone_index.query(query_embedding, top_k=3, include_metadata=True)
print_match(res)
print("Upload papers - Done!")