You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
69 lines
2.0 KiB
Python
69 lines
2.0 KiB
Python
import os
|
|
|
|
import click
|
|
from llama_index.core.schema import TextNode
|
|
from llama_index.embeddings.dashscope import (DashScopeEmbedding,
|
|
DashScopeTextEmbeddingModels,
|
|
DashScopeTextEmbeddingType)
|
|
from llama_index.vector_stores.elasticsearch import ElasticsearchStore
|
|
|
|
from magic_pdf.integrations.rag.api import DataReader
|
|
|
|
es_vec_store = ElasticsearchStore(
|
|
index_name='rag_index',
|
|
es_url=os.getenv('ES_URL', 'http://127.0.0.1:9200'),
|
|
es_user=os.getenv('ES_USER', 'elastic'),
|
|
es_password=os.getenv('ES_PASSWORD', 'llama_index'),
|
|
)
|
|
|
|
|
|
# Create embeddings
|
|
# text_type=`document` to build index
|
|
def embed_node(node):
|
|
embedder = DashScopeEmbedding(
|
|
model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V2,
|
|
text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
|
|
)
|
|
|
|
result_embeddings = embedder.get_text_embedding(node.text)
|
|
node.embedding = result_embeddings
|
|
return node
|
|
|
|
|
|
@click.command()
|
|
@click.option(
|
|
'-p',
|
|
'--path',
|
|
'path',
|
|
type=click.Path(exists=True),
|
|
required=True,
|
|
help='local pdf filepath or directory',
|
|
)
|
|
def cli(path):
|
|
output_dir = '/tmp/magic_pdf/integrations/rag/'
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
documents = DataReader(path, 'ocr', output_dir)
|
|
|
|
# build nodes
|
|
nodes = []
|
|
|
|
for idx in range(documents.get_documents_count()):
|
|
doc = documents.get_document_result(idx)
|
|
if doc is None: # something wrong happens when parse pdf !
|
|
continue
|
|
|
|
for page in iter(
|
|
doc): # iterate documents from initial page to last page !
|
|
for element in iter(page): # iterate the element from all page !
|
|
if element.text is None:
|
|
continue
|
|
nodes.append(
|
|
embed_node(
|
|
TextNode(text=element.text,
|
|
metadata={'purpose': 'demo'})))
|
|
es_vec_store.add(nodes)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
cli()
|