You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

69 lines
2.0 KiB
Python

import os
import click
from llama_index.core.schema import TextNode
from llama_index.embeddings.dashscope import (DashScopeEmbedding,
DashScopeTextEmbeddingModels,
DashScopeTextEmbeddingType)
from llama_index.vector_stores.elasticsearch import ElasticsearchStore
from magic_pdf.integrations.rag.api import DataReader
es_vec_store = ElasticsearchStore(
index_name='rag_index',
es_url=os.getenv('ES_URL', 'http://127.0.0.1:9200'),
es_user=os.getenv('ES_USER', 'elastic'),
es_password=os.getenv('ES_PASSWORD', 'llama_index'),
)
# Create embeddings
# text_type=`document` to build index
def embed_node(node):
embedder = DashScopeEmbedding(
model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V2,
text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
)
result_embeddings = embedder.get_text_embedding(node.text)
node.embedding = result_embeddings
return node
@click.command()
@click.option(
'-p',
'--path',
'path',
type=click.Path(exists=True),
required=True,
help='local pdf filepath or directory',
)
def cli(path):
output_dir = '/tmp/magic_pdf/integrations/rag/'
os.makedirs(output_dir, exist_ok=True)
documents = DataReader(path, 'ocr', output_dir)
# build nodes
nodes = []
for idx in range(documents.get_documents_count()):
doc = documents.get_document_result(idx)
if doc is None: # something wrong happens when parse pdf !
continue
for page in iter(
doc): # iterate documents from initial page to last page !
for element in iter(page): # iterate the element from all page !
if element.text is None:
continue
nodes.append(
embed_node(
TextNode(text=element.text,
metadata={'purpose': 'demo'})))
es_vec_store.add(nodes)
if __name__ == '__main__':
cli()