项目文件推送
parent
9cb4172a6d
commit
a58f199554
@ -1,127 +0,0 @@
|
|||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from kb_config import logger
|
|
||||||
from sentence_transformers import CrossEncoder
|
|
||||||
from faiss_kb_service import FaissKBService, DocumentWithVectorStoreId
|
|
||||||
from langchain.docstore.document import Document
|
|
||||||
from base_kb import KnowledgeFile
|
|
||||||
|
|
||||||
class QAService():
|
|
||||||
def __init__(self, kb_name, device) -> None:
|
|
||||||
|
|
||||||
embed_model_path = 'bge-large-zh-v1.5'
|
|
||||||
fkbs = FaissKBService(kb_name, embed_model_path=embed_model_path, device=device)
|
|
||||||
fkbs.do_create_kb()
|
|
||||||
self.fkbs = fkbs
|
|
||||||
self.kb_name = kb_name
|
|
||||||
|
|
||||||
def delete_qa_file(self, qa_file_id):
|
|
||||||
kb_file = KnowledgeFile(qa_file_id, self.kb_name)
|
|
||||||
self.fkbs.do_delete_doc(kb_file, not_refresh_vs_cache=True)
|
|
||||||
|
|
||||||
def update_qa_doc(self, qa_file_id, doc_list, id_list):
|
|
||||||
self.delete_qa_file(qa_file_id)
|
|
||||||
|
|
||||||
doc_infos = self.fkbs.do_add_doc(doc_list, ids=id_list)
|
|
||||||
logger.info('fassi add docs: ' + str(len(doc_infos)))
|
|
||||||
|
|
||||||
self.fkbs.save_vector_store()
|
|
||||||
|
|
||||||
|
|
||||||
def search(self,
|
|
||||||
query,
|
|
||||||
top_k = 3,
|
|
||||||
score_threshold = 0.75,
|
|
||||||
reranked=False):
|
|
||||||
|
|
||||||
docs = self.fkbs.do_search(query, top_k, 1 - score_threshold)
|
|
||||||
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
import json
|
|
||||||
|
|
||||||
|
|
||||||
def create_question_id(intent_code, j, test_question):
|
|
||||||
return f"{intent_code}@{j}@{test_question}"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def load_testing_data(file_path):
|
|
||||||
|
|
||||||
test_data_list = []
|
|
||||||
question_list = []
|
|
||||||
id_list = []
|
|
||||||
|
|
||||||
with open(file_path, encoding='utf-8') as f:
|
|
||||||
data = json.load(f)
|
|
||||||
for i, item in enumerate(data):
|
|
||||||
test_question = item['testQuestion']
|
|
||||||
intent_code = item['expectIntentCode']
|
|
||||||
test_data_list.append((test_question, intent_code))
|
|
||||||
|
|
||||||
q_list = item['expectIntentQuestionExample']
|
|
||||||
for j, q in enumerate(q_list):
|
|
||||||
q_id = create_question_id(intent_code, j, test_question)
|
|
||||||
question_list.append(q)
|
|
||||||
id_list.append(q_id)
|
|
||||||
return test_data_list, question_list, id_list
|
|
||||||
|
|
||||||
|
|
||||||
def convert_to_doc_list(question_list, id_list, qa_file_id):
|
|
||||||
doc_list = []
|
|
||||||
for question, id in zip(question_list, id_list):
|
|
||||||
|
|
||||||
metadata = {
|
|
||||||
'source': qa_file_id,
|
|
||||||
'id': id
|
|
||||||
}
|
|
||||||
doc = Document(page_content=question, metadata=metadata)
|
|
||||||
doc_list.append(doc)
|
|
||||||
|
|
||||||
return doc_list
|
|
||||||
|
|
||||||
import time
|
|
||||||
def work():
|
|
||||||
start_time = time.time()
|
|
||||||
kb_name = 'my_kb_test'
|
|
||||||
device = None
|
|
||||||
qa_service = QAService(kb_name, device)
|
|
||||||
|
|
||||||
|
|
||||||
test_data_list, question_list, id_list = load_testing_data(r'test_data/testing_data.json')
|
|
||||||
print('Loaded data!')
|
|
||||||
|
|
||||||
qa_file_id = 'QA_TEST_2' # the source of the qa, using for data cleaning, make sure to be unique
|
|
||||||
|
|
||||||
|
|
||||||
doc_list = convert_to_doc_list(question_list, id_list, qa_file_id)
|
|
||||||
|
|
||||||
qa_service.update_qa_doc(qa_file_id, doc_list, id_list)
|
|
||||||
|
|
||||||
cnt = 0
|
|
||||||
for query, code in test_data_list:
|
|
||||||
rst = qa_service.search(query)
|
|
||||||
if do_test(query, code, rst):
|
|
||||||
cnt += 1
|
|
||||||
|
|
||||||
print(str(cnt) + '/' + str(len(test_data_list)))
|
|
||||||
|
|
||||||
end_time = time.time()
|
|
||||||
elapsed_time = end_time - start_time
|
|
||||||
print(f"总耗时: {elapsed_time} 秒")
|
|
||||||
|
|
||||||
def do_test(query, expected_intent_code, rst):
|
|
||||||
if rst is None or len(rst)==0:
|
|
||||||
print('Empty: ' + query)
|
|
||||||
return False
|
|
||||||
for rst_doc, similarity_score in rst:
|
|
||||||
page_content = rst_doc.page_content
|
|
||||||
intent_code = rst_doc.metadata['id'].split('@')[0]
|
|
||||||
print(
|
|
||||||
f"{query} vs {page_content} : {expected_intent_code} vs {intent_code} - Similarity Score: {1 - similarity_score}")
|
|
||||||
return True
|
|
||||||
|
|
||||||
work()
|
|
||||||
|
|
@ -1,82 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
import json
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_answer_id(i):
|
|
||||||
return 'A_' + str(i)
|
|
||||||
|
|
||||||
def create_question_id(intent_code, j):
|
|
||||||
return intent_code + '@' + str(j)
|
|
||||||
|
|
||||||
|
|
||||||
def load_traing_data(file_path):
|
|
||||||
|
|
||||||
question_list = []
|
|
||||||
id_list = []
|
|
||||||
|
|
||||||
with open(file_path) as f:
|
|
||||||
data = json.load(f)
|
|
||||||
for i, item in enumerate(data):
|
|
||||||
intent_code = item['intentCode']
|
|
||||||
q_list = item['questionExample']
|
|
||||||
for j, q in enumerate(q_list):
|
|
||||||
q_id = create_question_id(intent_code, j)
|
|
||||||
question_list.append(q)
|
|
||||||
id_list.append(q_id)
|
|
||||||
|
|
||||||
return question_list, id_list
|
|
||||||
|
|
||||||
|
|
||||||
def load_testing_data(file_path):
|
|
||||||
|
|
||||||
test_data_list = []
|
|
||||||
question_list = []
|
|
||||||
id_list = []
|
|
||||||
|
|
||||||
with open(file_path, encoding='utf-8') as f:
|
|
||||||
data = json.load(f)
|
|
||||||
for i, item in enumerate(data):
|
|
||||||
test_question = item['testQuestion']
|
|
||||||
intent_code = item['expectIntentCode']
|
|
||||||
test_data_list.append((test_question, intent_code))
|
|
||||||
|
|
||||||
q_list = item['expectIntentQuestionExample']
|
|
||||||
for j, q in enumerate(q_list):
|
|
||||||
q_id = create_question_id(intent_code, j)
|
|
||||||
question_list.append(q)
|
|
||||||
id_list.append(q_id)
|
|
||||||
|
|
||||||
return test_data_list, question_list, id_list
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def work():
|
|
||||||
# question_list, id_list = load_traing_data('test_data/training_data.json')
|
|
||||||
|
|
||||||
# print(question_list[0])
|
|
||||||
# print(id_list[0])
|
|
||||||
|
|
||||||
# print(question_list[21])
|
|
||||||
# print(id_list[21])
|
|
||||||
|
|
||||||
# intent_conde, idx = id_list[21].split('@')
|
|
||||||
# print(intent_conde)
|
|
||||||
# print(idx)
|
|
||||||
|
|
||||||
test_data_list, question_list, id_list = load_testing_data('test_data/testing_data.json')
|
|
||||||
q_len = len(question_list)
|
|
||||||
print(question_list[0])
|
|
||||||
print(id_list[0])
|
|
||||||
print(question_list[q_len-1])
|
|
||||||
print(id_list[q_len-1])
|
|
||||||
print('#########')
|
|
||||||
print(test_data_list[0][0])
|
|
||||||
print(test_data_list[0][1])
|
|
||||||
|
|
||||||
|
|
||||||
work()
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue