新增模块
parent
822b766890
commit
f6f9668e41
@ -0,0 +1,24 @@
|
|||||||
|
package com.supervision.knowsub.controller;
|
||||||
|
|
||||||
|
import com.supervision.knowsub.etl.reader.TikaReader;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.web.bind.annotation.PostMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@RestController
|
||||||
|
@RequestMapping("etl")
|
||||||
|
public class EtlController {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private TikaReader tikaReader;
|
||||||
|
|
||||||
|
@PostMapping("testLoadText")
|
||||||
|
public void testLoadText(@RequestParam(name = "file") MultipartFile file) throws IOException {
|
||||||
|
tikaReader.loadAndSplitThenSaveVectorStore(file.getInputStream());
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,42 @@
|
|||||||
|
package com.supervision.knowsub.etl.reader;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.ai.document.Document;
|
||||||
|
import org.springframework.ai.reader.tika.TikaDocumentReader;
|
||||||
|
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
|
||||||
|
import org.springframework.ai.vectorstore.ElasticsearchVectorStore;
|
||||||
|
import org.springframework.core.io.InputStreamResource;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Component
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class TikaReader {
|
||||||
|
|
||||||
|
private final ElasticsearchVectorStore elasticsearchVectorStore;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 参考文档 <a href="https://zhuanlan.zhihu.com/p/703705663"/>
|
||||||
|
*
|
||||||
|
* @param inputStream 输入流
|
||||||
|
*/
|
||||||
|
public void loadAndSplitThenSaveVectorStore(InputStream inputStream) {
|
||||||
|
// 首先使用tika进行文件切分操作
|
||||||
|
log.info("首先进行内容切分");
|
||||||
|
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(new InputStreamResource(inputStream));
|
||||||
|
List<Document> documents = tikaDocumentReader.read();
|
||||||
|
log.info("切分完成,开始进行chunk分割");
|
||||||
|
// 然后切分为chunk
|
||||||
|
TokenTextSplitter tokenTextSplitter = new TokenTextSplitter();
|
||||||
|
List<Document> apply = tokenTextSplitter.apply(documents);
|
||||||
|
log.info("切分完成,开始进行保存到向量库中");
|
||||||
|
// 保存到向量数据库中
|
||||||
|
elasticsearchVectorStore.accept(apply);
|
||||||
|
log.info("保存完成");
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue