新增模块
parent
822b766890
commit
f6f9668e41
@ -0,0 +1,24 @@
|
||||
package com.supervision.knowsub.controller;
|
||||
|
||||
import com.supervision.knowsub.etl.reader.TikaReader;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("etl")
|
||||
public class EtlController {
|
||||
|
||||
@Autowired
|
||||
private TikaReader tikaReader;
|
||||
|
||||
@PostMapping("testLoadText")
|
||||
public void testLoadText(@RequestParam(name = "file") MultipartFile file) throws IOException {
|
||||
tikaReader.loadAndSplitThenSaveVectorStore(file.getInputStream());
|
||||
}
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
package com.supervision.knowsub.etl.reader;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.ai.reader.tika.TikaDocumentReader;
|
||||
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
|
||||
import org.springframework.ai.vectorstore.ElasticsearchVectorStore;
|
||||
import org.springframework.core.io.InputStreamResource;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
public class TikaReader {
|
||||
|
||||
private final ElasticsearchVectorStore elasticsearchVectorStore;
|
||||
|
||||
/**
|
||||
* 参考文档 <a href="https://zhuanlan.zhihu.com/p/703705663"/>
|
||||
*
|
||||
* @param inputStream 输入流
|
||||
*/
|
||||
public void loadAndSplitThenSaveVectorStore(InputStream inputStream) {
|
||||
// 首先使用tika进行文件切分操作
|
||||
log.info("首先进行内容切分");
|
||||
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(new InputStreamResource(inputStream));
|
||||
List<Document> documents = tikaDocumentReader.read();
|
||||
log.info("切分完成,开始进行chunk分割");
|
||||
// 然后切分为chunk
|
||||
TokenTextSplitter tokenTextSplitter = new TokenTextSplitter();
|
||||
List<Document> apply = tokenTextSplitter.apply(documents);
|
||||
log.info("切分完成,开始进行保存到向量库中");
|
||||
// 保存到向量数据库中
|
||||
elasticsearchVectorStore.accept(apply);
|
||||
log.info("保存完成");
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue