新增模块

dev_1.0.0^2
liu 10 months ago
parent 822b766890
commit f6f9668e41

@ -19,13 +19,23 @@
<dependencies>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-tika-document-reader</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<!-- 引入ollama的依赖.版本号来自于 dependencyManagement中 spring-ai-bom中的版本号.-->
<dependency>
<groupId>io.springboot.ai</groupId>
<artifactId>spring-ai-ollama-spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-elasticsearch-store</artifactId>
@ -74,6 +84,28 @@
</dependencies>
<repositories>
<repository>
<id>central</id>
<name>aliyun central repo</name>
<url>https://maven.aliyun.com/nexus/content/repositories/central/</url>
<layout>default</layout>
<releases>
<enabled>true</enabled>
<updatePolicy>daily</updatePolicy>
</releases>
<snapshots>
<enabled>false</enabled>
<updatePolicy>never</updatePolicy>
</snapshots>
</repository>
<repository>
<id>spring-snapshots</id>
<name>Spring Snapshots</name>
<url>https://repo.spring.io/snapshot</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>spring-milestones</id>
<name>Spring Milestones</name>

@ -0,0 +1,24 @@
package com.supervision.knowsub.controller;
import com.supervision.knowsub.etl.reader.TikaReader;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import java.io.IOException;
@RestController
@RequestMapping("etl")
public class EtlController {
@Autowired
private TikaReader tikaReader;
@PostMapping("testLoadText")
public void testLoadText(@RequestParam(name = "file") MultipartFile file) throws IOException {
tikaReader.loadAndSplitThenSaveVectorStore(file.getInputStream());
}
}

@ -0,0 +1,42 @@
package com.supervision.knowsub.etl.reader;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.tika.TikaDocumentReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.vectorstore.ElasticsearchVectorStore;
import org.springframework.core.io.InputStreamResource;
import org.springframework.stereotype.Component;
import java.io.InputStream;
import java.util.List;
@Slf4j
@Component
@RequiredArgsConstructor
public class TikaReader {
private final ElasticsearchVectorStore elasticsearchVectorStore;
/**
* <a href="https://zhuanlan.zhihu.com/p/703705663"/>
*
* @param inputStream
*/
public void loadAndSplitThenSaveVectorStore(InputStream inputStream) {
// 首先使用tika进行文件切分操作
log.info("首先进行内容切分");
TikaDocumentReader tikaDocumentReader = new TikaDocumentReader(new InputStreamResource(inputStream));
List<Document> documents = tikaDocumentReader.read();
log.info("切分完成,开始进行chunk分割");
// 然后切分为chunk
TokenTextSplitter tokenTextSplitter = new TokenTextSplitter();
List<Document> apply = tokenTextSplitter.apply(documents);
log.info("切分完成,开始进行保存到向量库中");
// 保存到向量数据库中
elasticsearchVectorStore.accept(apply);
log.info("保存完成");
}
}

@ -44,6 +44,7 @@
<dependencyManagement>
<dependencies>
<dependency>
<groupId>io.springboot.ai</groupId>
<artifactId>spring-ai-bom</artifactId>
@ -121,6 +122,28 @@
</dependencyManagement>
<repositories>
<repository>
<id>central</id>
<name>aliyun central repo</name>
<url>https://maven.aliyun.com/nexus/content/repositories/central/</url>
<layout>default</layout>
<releases>
<enabled>true</enabled>
<updatePolicy>daily</updatePolicy>
</releases>
<snapshots>
<enabled>false</enabled>
<updatePolicy>never</updatePolicy>
</snapshots>
</repository>
<repository>
<id>spring-snapshots</id>
<name>Spring Snapshots</name>
<url>https://repo.spring.io/snapshot</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>spring-milestones</id>
<name>Spring Milestones</name>

Loading…
Cancel
Save