Fix dependency-related issues via requirements update (#2236)

* Update requirements.txt * Create constraints.txt * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * pyopenjtalk and onnx fix * Update requirements.txt * Update requirements.txt * Update install.sh * update shell install.sh * update docs * Update Install.sh * fix bugs * Update .gitignore * Update .gitignore * Update install.sh * Update install.sh * Update extra-req.txt * Update requirements.txt
4 months ago · 6c468583c5
parent ee4a466f79
commit 6c468583c5
12 changed files with 526 additions and 199 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,5 +18,183 @@ TEMP
 weight.json
 ffmpeg*
 ffprobe*
 cfg.json
 speakers.json
 ref_audios
 tools/AP_BWE_main/24kto48k/*
 !tools/AP_BWE_main/24kto48k/readme.txt
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # UV
 #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #uv.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
 .pdm.toml
 .pdm-python
 .pdm-build/
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 # Ruff stuff:
 .ruff_cache/
 # PyPI configuration file
 .pypirc
--- a/GPT_SoVITS_Inference.ipynb
+++ b/GPT_SoVITS_Inference.ipynb
@ -1,42 +1,37 @@
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
      ],
      "metadata": {
        "id": "himHYZmra7ix"
-      }
+      },
      "source": [
        "# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "e9b7iFV3dm1f"
      },
      "outputs": [],
      "source": [
        "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
        "%cd GPT-SoVITS\n",
        "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
        "!pip install -r extra-req.txt --no-deps\n",
        "!pip install -r requirements.txt"
-      ],
+      ]
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "0NgxXg5sjv7z"
      },
      "outputs": [],
      "source": [
        "# @title Download pretrained models 下载预训练模型\n",
        "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -53,16 +48,16 @@
        "!git clone https://huggingface.co/Delik/uvr5_weights\n",
        "!git config core.sparseCheckout true\n",
        "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
-      ],
+      ]
      "metadata": {
        "id": "0NgxXg5sjv7z",
        "cellView": "form"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "cPDEH-9czOJF"
      },
      "outputs": [],
      "source": [
        "#@title Create folder models 创建文件夹模型\n",
        "import os\n",
@ -77,16 +72,16 @@
        "    print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建！)\")\n",
        "\n",
        "print(\"All folders have been created. (所有文件夹均已创建。)\")"
-      ],
+      ]
      "metadata": {
        "cellView": "form",
        "id": "cPDEH-9czOJF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "vbZY-LnM0tzq"
      },
      "outputs": [],
      "source": [
        "import requests\n",
        "import zipfile\n",
@ -124,29 +119,35 @@
        "        shutil.move(source_path, destination_path)\n",
        "\n",
        "print(f'Model downloaded. (模型已下载。)')"
-      ],
+      ]
      "metadata": {
        "cellView": "form",
        "id": "vbZY-LnM0tzq"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "4oRGUzkrk8C7"
      },
      "outputs": [],
      "source": [
        "# @title launch WebUI 启动WebUI\n",
        "!/usr/local/bin/pip install ipykernel\n",
        "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
        "%cd /content/GPT-SoVITS/\n",
        "!/usr/local/bin/python  webui.py"
      ]
    }
  ],
  "metadata": {
-        "id": "4oRGUzkrk8C7",
+    "accelerator": "GPU",
-        "cellView": "form"
+    "colab": {
      "provenance": []
    },
-      "execution_count": null,
+    "kernelspec": {
-      "outputs": []
+      "display_name": "Python 3",
      "name": "python3"
    }
-  ]
+  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
--- a/README.md
+++ b/README.md
@ -1,6 +1,5 @@
 <div align="center">
 <h1>GPT-SoVITS-WebUI</h1>
 A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
@ -77,6 +76,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
 pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```
@ -105,6 +105,7 @@ Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWeb
 Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)
 ##### MacOS Users
 ```bash
 brew install ffmpeg
 ```
@ -112,6 +113,7 @@ brew install ffmpeg
 #### Install Dependences
 ```bash
 pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```
@ -200,6 +202,7 @@ if you want to switch to V1,then
 ```bash
 python webui.py v1 <language(optional)>
 ```
 Or maunally switch version in WebUI
 ### Finetune
@ -224,11 +227,13 @@ Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference
 ```bash
 python GPT_SoVITS/inference_webui.py <language(optional)>
 ```
 OR
 ```bash
 python webui.py
 ```
 then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
 ## V2 Release Notes
@ -243,7 +248,7 @@ New Features:
 4. Improved synthesis quality for low-quality reference audio
-    [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
 Use v2 from v1 environment:
@ -263,7 +268,7 @@ New Features:
 2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression.
-    [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
 Use v3 from v2 environment:
@ -275,7 +280,6 @@ Use v3 from v2 environment:
   additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
 ## Todo List
 - [x] **High Priority:**
@ -297,15 +301,20 @@ Use v3 from v2 environment:
  - [ ] model mix
 ## (Additional) Method for running from the command line
 Use the command line to open the WebUI for UVR5
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
 <!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
 This is how the audio segmentation of the dataset is done using the command line
 ```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
@ -315,16 +324,21 @@ python audio_slicer.py \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
 ```
 This is how dataset ASR processing is done using the command line(Only Chinese)
 ```
 python tools/asr/funasr_asr.py -i <input> -o <output>
 ```
 ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
 (No progress bars, GPU performance may cause time delays)
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
 A custom list save path is enabled
 ## Credits
@ -332,6 +346,7 @@ A custom list save path is enabled
 Special thanks to the following projects and contributors:
 ### Theoretical Research
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -341,17 +356,23 @@ Special thanks to the following projects and contributors:
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
 ### Pretrained Models
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
 ### Text Frontend for Inference
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
 ### WebUI Tools
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/colab_webui.ipynb
+++ b/colab_webui.ipynb
@ -1,23 +1,10 @@
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
-        "id": "view-in-github",
+        "colab_type": "text",
-        "colab_type": "text"
+        "id": "view-in-github"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
@ -25,18 +12,20 @@
    },
    {
      "cell_type": "markdown",
      "source": [
        "环境配置 environment"
      ],
      "metadata": {
        "id": "_o6a8GS2lWQM"
-      }
+      },
      "source": [
        "环境配置 environment"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "e9b7iFV3dm1f"
      },
      "outputs": [],
      "source": [
        "!pip install -q condacolab\n",
        "# Setting up condacolab and installing packages\n",
@ -47,13 +36,17 @@
        "!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
        "%cd -q /content/GPT-SoVITS\n",
        "!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
        "!/usr/local/bin/pip install -r extra-req.txt --no-deps\n",
        "!/usr/local/bin/pip install -r requirements.txt"
-      ],
+      ]
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0NgxXg5sjv7z"
      },
      "outputs": [],
      "source": [
        "# @title Download pretrained models 下载预训练模型\n",
        "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -71,27 +64,35 @@
        "!git clone https://huggingface.co/Delik/uvr5_weights\n",
        "!git config core.sparseCheckout true\n",
        "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
-      ],
+      ]
      "metadata": {
        "id": "0NgxXg5sjv7z"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4oRGUzkrk8C7"
      },
      "outputs": [],
      "source": [
        "# @title launch WebUI 启动WebUI\n",
        "!/usr/local/bin/pip install ipykernel\n",
        "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
        "%cd /content/GPT-SoVITS/\n",
        "!/usr/local/bin/python  webui.py"
      ]
    }
  ],
  "metadata": {
-        "id": "4oRGUzkrk8C7"
+    "accelerator": "GPU",
    "colab": {
      "include_colab_link": true,
      "provenance": []
    },
-      "execution_count": null,
+    "kernelspec": {
-      "outputs": []
+      "display_name": "Python 3",
      "name": "python3"
    }
-  ]
+  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@ -76,6 +76,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
 pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```
@ -101,9 +102,10 @@ conda install -c conda-forge 'ffmpeg<7'
 下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下。
-安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语TTS)
+安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
 ##### MacOS 用户
 ```bash
 brew install ffmpeg
 ```
@ -111,6 +113,7 @@ brew install ffmpeg
 #### 安装依赖
 ```bash
 pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```
@ -147,7 +150,7 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
 1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型，并将其放置在 `GPT_SoVITS/pretrained_models` 目录中。
-2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型，解压并重命名为 `G2PWModel`，然后将其放置在 `GPT_SoVITS/text` 目录中。（仅限中文TTS）
+2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型，解压并重命名为 `G2PWModel`，然后将其放置在 `GPT_SoVITS/text` 目录中。（仅限中文 TTS）
 3. 对于 UVR5（人声/伴奏分离和混响移除，额外功能），从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型，并将其放置在 `tools/uvr5/uvr5_weights` 目录中。
@ -155,7 +158,6 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
   - 建议在模型名称和配置文件名中**直接指定模型类型**，例如`mel_mand_roformer`、`bs_roformer`。如果未指定，将从配置文中比对特征，以确定它是哪种类型的模型。例如，模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对。
 4. 对于中文 ASR（额外功能），从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型，并将它们放置在 `tools/asr/models` 目录中。
 5. 对于英语或日语 ASR（额外功能），从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型，并将其放置在 `tools/asr/models` 目录中。此外，[其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间。
@ -184,12 +186,12 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。
 ## 微调与推理
-### 打开WebUI
+### 打开 WebUI
 #### 整合包用户
 双击`go-webui.bat`或者使用`go-webui.ps1`
-若想使用V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
+若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
 #### 其他
@ -197,12 +199,13 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。
 python webui.py <language(optional)>
 ```
-若想使用V1,则
+若想使用 V1,则
 ```bash
 python webui.py v1 <language(optional)>
 ```
-或者在webUI内动态切换
+
 或者在 webUI 内动态切换
 ### 微调
@ -215,25 +218,27 @@ python webui.py v1 <language(optional)>
    5. 校对标注
    6. 前往下一个窗口,点击训练
-### 打开推理WebUI
+### 打开推理 WebUI
 #### 整合包用户
-双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI
+双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
 #### 其他
 ```bash
 python GPT_SoVITS/inference_webui.py <language(optional)>
 ```
 或者
 ```bash
 python webui.py
 ```
 然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI
-## V2发布说明
+然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
 ## V2 发布说明
 新特性:
@ -241,42 +246,41 @@ python webui.py
 2. 更好的文本前端
-3. 底模由2k小时扩展至5k小时
+3. 底模由 2k 小时扩展至 5k 小时
 4. 对低音质参考音频（尤其是来源于网络的高频严重缺失、听着很闷的音频）合成出来音质更好
-    详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
-从v1环境迁移至v2
+从 v1 环境迁移至 v2
-1. 需要pip安装requirements.txt更新环境
+1. 需要 pip 安装 requirements.txt 更新环境
-2. 需要克隆github上的最新代码
+2. 需要克隆 github 上的最新代码
-3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到GPT_SoVITS\pretrained_models\gsv-v2final-pretrained下
+3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained 下
-    中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)（下载G2PW模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下）
+   中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)（下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下）
-## V3更新说明
+## V3 更新说明
 新模型特点:
 1. 音色相似度更像，需要更少训练集来逼近本人（不训练直接使用底模模式下音色相似性提升更大）
-2. GPT合成更稳定，重复漏字更少，也更容易跑出丰富情感
+2. GPT 合成更稳定，重复漏字更少，也更容易跑出丰富情感
-    详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
-从v2环境迁移至v3
+从 v2 环境迁移至 v3
-1. 需要pip安装requirements.txt更新环境
+1. 需要 pip 安装 requirements.txt 更新环境
-2. 需要克隆github上的最新代码
+2. 需要克隆 github 上的最新代码
-3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些v3新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
+3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
    如果想用音频超分功能缓解v3模型生成24k音频觉得闷的问题，需要下载额外的模型参数，参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
   如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题，需要下载额外的模型参数，参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
 ## 待办事项清单
@ -299,16 +303,21 @@ python webui.py
  - [ ] 模型混合。
 ## （附加）命令行运行方式
-使用命令行打开UVR5的WebUI
+
-````
+使用命令行打开 UVR5 的 WebUI
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
-````
+```
 <!-- 如果打不开浏览器，请按照下面的格式进行UVR处理，这是使用mdxnet进行音频处理的方式
 ````
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ```` -->
 这是使用命令行完成数据集的音频切分的方式
-````
+
 ```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
    --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
@ -316,17 +325,22 @@ python audio_slicer.py \
    --min_length <minimum_duration_of_each_subclip> \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
-````
+```
-这是使用命令行完成数据集ASR处理的方式（仅限中文）
+
-````
+这是使用命令行完成数据集 ASR 处理的方式（仅限中文）
 ```
 python tools/asr/funasr_asr.py -i <input> -o <output>
-````
+```
-通过Faster_Whisper进行ASR处理（除中文之外的ASR标记）
+
 通过 Faster_Whisper 进行 ASR 处理（除中文之外的 ASR 标记）
 （没有进度条，GPU 性能可能会导致时间延迟）
 （没有进度条，GPU性能可能会导致时间延迟）
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
 启用自定义列表保存路径
 ## 致谢
@ -334,6 +348,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 特别感谢以下项目和贡献者：
 ### 理论研究
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -343,17 +358,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
 ### 预训练模型
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
 ### 推理用文本前端
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
 ### WebUI 工具
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/docs/ja/README.md
+++ b/docs/ja/README.md
@ -20,17 +20,17 @@
 ## 機能:
-1. **Zero-Shot TTS:** たった5秒間の音声サンプルで、即座にテキストからその音声に変換できます。
+1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます。
-2. **Few-Shot TTS:** わずか1分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。
+2. **Few-Shot TTS:** わずか 1 分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。
 3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています。
-4. **WebUI ツール:** 統合されたツールは、音声と伴奏（BGM等）の分離、トレーニングセットの自動セグメンテーション、ASR（中国語のみ）、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成やGPT/SoVITSモデルのトレーニング等を非常に簡単に行えます。
+4. **WebUI ツール:** 統合されたツールは、音声と伴奏（BGM 等）の分離、トレーニングセットの自動セグメンテーション、ASR（中国語のみ）、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成や GPT/SoVITS モデルのトレーニング等を非常に簡単に行えます。
 **[デモ動画](https://www.bilibili.com/video/BV12g4y1m7Uw)をチェック！**
-声の事前学習無しかつFew-Shotでトレーニングされたモデルのデモ:
+声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:
 https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
@ -43,7 +43,7 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
 - Python 3.9, PyTorch 2.0.1, CUDA 11
 - Python 3.10.13, PyTorch 2.1.2, CUDA 12.3
 - Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon)
- Python 3.9, PyTorch 2.2.2, CPUデバイス
+- Python 3.9, PyTorch 2.2.2, CPU デバイス
 _注記: numba==0.56.4 は py<3.11 が必要です_
@ -61,22 +61,22 @@ bash install.sh
 ### macOS
-**注：MacでGPUを使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面はCPUを使用して訓練することを強く推奨します。**
+**注：Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します。**
-1. `xcode-select --install` を実行して、Xcodeコマンドラインツールをインストールします。
+1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします。
-2. `brew install ffmpeg` を実行してFFmpegをインストールします。
+2. `brew install ffmpeg` を実行して FFmpeg をインストールします。
 3. 上記の手順を完了した後、以下のコマンドを実行してこのプロジェクトをインストールします。
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
-
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```
 ### 手動インストール
-#### FFmpegをインストールします。
+#### FFmpeg をインストールします。
 ##### Conda ユーザー
@ -97,6 +97,7 @@ conda install -c conda-forge 'ffmpeg<7'
 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。
 ##### MacOS ユーザー
 ```bash
 brew install ffmpeg
 ```
@ -104,6 +105,7 @@ brew install ffmpeg
 #### 依存関係をインストールします
 ```bash
 pip install -r extra-req.txt --no-deps
 pip install -r requirementx.txt
 ```
@ -138,17 +140,17 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
 1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください。
-2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。（中国語TTSのみ）
+2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。（中国語 TTS のみ）
-3. UVR5（ボーカル/伴奏（BGM等）分離 & リバーブ除去の追加機能）の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
+3. UVR5（ボーカル/伴奏（BGM 等）分離 & リバーブ除去の追加機能）の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
-    - UVR5でbs_roformerまたはmel_band_roformerモデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformerクラスのモデルとして認識されます。
+   - UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformer クラスのモデルとして認識されます。
   - モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例：mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです。
-4. 中国語ASR（追加機能）の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。
+4. 中国語 ASR（追加機能）の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。
-5. 英語または日本語のASR（追加機能）を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。
+5. 英語または日本語の ASR（追加機能）を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。
 ## データセット形式
@ -169,14 +171,15 @@ vocal_path|speaker_name|language|text
 ```
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
 ```
 ## 微調整と推論
-### WebUIを開く
+### WebUI を開く
 #### 統合パッケージ利用者
 `go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します。
-V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。
+V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。
 #### その他
@ -184,12 +187,13 @@ V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックす
 python webui.py <言語(オプション)>
 ```
-V1に切り替えたい場合は
+V1 に切り替えたい場合は
 ```bash
 python webui.py v1 <言語(オプション)>
 ```
-またはWebUIで手動でバージョンを切り替えてください。
+
 または WebUI で手動でバージョンを切り替えてください。
 ### 微調整
@ -202,25 +206,27 @@ python webui.py v1 <言語(オプション)>
    5. ASR転写を校正する
    6. 次のタブに移動し、モデルを微調整する
-### 推論WebUIを開く
+### 推論 WebUI を開く
 #### 統合パッケージ利用者
-`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。
+`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
 #### その他
 ```bash
 python GPT_SoVITS/inference_webui.py <言語(オプション)>
 ```
 または
 ```bash
 python webui.py
 ```
 その後、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。
-## V2リリースノート
+その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
 ## V2 リリースノート
 新機能:
@ -228,21 +234,21 @@ python webui.py
 2. 最適化されたテキストフロントエンド
-3. 事前学習済みモデルが2千時間から5千時間に拡張
+3. 事前学習済みモデルが 2 千時間から 5 千時間に拡張
 4. 低品質の参照音声に対する合成品質の向上
-    [詳細はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
-V1環境からV2を使用するには:
+V1 環境から V2 を使用するには:
 1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新
-2. 最新のコードをgithubからクローン
+2. 最新のコードを github からクローン
-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)からV2の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
+3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
-    中国語V2追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)（G2PWモデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します）
+   中国語 V2 追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)（G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します）
 ## V3 リリースノート
@ -250,17 +256,17 @@ V1環境からV2を使用するには:
 1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました（音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます）。
-2. GPTモデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
+2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
-    [詳細情報はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
 v2 環境から v3 を使用する方法:
 1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します。
-2. GitHubから最新のコードをクローンします。
+2. GitHub から最新のコードをクローンします。
-3. v3の事前学習済みモデル（s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ）を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。
+3. v3 の事前学習済みモデル（s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ）を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。
   追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。
@ -285,15 +291,20 @@ v2 環境から v3 を使用する方法:
  - [ ] モデルミックス
 ## (追加の) コマンドラインから実行する方法
 コマンド ラインを使用して UVR5 の WebUI を開きます
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
 <!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
 コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。
 ```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
@ -303,16 +314,21 @@ python audio_slicer.py \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
 ```
 コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
 ```
 python tools/asr/funasr_asr.py -i <input> -o <output>
 ```
-ASR処理はFaster_Whisperを通じて実行されます(中国語を除くASRマーキング)
+
 ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
 (進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります)
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
 カスタムリストの保存パスが有効になっています
 ## クレジット
@ -320,6 +336,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 特に以下のプロジェクトと貢献者に感謝します：
 ### 理論研究
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -329,17 +346,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
 ### 事前学習モデル
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
 ### 推論用テキストフロントエンド
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
 ### WebUI ツール
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/docs/ko/README.md
+++ b/docs/ko/README.md
@ -70,7 +70,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
-
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)
 ##### MacOS 사용자
 ```bash
 brew install ffmpeg
 ```
@ -106,6 +107,7 @@ brew install ffmpeg
 #### 의존성 설치
 ```bash
 pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```
@ -195,6 +197,7 @@ V1으로 전환하려면,
 ```bash
 python webui.py v1 <언어(옵션)>
 ```
 또는 WebUI에서 수동으로 버전을 전환하십시오.
 ### 미세 조정
@ -219,11 +222,13 @@ python webui.py v1 <언어(옵션)>
 ```bash
 python GPT_SoVITS/inference_webui.py <언어(옵션)>
 ```
 또는
 ```bash
 python webui.py
 ```
 그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
 ## V2 릴리스 노트
@ -238,7 +243,7 @@ python webui.py
 4. 저품질 참조 오디오에 대한 합성 품질 향상
-    [자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
 V1 환경에서 V2를 사용하려면:
@ -258,7 +263,7 @@ V1 환경에서 V2를 사용하려면:
 2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.
-    [자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
 v2 환경에서 v3 사용하기:
@ -270,7 +275,6 @@ v2 환경에서 v3 사용하기:
   추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
 ## 할 일 목록
 - [x] **최우선순위:**
@ -293,15 +297,20 @@ v2 환경에서 v3 사용하기:
  - [ ] 모델 블렌딩.
 ## (추가적인) 명령줄에서 실행하는 방법
 명령줄을 사용하여 UVR5용 WebUI 열기
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
 <!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
 명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
 ```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
@ -311,16 +320,21 @@ python audio_slicer.py \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
 ```
 명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
 ```
 python tools/asr/funasr_asr.py -i <input> -o <output>
 ```
 ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
 (진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
 사용자 정의 목록 저장 경로가 활성화되었습니다.
 ## 감사의 말
@ -328,6 +342,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 다음 프로젝트와 기여자들에게 특별히 감사드립니다:
 ### 이론 연구
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -337,17 +352,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
 ### 사전 학습 모델
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
 ### 추론용 텍스트 프론트엔드
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
 ### WebUI 도구
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/docs/tr/README.md
+++ b/docs/tr/README.md
@ -72,7 +72,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
-
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.
 ##### MacOS Kullanıcıları
 ```bash
 brew install ffmpeg
 ```
@ -106,6 +107,7 @@ brew install ffmpeg
 #### Bağımlılıkları Yükleme
 ```bash
 pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```
@ -192,6 +194,7 @@ V1'e geçmek istiyorsanız,
 ```bash
 python webui.py v1 <dil(isteğe bağlı)>
 ```
 veya WebUI'de manuel olarak sürüm değiştirin.
 ### İnce Ayar
@ -216,11 +219,13 @@ veya WebUI'de manuel olarak sürüm değiştirin.
 ```bash
 python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
 ```
 VEYA
 ```bash
 python webui.py
 ```
 ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
 ## V2 Sürüm Notları
@ -235,7 +240,7 @@ Yeni Özellikler:
 4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi
-    [detaylar burada](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
 V1 ortamından V2'yi kullanmak için:
@ -255,7 +260,7 @@ V1 ortamından V2'yi kullanmak için:
 2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi.
-    [daha fazla detay](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
 ### v2 ortamında v3 kullanımı:
@ -288,15 +293,20 @@ V1 ortamından V2'yi kullanmak için:
  - [ ] model karışımı
 ## (Ekstra) Komut satırından çalıştırma yöntemi
 UVR5 için Web Arayüzünü açmak için komut satırını kullanın
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
 <!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
 Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
 ```
 python audio_slicer.py \
    --input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
@ -306,16 +316,21 @@ python audio_slicer.py \
    --min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
    --hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
 ```
 Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
 ```
 python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
 ```
 ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
 (İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
 ```
 Özel bir liste kaydetme yolu etkinleştirildi
 ## Katkı Verenler
@ -323,6 +338,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
 Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
 ### Teorik Araştırma
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -332,17 +348,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
 ### Önceden Eğitilmiş Modeller
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
 ### Tahmin İçin Metin Ön Ucu
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
 ### WebUI Araçları
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/extra-req.txt
+++ b/extra-req.txt
@ -0,0 +1 @@
 faster-whisper
--- a/gpt-sovits_kaggle.ipynb
+++ b/gpt-sovits_kaggle.ipynb
@ -27,7 +27,8 @@
    "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
    "%cd GPT-SoVITS\n",
    "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
-    "!pip install -r requirements.txt"
+    "!pip install -r requirements.txt\n",
    "!pip install -r extra-req.txt --no-deps"
   ]
  },
  {
--- a/install.sh
+++ b/install.sh
@ -1,15 +1,17 @@
 #!/bin/bash
 set -e
 # 安装构建工具
 # Install build tools
 echo "Installing GCC..."
-conda install -c conda-forge gcc=14
+conda install -c conda-forge gcc=14 -y
 echo "Installing G++..."
-conda install -c conda-forge gxx
+conda install -c conda-forge gxx -y
 echo "Installing ffmpeg and cmake..."
-conda install ffmpeg cmake
+conda install ffmpeg cmake -y
 # 设置编译环境
 # Set up build environment
@ -18,7 +20,7 @@ export CC="$CONDA_PREFIX/bin/gcc"
 export CXX="$CONDA_PREFIX/bin/g++"
 echo "Checking for CUDA installation..."
-if command -v nvidia-smi &> /dev/null; then
+if command -v nvidia-smi &>/dev/null; then
    USE_CUDA=true
    echo "CUDA found."
 else
@ -26,7 +28,6 @@ else
    USE_CUDA=false
 fi
 if [ "$USE_CUDA" = false ]; then
    echo "Checking for ROCm installation..."
    if [ -d "/opt/rocm" ]; then
@ -48,7 +49,7 @@ fi
 if [ "$USE_CUDA" = true ]; then
    echo "Installing PyTorch with CUDA support..."
    conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
-elif [ "$USE_ROCM" = true ] ; then
+elif [ "$USE_ROCM" = true ]; then
    echo "Installing PyTorch with ROCm support..."
    pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2
 else
@ -56,21 +57,53 @@ else
    conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch
 fi
 echo "Installing Python dependencies from requirements.txt..."
 # 刷新环境
 # Refresh environment
 hash -r
 # pyopenjtalk Installation
 conda install jq -y
 OS_TYPE=$(uname)
 PACKAGE_NAME="pyopenjtalk"
 VERSION=$(curl -s https://pypi.org/pypi/$PACKAGE_NAME/json | jq -r .info.version)
 wget "https://files.pythonhosted.org/packages/source/${PACKAGE_NAME:0:1}/$PACKAGE_NAME/$PACKAGE_NAME-$VERSION.tar.gz"
 TAR_FILE=$(ls ${PACKAGE_NAME}-*.tar.gz)
 DIR_NAME="${TAR_FILE%.tar.gz}"
 tar -xzf "$TAR_FILE"
 rm "$TAR_FILE"
 CMAKE_FILE="$DIR_NAME/lib/open_jtalk/src/CMakeLists.txt"
 if [[ "$OS_TYPE" == "darwin"* ]]; then
    sed -i '' -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
 else
    sed -i -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
 fi
 tar -czf "$TAR_FILE" "$DIR_NAME"
 pip install "$TAR_FILE"
 rm -rf "$TAR_FILE" "$DIR_NAME"
 pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
-if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ] ; then
+if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
    echo "Update to WSL compatible runtime lib..."
-    location=`pip show torch | grep Location | awk -F ": " '{print $2}'`
+    location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
-    cd ${location}/torch/lib/
+    cd "${location}"/torch/lib/ || exit
    rm libhsa-runtime64.so*
    cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
 fi
 echo "Installation completed successfully!"
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ scipy
 tensorboard
 librosa==0.9.2
 numba==0.56.4
-pytorch-lightning
+pytorch-lightning>2.0
 gradio>=4.0,<=4.24.0
 ffmpeg-python
 onnxruntime; sys_platform == 'darwin'
@ -26,7 +26,6 @@ jieba_fast
 jieba
 split-lang
 fast_langdetect>=0.3.0
 Faster_Whisper
 wordsegment
 rotary_embedding_torch
 ToJyutping 
@ -38,4 +37,9 @@ python_mecab_ko; sys_platform != 'win32'
 fastapi<0.112.2
 x_transformers
 torchmetrics<=1.5
-attrdict
+pydantic<=2.10.6
 ctranslate2>=4.0,<5
 huggingface_hub>=0.13
 tokenizers>=0.13,<1
 av>=11
 tqdm