From d5e479dad6342222eb4887df627e69c048d2338c Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Mon, 26 May 2025 05:45:14 +0300 Subject: [PATCH] Introduce Docker and Windows CI Workflow, Pre-commit Formatting, and Language Resource Auto-Download (#2351) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Docker Auto-Build Workflow * Rename * Update * Fix Bugs * Disable Progress Bar When workflows triggered * Fix Wget * Fix Bugs * Fix Bugs * Update Wget * Update Workflows * Accelerate Docker Image Building * Fix Install.sh * Add Skip-Check For Action Runner * Fix Dockerfile * . * . * . * . * Delete File in Runner * Add Sort * Delete More Files * Delete More * . * . * . * Add Pre-Commit Hook Update Docker * Add Code Spell Check * [pre-commit.ci] trigger * [pre-commit.ci] trigger * [pre-commit.ci] trigger * Fix Bugs * . * Disable Progress Bar and Logs while using GitHub Actions * . * . * Fix Bugs * update conda * fix bugs * Fix Bugs * fix bugs * . * . * Quiet Installation * fix bugs * . * fix bug * . * Fix pre-commit.ci and Docker * fix bugs * . * Update Docker & Pre-Commit * fix bugs * Update Req * Update Req * Update OpenCC * update precommit * . * Update .pre-commit-config.yaml * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update Docs and fix bugs * Fix \ * Fix MacOS * . * test * . * Add Tag Alias * . * fix bugs * fix bugs * make image smaller * update pre-commit config * . * . * fix bugs * use miniconda * Fix Wrong Path * . * debug * debug * revert * Fix Bugs * Update Docs, Add Dict Auto Download in install.sh * update docker_build * Update Docs for Install.sh * update docker docs about architecture * Add Xcode-Commandline-Tool Installation * Update Docs 1. Add Missing VC17 2. Modufied the Order of FFmpeg Installation and Requirements Installation 3. Remove Duplicate FFmpeg * Fix Wrong Cuda Version * Update TESTED ENV * Add PYTHONNOUSERSITE(-s) * Fix Wrapper * Update install.sh For Robustness * Ignore .git * Preload CUDNN For Ctranslate2 * Remove Gradio Warnings * Update Colab * Fix OpenCC Problems * Update Win DLL Strategy * Fix Onnxruntime-gpu NVRTC Error * Fix Path Problems * Add Windows Packages Workflow * WIP * WIP * WIP * WIP * WIP * WIP * . * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * Fix Path * Fix Path * Enable Logging * Set 7-Zip compression level to maximum (-mx=9) * Use Multithread in ONNX Session * Fix Tag Bugs * Add Time * Add Time * Add Time * Compress More * Copy DLL to Solve VC Runtime DLL Missing Issues * Expose FFmpeg Errors, Copy Only Part of Visual C++ Runtime * Update build_windows_packages.ps1 * Update build_windows_packages.ps1 * Update build_windows_packages.ps1 * Update build_windows_packages.ps1 * WIP * WIP * WIP * Update build_windows_packages.ps1 * Update install.sh * Update build_windows_packages.ps1 * Update docker-publish.yaml * Update install.sh * Update Dockerfile * Update docker_build.sh * Update miniconda_install.sh * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update Colab-WebUI.ipynb * Update Colab-Inference.ipynb * Update docker-compose.yaml * 更新 build_windows_packages.ps1 * Update install.sh --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .dockerignore | 202 ++++++++++++- .github/build_windows_packages.ps1 | 194 ++++++++++++ .github/workflows/build_windows_packages.yaml | 38 +++ .github/workflows/docker-publish.yaml | 276 ++++++++++++++++++ .gitignore | 9 +- .pre-commit-config.yaml | 15 + Colab-Inference.ipynb | 13 +- colab_webui.ipynb => Colab-WebUI.ipynb | 4 +- Docker/damo.sha256 | 3 - Docker/download.py | 8 - Docker/download.sh | 11 - Docker/install_wrapper.sh | 33 +++ Docker/links.sha256 | 12 - Docker/links.txt | 34 --- Docker/miniconda_install.sh | 70 +++++ Dockerfile | 80 +++-- GPT_SoVITS/TTS_infer_pack/TTS.py | 40 ++- GPT_SoVITS/inference_webui.py | 140 +++++---- GPT_SoVITS/inference_webui_fast.py | 20 +- GPT_SoVITS/module/data_utils.py | 5 +- GPT_SoVITS/module/mel_processing.py | 66 +++-- GPT_SoVITS/module/models.py | 11 +- GPT_SoVITS/process_ckpt.py | 8 +- GPT_SoVITS/s2_train_v3_lora.py | 8 +- GPT_SoVITS/text/g2pw/onnx_api.py | 15 +- GPT_SoVITS/utils.py | 6 +- README.md | 130 +++++---- docker-compose.yaml | 103 +++++-- docker_build.sh | 82 ++++++ dockerbuild.sh | 21 -- docs/cn/README.md | 149 ++++++---- docs/ja/README.md | 127 ++++---- docs/ko/README.md | 118 +++++--- docs/tr/README.md | 123 +++++--- go-webui.bat | 4 + go-webui.ps1 | 5 +- install.sh | 255 ++++++++++------ requirements.txt | 8 +- tools/asr/fasterwhisper_asr.py | 3 + tools/my_utils.py | 112 ++++++- tools/subfix_webui.py | 6 +- tools/uvr5/lib/lib_v5/dataset.py | 28 +- tools/uvr5/lib/lib_v5/layers.py | 24 +- tools/uvr5/lib/lib_v5/layers_123812KB.py | 24 +- tools/uvr5/lib/lib_v5/layers_123821KB.py | 24 +- tools/uvr5/lib/lib_v5/layers_33966KB.py | 32 +- tools/uvr5/lib/lib_v5/layers_537227KB.py | 32 +- tools/uvr5/lib/lib_v5/layers_537238KB.py | 32 +- tools/uvr5/lib/lib_v5/layers_new.py | 28 +- tools/uvr5/lib/lib_v5/model_param_init.py | 7 +- tools/uvr5/lib/lib_v5/nets.py | 2 - tools/uvr5/lib/lib_v5/nets_537227KB.py | 1 - tools/uvr5/lib/lib_v5/nets_537238KB.py | 1 - tools/uvr5/lib/lib_v5/nets_new.py | 16 +- tools/uvr5/lib/lib_v5/spec_utils.py | 119 +++----- tools/uvr5/lib/utils.py | 16 +- tools/uvr5/webui.py | 18 +- webui.py | 78 +++-- 58 files changed, 2079 insertions(+), 970 deletions(-) create mode 100644 .github/build_windows_packages.ps1 create mode 100644 .github/workflows/build_windows_packages.yaml create mode 100644 .github/workflows/docker-publish.yaml create mode 100644 .pre-commit-config.yaml rename colab_webui.ipynb => Colab-WebUI.ipynb (95%) delete mode 100644 Docker/damo.sha256 delete mode 100644 Docker/download.py delete mode 100644 Docker/download.sh create mode 100644 Docker/install_wrapper.sh delete mode 100644 Docker/links.sha256 delete mode 100644 Docker/links.txt create mode 100644 Docker/miniconda_install.sh create mode 100644 docker_build.sh delete mode 100755 dockerbuild.sh diff --git a/.dockerignore b/.dockerignore index 4eca27b..bf36b88 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,8 +1,198 @@ -docs -logs +GPT_SoVITS/pretrained_models/* +tools/asr/models/* +tools/uvr5/uvr5_weights/* + +.git +.DS_Store +.vscode +*.pyc +env +runtime +.idea output -reference -SoVITS_weights -GPT_weights +logs +SoVITS_weights*/ +GPT_weights*/ TEMP -.git +weight.json +ffmpeg* +ffprobe* +cfg.json +speakers.json +ref_audios + +# Byte-compiled / optimized / DLL files +__pycache__/ +**/__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc diff --git a/.github/build_windows_packages.ps1 b/.github/build_windows_packages.ps1 new file mode 100644 index 0000000..2e4acb2 --- /dev/null +++ b/.github/build_windows_packages.ps1 @@ -0,0 +1,194 @@ +$ErrorActionPreference = "Stop" + +Write-Host "Current location: $(Get-Location)" + +$cuda = $env:TORCH_CUDA +if (-not $cuda) { + Write-Error "Missing TORCH_CUDA env (cu124 or cu128)" + exit 1 +} + +$date = $env:DATE_SUFFIX +if ([string]::IsNullOrWhiteSpace($date)) { + $date = Get-Date -Format "MMdd" +} + +$pkgName = "GPT-SoVITS-$date" +$tmpDir = "tmp" +$srcDir = $PWD + +$suffix = $env:PKG_SUFFIX +if (-not [string]::IsNullOrWhiteSpace($suffix)) { + $pkgName = "$pkgName$suffix" +} + +$pkgName = "$pkgName-$cuda" + +$baseHF = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main" +$PRETRAINED_URL = "$baseHF/pretrained_models.zip" +$G2PW_URL = "$baseHF/G2PWModel.zip" +$UVR5_URL = "$baseHF/uvr5_weights.zip" +$NLTK_URL = "$baseHF/nltk_data.zip" +$JTALK_URL = "$baseHF/open_jtalk_dic_utf_8-1.11.tar.gz" + +$PYTHON_VERSION = "3.11.12" +$PY_RELEASE_VERSION = "20250409" + +Write-Host "[INFO] Cleaning .git..." +Remove-Item "$srcDir\.git" -Recurse -Force -ErrorAction SilentlyContinue + +Write-Host "[INFO] Creating tmp dir..." +New-Item -ItemType Directory -Force -Path $tmpDir + +Write-Host "[INFO] System Python version:" +python --version +python -m site + +Write-Host "[INFO] Downloading Python $PYTHON_VERSION..." +$zst = "$tmpDir\python.tar.zst" +Invoke-WebRequest "https://github.com/astral-sh/python-build-standalone/releases/download/$PY_RELEASE_VERSION/cpython-$PYTHON_VERSION+$PY_RELEASE_VERSION-x86_64-pc-windows-msvc-pgo-full.tar.zst" -OutFile $zst +& "C:\Program Files\7-Zip\7z.exe" e $zst -o"$tmpDir" -aoa +$tar = Get-ChildItem "$tmpDir" -Filter "*.tar" | Select-Object -First 1 +& "C:\Program Files\7-Zip\7z.exe" x $tar.FullName -o"$tmpDir\extracted" -aoa +Move-Item "$tmpDir\extracted\python\install" "$srcDir\runtime" + +Write-Host "[INFO] Copying Redistributing Visual C++ Runtime..." +$vswhere = "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" +$vsPath = & $vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath +$redistRoot = Join-Path $vsPath "VC\Redist\MSVC" +$targetVer = Get-ChildItem -Path $redistRoot -Directory | + Where-Object { $_.Name -match "^14\." } | + Sort-Object Name -Descending | + Select-Object -First 1 +$x64Path = Join-Path $targetVer.FullName "x64" +Get-ChildItem -Path $x64Path -Directory | Where-Object { + $_.Name -match '^Microsoft\..*\.(CRT|OpenMP)$' +} | ForEach-Object { + Get-ChildItem -Path $_.FullName -Filter "*.dll" | ForEach-Object { + Copy-Item -Path $_.FullName -Destination "$srcDir\runtime" -Force + } +} + +function DownloadAndUnzip($url, $targetRelPath) { + $filename = Split-Path $url -Leaf + $tmpZip = "$tmpDir\$filename" + Invoke-WebRequest $url -OutFile $tmpZip + Expand-Archive -Path $tmpZip -DestinationPath $tmpDir -Force + $subdirName = $filename -replace '\.zip$', '' + $sourcePath = Join-Path $tmpDir $subdirName + $destRoot = Join-Path $srcDir $targetRelPath + $destPath = Join-Path $destRoot $subdirName + if (Test-Path $destPath) { + Remove-Item $destPath -Recurse -Force + } + Move-Item $sourcePath $destRoot + Remove-Item $tmpZip +} + +Write-Host "[INFO] Download pretrained_models..." +DownloadAndUnzip $PRETRAINED_URL "GPT_SoVITS" + +Write-Host "[INFO] Download G2PWModel..." +DownloadAndUnzip $G2PW_URL "GPT_SoVITS\text" + +Write-Host "[INFO] Download UVR5 model..." +DownloadAndUnzip $UVR5_URL "tools\uvr5" + +Write-Host "[INFO] Downloading funasr..." +$funasrUrl = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/funasr.zip" +$funasrZip = "$tmpDir\funasr.zip" +Invoke-WebRequest -Uri $funasrUrl -OutFile $funasrZip +Expand-Archive -Path $funasrZip -DestinationPath "$srcDir\tools\asr\models" -Force +Remove-Item $funasrZip + +Write-Host "[INFO] Download ffmpeg..." +$ffUrl = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip" +$ffZip = "$tmpDir\ffmpeg.zip" +Invoke-WebRequest -Uri $ffUrl -OutFile $ffZip +Expand-Archive $ffZip -DestinationPath $tmpDir -Force +$ffDir = Get-ChildItem -Directory "$tmpDir" | Where-Object { $_.Name -like "ffmpeg*" } | Select-Object -First 1 +Move-Item "$($ffDir.FullName)\bin\ffmpeg.exe" "$srcDir\runtime" +Move-Item "$($ffDir.FullName)\bin\ffprobe.exe" "$srcDir\runtime" +Remove-Item $ffZip +Remove-Item $ffDir.FullName -Recurse -Force + +Write-Host "[INFO] Installing PyTorch..." +& ".\runtime\python.exe" -m ensurepip +& ".\runtime\python.exe" -m pip install --upgrade pip --no-warn-script-location +switch ($cuda) { + "cu124" { + & ".\runtime\python.exe" -m pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cu124 --no-warn-script-location + } + "cu128" { + & ".\runtime\python.exe" -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128 --no-warn-script-location + } + default { + Write-Error "Unsupported CUDA version: $cuda" + exit 1 + } +} + +Write-Host "[INFO] Installing dependencies..." +& ".\runtime\python.exe" -m pip install -r extra-req.txt --no-deps --no-warn-script-location +& ".\runtime\python.exe" -m pip install -r requirements.txt --no-warn-script-location + +Write-Host "[INFO] Downloading NLTK and pyopenjtalk dictionary..." +$PYTHON = ".\runtime\python.exe" +$prefix = & $PYTHON -c "import sys; print(sys.prefix)" +$jtalkPath = & $PYTHON -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))" +$nltkZip = "$tmpDir\nltk_data.zip" +$jtalkTar = "$tmpDir\open_jtalk_dic_utf_8-1.11.tar.gz" + +Invoke-WebRequest -Uri $NLTK_URL -OutFile $nltkZip +Expand-Archive -Path $nltkZip -DestinationPath $prefix -Force +Remove-Item $nltkZip + +Invoke-WebRequest -Uri $JTALK_URL -OutFile $jtalkTar +& "C:\Program Files\7-Zip\7z.exe" e $jtalkTar -o"$tmpDir" -aoa +$innerTar = Get-ChildItem "$tmpDir" -Filter "*.tar" | Select-Object -First 1 +& "C:\Program Files\7-Zip\7z.exe" x $innerTar.FullName -o"$jtalkPath" -aoa +Remove-Item $jtalkTar +Remove-Item $innerTar.FullName + +Write-Host "[INFO] Preparing final directory $pkgName ..." +$items = @(Get-ChildItem -Filter "*.sh") + + @(Get-ChildItem -Filter "*.ipynb") + + @("$tmpDir", ".github", "Docker", "docs", ".gitignore", ".dockerignore", "README.md") +Remove-Item $items -Force -Recurse -ErrorAction SilentlyContinue +$curr = Get-Location +Set-Location ../ +Get-ChildItem . +Copy-Item -Path $curr -Destination $pkgName -Recurse +$7zPath = "$pkgName.7z" +$start = Get-Date +Write-Host "Compress Starting at $start" +& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -md=1g -ms=1g -mmc=500 -mfb=273 -mlc=0 -mlp=4 -mpb=4 -mc=8g -mmt=on -bsp1 +$end = Get-Date +Write-Host "Elapsed time: $($end - $start)" +Get-ChildItem . + +python -m pip install --upgrade pip +python -m pip install "modelscope" "huggingface_hub[hf_transfer]" --no-warn-script-location + +Write-Host "[INFO] Uploading to ModelScope..." +$msUser = $env:MODELSCOPE_USERNAME +$msToken = $env:MODELSCOPE_TOKEN +if (-not $msUser -or -not $msToken) { + Write-Error "Missing MODELSCOPE_USERNAME or MODELSCOPE_TOKEN" + exit 1 +} +modelscope upload "$msUser/GPT-SoVITS-Packages" "$7zPath" "$7zPath" --repo-type model --token $msToken + +Write-Host "[SUCCESS] Uploaded: $7zPath to ModelScope" + +Write-Host "[INFO] Uploading to HuggingFace..." +$hfUser = $env:HUGGINGFACE_USERNAME +$hfToken = $env:HUGGINGFACE_TOKEN +if (-not $hfUser -or -not $hfToken) { + Write-Error "Missing HUGGINGFACE_USERNAME or HUGGINGFACE_TOKEN" + exit 1 +} +$env:HF_HUB_ENABLE_HF_TRANSFER = "1" +huggingface-cli upload "$hfUser/GPT-SoVITS-Packages" "$7zPath" "$7zPath" --repo-type model --token $hfToken + +Write-Host "[SUCCESS] Uploaded: $7zPath to HuggingFace" diff --git a/.github/workflows/build_windows_packages.yaml b/.github/workflows/build_windows_packages.yaml new file mode 100644 index 0000000..3286146 --- /dev/null +++ b/.github/workflows/build_windows_packages.yaml @@ -0,0 +1,38 @@ +name: Build and Upload Windows Package + +on: + workflow_dispatch: + inputs: + date: + description: "Date suffix (optional)" + required: false + default: "" + suffix: + description: "Package name suffix (optional)" + required: false + default: "" + +jobs: + build: + runs-on: windows-latest + strategy: + matrix: + torch_cuda: [cu124, cu128] + env: + TORCH_CUDA: ${{ matrix.torch_cuda }} + MODELSCOPE_USERNAME: ${{ secrets.MODELSCOPE_USERNAME }} + MODELSCOPE_TOKEN: ${{ secrets.MODELSCOPE_TOKEN }} + HUGGINGFACE_USERNAME: ${{ secrets.HUGGINGFACE_USERNAME }} + HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }} + DATE_SUFFIX: ${{ github.event.inputs.date }} + PKG_SUFFIX: ${{ github.event.inputs.suffix }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run Build and Upload Script + shell: pwsh + run: | + Move-Item .github/build_windows_packages.ps1 ../build_windows_packages.ps1 + ../build_windows_packages.ps1 \ No newline at end of file diff --git a/.github/workflows/docker-publish.yaml b/.github/workflows/docker-publish.yaml new file mode 100644 index 0000000..a00a0a7 --- /dev/null +++ b/.github/workflows/docker-publish.yaml @@ -0,0 +1,276 @@ +name: Build and Publish Docker Image + +on: + workflow_dispatch: + +jobs: + generate-meta: + runs-on: ubuntu-22.04 + outputs: + tag: ${{ steps.meta.outputs.tag }} + steps: + - name: Checkout Code + uses: actions/checkout@v4 + + - name: Generate Tag + id: meta + run: | + DATE=$(date +'%Y%m%d') + COMMIT=$(git rev-parse --short=6 HEAD) + echo "tag=${DATE}-${COMMIT}" >> $GITHUB_OUTPUT + build-amd64: + needs: generate-meta + runs-on: ubuntu-22.04 + strategy: + matrix: + include: + - cuda_version: 12.6 + lite: true + torch_base: lite + tag_prefix: cu126-lite + - cuda_version: 12.6 + lite: false + torch_base: full + tag_prefix: cu126 + - cuda_version: 12.8 + lite: true + torch_base: lite + tag_prefix: cu128-lite + - cuda_version: 12.8 + lite: false + torch_base: full + tag_prefix: cu128 + + steps: + - name: Checkout Code + uses: actions/checkout@v4 + + - name: Free up disk space + run: | + echo "Before cleanup:" + df -h + + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + sudo rm -rf /opt/hostedtoolcache/PyPy + sudo rm -rf /opt/hostedtoolcache/go + sudo rm -rf /opt/hostedtoolcache/node + sudo rm -rf /opt/hostedtoolcache/Ruby + sudo rm -rf /opt/microsoft + sudo rm -rf /opt/pipx + sudo rm -rf /opt/az + sudo rm -rf /opt/google + + + sudo rm -rf /usr/lib/jvm + sudo rm -rf /usr/lib/google-cloud-sdk + sudo rm -rf /usr/lib/dotnet + + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/local/.ghcup + sudo rm -rf /usr/local/julia1.11.5 + sudo rm -rf /usr/local/share/powershell + sudo rm -rf /usr/local/share/chromium + + sudo rm -rf /usr/share/swift + sudo rm -rf /usr/share/miniconda + sudo rm -rf /usr/share/az_12.1.0 + sudo rm -rf /usr/share/dotnet + + echo "After cleanup:" + df -h + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_PASSWORD }} + + - name: Build and Push Docker Image (amd64) + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + push: true + platforms: linux/amd64 + build-args: | + LITE=${{ matrix.lite }} + TORCH_BASE=${{ matrix.torch_base }} + CUDA_VERSION=${{ matrix.cuda_version }} + WORKFLOW=true + tags: | + xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-amd64 + xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-amd64 + + build-arm64: + needs: generate-meta + runs-on: ubuntu-22.04-arm + strategy: + matrix: + include: + - cuda_version: 12.6 + lite: true + torch_base: lite + tag_prefix: cu126-lite + - cuda_version: 12.6 + lite: false + torch_base: full + tag_prefix: cu126 + - cuda_version: 12.8 + lite: true + torch_base: lite + tag_prefix: cu128-lite + - cuda_version: 12.8 + lite: false + torch_base: full + tag_prefix: cu128 + + steps: + - name: Checkout Code + uses: actions/checkout@v4 + + - name: Free up disk space + run: | + echo "Before cleanup:" + df -h + + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + sudo rm -rf /opt/hostedtoolcache/PyPy + sudo rm -rf /opt/hostedtoolcache/go + sudo rm -rf /opt/hostedtoolcache/node + sudo rm -rf /opt/hostedtoolcache/Ruby + sudo rm -rf /opt/microsoft + sudo rm -rf /opt/pipx + sudo rm -rf /opt/az + sudo rm -rf /opt/google + + + sudo rm -rf /usr/lib/jvm + sudo rm -rf /usr/lib/google-cloud-sdk + sudo rm -rf /usr/lib/dotnet + + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/local/.ghcup + sudo rm -rf /usr/local/julia1.11.5 + sudo rm -rf /usr/local/share/powershell + sudo rm -rf /usr/local/share/chromium + + sudo rm -rf /usr/share/swift + sudo rm -rf /usr/share/miniconda + sudo rm -rf /usr/share/az_12.1.0 + sudo rm -rf /usr/share/dotnet + + echo "After cleanup:" + df -h + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_PASSWORD }} + + - name: Build and Push Docker Image (arm64) + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + push: true + platforms: linux/arm64 + build-args: | + LITE=${{ matrix.lite }} + TORCH_BASE=${{ matrix.torch_base }} + CUDA_VERSION=${{ matrix.cuda_version }} + WORKFLOW=true + tags: | + xxxxrt666/gpt-sovits:${{ matrix.tag_prefix }}-${{ needs.generate-meta.outputs.tag }}-arm64 + xxxxrt666/gpt-sovits:latest-${{ matrix.tag_prefix }}-arm64 + + + merge-and-clean: + needs: + - build-amd64 + - build-arm64 + - generate-meta + runs-on: ubuntu-latest + strategy: + matrix: + include: + - tag_prefix: cu126-lite + - tag_prefix: cu126 + - tag_prefix: cu128-lite + - tag_prefix: cu128 + + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_PASSWORD }} + + - name: Merge amd64 and arm64 into multi-arch image + run: | + DATE_TAG=${{ needs.generate-meta.outputs.tag }} + TAG_PREFIX=${{ matrix.tag_prefix }} + + docker buildx imagetools create \ + --tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG} \ + ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-amd64 \ + ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:${TAG_PREFIX}-${DATE_TAG}-arm64 + + docker buildx imagetools create \ + --tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX} \ + ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-amd64 \ + ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-${TAG_PREFIX}-arm64 + - name: Delete old platform-specific tags via Docker Hub API + env: + DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }} + DOCKER_HUB_TOKEN: ${{ secrets.DOCKER_HUB_PASSWORD }} + TAG_PREFIX: ${{ matrix.tag_prefix }} + DATE_TAG: ${{ needs.generate-meta.outputs.tag }} + run: | + sudo apt-get update && sudo apt-get install -y jq + + TOKEN=$(curl -s -u $DOCKER_HUB_USERNAME:$DOCKER_HUB_TOKEN \ + "https://auth.docker.io/token?service=registry.docker.io&scope=repository:$DOCKER_HUB_USERNAME/gpt-sovits:pull,push,delete" \ + | jq -r .token) + + for PLATFORM in amd64 arm64; do + SAFE_PLATFORM=$(echo $PLATFORM | sed 's/\//-/g') + TAG="${TAG_PREFIX}-${DATE_TAG}-${SAFE_PLATFORM}" + LATEST_TAG="latest-${TAG_PREFIX}-${SAFE_PLATFORM}" + + for DEL_TAG in "$TAG" "$LATEST_TAG"; do + echo "Deleting tag: $DEL_TAG" + curl -X DELETE -H "Authorization: Bearer $TOKEN" https://registry-1.docker.io/v2/$DOCKER_HUB_USERNAME/gpt-sovits/manifests/$DEL_TAG + done + done + create-default: + runs-on: ubuntu-latest + needs: + - merge-and-clean + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_PASSWORD }} + + - name: Create Default Tag + run: | + docker buildx imagetools create \ + --tag ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest \ + ${{ secrets.DOCKER_HUB_USERNAME }}/gpt-sovits:latest-cu126-lite + \ No newline at end of file diff --git a/.gitignore b/.gitignore index 0bb4e0b..d280e45 100644 --- a/.gitignore +++ b/.gitignore @@ -7,13 +7,8 @@ runtime .idea output logs -reference -GPT_weights -SoVITS_weights -GPT_weights_v2 -SoVITS_weights_v2 -GPT_weights_v3 -SoVITS_weights_v3 +SoVITS_weights*/ +GPT_weights*/ TEMP weight.json ffmpeg* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..2434e74 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,15 @@ +ci: + autoupdate_schedule: monthly + +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.7 + hooks: + # Run the linter. + - id: ruff + types_or: [ python, pyi ] + args: [ --fix ] + # Run the formatter. + - id: ruff-format + types_or: [ python, pyi ] + args: [ --line-length, "120", --target-version, "py310" ] diff --git a/Colab-Inference.ipynb b/Colab-Inference.ipynb index 8a31701..b962c9b 100644 --- a/Colab-Inference.ipynb +++ b/Colab-Inference.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -39,9 +46,9 @@ "\n", "cd GPT-SoVITS\n", "\n", - "mkdir GPT_weights\n", + "mkdir -p GPT_weights\n", "\n", - "mkdir SoVITS_weights\n", + "mkdir -p SoVITS_weights\n", "\n", "if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n", " :\n", @@ -53,7 +60,7 @@ "\n", "pip install ipykernel\n", "\n", - "bash install.sh --source HF" + "bash install.sh --device CU126 --source HF" ] }, { diff --git a/colab_webui.ipynb b/Colab-WebUI.ipynb similarity index 95% rename from colab_webui.ipynb rename to Colab-WebUI.ipynb index b410775..b1403f3 100644 --- a/colab_webui.ipynb +++ b/Colab-WebUI.ipynb @@ -7,7 +7,7 @@ "id": "view-in-github" }, "source": [ - "\"Open" + "\"Open" ] }, { @@ -59,7 +59,7 @@ "\n", "pip install ipykernel\n", "\n", - "bash install.sh --source HF --download-uvr5" + "bash install.sh --device CU126 --source HF --download-uvr5" ] }, { diff --git a/Docker/damo.sha256 b/Docker/damo.sha256 deleted file mode 100644 index 6e9804d..0000000 --- a/Docker/damo.sha256 +++ /dev/null @@ -1,3 +0,0 @@ -5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025 /workspace/tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.pb -b3be75be477f0780277f3bae0fe489f48718f585f3a6e45d7dd1fbb1a4255fc5 /workspace/tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch/model.pb -a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916 /workspace/tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pb \ No newline at end of file diff --git a/Docker/download.py b/Docker/download.py deleted file mode 100644 index 952423d..0000000 --- a/Docker/download.py +++ /dev/null @@ -1,8 +0,0 @@ -# Download moda ASR related models -from modelscope import snapshot_download - -model_dir = snapshot_download( - "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", revision="v2.0.4" -) -model_dir = snapshot_download("damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", revision="v2.0.4") -model_dir = snapshot_download("damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", revision="v2.0.4") diff --git a/Docker/download.sh b/Docker/download.sh deleted file mode 100644 index 447e018..0000000 --- a/Docker/download.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -set -Eeuo pipefail - -echo "Downloading models..." - -aria2c --disable-ipv6 --input-file /workspace/Docker/links.txt --dir /workspace --continue - -echo "Checking SHA256..." - -parallel --will-cite -a /workspace/Docker/links.sha256 "echo -n {} | sha256sum -c" diff --git a/Docker/install_wrapper.sh b/Docker/install_wrapper.sh new file mode 100644 index 0000000..6dd93e5 --- /dev/null +++ b/Docker/install_wrapper.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +cd "$SCRIPT_DIR" || exit 1 + +cd .. || exit 1 + +set -e + +source "$HOME/miniconda3/etc/profile.d/conda.sh" + +mkdir -p GPT_SoVITS + +mkdir -p GPT_SoVITS/text + +ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models + +ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel + +bash install.sh --device "CU${CUDA_VERSION//./}" --source HF + +pip cache purge + +pip show torch + +rm -rf /tmp/* /var/tmp/* + +rm -rf "$HOME/miniconda3/pkgs" + +mkdir -p "$HOME/miniconda3/pkgs" + +rm -rf /root/.conda /root/.cache diff --git a/Docker/links.sha256 b/Docker/links.sha256 deleted file mode 100644 index cda6dc1..0000000 --- a/Docker/links.sha256 +++ /dev/null @@ -1,12 +0,0 @@ -b1c1e17e9c99547a89388f72048cd6e1b41b5a18b170e86a46dfde0324d63eb1 /workspace/GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt -fc579c1db3c1e21b721001cf99d7a584214280df19b002e200b630a34fa06eb8 /workspace/GPT_SoVITS/pretrained_models/s2D488k.pth -020a014e1e01e550e510f2f61fae5e5f5b6aab40f15c22f1f12f724df507e835 /workspace/GPT_SoVITS/pretrained_models/s2G488k.pth -24164f129c66499d1346e2aa55f183250c223161ec2770c0da3d3b08cf432d3c /workspace/GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin -e53a693acc59ace251d143d068096ae0d7b79e4b1b503fa84c9dcf576448c1d8 /workspace/GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin -39796caa5db18d7f9382d8ac997ac967bfd85f7761014bb807d2543cc844ef05 /workspace/tools/uvr5/uvr5_weights/HP2_all_vocals.pth -45e6b65199e781b4a6542002699be9f19cd3d1cb7d1558bc2bfbcd84674dfe28 /workspace/tools/uvr5/uvr5_weights/HP3_all_vocals.pth -5908891829634926119720241e8573d97cbeb8277110a7512bdb0bd7563258ee /workspace/tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth -8c8fd1582f9aabc363e47af62ddb88df6cae7e064cae75bbf041a067a5e0aee2 /workspace/tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth -01376dd2a571bf3cb9cced680732726d2d732609d09216a610b0d110f133febe /workspace/tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth -56aba59db3bcdd14a14464e62f3129698ecdea62eee0f003b9360923eb3ac79e /workspace/tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth -233bb5c6aaa365e568659a0a81211746fa881f8f47f82d9e864fce1f7692db80 /workspace/tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx \ No newline at end of file diff --git a/Docker/links.txt b/Docker/links.txt deleted file mode 100644 index e6603db..0000000 --- a/Docker/links.txt +++ /dev/null @@ -1,34 +0,0 @@ -# GPT-SoVITS models -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s1bert25hz-2kh-longer-epoch%3D68e-step%3D50232.ckpt - out=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2D488k.pth - out=GPT_SoVITS/pretrained_models/s2D488k.pth -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/s2G488k.pth - out=GPT_SoVITS/pretrained_models/s2G488k.pth -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/config.json - out=GPT_SoVITS/pretrained_models/chinese-hubert-base/config.json -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/preprocessor_config.json - out=GPT_SoVITS/pretrained_models/chinese-hubert-base/preprocessor_config.json -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-hubert-base/pytorch_model.bin - out=GPT_SoVITS/pretrained_models/chinese-hubert-base/pytorch_model.bin -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/config.json - out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/config.json -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/pytorch_model.bin - out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/pytorch_model.bin -https://huggingface.co/lj1995/GPT-SoVITS/resolve/main/chinese-roberta-wwm-ext-large/tokenizer.json - out=GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large/tokenizer.json -# UVR5 -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth - out=tools/uvr5/uvr5_weights/HP2_all_vocals.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth - out=tools/uvr5/uvr5_weights/HP3_all_vocals.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth - out=tools/uvr5/uvr5_weights/HP5_only_main_vocal.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth - out=tools/uvr5/uvr5_weights/VR-DeEchoAggressive.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth - out=tools/uvr5/uvr5_weights/VR-DeEchoDeReverb.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth - out=tools/uvr5/uvr5_weights/VR-DeEchoNormal.pth -https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx - out=tools/uvr5/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx \ No newline at end of file diff --git a/Docker/miniconda_install.sh b/Docker/miniconda_install.sh new file mode 100644 index 0000000..001a2a4 --- /dev/null +++ b/Docker/miniconda_install.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +cd "$SCRIPT_DIR" || exit 1 + +cd .. || exit 1 + +if [ -d "$HOME/miniconda3" ]; then + exit 0 +fi + +WORKFLOW=${WORKFLOW:-"false"} +TARGETPLATFORM=${TARGETPLATFORM:-"linux/amd64"} + +if [ "$WORKFLOW" = "true" ]; then + WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404) +else + WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404) +fi + +if [ "$TARGETPLATFORM" = "linux/amd64" ]; then + "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-x86_64.sh +elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then + "${WGET_CMD[@]}" -O miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py311_25.3.1-1-Linux-aarch64.sh +else + exit 1 +fi + +LOG_PATH="/tmp/miniconda-install.log" + +bash miniconda.sh -b -p "$HOME/miniconda3" >"$LOG_PATH" 2>&1 + +if [ $? -eq 0 ]; then + echo "== Miniconda Installed ==" +else + echo "Failed to Install miniconda" + tail -n 50 "$LOG_PATH" + exit 1 +fi + +rm miniconda.sh + +source "$HOME/miniconda3/etc/profile.d/conda.sh" + +"$HOME/miniconda3/bin/conda" config --add channels conda-forge + +"$HOME/miniconda3/bin/conda" update -q --all -y 1>/dev/null + +"$HOME/miniconda3/bin/conda" install python=3.11 -q -y + +"$HOME/miniconda3/bin/conda" install gcc=14 gxx ffmpeg cmake make unzip -q -y + +if [ "$CUDA_VERSION" = "12.8" ]; then + "$HOME/miniconda3/bin/pip" install torch torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu128 +elif [ "$CUDA_VERSION" = "12.6" ]; then + "$HOME/miniconda3/bin/pip" install torch==2.6 torchaudio --no-cache-dir --index-url https://download.pytorch.org/whl/cu126 +fi + +"$HOME/miniconda3/bin/pip" cache purge + +rm $LOG_PATH + +rm -rf "$HOME/miniconda3/pkgs" + +mkdir -p "$HOME/miniconda3/pkgs" + +rm -rf "$HOME/.conda" "$HOME/.cache" diff --git a/Dockerfile b/Dockerfile index 80cd9f3..71bf6fa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,42 +1,62 @@ -# Base CUDA image -FROM cnstark/pytorch:2.0.1-py3.9.17-cuda11.8.0-ubuntu20.04 +ARG CUDA_VERSION=12.6 +ARG TORCH_BASE=full -LABEL maintainer="breakstring@hotmail.com" -LABEL version="dev-20240209" +FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE} + +LABEL maintainer="XXXXRT" +LABEL version="V4" LABEL description="Docker image for GPT-SoVITS" +ARG CUDA_VERSION=12.6 -# Install 3rd party apps -ENV DEBIAN_FRONTEND=noninteractive -ENV TZ=Etc/UTC -RUN apt-get update && \ - apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && \ - git lfs install && \ - rm -rf /var/lib/apt/lists/* +ENV CUDA_VERSION=${CUDA_VERSION} -# Copy only requirements.txt initially to leverage Docker cache -WORKDIR /workspace -COPY requirements.txt /workspace/ -RUN pip install --no-cache-dir -r requirements.txt +SHELL ["/bin/bash", "-c"] + +WORKDIR /workspace/GPT-SoVITS + +COPY Docker /workspace/GPT-SoVITS/Docker/ + +ARG LITE=false +ENV LITE=${LITE} + +ARG WORKFLOW=false +ENV WORKFLOW=${WORKFLOW} + +ARG TARGETPLATFORM +ENV TARGETPLATFORM=${TARGETPLATFORM} + +RUN bash Docker/miniconda_install.sh -# Define a build-time argument for image type -ARG IMAGE_TYPE=full +COPY extra-req.txt /workspace/GPT-SoVITS/ -# Conditional logic based on the IMAGE_TYPE argument -# Always copy the Docker directory, but only use it if IMAGE_TYPE is not "elite" -COPY ./Docker /workspace/Docker -# elite 类型的镜像里面不包含额外的模型 -RUN if [ "$IMAGE_TYPE" != "elite" ]; then \ - chmod +x /workspace/Docker/download.sh && \ - /workspace/Docker/download.sh && \ - python /workspace/Docker/download.py && \ - python -m nltk.downloader averaged_perceptron_tagger cmudict; \ - fi +COPY requirements.txt /workspace/GPT-SoVITS/ +COPY install.sh /workspace/GPT-SoVITS/ -# Copy the rest of the application -COPY . /workspace +RUN bash Docker/install_wrapper.sh EXPOSE 9871 9872 9873 9874 9880 -CMD ["python", "webui.py"] +ENV PYTHONPATH="/workspace/GPT-SoVITS" + +RUN conda init bash && echo "conda activate base" >> ~/.bashrc + +WORKDIR /workspace + +RUN rm -rf /workspace/GPT-SoVITS + +WORKDIR /workspace/GPT-SoVITS + +COPY . /workspace/GPT-SoVITS + +CMD ["/bin/bash", "-c", "\ + rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \ + rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \ + rm -rf /workspace/GPT-SoVITS/tools/asr/models && \ + rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \ + ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \ + ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \ + ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \ + ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \ + exec bash"] \ No newline at end of file diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index d20daee..6ef46eb 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -108,7 +108,7 @@ resample_transform_dict = {} def resample(audio_tensor, sr0, sr1, device): global resample_transform_dict - key="%s-%s"%(sr0,sr1) + key = "%s-%s" % (sr0, sr1) if key not in resample_transform_dict: resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device) return resample_transform_dict[key](audio_tensor) @@ -252,7 +252,6 @@ class TTS_Config: "cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base", "bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large", }, - } configs: dict = None v1_languages: list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"] @@ -432,7 +431,6 @@ class TTS: "aux_ref_audio_paths": [], } - self.stop_flag: bool = False self.precision: torch.dtype = torch.float16 if self.configs.is_half else torch.float32 @@ -468,7 +466,7 @@ class TTS: path_sovits = self.configs.default_configs[model_version]["vits_weights_path"] if if_lora_v3 == True and os.path.exists(path_sovits) == False: - info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重"%model_version) + info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version) raise FileExistsError(info) # dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False) @@ -507,7 +505,7 @@ class TTS: ) self.configs.use_vocoder = False else: - kwargs["version"]=model_version + kwargs["version"] = model_version vits_model = SynthesizerTrnV3( self.configs.filter_length // 2 + 1, self.configs.segment_size // self.configs.hop_length, @@ -572,7 +570,7 @@ class TTS: self.vocoder.cpu() del self.vocoder self.empty_cache() - + self.vocoder = BigVGAN.from_pretrained( "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False, @@ -595,18 +593,21 @@ class TTS: self.empty_cache() self.vocoder = Generator( - initial_channel=100, - resblock="1", - resblock_kernel_sizes=[3, 7, 11], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], - upsample_rates=[10, 6, 2, 2, 2], - upsample_initial_channel=512, - upsample_kernel_sizes=[20, 12, 4, 4, 4], - gin_channels=0, is_bias=True - ) + initial_channel=100, + resblock="1", + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + upsample_rates=[10, 6, 2, 2, 2], + upsample_initial_channel=512, + upsample_kernel_sizes=[20, 12, 4, 4, 4], + gin_channels=0, + is_bias=True, + ) self.vocoder.remove_weight_norm() - state_dict_g = torch.load("%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu") - print("loading vocoder",self.vocoder.load_state_dict(state_dict_g)) + state_dict_g = torch.load( + "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu" + ) + print("loading vocoder", self.vocoder.load_state_dict(state_dict_g)) self.vocoder_configs["sr"] = 48000 self.vocoder_configs["T_ref"] = 500 @@ -614,9 +615,6 @@ class TTS: self.vocoder_configs["upsample_rate"] = 480 self.vocoder_configs["overlapped_len"] = 12 - - - self.vocoder = self.vocoder.eval() if self.configs.is_half == True: self.vocoder = self.vocoder.half().to(self.configs.device) @@ -1439,7 +1437,7 @@ class TTS: ref_audio = ref_audio.to(self.configs.device).float() if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - + # tgt_sr = self.vocoder_configs["sr"] tgt_sr = 24000 if self.configs.version == "v3" else 32000 if ref_sr != tgt_sr: diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 4bee27c..4682014 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -7,11 +7,17 @@ 全部按日文识别 """ +import json import logging +import os +import re +import sys import traceback import warnings +import torch import torchaudio +from text.LangSegmenter import LangSegmenter logging.getLogger("markdown_it").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.ERROR) @@ -23,20 +29,6 @@ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) logging.getLogger("multipart.multipart").setLevel(logging.ERROR) warnings.simplefilter(action="ignore", category=FutureWarning) -import json -import os -import re -import sys - -import torch -from text.LangSegmenter import LangSegmenter - -try: - import gradio.analytics as analytics - - analytics.version_check = lambda: None -except: - ... version = model_version = os.environ.get("version", "v2") path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth" path_sovits_v4 = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth" @@ -106,7 +98,7 @@ cnhubert.cnhubert_base_path = cnhubert_base_path import random -from GPT_SoVITS.module.models import SynthesizerTrn, SynthesizerTrnV3,Generator +from GPT_SoVITS.module.models import Generator, SynthesizerTrn, SynthesizerTrnV3 def set_seed(seed): @@ -226,9 +218,9 @@ else: resample_transform_dict = {} -def resample(audio_tensor, sr0,sr1): +def resample(audio_tensor, sr0, sr1): global resample_transform_dict - key="%s-%s"%(sr0,sr1) + key = "%s-%s" % (sr0, sr1) if key not in resample_transform_dict: resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device) return resample_transform_dict[key](audio_tensor) @@ -238,14 +230,18 @@ def resample(audio_tensor, sr0,sr1): # symbol_version-model_version-if_lora_v3 from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new -v3v4set={"v3","v4"} +v3v4set = {"v3", "v4"} + + def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): global vq_model, hps, version, model_version, dict_language, if_lora_v3 version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) - print(sovits_path,version, model_version, if_lora_v3) - is_exist=is_exist_s2gv3 if model_version=="v3"else is_exist_s2gv4 + print(sovits_path, version, model_version, if_lora_v3) + is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4 if if_lora_v3 == True and is_exist == False: - info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重"%model_version) + info = "GPT_SoVITS/pretrained_models/s2Gv3.pth" + i18n( + "SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version + ) gr.Warning(info) raise FileExistsError(info) dict_language = dict_language_v1 if version == "v1" else dict_language_v2 @@ -276,10 +272,15 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None) prompt_language_update, text_update, text_language_update, - {"__type__": "update", "visible": visible_sample_steps, "value": 32 if model_version=="v3"else 8,"choices":[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32]}, + { + "__type__": "update", + "visible": visible_sample_steps, + "value": 32 if model_version == "v3" else 8, + "choices": [4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32], + }, {"__type__": "update", "visible": visible_inp_refs}, {"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False}, - {"__type__": "update", "visible": True if model_version =="v3" else False}, + {"__type__": "update", "visible": True if model_version == "v3" else False}, {"__type__": "update", "value": i18n("模型加载中,请等待"), "interactive": False}, ) @@ -304,7 +305,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None) ) model_version = version else: - hps.model.version=model_version + hps.model.version = model_version vq_model = SynthesizerTrnV3( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, @@ -326,7 +327,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None) else: path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 print( - "loading sovits_%spretrained_G"%model_version, + "loading sovits_%spretrained_G" % model_version, vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False), ) lora_rank = dict_s2["lora_rank"] @@ -337,7 +338,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None) init_lora_weights=True, ) vq_model.cfm = get_peft_model(vq_model.cfm, lora_config) - print("loading sovits_%s_lora%s" % (model_version,lora_rank)) + print("loading sovits_%s_lora%s" % (model_version, lora_rank)) vq_model.load_state_dict(dict_s2["weight"], strict=False) vq_model.cfm = vq_model.cfm.merge_and_unload() # torch.save(vq_model.state_dict(),"merge_win.pth") @@ -350,10 +351,15 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None) prompt_language_update, text_update, text_language_update, - {"__type__": "update", "visible": visible_sample_steps, "value":32 if model_version=="v3"else 8,"choices":[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32]}, + { + "__type__": "update", + "visible": visible_sample_steps, + "value": 32 if model_version == "v3" else 8, + "choices": [4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32], + }, {"__type__": "update", "visible": visible_inp_refs}, {"__type__": "update", "value": False, "interactive": True if model_version not in v3v4set else False}, - {"__type__": "update", "visible": True if model_version =="v3" else False}, + {"__type__": "update", "visible": True if model_version == "v3" else False}, {"__type__": "update", "value": i18n("合成语音"), "interactive": True}, ) with open("./weight.json") as f: @@ -400,7 +406,7 @@ now_dir = os.getcwd() def init_bigvgan(): - global bigvgan_model,hifigan_model + global bigvgan_model, hifigan_model from BigVGAN import bigvgan bigvgan_model = bigvgan.BigVGAN.from_pretrained( @@ -411,17 +417,20 @@ def init_bigvgan(): bigvgan_model.remove_weight_norm() bigvgan_model = bigvgan_model.eval() if hifigan_model: - hifigan_model=hifigan_model.cpu() - hifigan_model=None - try:torch.cuda.empty_cache() - except:pass + hifigan_model = hifigan_model.cpu() + hifigan_model = None + try: + torch.cuda.empty_cache() + except: + pass if is_half == True: bigvgan_model = bigvgan_model.half().to(device) else: bigvgan_model = bigvgan_model.to(device) + def init_hifigan(): - global hifigan_model,bigvgan_model + global hifigan_model, bigvgan_model hifigan_model = Generator( initial_channel=100, resblock="1", @@ -430,26 +439,32 @@ def init_hifigan(): upsample_rates=[10, 6, 2, 2, 2], upsample_initial_channel=512, upsample_kernel_sizes=[20, 12, 4, 4, 4], - gin_channels=0, is_bias=True + gin_channels=0, + is_bias=True, ) hifigan_model.eval() hifigan_model.remove_weight_norm() - state_dict_g = torch.load("%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu") - print("loading vocoder",hifigan_model.load_state_dict(state_dict_g)) + state_dict_g = torch.load( + "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu" + ) + print("loading vocoder", hifigan_model.load_state_dict(state_dict_g)) if bigvgan_model: - bigvgan_model=bigvgan_model.cpu() - bigvgan_model=None - try:torch.cuda.empty_cache() - except:pass + bigvgan_model = bigvgan_model.cpu() + bigvgan_model = None + try: + torch.cuda.empty_cache() + except: + pass if is_half == True: hifigan_model = hifigan_model.half().to(device) else: hifigan_model = hifigan_model.to(device) -bigvgan_model=hifigan_model=None -if model_version=="v3": + +bigvgan_model = hifigan_model = None +if model_version == "v3": init_bigvgan() -if model_version=="v4": +if model_version == "v4": init_hifigan() @@ -831,17 +846,17 @@ def get_tts_wav( ref_audio = ref_audio.to(device).float() if ref_audio.shape[0] == 2: ref_audio = ref_audio.mean(0).unsqueeze(0) - tgt_sr=24000 if model_version=="v3"else 32000 + tgt_sr = 24000 if model_version == "v3" else 32000 if sr != tgt_sr: - ref_audio = resample(ref_audio, sr,tgt_sr) + ref_audio = resample(ref_audio, sr, tgt_sr) # print("ref_audio",ref_audio.abs().mean()) - mel2 = mel_fn(ref_audio)if model_version=="v3"else mel_fn_v4(ref_audio) + mel2 = mel_fn(ref_audio) if model_version == "v3" else mel_fn_v4(ref_audio) mel2 = norm_spec(mel2) T_min = min(mel2.shape[2], fea_ref.shape[2]) mel2 = mel2[:, :, :T_min] fea_ref = fea_ref[:, :, :T_min] - Tref=468 if model_version=="v3"else 500 - Tchunk=934 if model_version=="v3"else 1000 + Tref = 468 if model_version == "v3" else 500 + Tchunk = 934 if model_version == "v3" else 1000 if T_min > Tref: mel2 = mel2[:, :, -Tref:] fea_ref = fea_ref[:, :, -Tref:] @@ -866,13 +881,13 @@ def get_tts_wav( cfm_resss.append(cfm_res) cfm_res = torch.cat(cfm_resss, 2) cfm_res = denorm_spec(cfm_res) - if model_version=="v3": + if model_version == "v3": if bigvgan_model == None: init_bigvgan() - else:#v4 + else: # v4 if hifigan_model == None: init_hifigan() - vocoder_model=bigvgan_model if model_version=="v3"else hifigan_model + vocoder_model = bigvgan_model if model_version == "v3" else hifigan_model with torch.inference_mode(): wav_gen = vocoder_model(cfm_res) audio = wav_gen[0][0] # .cpu().detach().numpy() @@ -886,9 +901,12 @@ def get_tts_wav( t1 = ttime() print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3]))) audio_opt = torch.cat(audio_opt, 0) # np.concatenate - if model_version in {"v1","v2"}:opt_sr=32000 - elif model_version=="v3":opt_sr=24000 - else:opt_sr=48000#v4 + if model_version in {"v1", "v2"}: + opt_sr = 32000 + elif model_version == "v3": + opt_sr = 24000 + else: + opt_sr = 48000 # v4 if if_sr == True and opt_sr == 24000: print(i18n("音频超分中")) audio_opt, opt_sr = audio_sr(audio_opt.unsqueeze(0), opt_sr) @@ -1061,7 +1079,7 @@ def html_left(text, label="p"): """ -with gr.Blocks(title="GPT-SoVITS WebUI") as app: +with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app: gr.Markdown( value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" @@ -1131,16 +1149,16 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: sample_steps = ( gr.Radio( label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), - value=32 if model_version=="v3"else 8, - choices=[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32], + value=32 if model_version == "v3" else 8, + choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32], visible=True, ) if model_version in v3v4set else gr.Radio( label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"), - choices=[4, 8, 16, 32,64,128]if model_version=="v3"else [4, 8, 16, 32], + choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32], visible=False, - value=32 if model_version=="v3"else 8, + value=32 if model_version == "v3" else 8, ) ) if_sr_Checkbox = gr.Checkbox( @@ -1148,7 +1166,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: value=False, interactive=True, show_label=True, - visible=False if model_version !="v3" else True, + visible=False if model_version != "v3" else True, ) gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3")) with gr.Row(): diff --git a/GPT_SoVITS/inference_webui_fast.py b/GPT_SoVITS/inference_webui_fast.py index 311994b..0b9525e 100644 --- a/GPT_SoVITS/inference_webui_fast.py +++ b/GPT_SoVITS/inference_webui_fast.py @@ -14,6 +14,8 @@ import random import re import sys +import torch + now_dir = os.getcwd() sys.path.append(now_dir) sys.path.append("%s/GPT_SoVITS" % (now_dir)) @@ -25,14 +27,6 @@ logging.getLogger("httpx").setLevel(logging.ERROR) logging.getLogger("asyncio").setLevel(logging.ERROR) logging.getLogger("charset_normalizer").setLevel(logging.ERROR) logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) -import torch - -try: - import gradio.analytics as analytics - - analytics.version_check = lambda: None -except: - ... infer_ttswebui = os.environ.get("infer_ttswebui", 9872) @@ -262,15 +256,17 @@ SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) from process_ckpt import get_sovits_version_from_path_fast -v3v4set={"v3","v4"} +v3v4set = {"v3", "v4"} + + def change_sovits_weights(sovits_path, prompt_language=None, text_language=None): global version, model_version, dict_language, if_lora_v3 version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path) # print(sovits_path,version, model_version, if_lora_v3) - is_exist=is_exist_s2gv3 if model_version=="v3"else is_exist_s2gv4 + is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4 path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4 if if_lora_v3 == True and is_exist == False: - info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重"%model_version) + info = path_sovits + i18n("SoVITS %s 底模缺失,无法加载相应 LoRA 权重" % model_version) gr.Warning(info) raise FileExistsError(info) dict_language = dict_language_v1 if version == "v1" else dict_language_v2 @@ -328,7 +324,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None) f.write(json.dumps(data)) -with gr.Blocks(title="GPT-SoVITS WebUI") as app: +with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app: gr.Markdown( value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index 1bda2b3..11f6b09 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -470,6 +470,7 @@ class TextAudioSpeakerCollateV3: # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths + class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset): """ 1) loads audio, speaker_id, text pairs @@ -596,7 +597,7 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset): audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False ) spec = torch.squeeze(spec, 0) - spec1 = spectrogram_torch(audio_norm, 1280,32000, 320, 1280,center=False) + spec1 = spectrogram_torch(audio_norm, 1280, 32000, 320, 1280, center=False) mel = spec_to_mel_torch(spec1, 1280, 100, 32000, 0, None) mel = self.norm_spec(torch.squeeze(mel, 0)) return spec, mel @@ -643,7 +644,7 @@ class TextAudioSpeakerCollateV4: mel_lengths = torch.LongTensor(len(batch)) spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) - mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_spec_len*2) + mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_spec_len * 2) ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len) text_padded = torch.LongTensor(len(batch), max_text_len) # wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) diff --git a/GPT_SoVITS/module/mel_processing.py b/GPT_SoVITS/module/mel_processing.py index 7a17c54..62c7b40 100644 --- a/GPT_SoVITS/module/mel_processing.py +++ b/GPT_SoVITS/module/mel_processing.py @@ -39,24 +39,36 @@ hann_window = {} def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): if torch.min(y) < -1.2: - print('min value is ', torch.min(y)) + print("min value is ", torch.min(y)) if torch.max(y) > 1.2: - print('max value is ', torch.max(y)) + print("max value is ", torch.max(y)) global hann_window - dtype_device = str(y.dtype) + '_' + str(y.device) + dtype_device = str(y.dtype) + "_" + str(y.device) # wnsize_dtype_device = str(win_size) + '_' + dtype_device - key = "%s-%s-%s-%s-%s" %(dtype_device,n_fft, sampling_rate, hop_size, win_size) + key = "%s-%s-%s-%s-%s" % (dtype_device, n_fft, sampling_rate, hop_size, win_size) # if wnsize_dtype_device not in hann_window: if key not in hann_window: # hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) hann_window[key] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) - y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = torch.nn.functional.pad( + y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" + ) y = y.squeeze(1) # spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], - spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[key], - center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[key], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8) return spec @@ -64,9 +76,9 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False) def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): global mel_basis - dtype_device = str(spec.dtype) + '_' + str(spec.device) + dtype_device = str(spec.dtype) + "_" + str(spec.device) # fmax_dtype_device = str(fmax) + '_' + dtype_device - key = "%s-%s-%s-%s-%s-%s"%(dtype_device,n_fft, num_mels, sampling_rate, fmin, fmax) + key = "%s-%s-%s-%s-%s-%s" % (dtype_device, n_fft, num_mels, sampling_rate, fmin, fmax) # if fmax_dtype_device not in mel_basis: if key not in mel_basis: mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) @@ -78,17 +90,25 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): return spec - def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): if torch.min(y) < -1.2: - print('min value is ', torch.min(y)) + print("min value is ", torch.min(y)) if torch.max(y) > 1.2: - print('max value is ', torch.max(y)) + print("max value is ", torch.max(y)) global mel_basis, hann_window - dtype_device = str(y.dtype) + '_' + str(y.device) + dtype_device = str(y.dtype) + "_" + str(y.device) # fmax_dtype_device = str(fmax) + '_' + dtype_device - fmax_dtype_device = "%s-%s-%s-%s-%s-%s-%s-%s"%(dtype_device,n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax) + fmax_dtype_device = "%s-%s-%s-%s-%s-%s-%s-%s" % ( + dtype_device, + n_fft, + num_mels, + sampling_rate, + hop_size, + win_size, + fmin, + fmax, + ) # wnsize_dtype_device = str(win_size) + '_' + dtype_device wnsize_dtype_device = fmax_dtype_device if fmax_dtype_device not in mel_basis: @@ -97,11 +117,23 @@ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, if wnsize_dtype_device not in hann_window: hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) - y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') + y = torch.nn.functional.pad( + y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" + ) y = y.squeeze(1) - spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], - center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8) diff --git a/GPT_SoVITS/module/models.py b/GPT_SoVITS/module/models.py index 21f60d9..3e37f0f 100644 --- a/GPT_SoVITS/module/models.py +++ b/GPT_SoVITS/module/models.py @@ -414,7 +414,8 @@ class Generator(torch.nn.Module): upsample_rates, upsample_initial_channel, upsample_kernel_sizes, - gin_channels=0,is_bias=False, + gin_channels=0, + is_bias=False, ): super(Generator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) @@ -1173,7 +1174,7 @@ class SynthesizerTrnV3(nn.Module): quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge) fea = self.bridge(x) - fea = F.interpolate(fea, scale_factor=(1.875 if self.version=="v3"else 2), mode="nearest") ##BCT + fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest") ##BCT fea, y_mask_ = self.wns1( fea, mel_lengths, ge ) ##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate. @@ -1196,9 +1197,9 @@ class SynthesizerTrnV3(nn.Module): ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask) y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device) if speed == 1: - sizee = int(codes.size(2) * (3.875 if self.version=="v3"else 4)) + sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4)) else: - sizee = int(codes.size(2) * (3.875 if self.version=="v3"else 4) / speed) + 1 + sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4) / speed) + 1 y_lengths1 = torch.LongTensor([sizee]).to(codes.device) text_lengths = torch.LongTensor([text.size(-1)]).to(text.device) @@ -1207,7 +1208,7 @@ class SynthesizerTrnV3(nn.Module): quantized = F.interpolate(quantized, scale_factor=2, mode="nearest") ##BCT x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed) fea = self.bridge(x) - fea = F.interpolate(fea, scale_factor=(1.875 if self.version=="v3"else 2), mode="nearest") ##BCT + fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest") ##BCT ####more wn paramter to learn mel fea, y_mask_ = self.wns1(fea, y_lengths1, ge) return fea, ge diff --git a/GPT_SoVITS/process_ckpt.py b/GPT_SoVITS/process_ckpt.py index 4a2a1ba..1c458a4 100644 --- a/GPT_SoVITS/process_ckpt.py +++ b/GPT_SoVITS/process_ckpt.py @@ -28,18 +28,18 @@ def my_save(fea, path): #####fix issue: torch.save doesn't support chinese path from io import BytesIO -def my_save2(fea, path,cfm_version): +def my_save2(fea, path, cfm_version): bio = BytesIO() torch.save(fea, bio) bio.seek(0) data = bio.getvalue() - byte=b"03" if cfm_version=="v3"else b"04" + byte = b"03" if cfm_version == "v3" else b"04" data = byte + data[2:] with open(path, "wb") as f: f.write(data) -def savee(ckpt, name, epoch, steps, hps, cfm_version=None,lora_rank=None): +def savee(ckpt, name, epoch, steps, hps, cfm_version=None, lora_rank=None): try: opt = OrderedDict() opt["weight"] = {} @@ -51,7 +51,7 @@ def savee(ckpt, name, epoch, steps, hps, cfm_version=None,lora_rank=None): opt["info"] = "%sepoch_%siteration" % (epoch, steps) if lora_rank: opt["lora_rank"] = lora_rank - my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name),cfm_version) + my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), cfm_version) else: my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name)) return "Success." diff --git a/GPT_SoVITS/s2_train_v3_lora.py b/GPT_SoVITS/s2_train_v3_lora.py index ddeec4f..4d8d23d 100644 --- a/GPT_SoVITS/s2_train_v3_lora.py +++ b/GPT_SoVITS/s2_train_v3_lora.py @@ -31,7 +31,6 @@ from module.data_utils import ( TextAudioSpeakerLoaderV3, TextAudioSpeakerCollateV4, TextAudioSpeakerLoaderV4, - ) from module.models import ( SynthesizerTrnV3 as SynthesizerTrn, @@ -88,8 +87,8 @@ def run(rank, n_gpus, hps): if torch.cuda.is_available(): torch.cuda.set_device(rank) - TextAudioSpeakerLoader=TextAudioSpeakerLoaderV3 if hps.model.version=="v3"else TextAudioSpeakerLoaderV4 - TextAudioSpeakerCollate=TextAudioSpeakerCollateV3 if hps.model.version=="v3"else TextAudioSpeakerCollateV4 + TextAudioSpeakerLoader = TextAudioSpeakerLoaderV3 if hps.model.version == "v3" else TextAudioSpeakerLoaderV4 + TextAudioSpeakerCollate = TextAudioSpeakerCollateV3 if hps.model.version == "v3" else TextAudioSpeakerCollateV4 train_dataset = TextAudioSpeakerLoader(hps.data) ######## train_sampler = DistributedBucketSampler( train_dataset, @@ -365,7 +364,8 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade hps.name + "_e%s_s%s_l%s" % (epoch, global_step, lora_rank), epoch, global_step, - hps,cfm_version=hps.model.version, + hps, + cfm_version=hps.model.version, lora_rank=lora_rank, ), ) diff --git a/GPT_SoVITS/text/g2pw/onnx_api.py b/GPT_SoVITS/text/g2pw/onnx_api.py index bf3109e..9282739 100644 --- a/GPT_SoVITS/text/g2pw/onnx_api.py +++ b/GPT_SoVITS/text/g2pw/onnx_api.py @@ -1,27 +1,28 @@ # This code is modified from https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw # This code is modified from https://github.com/GitYCC/g2pW -import warnings - -warnings.filterwarnings("ignore") import json import os +import warnings import zipfile from typing import Any, Dict, List, Tuple import numpy as np import onnxruntime import requests - -onnxruntime.set_default_logger_severity(3) +import torch from opencc import OpenCC from pypinyin import Style, pinyin -from transformers import AutoTokenizer +from transformers.models.auto.tokenization_auto import AutoTokenizer from ..zh_normalization.char_convert import tranditional_to_simplified from .dataset import get_char_phoneme_labels, get_phoneme_labels, prepare_onnx_input from .utils import load_config +onnxruntime.set_default_logger_severity(3) +onnxruntime.preload_dlls() +warnings.filterwarnings("ignore") + model_version = "1.1" @@ -87,7 +88,7 @@ class G2PWOnnxConverter: sess_options = onnxruntime.SessionOptions() sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL - sess_options.intra_op_num_threads = 2 + sess_options.intra_op_num_threads = 2 if torch.cuda.is_available() else 0 try: self.session_g2pW = onnxruntime.InferenceSession( os.path.join(uncompress_path, "g2pW.onnx"), diff --git a/GPT_SoVITS/utils.py b/GPT_SoVITS/utils.py index 1cc2d97..f6f388a 100644 --- a/GPT_SoVITS/utils.py +++ b/GPT_SoVITS/utils.py @@ -16,7 +16,7 @@ logging.getLogger("matplotlib").setLevel(logging.ERROR) MATPLOTLIB_FLAG = False -logging.basicConfig(stream=sys.stdout, level=logging.ERROR) +logging.basicConfig(stream=sys.stdout, level=logging.INFO) logger = logging @@ -309,13 +309,13 @@ def check_git_hash(model_dir): def get_logger(model_dir, filename="train.log"): global logger logger = logging.getLogger(os.path.basename(model_dir)) - logger.setLevel(logging.ERROR) + logger.setLevel(logging.INFO) formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") if not os.path.exists(model_dir): os.makedirs(model_dir) h = logging.FileHandler(os.path.join(model_dir, filename)) - h.setLevel(logging.ERROR) + h.setLevel(logging.INFO) h.setFormatter(formatter) logger.addHandler(h) return logger diff --git a/README.md b/README.md index 463649a..b32d2fd 100644 --- a/README.md +++ b/README.md @@ -44,15 +44,15 @@ For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/ ### Tested Environments -| Python Version | PyTorch Version | Device | -|----------------|------------------|-----------------| -| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 | -| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 | -| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 | -| Python 3.9 | PyTorch 2.5.1 | Apple silicon | -| Python 3.11 | PyTorch 2.6.0 | Apple silicon | -| Python 3.9 | PyTorch 2.2.2 | CPU | -| Python 3.9 | PyTorch 2.8.0dev | CUDA12.8(for Nvidia50x0) | +| Python Version | PyTorch Version | Device | +| -------------- | ---------------- | ------------- | +| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | +| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.7.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | ### Windows @@ -63,31 +63,41 @@ If you are a Windows user (tested with win>=10), you can [download the integrate ### Linux ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh --source [--download-uvr5] +bash install.sh --device --source [--download-uvr5] ``` ### macOS **Note: The models trained with GPUs on Macs result in significantly lower quality compared to those trained on other devices, so we are temporarily using CPUs instead.** -1. Install Xcode command-line tools by running `xcode-select --install`. -2. Install the program by running the following commands: +Install the program by running the following commands: ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh --source [--download-uvr5] +bash install.sh --device --source [--download-uvr5] ``` ### Install Manually +#### Install Dependences + +```bash +conda create -n GPTSoVits python=3.10 +conda activate GPTSoVits + +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + #### Install FFmpeg ##### Conda Users ```bash +conda activate GPTSoVits conda install ffmpeg ``` @@ -96,14 +106,13 @@ conda install ffmpeg ```bash sudo apt install ffmpeg sudo apt install libsox-dev -conda install -c conda-forge 'ffmpeg<7' ``` ##### Windows Users -Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root. +Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root -Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only) +Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) ##### MacOS Users @@ -111,36 +120,53 @@ Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Ko brew install ffmpeg ``` -#### Install Dependences +### Running GPT-SoVITS with Docker -```bash -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt -``` +#### Docker Image Selection -### Using Docker +Due to rapid development in the codebase and a slower Docker image release cycle, please: -#### docker-compose.yaml configuration +- Check [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) for the latest available image tags +- Choose an appropriate image tag for your environment +- `Lite` means the Docker image does not include ASR models and UVR5 models. You can manually download the UVR5 models, while the program will automatically download the ASR models as needed +- The appropriate architecture image (amd64/arm64) will be automatically pulled during Docker Compose +- Optionally, build the image locally using the provided Dockerfile for the most up-to-date changes -0. Regarding image tags: Due to rapid updates in the codebase and the slow process of packaging and testing images, please check [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(outdated) for the currently packaged latest images and select as per your situation, or alternatively, build locally using a Dockerfile according to your own needs. -1. Environment Variables: - - is_half: Controls half-precision/double-precision. This is typically the cause if the content under the directories 4-cnhubert/5-wav32k is not generated correctly during the "SSL extracting" step. Adjust to True or False based on your actual situation. -2. Volumes Configuration, The application's root directory inside the container is set to /workspace. The default docker-compose.yaml lists some practical examples for uploading/downloading content. -3. shm_size: The default available memory for Docker Desktop on Windows is too small, which can cause abnormal operations. Adjust according to your own situation. -4. Under the deploy section, GPU-related settings should be adjusted cautiously according to your system and actual circumstances. +#### Environment Variables -#### Running with docker compose +- `is_half`: Controls whether half-precision (fp16) is enabled. Set to `true` if your GPU supports it to reduce memory usage. -``` -docker compose -f "docker-compose.yaml" up -d +#### Shared Memory Configuration + +On Windows (Docker Desktop), the default shared memory size is small and may cause unexpected behavior. Increase `shm_size` (e.g., to `16g`) in your Docker Compose file based on your available system memory. + +#### Choosing a Service + +The `docker-compose.yaml` defines two services: + +- `GPT-SoVITS-CU126` & `GPT-SoVITS-CU128`: Full version with all features. +- `GPT-SoVITS-CU126-Lite` & `GPT-SoVITS-CU128-Lite`: Lightweight version with reduced dependencies and functionality. + +To run a specific service with Docker Compose, use: + +```bash +docker compose run --service-ports ``` -#### Running with docker command +#### Building the Docker Image Locally -As above, modify the corresponding parameters based on your actual situation, then run the following command: +If you want to build the image yourself, use: +```bash +bash docker_build.sh --cuda <12.6|12.8> [--lite] ``` -docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx + +#### Accessing the Running Container (Bash Shell) + +Once the container is running in the background, you can access it using: + +```bash +docker exec -it bash ``` ## Pretrained Models @@ -168,7 +194,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker The TTS annotation .list file format: ``` + vocal_path|speaker_name|language|text + ``` Language dictionary: @@ -182,7 +210,9 @@ Language dictionary: Example: ``` + D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. + ``` ## Finetune and inference @@ -212,12 +242,12 @@ Or maunally switch version in WebUI #### Path Auto-filling is now supported - 1. Fill in the audio path - 2. Slice the audio into small chunks - 3. Denoise(optinal) - 4. ASR - 5. Proofreading ASR transcriptions - 6. Go to the next Tab, then finetune the model +1. Fill in the audio path +2. Slice the audio into small chunks +3. Denoise(optinal) +4. ASR +5. Proofreading ASR transcriptions +6. Go to the next Tab, then finetune the model ### Open Inference WebUI @@ -259,7 +289,7 @@ Use v2 from v1 environment: 2. Clone the latest codes from github. -3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`. +3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`. Chinese v2 additional: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.) @@ -279,7 +309,7 @@ Use v3 from v2 environment: 2. Clone the latest codes from github. -3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`. +3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`. additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt) @@ -296,7 +326,7 @@ Use v4 from v1/v2/v3 environment: 2. Clone the latest codes from github. -3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`. +3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`. ## Todo List @@ -322,7 +352,7 @@ Use v4 from v1/v2/v3 environment: Use the command line to open the WebUI for UVR5 -``` +```bash python tools/uvr5/webui.py "" ``` @@ -333,7 +363,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level -- This is how the audio segmentation of the dataset is done using the command line -``` +```bash python audio_slicer.py \ --input_path "" \ --output_root "" \ @@ -345,7 +375,7 @@ python audio_slicer.py \ This is how dataset ASR processing is done using the command line(Only Chinese) -``` +```bash python tools/asr/funasr_asr.py -i -o ``` @@ -353,7 +383,7 @@ ASR processing is performed through Faster_Whisper(ASR marking except Chinese) (No progress bars, GPU performance may cause time delays) -``` +```bash python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ``` diff --git a/docker-compose.yaml b/docker-compose.yaml index aca8ab9..9703d0c 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,32 +1,95 @@ -version: '3.8' +version: "3.8" services: - gpt-sovits: - image: breakstring/gpt-sovits:latest # please change the image name and tag base your environment. If the tag contains the word 'elite', such as "latest-elite", it indicates that the image does not include the necessary models such as GPT-SoVITS, UVR5, Damo ASR, etc. You will need to download them yourself and map them into the container. - container_name: gpt-sovits-container - environment: - - is_half=False - - is_share=False + GPT-SoVITS-CU126: + image: xxxxrt666/gpt-sovits:latest-cu126 + container_name: GPT-SoVITS-CU126 + ports: + - "9871:9871" + - "9872:9872" + - "9873:9873" + - "9874:9874" + - "9880:9880" volumes: - - ./output:/workspace/output - - ./logs:/workspace/logs - - ./SoVITS_weights:/workspace/SoVITS_weights - - ./reference:/workspace/reference - working_dir: /workspace + - .:/workspace/GPT-SoVITS + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel + - /dev/null:/workspace/GPT-SoVITS/tools/asr/models + - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights + environment: + - is_half=true + tty: true + stdin_open: true + shm_size: "16g" + restart: unless-stopped + runtime: nvidia + GPT-SoVITS-CU126-Lite: + image: xxxxrt666/gpt-sovits:latest-cu126-lite + container_name: GPT-SoVITS-CU126-Lite ports: + - "9871:9871" + - "9872:9872" + - "9873:9873" + - "9874:9874" - "9880:9880" + volumes: + - .:/workspace/GPT-SoVITS + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel + - /dev/null:/workspace/GPT-SoVITS/tools/asr/models + - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights + - tools/asr/models:/workspace/models/asr_models + - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights + environment: + - is_half=true + tty: true + stdin_open: true + shm_size: "16g" + restart: unless-stopped + runtime: nvidia + GPT-SoVITS-CU128: + image: xxxxrt666/gpt-sovits:latest-cu128 + container_name: GPT-SoVITS-CU128 + ports: - "9871:9871" - "9872:9872" - "9873:9873" - "9874:9874" - shm_size: 16G - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: "all" - capabilities: [gpu] + - "9880:9880" + volumes: + - .:/workspace/GPT-SoVITS + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel + - /dev/null:/workspace/GPT-SoVITS/tools/asr/models + - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights + environment: + - is_half=true + tty: true stdin_open: true + shm_size: "16g" + restart: unless-stopped + runtime: nvidia + GPT-SoVITS-CU128-Lite: + image: xxxxrt666/gpt-sovits:latest-cu128-lite + container_name: GPT-SoVITS-CU128-Lite + ports: + - "9871:9871" + - "9872:9872" + - "9873:9873" + - "9874:9874" + - "9880:9880" + volumes: + - .:/workspace/GPT-SoVITS + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models + - /dev/null:/workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel + - /dev/null:/workspace/GPT-SoVITS/tools/asr/models + - /dev/null:/workspace/GPT-SoVITS/tools/uvr5/uvr5_weights + - tools/asr/models:/workspace/models/asr_models + - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights + environment: + - is_half=true tty: true + stdin_open: true + shm_size: "16g" restart: unless-stopped + runtime: nvidia \ No newline at end of file diff --git a/docker_build.sh b/docker_build.sh new file mode 100644 index 0000000..354599d --- /dev/null +++ b/docker_build.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +cd "$SCRIPT_DIR" || exit 1 + +set -e + +if ! command -v docker &>/dev/null; then + echo "Docker Not Found" + exit 1 +fi + +trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR + +LITE=false +CUDA_VERSION=12.6 + +print_help() { + echo "Usage: bash docker_build.sh [OPTIONS]" + echo "" + echo "Options:" + echo " --cuda 12.6|12.8 Specify the CUDA VERSION (REQUIRED)" + echo " --lite Build a Lite Image" + echo " -h, --help Show this help message and exit" + echo "" + echo "Examples:" + echo " bash docker_build.sh --cuda 12.6 --funasr --faster-whisper" +} + +# Show help if no arguments provided +if [[ $# -eq 0 ]]; then + print_help + exit 0 +fi + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --cuda) + case "$2" in + 12.6) + CUDA_VERSION=12.6 + ;; + 12.8) + CUDA_VERSION=12.8 + ;; + *) + echo "Error: Invalid CUDA_VERSION: $2" + echo "Choose From: [12.6, 12.8]" + exit 1 + ;; + esac + shift 2 + ;; + --lite) + LITE=true + shift + ;; + *) + echo "Unknown Argument: $1" + echo "Use -h or --help to see available options." + exit 1 + ;; + esac +done + +TARGETPLATFORM=$(uname -m | grep -q 'x86' && echo "linux/amd64" || echo "linux/arm64") + +if [ $LITE = true ]; then + TORCH_BASE="lite" +else + TORCH_BASE="full" +fi + +docker build \ + --build-arg CUDA_VERSION=$CUDA_VERSION \ + --build-arg LITE=$LITE \ + --build-arg TARGETPLATFORM="$TARGETPLATFORM" \ + --build-arg TORCH_BASE=$TORCH_BASE \ + -t "${USER}/gpt-sovits:local" \ + . diff --git a/dockerbuild.sh b/dockerbuild.sh deleted file mode 100755 index 3a4a1e1..0000000 --- a/dockerbuild.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -# 获取当前日期,格式为 YYYYMMDD -DATE=$(date +%Y%m%d) -# 获取最新的 Git commit 哈希值的前 7 位 -COMMIT_HASH=$(git rev-parse HEAD | cut -c 1-7) - -# 构建 full 版本的镜像 -docker build --build-arg IMAGE_TYPE=full -t breakstring/gpt-sovits:latest . -# 为同一个镜像添加带日期的标签 -docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$DATE -# 为同一个镜像添加带当前代码库Commit哈希值的标签 -docker tag breakstring/gpt-sovits:latest breakstring/gpt-sovits:dev-$COMMIT_HASH - - -# 构建 elite 版本的镜像(无模型下载步骤,需手工将模型下载安装进容器) -docker build --build-arg IMAGE_TYPE=elite -t breakstring/gpt-sovits:latest-elite . -# 为同一个镜像添加带日期的标签 -docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$DATE-elite -# 为同一个镜像添加带当前代码库Commit哈希值的标签 -docker tag breakstring/gpt-sovits:latest-elite breakstring/gpt-sovits:dev-$COMMIT_HASH-elite diff --git a/docs/cn/README.md b/docs/cn/README.md index cc72b89..832e75f 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -20,21 +20,21 @@ --- -## 功能: +## 功能 -1. **零样本文本到语音 (TTS): ** 输入 5 秒的声音样本, 即刻体验文本到语音转换. +1. **零样本文本到语音 (TTS):** 输入 5 秒的声音样本, 即刻体验文本到语音转换. -2. **少样本 TTS: ** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感. +2. **少样本 TTS:** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感. -3. **跨语言支持: ** 支持与训练数据集不同语言的推理, 目前支持英语、日语、韩语、粤语和中文. +3. **跨语言支持:** 支持与训练数据集不同语言的推理, 目前支持英语、日语、韩语、粤语和中文. -4. **WebUI 工具: ** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注, 协助初学者创建训练数据集和 GPT/SoVITS 模型. +4. **WebUI 工具:** 集成工具包括声音伴奏分离、自动训练集分割、中文自动语音识别(ASR)和文本标注, 协助初学者创建训练数据集和 GPT/SoVITS 模型. **查看我们的介绍视频 [demo video](https://www.bilibili.com/video/BV12g4y1m7Uw)** -未见过的说话者 few-shot 微调演示: +未见过的说话者 few-shot 微调演示: -https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb + **用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** @@ -44,14 +44,15 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- ### 测试通过的环境 -| Python Version | PyTorch Version | Device | -|----------------|------------------|-----------------| -| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 | -| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 | -| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 | -| Python 3.9 | PyTorch 2.5.1 | Apple silicon | -| Python 3.11 | PyTorch 2.6.0 | Apple silicon | -| Python 3.9 | PyTorch 2.2.2 | CPU | +| Python Version | PyTorch Version | Device | +| -------------- | ---------------- | ------------- | +| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | +| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.7.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | ### Windows @@ -62,31 +63,41 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- ### Linux ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh --source [--download-uvr5] +bash install.sh --device --source [--download-uvr5] ``` ### macOS **注: 在 Mac 上使用 GPU 训练的模型效果显著低于其他设备训练的模型, 所以我们暂时使用 CPU 进行训练.** -1. 运行 `xcode-select --install` 安装 Xcode command-line tools. -2. 运行以下的命令来安装本项目: +运行以下的命令来安装本项目: ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh --source [--download-uvr5] +bash install.sh --device --source [--download-uvr5] ``` ### 手动安装 +#### 安装依赖 + +```bash +conda create -n GPTSoVits python=3.10 +conda activate GPTSoVits + +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + #### 安装 FFmpeg ##### Conda 用户 ```bash +conda activate GPTSoVits conda install ffmpeg ``` @@ -95,14 +106,13 @@ conda install ffmpeg ```bash sudo apt install ffmpeg sudo apt install libsox-dev -conda install -c conda-forge 'ffmpeg<7' ``` ##### Windows 用户 -下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下. +下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下 -安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS) +安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境 ##### MacOS 用户 @@ -110,38 +120,53 @@ conda install -c conda-forge 'ffmpeg<7' brew install ffmpeg ``` -#### 安装依赖 +### 运行 GPT-SoVITS (使用 Docker) -```bash -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt -``` +#### Docker 镜像选择 -### 在 Docker 中使用 +由于代码库更新频繁, 而 Docker 镜像的发布周期相对较慢, 请注意: -#### docker-compose.yaml 设置 +- 前往 [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) 查看最新可用的镜像标签(tags) +- 根据你的运行环境选择合适的镜像标签 +- `Lite` Docker 镜像不包含 ASR 模型和 UVR5 模型. 你可以自行下载 UVR5 模型, ASR 模型则会在需要时由程序自动下载 +- 在使用 Docker Compose 时, 会自动拉取适配的架构镜像 (amd64 或 arm64) +- 可选:为了获得最新的更改, 你可以使用提供的 Dockerfile 在本地构建镜像 -0. image 的标签: 由于代码库更新很快, 镜像的打包和测试又很慢, 所以请自行在 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(旧版本) 查看当前打包好的最新的镜像并根据自己的情况选用, 或者在本地根据您自己的需求通过 Dockerfile 进行构建. -1. 环境变量: +#### 环境变量 -- is_half: 半精度/双精度控制.在进行 "SSL extracting" 步骤时如果无法正确生成 4-cnhubert/5-wav32k 目录下的内容时, 一般都是它引起的, 可以根据实际情况来调整为 True 或者 False. +- `is_half`:控制是否启用半精度(fp16). 如果你的 GPU 支持, 设置为 `true` 可以减少显存占用 -2. Volume 设置, 容器内的应用根目录设置为 /workspace. 默认的 docker-compose.yaml 中列出了一些实际的例子, 便于上传/下载内容. -3. shm_size: Windows 下的 Docker Desktop 默认可用内存过小, 会导致运行异常, 根据自己情况酌情设置. -4. deploy 小节下的 gpu 相关内容, 请根据您的系统和实际情况酌情设置. +#### 共享内存配置 -#### 通过 docker compose 运行 +在 Windows (Docker Desktop) 中, 默认共享内存大小较小, 可能导致运行异常. 请在 Docker Compose 文件中根据系统内存情况, 增大 `shm_size` (例如设置为 `16g`) -``` -docker compose -f "docker-compose.yaml" up -d +#### 选择服务 + +`docker-compose.yaml` 文件定义了两个主要服务类型: + +- `GPT-SoVITS-CU126` 与 `GPT-SoVITS-CU128`:完整版, 包含所有功能 +- `GPT-SoVITS-CU126-Lite` 与 `GPT-SoVITS-CU128-Lite`:轻量版, 依赖更少, 功能略有删减 + +如需使用 Docker Compose 运行指定服务, 请执行: + +```bash +docker compose run --service-ports ``` -#### 通过 docker 命令运行 +#### 本地构建 Docker 镜像 -同上, 根据您自己的实际情况修改对应的参数, 然后运行如下命令: +如果你希望自行构建镜像, 请使用以下命令: +```bash +bash docker_build.sh --cuda <12.6|12.8> [--lite] ``` -docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx + +#### 访问运行中的容器 (Bash Shell) + +当容器在后台运行时, 你可以通过以下命令进入容器: + +```bash +docker exec -it bash ``` ## 预训练模型 @@ -166,13 +191,13 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker ## 数据集格式 -文本到语音 (TTS) 注释 .list 文件格式: +文本到语音 (TTS) 注释 .list 文件格式: ``` vocal_path|speaker_name|language|text ``` -语言字典: +语言字典: - 'zh': 中文 - 'ja': 日语 @@ -180,7 +205,7 @@ vocal_path|speaker_name|language|text - 'ko': 韩语 - 'yue': 粤语 -示例: +示例: ``` D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神. @@ -213,12 +238,12 @@ python webui.py v1 #### 现已支持自动填充路径 - 1. 填入训练音频路径 - 2. 切割音频 - 3. 进行降噪(可选) - 4. 进行ASR - 5. 校对标注 - 6. 前往下一个窗口,点击训练 +1. 填入训练音频路径 +2. 切割音频 +3. 进行降噪(可选) +4. 进行 ASR +5. 校对标注 +6. 前往下一个窗口,点击训练 ### 打开推理 WebUI @@ -260,7 +285,7 @@ python webui.py 2. 需要克隆 github 上的最新代码 -3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained 下 +3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS/pretrained_models/gsv-v2final-pretrained 下 中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下) @@ -280,13 +305,13 @@ python webui.py 2. 需要克隆 github 上的最新代码 -3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下 +3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS/pretrained_models`目录下 如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题, 需要下载额外的模型参数, 参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt) ## 待办事项清单 -- [x] **高优先级: ** +- [x] **高优先级:** - [x] 日语和英语的本地化. - [x] 用户指南. @@ -304,11 +329,11 @@ python webui.py - [x] 更好的 sovits 基础模型 (增强的音频质量). - [ ] 模型混合. -## (附加) 命令行运行方式 +## (附加) 命令行运行方式 使用命令行打开 UVR5 的 WebUI -``` +```bash python tools/uvr5/webui.py "" ``` @@ -319,7 +344,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level -- 这是使用命令行完成数据集的音频切分的方式 -``` +```bash python audio_slicer.py \ --input_path "" \ --output_root "" \ @@ -331,15 +356,15 @@ python audio_slicer.py \ 这是使用命令行完成数据集 ASR 处理的方式 (仅限中文) -``` +```bash python tools/asr/funasr_asr.py -i -o ``` 通过 Faster_Whisper 进行 ASR 处理 (除中文之外的 ASR 标记) - (没有进度条, GPU 性能可能会导致时间延迟) +(没有进度条, GPU 性能可能会导致时间延迟) -``` +```bash python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ``` @@ -347,7 +372,7 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ## 致谢 -特别感谢以下项目和贡献者: +特别感谢以下项目和贡献者: ### 理论研究 diff --git a/docs/ja/README.md b/docs/ja/README.md index 145de21..8cedb0d 100644 --- a/docs/ja/README.md +++ b/docs/ja/README.md @@ -40,14 +40,15 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- ### テスト済みの環境 -| Python Version | PyTorch Version | Device | -|----------------|------------------|-----------------| -| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 | -| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 | -| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 | -| Python 3.9 | PyTorch 2.5.1 | Apple silicon | -| Python 3.11 | PyTorch 2.6.0 | Apple silicon | -| Python 3.9 | PyTorch 2.2.2 | CPU | +| Python Version | PyTorch Version | Device | +| -------------- | ---------------- | ------------- | +| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | +| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.7.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | ### Windows @@ -56,31 +57,41 @@ Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケ ### Linux ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh --source [--download-uvr5] +bash install.sh --device --source [--download-uvr5] ``` ### macOS **注: Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します.** -1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします. -2. 以下のコマンドを実行してこのプロジェクトをインストールします. +以下のコマンドを実行してこのプロジェクトをインストールします: ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh --source [--download-uvr5] +bash install.sh --device --source [--download-uvr5] ``` ### 手動インストール -#### FFmpeg をインストールします. +#### 依存関係をインストールします + +```bash +conda create -n GPTSoVits python=3.10 +conda activate GPTSoVits + +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + +#### FFmpeg をインストールします ##### Conda ユーザー ```bash +conda activate GPTSoVits conda install ffmpeg ``` @@ -89,12 +100,13 @@ conda install ffmpeg ```bash sudo apt install ffmpeg sudo apt install libsox-dev -conda install -c conda-forge 'ffmpeg<7' ``` ##### Windows ユーザー -[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます. +[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます + +[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 環境をインストールしてください ##### MacOS ユーザー @@ -102,38 +114,53 @@ conda install -c conda-forge 'ffmpeg<7' brew install ffmpeg ``` -#### 依存関係をインストールします +### GPT-SoVITS の実行 (Docker 使用) -```bash -pip install -r extra-req.txt --no-deps -pip install -r requirementx.txt -``` +#### Docker イメージの選択 + +コードベースの更新が頻繁である一方、Docker イメージのリリースは比較的遅いため、以下を確認してください: + +- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) で最新のイメージタグを確認してください +- 環境に合った適切なイメージタグを選択してください +- `Lite` とは、Docker イメージに ASR モデルおよび UVR5 モデルが含まれていないことを意味します. UVR5 モデルは手動でダウンロードし、ASR モデルは必要に応じてプログラムが自動的にダウンロードします +- Docker Compose 実行時に、対応するアーキテクチャ (amd64 または arm64) のイメージが自動的に取得されます +- オプション:最新の変更を反映させるため、提供されている Dockerfile を使ってローカルでイメージをビルドすることも可能です + +#### 環境変数 -### Docker の使用 +- `is_half`:半精度 (fp16) を使用するかどうかを制御します. GPU が対応している場合、`true` に設定することでメモリ使用量を削減できます -#### docker-compose.yaml の設定 +#### 共有メモリの設定 -0. イメージのタグについて: コードベースの更新が速い割に、イメージのパッケージングとテストが遅いため、[Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(古いバージョン) で現在パッケージされている最新のイメージをご覧になり、ご自身の状況に応じて選択するか、またはご自身のニーズに応じて Dockerfile を使用してローカルでビルドしてください. -1. 環境変数: +Windows (Docker Desktop) では、デフォルトの共有メモリサイズが小さいため、予期しない動作が発生する可能性があります. Docker Compose ファイル内の `shm_size` を (例:`16g`) に増やすことをおすすめします - - `is_half`: 半精度/倍精度の制御."SSL 抽出"ステップ中に`4-cnhubert/5-wav32k`ディレクトリ内の内容が正しく生成されない場合、通常これが原因です.実際の状況に応じて True または False に調整してください. +#### サービスの選択 -2. ボリューム設定: コンテナ内のアプリケーションのルートディレクトリは`/workspace`に設定されます.デフォルトの`docker-compose.yaml`には、アップロード/ダウンロードの内容の実例がいくつか記載されています. -3. `shm_size`: Windows の Docker Desktop のデフォルトの利用可能メモリは小さすぎるため、うまく動作しない可能性があります.状況に応じて適宜設定してください. -4. `deploy`セクションの GPU に関連する内容は、システムと実際の状況に応じて慎重に設定してください. +`docker-compose.yaml` ファイルには次の 2 種類のサービスが定義されています: -#### docker compose で実行する +- `GPT-SoVITS-CU126` および `GPT-SoVITS-CU128`:すべての機能を含むフルバージョン +- `GPT-SoVITS-CU126-Lite` および `GPT-SoVITS-CU128-Lite`:依存関係を削減した軽量バージョン -```markdown -docker compose -f "docker-compose.yaml" up -d +特定のサービスを Docker Compose で実行するには、以下のコマンドを使用します: + +```bash +docker compose run --service-ports +``` + +#### Docker イメージのローカルビルド + +自分でイメージをビルドするには、以下のコマンドを使ってください: + +```bash +bash docker_build.sh --cuda <12.6|12.8> [--lite] ``` -#### docker コマンドで実行する +#### 実行中のコンテナへアクセス (Bash Shell) -上記と同様に、実際の状況に基づいて対応するパラメータを変更し、次のコマンドを実行します: +コンテナがバックグラウンドで実行されている場合、以下のコマンドでシェルにアクセスできます: -```markdown -docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx +```bash +docker exec -it bash ``` ## 事前訓練済みモデル @@ -201,12 +228,12 @@ python webui.py v1 <言語(オプション)> #### パス自動補完のサポート - 1. 音声パスを入力する - 2. 音声を小さなチャンクに分割する - 3. ノイズ除去 (オプション) - 4. ASR - 5. ASR転写を校正する - 6. 次のタブに移動し、モデルを微調整する +1. 音声パスを入力する +2. 音声を小さなチャンクに分割する +3. ノイズ除去 (オプション) +4. ASR +5. ASR 転写を校正する +6. 次のタブに移動し、モデルを微調整する ### 推論 WebUI を開く @@ -248,7 +275,7 @@ V1 環境から V2 を使用するには: 2. 最新のコードを github からクローン -3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置 +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`に配置 中国語 V2 追加: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します) @@ -268,7 +295,7 @@ v2 環境から v3 を使用する方法: 2. GitHub から最新のコードをクローンします. -3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します. +3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS/pretrained_models フォルダに配置します. 追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください. @@ -296,7 +323,7 @@ v2 環境から v3 を使用する方法: コマンド ラインを使用して UVR5 の WebUI を開きます -``` +```bash python tools/uvr5/webui.py "" ``` @@ -307,7 +334,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level -- コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです. -``` +```bash python audio_slicer.py \ --input_path "" \ --output_root "" \ @@ -319,7 +346,7 @@ python audio_slicer.py \ コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ) -``` +```bash python tools/asr/funasr_asr.py -i -o ``` @@ -327,7 +354,7 @@ ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く A (進行状況バーは表示されません.GPU のパフォーマンスにより時間遅延が発生する可能性があります) -``` +```bash python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ``` @@ -335,7 +362,7 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ## クレジット -特に以下のプロジェクトと貢献者に感謝します: +特に以下のプロジェクトと貢献者に感謝します: ### 理論研究 diff --git a/docs/ko/README.md b/docs/ko/README.md index e9fee8f..e6419a1 100644 --- a/docs/ko/README.md +++ b/docs/ko/README.md @@ -40,14 +40,15 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- ### 테스트 통과 환경 -| Python Version | PyTorch Version | Device | -|----------------|------------------|-----------------| -| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 | -| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 | -| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 | -| Python 3.9 | PyTorch 2.5.1 | Apple silicon | -| Python 3.11 | PyTorch 2.6.0 | Apple silicon | -| Python 3.9 | PyTorch 2.2.2 | CPU | +| Python Version | PyTorch Version | Device | +| -------------- | ---------------- | ------------- | +| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | +| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.7.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | ### Windows @@ -56,31 +57,41 @@ Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다 ### Linux ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh --source [--download-uvr5] +bash install.sh --device --source [--download-uvr5] ``` ### macOS **주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.** -1. `xcode-select --install`을 실행하여 Xcode 커맨드라인 도구를 설치하세요. -2. 다음 명령어를 실행하여 이 프로젝트를 설치하세요. +다음 명령어를 실행하여 이 프로젝트를 설치하세요 ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh --source [--download-uvr5] +bash install.sh --device --source [--download-uvr5] ``` ### 수동 설치 +#### 의존성 설치 + +```bash +conda create -n GPTSoVits python=3.10 +conda activate GPTSoVits + +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + #### FFmpeg 설치 ##### Conda 사용자 ```bash +conda activate GPTSoVits conda install ffmpeg ``` @@ -89,14 +100,13 @@ conda install ffmpeg ```bash sudo apt install ffmpeg sudo apt install libsox-dev -conda install -c conda-forge 'ffmpeg<7' ``` ##### Windows 사용자 -[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다. +[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다 -[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용) +[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 ##### MacOS 사용자 @@ -104,41 +114,53 @@ conda install -c conda-forge 'ffmpeg<7' brew install ffmpeg ``` -#### 의존성 설치 +### GPT-SoVITS 실행하기 (Docker 사용) -```bash -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt -``` +#### Docker 이미지 선택 -### Docker에서 사용 +코드베이스가 빠르게 업데이트되는 반면 Docker 이미지 릴리스 주기는 느리기 때문에 다음을 참고하세요: -#### docker-compose.yaml 설정 +- [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits)에서 최신 이미지 태그를 확인하세요 +- 환경에 맞는 적절한 이미지 태그를 선택하세요 +- `Lite` 는 Docker 이미지에 ASR 모델과 UVR5 모델이 포함되어 있지 않음을 의미합니다. UVR5 모델은 사용자가 직접 다운로드해야 하며, ASR 모델은 필요 시 프로그램이 자동으로 다운로드합니다 +- Docker Compose 실행 시, 해당 아키텍처에 맞는 이미지(amd64 또는 arm64)가 자동으로 다운로드됩니다 +- 선택 사항: 최신 변경사항을 반영하려면 제공된 Dockerfile을 사용하여 로컬에서 직접 이미지를 빌드할 수 있습니다 -0. 이미지 태그: 코드 저장소가 빠르게 업데이트되고 패키지가 느리게 빌드되고 테스트되므로, 현재 빌드된 최신 도커 이미지를 [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(오래된 버전) 에서 확인하고 필요에 따라 Dockerfile을 사용하여 로컬에서 빌드할 수 있습니다. +#### 환경 변수 -1. 환경 변수: +- `is_half`: 반정밀도(fp16) 사용 여부를 제어합니다. GPU가 지원하는 경우 `true`로 설정하면 메모리 사용량을 줄일 수 있습니다 -- is_half: 반정밀/배정밀 제어. "SSL 추출" 단계에서 4-cnhubert/5-wav32k 디렉토리의 내용을 올바르게 생성할 수 없는 경우, 일반적으로 이것 때문입니다. 실제 상황에 따라 True 또는 False로 조정할 수 있습니다. +#### 공유 메모리 설정 -2. 볼륨 설정, 컨테이너 내의 애플리케이션 루트 디렉토리를 /workspace로 설정합니다. 기본 docker-compose.yaml에는 실제 예제가 나열되어 있으므로 업로드/다운로드를 쉽게 할 수 있습니다. +Windows(Docker Desktop)에서는 기본 공유 메모리 크기가 작아 예기치 않은 동작이 발생할 수 있습니다. 시스템 메모리 상황에 따라 Docker Compose 파일에서 `shm_size`를 (예: `16g`)로 증가시키는 것이 좋습니다 -3. shm_size: Windows의 Docker Desktop의 기본 사용 가능한 메모리가 너무 작아 오류가 발생할 수 있으므로 실제 상황에 따라 조정합니다. +#### 서비스 선택 -4. deploy 섹션의 gpu 관련 내용은 시스템 및 실제 상황에 따라 조정합니다. +`docker-compose.yaml` 파일에는 두 가지 서비스 유형이 정의되어 있습니다: -#### docker compose로 실행 +- `GPT-SoVITS-CU126` 및 `GPT-SoVITS-CU128`: 전체 기능을 포함한 풀 버전 +- `GPT-SoVITS-CU126-Lite` 및 `GPT-SoVITS-CU128-Lite`: 의존성이 줄어든 경량 버전 -``` -docker compose -f "docker-compose.yaml" up -d +특정 서비스를 Docker Compose로 실행하려면 다음 명령을 사용하세요: + +```bash +docker compose run --service-ports ``` -#### docker 명령으로 실행 +#### Docker 이미지 직접 빌드하기 -위와 동일하게 실제 상황에 맞게 매개변수를 수정한 다음 다음 명령을 실행합니다: +직접 이미지를 빌드하려면 다음 명령어를 사용하세요: +```bash +bash docker_build.sh --cuda <12.6|12.8> [--lite] ``` -docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx + +#### 실행 중인 컨테이너 접속하기 (Bash Shell) + +컨테이너가 백그라운드에서 실행 중일 때 다음 명령어로 셸에 접속할 수 있습니다: + +```bash +docker exec -it bash ``` ## 사전 학습된 모델 @@ -206,12 +228,12 @@ python webui.py v1 <언어(옵션)> #### 경로 자동 채우기가 지원됩니다 - 1. 오디오 경로를 입력하십시오. - 2. 오디오를 작은 청크로 분할하십시오. - 3. 노이즈 제거(옵션) - 4. ASR 수행 - 5. ASR 전사를 교정하십시오. - 6. 다음 탭으로 이동하여 모델을 미세 조정하십시오. +1. 오디오 경로를 입력하십시오. +2. 오디오를 작은 청크로 분할하십시오. +3. 노이즈 제거(옵션) +4. ASR 수행 +5. ASR 전사를 교정하십시오. +6. 다음 탭으로 이동하여 모델을 미세 조정하십시오. ### 추론 WebUI 열기 @@ -253,7 +275,7 @@ V1 환경에서 V2를 사용하려면: 2. github에서 최신 코드를 클론하십시오. -3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오. +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`에 넣으십시오. 중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.) @@ -273,7 +295,7 @@ v2 환경에서 v3 사용하기: 2. 최신 코드를 github 에서 클론합니다. -3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다. +3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS/pretrained_models` 폴더에 넣습니다. 추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요. @@ -302,7 +324,7 @@ v2 환경에서 v3 사용하기: 명령줄을 사용하여 UVR5용 WebUI 열기 -``` +```bash python tools/uvr5/webui.py "" ``` @@ -313,7 +335,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level -- 명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다. -``` +```bash python audio_slicer.py \ --input_path "" \ --output_root "" \ @@ -325,7 +347,7 @@ python audio_slicer.py \ 명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당). -``` +```bash python tools/asr/funasr_asr.py -i -o ``` @@ -333,7 +355,7 @@ ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행 (진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음) -``` +```bash python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ``` diff --git a/docs/tr/README.md b/docs/tr/README.md index d59f66b..0a8ee4a 100644 --- a/docs/tr/README.md +++ b/docs/tr/README.md @@ -42,14 +42,15 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350- ### Test Edilmiş Ortamlar -| Python Version | PyTorch Version | Device | -|----------------|------------------|-----------------| -| Python 3.9 | PyTorch 2.0.1 | CUDA 11.8 | -| Python 3.10.13 | PyTorch 2.1.2 | CUDA 12.3 | -| Python 3.10.17 | PyTorch 2.5.1 | CUDA 12.4 | -| Python 3.9 | PyTorch 2.5.1 | Apple silicon | -| Python 3.11 | PyTorch 2.6.0 | Apple silicon | -| Python 3.9 | PyTorch 2.2.2 | CPU | +| Python Version | PyTorch Version | Device | +| -------------- | ---------------- | ------------- | +| Python 3.10 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.5.1 | CUDA 12.4 | +| Python 3.11 | PyTorch 2.7.0 | CUDA 12.8 | +| Python 3.9 | PyTorch 2.8.0dev | CUDA 12.8 | +| Python 3.9 | PyTorch 2.5.1 | Apple silicon | +| Python 3.11 | PyTorch 2.7.0 | Apple silicon | +| Python 3.9 | PyTorch 2.2.2 | CPU | ### Windows @@ -58,31 +59,41 @@ Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre ### Linux ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh --source [--download-uvr5] +bash install.sh --device --source [--download-uvr5] ``` ### macOS **Not: Mac'lerde GPU'larla eğitilen modeller, diğer cihazlarda eğitilenlere göre önemli ölçüde daha düşük kalitede sonuç verir, bu nedenle geçici olarak CPU'lar kullanıyoruz.** -1. `xcode-select --install` komutunu çalıştırarak Xcode komut satırı araçlarını yükleyin. -2. Aşağıdaki komutları çalıştırarak programı yükleyin: +Aşağıdaki komutları çalıştırarak programı yükleyin: ```bash -conda create -n GPTSoVits python=3.9 +conda create -n GPTSoVits python=3.10 conda activate GPTSoVits -bash install.sh --source [--download-uvr5] +bash install.sh --device --source [--download-uvr5] ``` ### El ile Yükleme +#### Bağımlılıkları Yükleme + +```bash +conda create -n GPTSoVits python=3.10 +conda activate GPTSoVits + +pip install -r extra-req.txt --no-deps +pip install -r requirements.txt +``` + #### FFmpeg'i Yükleme ##### Conda Kullanıcıları ```bash +conda activate GPTSoVits conda install ffmpeg ``` @@ -91,12 +102,13 @@ conda install ffmpeg ```bash sudo apt install ffmpeg sudo apt install libsox-dev -conda install -c conda-forge 'ffmpeg<7' ``` ##### Windows Kullanıcıları -[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin. +[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin + +[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) ortamını yükleyin ##### MacOS Kullanıcıları @@ -104,36 +116,53 @@ conda install -c conda-forge 'ffmpeg<7' brew install ffmpeg ``` -#### Bağımlılıkları Yükleme +### GPT-SoVITS Çalıştırma (Docker Kullanarak) -```bash -pip install -r extra-req.txt --no-deps -pip install -r requirements.txt -``` +#### Docker İmajı Seçimi -### Docker Kullanarak +Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığı için lütfen şu adımları izleyin: -#### docker-compose.yaml yapılandırması +- En güncel kullanılabilir imaj etiketlerini görmek için [Docker Hub](https://hub.docker.com/r/xxxxrt666/gpt-sovits) adresini kontrol edin +- Ortamınıza uygun bir imaj etiketi seçin +- `Lite`, Docker imajında ASR modelleri ve UVR5 modellerinin bulunmadığı anlamına gelir. UVR5 modellerini manuel olarak indirebilirsiniz; ASR modelleri ise gerektiğinde program tarafından otomatik olarak indirilir +- Docker Compose sırasında, uygun mimariye (amd64 veya arm64) ait imaj otomatik olarak indirilir +- Opsiyonel: En güncel değişiklikleri almak için, sağlanan Dockerfile ile yerel olarak imajı kendiniz oluşturabilirsiniz -0. Görüntü etiketleri hakkında: Kod tabanındaki hızlı güncellemeler ve görüntüleri paketleme ve test etme işleminin yavaş olması nedeniyle, lütfen şu anda paketlenmiş en son görüntüleri kontrol etmek için [Docker Hub](https://hub.docker.com/r/breakstring/gpt-sovits)(eski sürüm) adresini kontrol edin ve durumunuza göre seçim yapın veya alternatif olarak, kendi ihtiyaçlarınıza göre bir Dockerfile kullanarak yerel olarak oluşturun. -1. Ortam Değişkenleri: - - is_half: Yarım hassasiyet/çift hassasiyeti kontrol eder. Bu genellikle "SSL çıkarma" adımı sırasında 4-cnhubert/5-wav32k dizinleri altındaki içeriğin doğru şekilde oluşturulmamasının nedenidir. Gerçek durumunuza göre True veya False olarak ayarlayın. -2. Birim Yapılandırması, Kapsayıcı içindeki uygulamanın kök dizini /workspace olarak ayarlanmıştır. Varsayılan docker-compose.yaml, içerik yükleme/indirme için bazı pratik örnekler listeler. -3. shm_size: Windows üzerinde Docker Desktop için varsayılan kullanılabilir bellek çok küçüktür, bu da anormal işlemlere neden olabilir. Kendi durumunuza göre ayarlayın. -4. Dağıtım bölümü altında, GPU ile ilgili ayarlar sisteminize ve gerçek koşullara göre dikkatlice ayarlanmalıdır. +#### Ortam Değişkenleri -#### docker compose ile çalıştırma +- `is_half`: Yarı hassasiyet (fp16) kullanımını kontrol eder. GPU’nuz destekliyorsa, belleği azaltmak için `true` olarak ayarlayın. -``` -docker compose -f "docker-compose.yaml" up -d +#### Paylaşılan Bellek Yapılandırması + +Windows (Docker Desktop) ortamında, varsayılan paylaşılan bellek boyutu düşüktür ve bu beklenmedik hatalara neden olabilir. Sistem belleğinize göre Docker Compose dosyasındaki `shm_size` değerini (örneğin `16g`) artırmanız önerilir. + +#### Servis Seçimi + +`docker-compose.yaml` dosyasında iki tür servis tanımlanmıştır: + +- `GPT-SoVITS-CU126` ve `GPT-SoVITS-CU128`: Tüm özellikleri içeren tam sürüm. +- `GPT-SoVITS-CU126-Lite` ve `GPT-SoVITS-CU128-Lite`: Daha az bağımlılığa ve sınırlı işlevselliğe sahip hafif sürüm. + +Belirli bir servisi Docker Compose ile çalıştırmak için şu komutu kullanın: + +```bash +docker compose run --service-ports ``` -#### docker komutu ile çalıştırma +#### Docker İmajını Yerel Olarak Oluşturma -Yukarıdaki gibi, ilgili parametreleri gerçek durumunuza göre değiştirin, ardından aşağıdaki komutu çalıştırın: +Docker imajını kendiniz oluşturmak isterseniz şu komutu kullanın: +```bash +bash docker_build.sh --cuda <12.6|12.8> [--lite] ``` -docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-DockerTest\output:/workspace/output --volume=G:\GPT-SoVITS-DockerTest\logs:/workspace/logs --volume=G:\GPT-SoVITS-DockerTest\SoVITS_weights:/workspace/SoVITS_weights --workdir=/workspace -p 9880:9880 -p 9871:9871 -p 9872:9872 -p 9873:9873 -p 9874:9874 --shm-size="16G" -d breakstring/gpt-sovits:xxxxx + +#### Çalışan Konteynere Erişim (Bash Shell) + +Konteyner arka planda çalışırken, aşağıdaki komutla içine girebilirsiniz: + +```bash +docker exec -it bash ``` ## Önceden Eğitilmiş Modeller @@ -203,12 +232,12 @@ veya WebUI'de manuel olarak sürüm değiştirin. #### Yol Otomatik Doldurma artık destekleniyor - 1. Ses yolunu doldurun - 2. Sesi küçük parçalara ayırın - 3. Gürültü azaltma (isteğe bağlı) - 4. ASR - 5. ASR transkripsiyonlarını düzeltin - 6. Bir sonraki sekmeye geçin ve modeli ince ayar yapın +1. Ses yolunu doldurun +2. Sesi küçük parçalara ayırın +3. Gürültü azaltma (isteğe bağlı) +4. ASR +5. ASR transkripsiyonlarını düzeltin +6. Bir sonraki sekmeye geçin ve modeli ince ayar yapın ### Çıkarım WebUI'sini Açın @@ -250,7 +279,7 @@ V1 ortamından V2'yi kullanmak için: 2. github'dan en son kodları klonlayın. -3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin. +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained` dizinine yerleştirin. Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.) @@ -270,7 +299,7 @@ V1 ortamından V2'yi kullanmak için: 2. GitHub'dan en son kodları klonlayın. -3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin. +3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS/pretrained_models` dizinine yerleştirin. ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz. @@ -298,7 +327,7 @@ V1 ortamından V2'yi kullanmak için: UVR5 için Web Arayüzünü açmak için komut satırını kullanın -``` +```bash python tools/uvr5/webui.py "" ``` @@ -309,7 +338,7 @@ python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level -- Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır -``` +```bash python audio_slicer.py \ --input_path "" \ --output_root "" \ @@ -321,7 +350,7 @@ python audio_slicer.py \ Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince) -``` +```bash python tools/asr/funasr_asr.py -i -o <çıktı> ``` @@ -329,7 +358,7 @@ ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışınd (İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir) -``` +```bash python ./tools/asr/fasterwhisper_asr.py -i -o <çıktı> -l ``` diff --git a/go-webui.bat b/go-webui.bat index a2dfff6..c1c8108 100644 --- a/go-webui.bat +++ b/go-webui.bat @@ -1,2 +1,6 @@ +set "SCRIPT_DIR=%~dp0" +set "SCRIPT_DIR=%SCRIPT_DIR:~0,-1%" +cd /d "%SCRIPT_DIR%" +set "PATH=%SCRIPT_DIR%\runtime;%PATH%" runtime\python.exe -I webui.py zh_CN pause diff --git a/go-webui.ps1 b/go-webui.ps1 index f942726..0910342 100644 --- a/go-webui.ps1 +++ b/go-webui.ps1 @@ -1,4 +1,7 @@ $ErrorActionPreference = "SilentlyContinue" chcp 65001 -& "$PSScriptRoot\runtime\python.exe" -I "$PSScriptRoot\webui.py" zh_CN +Set-Location $PSScriptRoot +$runtimePath = Join-Path $PSScriptRoot "runtime" +$env:PATH = "$runtimePath;$env:PATH" +& "$runtimePath\python.exe" -I "$PSScriptRoot\webui.py" zh_CN pause diff --git a/install.sh b/install.sh index e45b91b..eba1868 100644 --- a/install.sh +++ b/install.sh @@ -14,18 +14,24 @@ fi trap 'echo "Error Occured at \"$BASH_COMMAND\" with exit code $?"; exit 1' ERR -is_HF=false -is_HF_MIRROR=false -is_MODELSCOPE=false +USE_CUDA=false +USE_ROCM=false +USE_CPU=false +WORKFLOW=${WORKFLOW:-"false"} + +USE_HF=false +USE_HF_MIRROR=false +USE_MODELSCOPE=false DOWNLOAD_UVR5=false print_help() { echo "Usage: bash install.sh [OPTIONS]" echo "" echo "Options:" - echo " --source HF|HF-Mirror|ModelScope Specify the model source (REQUIRED)" - echo " --download-uvr5 Enable downloading the UVR5 model" - echo " -h, --help Show this help message and exit" + echo " --device CU126|CU128|ROCM|MPS|CPU Specify the Device (REQUIRED)" + echo " --source HF|HF-Mirror|ModelScope Specify the model source (REQUIRED)" + echo " --download-uvr5 Enable downloading the UVR5 model" + echo " -h, --help Show this help message and exit" echo "" echo "Examples:" echo " bash install.sh --source HF --download-uvr5" @@ -41,132 +47,192 @@ fi # Parse arguments while [[ $# -gt 0 ]]; do case "$1" in - --source) - case "$2" in - HF) - is_HF=true - ;; - HF-Mirror) - is_HF_MIRROR=true - ;; - ModelScope) - is_MODELSCOPE=true - ;; - *) - echo "Error: Invalid Download Source: $2" - echo "Choose From: [HF, HF-Mirror, ModelScope]" - exit 1 - ;; - esac - shift 2 + --source) + case "$2" in + HF) + USE_HF=true ;; - --download-uvr5) - DOWNLOAD_UVR5=true - shift + HF-Mirror) + USE_HF_MIRROR=true ;; - -h|--help) - print_help - exit 0 + ModelScope) + USE_MODELSCOPE=true ;; *) - echo "Unknown Argument: $1" - echo "Use -h or --help to see available options." + echo "Error: Invalid Download Source: $2" + echo "Choose From: [HF, HF-Mirror, ModelScope]" exit 1 ;; + esac + shift 2 + ;; + --device) + case "$2" in + CU126) + CUDA=126 + USE_CUDA=true + ;; + CU128) + CUDA=128 + USE_CUDA=true + ;; + ROCM) + USE_ROCM=true + ;; + MPS) + USE_CPU=true + ;; + CPU) + USE_CPU=true + ;; + *) + echo "Error: Invalid Device: $2" + echo "Choose From: [CU126, CU128, ROCM, MPS, CPU]" + exit 1 + ;; + esac + shift 2 + ;; + --download-uvr5) + DOWNLOAD_UVR5=true + shift + ;; + -h | --help) + print_help + exit 0 + ;; + *) + echo "Unknown Argument: $1" + echo "Use -h or --help to see available options." + exit 1 + ;; esac done -if ! $is_HF && ! $is_HF_MIRROR && ! $is_MODELSCOPE; then +if ! $USE_CUDA && ! $USE_ROCM && ! $USE_CPU; then + echo "Error: Device is REQUIRED" + echo "" + print_help + exit 1 +fi + +if ! $USE_HF && ! $USE_HF_MIRROR && ! $USE_MODELSCOPE; then echo "Error: Download Source is REQUIRED" echo "" print_help exit 1 fi -if [ "$is_HF" = "true" ]; then +# 安装构建工具 +# Install build tools +if [ "$(uname)" != "Darwin" ]; then + gcc_major_version=$(command -v gcc >/dev/null 2>&1 && gcc -dumpversion | cut -d. -f1 || echo 0) + if [ "$gcc_major_version" -lt 11 ]; then + echo "Installing GCC & G++..." + conda install -c conda-forge gcc=11 gxx=11 -q -y + else + echo "GCC >=11" + fi +else + if ! xcode-select -p &>/dev/null; then + echo "Installing Xcode Command Line Tools..." + xcode-select --install + fi + echo "Waiting For Xcode Command Line Tools Installation Complete..." + while true; do + sleep 20 + + if xcode-select -p &>/dev/null; then + echo "Xcode Command Line Tools Installed" + break + else + echo "Installing,Please Wait..." + fi + done + conda install -c conda-forge -q -y +fi + +echo "Installing ffmpeg and cmake..." +conda install ffmpeg cmake make -q -y + +echo "Installing unzip..." +conda install unzip -y --quiet + +if [ "$USE_HF" = "true" ]; then echo "Download Model From HuggingFace" PRETRINED_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip" G2PW_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip" UVR5_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip" -elif [ "$is_HF_MIRROR" = "true" ]; then + NLTK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip" + PYOPENJTALK_URL="https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz" +elif [ "$USE_HF_MIRROR" = "true" ]; then echo "Download Model From HuggingFace-Mirror" PRETRINED_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/pretrained_models.zip" G2PW_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip" UVR5_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/uvr5_weights.zip" -elif [ "$is_MODELSCOPE" = "true" ]; then + NLTK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/nltk_data.zip" + PYOPENJTALK_URL="https://hf-mirror.com/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/open_jtalk_dic_utf_8-1.11.tar.gz" +elif [ "$USE_MODELSCOPE" = "true" ]; then echo "Download Model From ModelScope" PRETRINED_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/pretrained_models.zip" G2PW_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip" UVR5_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/uvr5_weights.zip" + NLTK_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/nltk_data.zip" + PYOPENJTALK_URL="https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/open_jtalk_dic_utf_8-1.11.tar.gz" +fi + +if [ "$WORKFLOW" = "true" ]; then + WGET_CMD=(wget -nv --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404) +else + WGET_CMD=(wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404) fi -if find "GPT_SoVITS/pretrained_models" -mindepth 1 ! -name '.gitignore' | grep -q .; then +if find -L "GPT_SoVITS/pretrained_models" -mindepth 1 ! -name '.gitignore' | grep -q .; then echo "Pretrained Model Exists" else echo "Download Pretrained Models" - wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$PRETRINED_URL" + "${WGET_CMD[@]}" "$PRETRINED_URL" - unzip pretrained_models.zip + unzip -q -o pretrained_models.zip -d GPT_SoVITS rm -rf pretrained_models.zip - mv pretrained_models/* GPT_SoVITS/pretrained_models - rm -rf pretrained_models fi if [ ! -d "GPT_SoVITS/text/G2PWModel" ]; then echo "Download G2PWModel" - wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$G2PW_URL" + "${WGET_CMD[@]}" "$G2PW_URL" - unzip G2PWModel.zip + unzip -q -o G2PWModel.zip -d GPT_SoVITS/text rm -rf G2PWModel.zip - mv G2PWModel GPT_SoVITS/text/G2PWModel else echo "G2PWModel Exists" fi -if [ "$DOWNLOAD_UVR5" = "true" ];then - if find "tools/uvr5/uvr5_weights" -mindepth 1 ! -name '.gitignore' | grep -q .; then +if [ "$DOWNLOAD_UVR5" = "true" ]; then + if find -L "tools/uvr5/uvr5_weights" -mindepth 1 ! -name '.gitignore' | grep -q .; then echo "UVR5 Model Exists" else echo "Download UVR5 Model" - wget --tries=25 --wait=5 --read-timeout=40 --retry-on-http-error=404 "$UVR5_URL" + "${WGET_CMD[@]}" "$UVR5_URL" - unzip uvr5_weights.zip + unzip -q -o uvr5_weights.zip -d tools/uvr5 rm -rf uvr5_weights.zip - mv uvr5_weights/* tools/uvr5/uvr5_weights - rm -rf uvr5_weights fi fi -# 安装构建工具 -# Install build tools -echo "Installing GCC..." -conda install -c conda-forge gcc=14 -y - -echo "Installing G++..." -conda install -c conda-forge gxx -y - -echo "Installing ffmpeg and cmake..." -conda install ffmpeg cmake -y - -echo "Installing git-lfs and zip..." -conda install git-lfs -y -conda install zip -y - -git-lfs install - -echo "Checking for CUDA installation..." -if command -v nvidia-smi &>/dev/null; then - USE_CUDA=true - echo "CUDA found." -else - echo "CUDA not found." - USE_CUDA=false +if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then + echo "Checking for CUDA installation..." + if command -v nvidia-smi &>/dev/null; then + echo "CUDA found." + else + USE_CUDA=false + USE_CPU=true + echo "CUDA not found." + fi fi -if [ "$USE_CUDA" = false ]; then +if [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then echo "Checking for ROCm installation..." if [ -d "/opt/rocm" ]; then - USE_ROCM=true echo "ROCm found." if grep -qi "microsoft" /proc/version; then echo "You are running WSL." @@ -176,20 +242,28 @@ if [ "$USE_CUDA" = false ]; then IS_WSL=false fi else - echo "ROCm not found." USE_ROCM=false + USE_CPU=true + echo "ROCm not found." fi fi -if [ "$USE_CUDA" = true ]; then +if [ "$USE_CUDA" = true ] && [ "$WORKFLOW" = false ]; then echo "Installing PyTorch with CUDA support..." - pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124 -elif [ "$USE_ROCM" = true ]; then + if [ "$CUDA" = 128 ]; then + pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128 + elif [ "$CUDA" = 126 ]; then + pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cu126 + fi +elif [ "$USE_ROCM" = true ] && [ "$WORKFLOW" = false ]; then echo "Installing PyTorch with ROCm support..." - pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2 -else + pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/rocm6.2 +elif [ "$USE_CPU" = true ] && [ "$WORKFLOW" = false ]; then echo "Installing PyTorch for CPU..." - pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cpu + pip install torch==2.6 torchaudio --index-url https://download.pytorch.org/whl/cpu +elif [ "$WORKFLOW" = false ]; then + echo "Unknown Err" + exit 1 fi echo "Installing Python dependencies from requirements.txt..." @@ -198,11 +272,20 @@ echo "Installing Python dependencies from requirements.txt..." # Refresh environment hash -r -pip install -r extra-req.txt --no-deps +pip install -r extra-req.txt --no-deps --quiet + +pip install -r requirements.txt --quiet + +PY_PREFIX=$(python -c "import sys; print(sys.prefix)") +PYOPENJTALK_PREFIX=$(python -c "import os, pyopenjtalk; print(os.path.dirname(pyopenjtalk.__file__))") -pip install -r requirements.txt +"${WGET_CMD[@]}" "$NLTK_URL" -O nltk_data.zip +unzip -q -o nltk_data -d "$PY_PREFIX" +rm -rf nltk_data.zip -python -c "import nltk; nltk.download(['averaged_perceptron_tagger','averaged_perceptron_tagger_eng','cmudict'])" +"${WGET_CMD[@]}" "$PYOPENJTALK_URL" -O open_jtalk_dic_utf_8-1.11.tar.gz +tar -xvzf open_jtalk_dic_utf_8-1.11.tar.gz -C "$PYOPENJTALK_PREFIX" +rm -rf open_jtalk_dic_utf_8-1.11.tar.gz if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then echo "Update to WSL compatible runtime lib..." diff --git a/requirements.txt b/requirements.txt index 9703b25..07431a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +--no-binary=opencc numpy<2.0 scipy tensorboard @@ -6,8 +7,8 @@ numba pytorch-lightning>=2.4 gradio<5 ffmpeg-python -onnxruntime; sys_platform == 'darwin' -onnxruntime-gpu; sys_platform != 'darwin' +onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64" +onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64" tqdm funasr==1.0.27 cn2an @@ -31,8 +32,7 @@ rotary_embedding_torch ToJyutping g2pk2 ko_pron -opencc; sys_platform != 'linux' -opencc==1.1.1; sys_platform == 'linux' +opencc python_mecab_ko; sys_platform != 'win32' fastapi[standard]>=0.115.2 x_transformers diff --git a/tools/asr/fasterwhisper_asr.py b/tools/asr/fasterwhisper_asr.py index e570f17..27cabbc 100644 --- a/tools/asr/fasterwhisper_asr.py +++ b/tools/asr/fasterwhisper_asr.py @@ -10,6 +10,7 @@ from faster_whisper import WhisperModel from tqdm import tqdm from tools.asr.config import check_fw_local_models +from tools.my_utils import load_cudnn # fmt: off language_code_list = [ @@ -93,6 +94,8 @@ def execute_asr(input_folder, output_folder, model_size, language, precision): return output_file_path +load_cudnn() + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( diff --git a/tools/my_utils.py b/tools/my_utils.py index 44d326e..59a7cd3 100644 --- a/tools/my_utils.py +++ b/tools/my_utils.py @@ -1,11 +1,15 @@ +import ctypes import os -import traceback +import sys +from pathlib import Path + import ffmpeg -import numpy as np import gradio as gr -from tools.i18n.i18n import I18nAuto +import numpy as np import pandas as pd +from tools.i18n.i18n import I18nAuto + i18n = I18nAuto(language=os.environ.get("language", "Auto")) @@ -15,7 +19,7 @@ def load_audio(file, sr): # This launches a subprocess to decode audio while down-mixing and resampling as necessary. # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 - if os.path.exists(file) == False: + if os.path.exists(file) is False: raise RuntimeError("You input a wrong audio path that does not exists, please fix it!") out, _ = ( ffmpeg.input(file, threads=0) @@ -23,7 +27,11 @@ def load_audio(file, sr): .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) ) except Exception: - traceback.print_exc() + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True) + ) # Expose the Error raise RuntimeError(i18n("音频加载失败")) return np.frombuffer(out, np.float32).flatten() @@ -127,3 +135,97 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False): ... else: gr.Warning(i18n("缺少语义数据集")) + + +def load_cudnn(): + import torch + + if not torch.cuda.is_available(): + print("[INFO] CUDA is not available, skipping cuDNN setup.") + return + + if sys.platform == "win32": + torch_lib_dir = Path(torch.__file__).parent / "lib" + if torch_lib_dir.exists(): + os.add_dll_directory(str(torch_lib_dir)) + print(f"[INFO] Added DLL directory: {torch_lib_dir}") + matching_files = sorted(torch_lib_dir.glob("cudnn_cnn*.dll")) + if not matching_files: + print(f"[ERROR] No cudnn_cnn*.dll found in {torch_lib_dir}") + return + for dll_path in matching_files: + dll_name = os.path.basename(dll_path) + try: + ctypes.CDLL(dll_name) + print(f"[INFO] Loaded: {dll_name}") + except OSError as e: + print(f"[WARNING] Failed to load {dll_name}: {e}") + else: + print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}") + + elif sys.platform == "linux": + site_packages = Path(torch.__file__).resolve().parents[1] + cudnn_dir = site_packages / "nvidia" / "cudnn" / "lib" + + if not cudnn_dir.exists(): + print(f"[ERROR] cudnn dir not found: {cudnn_dir}") + return + + matching_files = sorted(cudnn_dir.glob("libcudnn_cnn*.so*")) + if not matching_files: + print(f"[ERROR] No libcudnn_cnn*.so* found in {cudnn_dir}") + return + + for so_path in matching_files: + try: + ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore + print(f"[INFO] Loaded: {so_path}") + except OSError as e: + print(f"[WARNING] Failed to load {so_path}: {e}") + + +def load_nvrtc(): + import torch + + if not torch.cuda.is_available(): + print("[INFO] CUDA is not available, skipping nvrtc setup.") + return + + if sys.platform == "win32": + torch_lib_dir = Path(torch.__file__).parent / "lib" + if torch_lib_dir.exists(): + os.add_dll_directory(str(torch_lib_dir)) + print(f"[INFO] Added DLL directory: {torch_lib_dir}") + matching_files = sorted(torch_lib_dir.glob("nvrtc*.dll")) + if not matching_files: + print(f"[ERROR] No nvrtc*.dll found in {torch_lib_dir}") + return + for dll_path in matching_files: + dll_name = os.path.basename(dll_path) + try: + ctypes.CDLL(dll_name) + print(f"[INFO] Loaded: {dll_name}") + except OSError as e: + print(f"[WARNING] Failed to load {dll_name}: {e}") + else: + print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}") + + elif sys.platform == "linux": + site_packages = Path(torch.__file__).resolve().parents[1] + nvrtc_dir = site_packages / "nvidia" / "cuda_nvrtc" / "lib" + + if not nvrtc_dir.exists(): + print(f"[ERROR] nvrtc dir not found: {nvrtc_dir}") + return + + matching_files = sorted(nvrtc_dir.glob("libnvrtc*.so*")) + if not matching_files: + print(f"[ERROR] No libnvrtc*.so* found in {nvrtc_dir}") + return + + for so_path in matching_files: + try: + ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore + print(f"[INFO] Loaded: {so_path}") + except OSError as e: + print(f"[WARNING] Failed to load {so_path}: {e}") diff --git a/tools/subfix_webui.py b/tools/subfix_webui.py index eae1c98..4244449 100644 --- a/tools/subfix_webui.py +++ b/tools/subfix_webui.py @@ -1,7 +1,7 @@ import argparse -import os import copy import json +import os import uuid try: @@ -11,8 +11,8 @@ try: except: ... -import librosa import gradio as gr +import librosa import numpy as np import soundfile @@ -303,7 +303,7 @@ if __name__ == "__main__": set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch) - with gr.Blocks() as demo: + with gr.Blocks(analytics_enabled=False) as demo: with gr.Row(): btn_change_index = gr.Button("Change Index") btn_submit_change = gr.Button("Submit Text") diff --git a/tools/uvr5/lib/lib_v5/dataset.py b/tools/uvr5/lib/lib_v5/dataset.py index cfd01a1..1a30eec 100644 --- a/tools/uvr5/lib/lib_v5/dataset.py +++ b/tools/uvr5/lib/lib_v5/dataset.py @@ -32,18 +32,10 @@ def make_pair(mix_dir, inst_dir): input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"] X_list = sorted( - [ - os.path.join(mix_dir, fname) - for fname in os.listdir(mix_dir) - if os.path.splitext(fname)[1] in input_exts - ] + [os.path.join(mix_dir, fname) for fname in os.listdir(mix_dir) if os.path.splitext(fname)[1] in input_exts] ) y_list = sorted( - [ - os.path.join(inst_dir, fname) - for fname in os.listdir(inst_dir) - if os.path.splitext(fname)[1] in input_exts - ] + [os.path.join(inst_dir, fname) for fname in os.listdir(inst_dir) if os.path.splitext(fname)[1] in input_exts] ) filelist = list(zip(X_list, y_list)) @@ -65,14 +57,10 @@ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): train_filelist = filelist[:-val_size] val_filelist = filelist[-val_size:] else: - train_filelist = [ - pair for pair in filelist if list(pair) not in val_filelist - ] + train_filelist = [pair for pair in filelist if list(pair) not in val_filelist] elif split_mode == "subdirs": if len(val_filelist) != 0: - raise ValueError( - "The `val_filelist` option is not available in `subdirs` mode" - ) + raise ValueError("The `val_filelist` option is not available in `subdirs` mode") train_filelist = make_pair( os.path.join(dataset_dir, "training/mixtures"), @@ -91,9 +79,7 @@ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha): perm = np.random.permutation(len(X)) for i, idx in enumerate(tqdm(perm)): if np.random.uniform() < reduction_rate: - y[idx] = spec_utils.reduce_vocal_aggressively( - X[idx], y[idx], reduction_mask - ) + y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask) if np.random.uniform() < 0.5: # swap channel @@ -152,9 +138,7 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): patch_list = [] - patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format( - cropsize, sr, hop_length, n_fft, offset - ) + patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(cropsize, sr, hop_length, n_fft, offset) os.makedirs(patch_dir, exist_ok=True) for i, (X_path, y_path) in enumerate(tqdm(filelist)): diff --git a/tools/uvr5/lib/lib_v5/layers.py b/tools/uvr5/lib/lib_v5/layers.py index 4fc1b5c..2b9101e 100644 --- a/tools/uvr5/lib/lib_v5/layers.py +++ b/tools/uvr5/lib/lib_v5/layers.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,24 +89,14 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_123812KB.py b/tools/uvr5/lib/lib_v5/layers_123812KB.py index 4fc1b5c..2b9101e 100644 --- a/tools/uvr5/lib/lib_v5/layers_123812KB.py +++ b/tools/uvr5/lib/lib_v5/layers_123812KB.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,24 +89,14 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_123821KB.py b/tools/uvr5/lib/lib_v5/layers_123821KB.py index 4fc1b5c..2b9101e 100644 --- a/tools/uvr5/lib/lib_v5/layers_123821KB.py +++ b/tools/uvr5/lib/lib_v5/layers_123821KB.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,24 +89,14 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_33966KB.py b/tools/uvr5/lib/lib_v5/layers_33966KB.py index 9b127bc..4397777 100644 --- a/tools/uvr5/lib/lib_v5/layers_33966KB.py +++ b/tools/uvr5/lib/lib_v5/layers_33966KB.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,30 +89,16 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_537227KB.py b/tools/uvr5/lib/lib_v5/layers_537227KB.py index 9b127bc..4397777 100644 --- a/tools/uvr5/lib/lib_v5/layers_537227KB.py +++ b/tools/uvr5/lib/lib_v5/layers_537227KB.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,30 +89,16 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_537238KB.py b/tools/uvr5/lib/lib_v5/layers_537238KB.py index 9b127bc..4397777 100644 --- a/tools/uvr5/lib/lib_v5/layers_537238KB.py +++ b/tools/uvr5/lib/lib_v5/layers_537238KB.py @@ -63,9 +63,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None @@ -91,30 +89,16 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) + self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ) + self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)) def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) diff --git a/tools/uvr5/lib/lib_v5/layers_new.py b/tools/uvr5/lib/lib_v5/layers_new.py index 44153b6..7d7005c 100644 --- a/tools/uvr5/lib/lib_v5/layers_new.py +++ b/tools/uvr5/lib/lib_v5/layers_new.py @@ -40,9 +40,7 @@ class Encoder(nn.Module): class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False): super(Decoder, self).__init__() self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) @@ -72,23 +70,15 @@ class ASPPModule(nn.Module): Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), ) self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) - self.conv3 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[2], dilations[2], activ=activ - ) + self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ) + self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ) + self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ) self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) self.dropout = nn.Dropout2d(0.1) if dropout else None def forward(self, x): _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True) feat2 = self.conv2(x) feat3 = self.conv3(x) feat4 = self.conv4(x) @@ -106,12 +96,8 @@ class LSTMModule(nn.Module): def __init__(self, nin_conv, nin_lstm, nout_lstm): super(LSTMModule, self).__init__() self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) - self.lstm = nn.LSTM( - input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True - ) - self.dense = nn.Sequential( - nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() - ) + self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True) + self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()) def forward(self, x): N, _, nbins, nframes = x.size() diff --git a/tools/uvr5/lib/lib_v5/model_param_init.py b/tools/uvr5/lib/lib_v5/model_param_init.py index b995c0b..79b94d3 100644 --- a/tools/uvr5/lib/lib_v5/model_param_init.py +++ b/tools/uvr5/lib/lib_v5/model_param_init.py @@ -1,5 +1,4 @@ import json -import os import pathlib default_param = {} @@ -48,9 +47,7 @@ class ModelParameters(object): import zipfile with zipfile.ZipFile(config_path, "r") as zip: - self.param = json.loads( - zip.read("param.json"), object_pairs_hook=int_keys - ) + self.param = json.loads(zip.read("param.json"), object_pairs_hook=int_keys) elif ".json" == pathlib.Path(config_path).suffix: with open(config_path, "r") as f: self.param = json.loads(f.read(), object_pairs_hook=int_keys) @@ -65,5 +62,5 @@ class ModelParameters(object): "stereo_n", "reverse", ]: - if not k in self.param: + if k not in self.param: self.param[k] = False diff --git a/tools/uvr5/lib/lib_v5/nets.py b/tools/uvr5/lib/lib_v5/nets.py index 5da3948..42d7807 100644 --- a/tools/uvr5/lib/lib_v5/nets.py +++ b/tools/uvr5/lib/lib_v5/nets.py @@ -3,8 +3,6 @@ import torch import torch.nn.functional as F from torch import nn -from . import spec_utils - class BaseASPPNet(nn.Module): def __init__(self, nin, ch, dilations=(4, 8, 16)): diff --git a/tools/uvr5/lib/lib_v5/nets_537227KB.py b/tools/uvr5/lib/lib_v5/nets_537227KB.py index 823b44f..9bb1df1 100644 --- a/tools/uvr5/lib/lib_v5/nets_537227KB.py +++ b/tools/uvr5/lib/lib_v5/nets_537227KB.py @@ -1,4 +1,3 @@ -import numpy as np import torch import torch.nn.functional as F from torch import nn diff --git a/tools/uvr5/lib/lib_v5/nets_537238KB.py b/tools/uvr5/lib/lib_v5/nets_537238KB.py index 823b44f..9bb1df1 100644 --- a/tools/uvr5/lib/lib_v5/nets_537238KB.py +++ b/tools/uvr5/lib/lib_v5/nets_537238KB.py @@ -1,4 +1,3 @@ -import numpy as np import torch import torch.nn.functional as F from torch import nn diff --git a/tools/uvr5/lib/lib_v5/nets_new.py b/tools/uvr5/lib/lib_v5/nets_new.py index 1c0f4fa..ba1a559 100644 --- a/tools/uvr5/lib/lib_v5/nets_new.py +++ b/tools/uvr5/lib/lib_v5/nets_new.py @@ -6,9 +6,7 @@ from . import layers_new class BaseNet(nn.Module): - def __init__( - self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) - ): + def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))): super(BaseNet, self).__init__() self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1) self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1) @@ -56,21 +54,15 @@ class CascadedNet(nn.Module): layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), ) - self.stg1_high_band_net = BaseNet( - 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 - ) + self.stg1_high_band_net = BaseNet(2, nout // 4, self.nin_lstm // 2, nout_lstm // 2) self.stg2_low_band_net = nn.Sequential( BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), ) - self.stg2_high_band_net = BaseNet( - nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 - ) + self.stg2_high_band_net = BaseNet(nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2) - self.stg3_full_band_net = BaseNet( - 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm - ) + self.stg3_full_band_net = BaseNet(3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm) self.out = nn.Conv2d(nout, 2, 1, bias=False) self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) diff --git a/tools/uvr5/lib/lib_v5/spec_utils.py b/tools/uvr5/lib/lib_v5/spec_utils.py index da072e4..4d987cd 100644 --- a/tools/uvr5/lib/lib_v5/spec_utils.py +++ b/tools/uvr5/lib/lib_v5/spec_utils.py @@ -27,9 +27,7 @@ def crop_center(h1, h2): return h1 -def wave_to_spectrogram( - wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False -): +def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): if reverse: wave_left = np.flip(np.asfortranarray(wave[0])) wave_right = np.flip(np.asfortranarray(wave[1])) @@ -43,7 +41,7 @@ def wave_to_spectrogram( wave_left = np.asfortranarray(wave[0]) wave_right = np.asfortranarray(wave[1]) - spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length) + spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length) spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length) spec = np.asfortranarray([spec_left, spec_right]) @@ -51,9 +49,7 @@ def wave_to_spectrogram( return spec -def wave_to_spectrogram_mt( - wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False -): +def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False): import threading if reverse: @@ -103,21 +99,13 @@ def combine_spectrograms(specs, mp): raise ValueError("Too much bins") # lowpass fiter - if ( - mp.param["pre_filter_start"] > 0 - ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: + if mp.param["pre_filter_start"] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: if bands_n == 1: - spec_c = fft_lp_filter( - spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"] - ) + spec_c = fft_lp_filter(spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]) else: gp = 1 - for b in range( - mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"] - ): - g = math.pow( - 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0 - ) + for b in range(mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]): + g = math.pow(10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0) gp = g spec_c[:, b, :] *= g @@ -189,9 +177,7 @@ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32): else: e += fade_size - mag[:, :, s + fade_size : e - fade_size] += ref[ - :, :, s + fade_size : e - fade_size - ] + mag[:, :, s + fade_size : e - fade_size] += ref[:, :, s + fade_size : e - fade_size] old_e = e return mag @@ -207,9 +193,7 @@ def cache_or_load(mix_path, inst_path, mp): mix_basename = os.path.splitext(os.path.basename(mix_path))[0] inst_basename = os.path.splitext(os.path.basename(inst_path))[0] - cache_dir = "mph{}".format( - hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest() - ) + cache_dir = "mph{}".format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()) mix_cache_dir = os.path.join("cache", cache_dir) inst_cache_dir = os.path.join("cache", cache_dir) @@ -230,31 +214,27 @@ def cache_or_load(mix_path, inst_path, mp): if d == len(mp.param["band"]): # high-end band X_wave[d], _ = librosa.load( - mix_path, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"] + mix_path, sr=bp["sr"], mono=False, dtype=np.float32, res_type=bp["res_type"] ) y_wave[d], _ = librosa.load( inst_path, - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], ) else: # lower bands X_wave[d] = librosa.resample( X_wave[d + 1], - orig_sr = mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], + orig_sr=mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], ) y_wave[d] = librosa.resample( y_wave[d + 1], - orig_sr = mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], + orig_sr=mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], ) X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d]) @@ -302,9 +282,7 @@ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse): if reverse: return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) elif mid_side: - return np.asfortranarray( - [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] - ) + return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) elif mid_side_b2: return np.asfortranarray( [ @@ -326,9 +304,7 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): global wave_left wave_left = librosa.istft(**kwargs) - thread = threading.Thread( - target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length} - ) + thread = threading.Thread(target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}) thread.start() wave_right = librosa.istft(spec_right, hop_length=hop_length) thread.join() @@ -336,9 +312,7 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): if reverse: return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) elif mid_side: - return np.asfortranarray( - [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] - ) + return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]) elif mid_side_b2: return np.asfortranarray( [ @@ -357,21 +331,15 @@ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): for d in range(1, bands_n + 1): bp = mp.param["band"][d] - spec_s = np.ndarray( - shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex - ) + spec_s = np.ndarray(shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex) h = bp["crop_stop"] - bp["crop_start"] - spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[ - :, offset : offset + h, : - ] + spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[:, offset : offset + h, :] offset += h if d == bands_n: # higher if extra_bins_h: # if --high_end_process bypass max_bin = bp["n_fft"] // 2 - spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[ - :, :extra_bins_h, : - ] + spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[:, :extra_bins_h, :] if bp["hpf_start"] > 0: spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) if bands_n == 1: @@ -405,9 +373,9 @@ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): mp.param["mid_side_b2"], mp.param["reverse"], ), - orig_sr = bp["sr"], - target_sr = sr, - res_type = "sinc_fastest", + orig_sr=bp["sr"], + target_sr=sr, + res_type="sinc_fastest", ) else: # mid spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) @@ -456,10 +424,7 @@ def mirroring(a, spec_m, input_high_end, mp): np.abs( spec_m[ :, - mp.param["pre_filter_start"] - - 10 - - input_high_end.shape[1] : mp.param["pre_filter_start"] - - 10, + mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :, ] ), @@ -467,19 +432,14 @@ def mirroring(a, spec_m, input_high_end, mp): ) mirror = mirror * np.exp(1.0j * np.angle(input_high_end)) - return np.where( - np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror - ) + return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror) if "mirroring2" == a: mirror = np.flip( np.abs( spec_m[ :, - mp.param["pre_filter_start"] - - 10 - - input_high_end.shape[1] : mp.param["pre_filter_start"] - - 10, + mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10, :, ] ), @@ -528,7 +488,6 @@ def istft(spec, hl): if __name__ == "__main__": import argparse - import sys import time import cv2 @@ -573,10 +532,10 @@ if __name__ == "__main__": if d == len(mp.param["band"]): # high-end band wave[d], _ = librosa.load( args.input[i], - sr = bp["sr"], - mono = False, - dtype = np.float32, - res_type = bp["res_type"], + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], ) if len(wave[d].shape) == 1: # mono to stereo @@ -584,9 +543,9 @@ if __name__ == "__main__": else: # lower bands wave[d] = librosa.resample( wave[d + 1], - orig_sr = mp.param["band"][d + 1]["sr"], - target_sr = bp["sr"], - res_type = bp["res_type"], + orig_sr=mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], ) spec[d] = wave_to_spectrogram( diff --git a/tools/uvr5/lib/utils.py b/tools/uvr5/lib/utils.py index 5e8cd22..0166d52 100644 --- a/tools/uvr5/lib/utils.py +++ b/tools/uvr5/lib/utils.py @@ -27,9 +27,7 @@ def inference(X_spec, device, model, aggressiveness, data): data : dic configs """ - def _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True - ): + def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True): model.eval() with torch.no_grad(): preds = [] @@ -39,9 +37,7 @@ def inference(X_spec, device, model, aggressiveness, data): total_iterations = sum(iterations) for i in tqdm(range(n_window)): start = i * roi_size - X_mag_window = X_mag_pad[ - None, :, :, start : start + data["window_size"] - ] + X_mag_window = X_mag_pad[None, :, :, start : start + data["window_size"]] X_mag_window = torch.from_numpy(X_mag_window) if is_half: X_mag_window = X_mag_window.half() @@ -76,9 +72,7 @@ def inference(X_spec, device, model, aggressiveness, data): is_half = True else: is_half = False - pred = _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half - ) + pred = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half) pred = pred[:, :, :n_frame] if data["tta"]: @@ -88,9 +82,7 @@ def inference(X_spec, device, model, aggressiveness, data): X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") - pred_tta = _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half - ) + pred_tta = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half) pred_tta = pred_tta[:, :, roi_size // 2 :] pred_tta = pred_tta[:, :, :n_frame] diff --git a/tools/uvr5/webui.py b/tools/uvr5/webui.py index a3d7fe4..f5f8d3f 100644 --- a/tools/uvr5/webui.py +++ b/tools/uvr5/webui.py @@ -1,26 +1,22 @@ +import logging import os import traceback + import gradio as gr -import logging + from tools.i18n.i18n import I18nAuto from tools.my_utils import clean_path i18n = I18nAuto() logger = logging.getLogger(__name__) +import sys + import ffmpeg import torch -import sys +from bsroformer import Roformer_Loader from mdxnet import MDXNetDereverb from vr import AudioPre, AudioPreDeEcho -from bsroformer import Roformer_Loader - -try: - import gradio.analytics as analytics - - analytics.version_check = lambda: None -except: - ... weight_uvr5_root = "tools/uvr5/uvr5_weights" uvr5_names = [] @@ -129,7 +125,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format yield "\n".join(infos) -with gr.Blocks(title="UVR5 WebUI") as app: +with gr.Blocks(title="UVR5 WebUI", analytics_enabled=False) as app: gr.Markdown( value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" diff --git a/webui.py b/webui.py index cddbb02..43f3324 100644 --- a/webui.py +++ b/webui.py @@ -147,7 +147,9 @@ if torch.cuda.is_available() or ngpu != 0: # mem.append(psutil.virtual_memory().total/ 1024 / 1024 / 1024) # 实测使用系统内存作为显存不会爆显存 -v3v4set={"v3","v4"} +v3v4set = {"v3", "v4"} + + def set_default(): global \ default_batch_size, \ @@ -382,7 +384,7 @@ def change_label(path_list): if p_label is None: check_for_existance([path_list]) path_list = my_utils.clean_path(path_list) - cmd = '"%s" tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s' % ( + cmd = '"%s" -s tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s' % ( python_exec, path_list, webui_port_subfix, @@ -411,7 +413,13 @@ process_name_uvr5 = i18n("人声分离WebUI") def change_uvr5(): global p_uvr5 if p_uvr5 is None: - cmd = '"%s" tools/uvr5/webui.py "%s" %s %s %s' % (python_exec, infer_device, is_half, webui_port_uvr5, is_share) + cmd = '"%s" -s tools/uvr5/webui.py "%s" %s %s %s' % ( + python_exec, + infer_device, + is_half, + webui_port_uvr5, + is_share, + ) yield ( process_info(process_name_uvr5, "opened"), {"__type__": "update", "visible": False}, @@ -435,9 +443,9 @@ process_name_tts = i18n("TTS推理WebUI") def change_tts_inference(bert_path, cnhubert_base_path, gpu_number, gpt_path, sovits_path, batched_infer_enabled): global p_tts_inference if batched_infer_enabled: - cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language) + cmd = '"%s" -s GPT_SoVITS/inference_webui_fast.py "%s"' % (python_exec, language) else: - cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language) + cmd = '"%s" -s GPT_SoVITS/inference_webui.py "%s"' % (python_exec, language) # #####v3暂不支持加速推理 # if version=="v3": # cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) @@ -478,7 +486,7 @@ def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang, asr_ asr_inp_dir = my_utils.clean_path(asr_inp_dir) asr_opt_dir = my_utils.clean_path(asr_opt_dir) check_for_existance([asr_inp_dir]) - cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}' + cmd = f'"{python_exec}" -s tools/asr/{asr_dict[asr_model]["path"]}' cmd += f' -i "{asr_inp_dir}"' cmd += f' -o "{asr_opt_dir}"' cmd += f" -s {asr_model_size}" @@ -539,7 +547,7 @@ def open_denoise(denoise_inp_dir, denoise_opt_dir): denoise_inp_dir = my_utils.clean_path(denoise_inp_dir) denoise_opt_dir = my_utils.clean_path(denoise_opt_dir) check_for_existance([denoise_inp_dir]) - cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s' % ( + cmd = '"%s" -s tools/cmd-denoise.py -i "%s" -o "%s" -p %s' % ( python_exec, denoise_inp_dir, denoise_opt_dir, @@ -589,6 +597,7 @@ def close_denoise(): p_train_SoVITS = None process_name_sovits = i18n("SoVITS训练") + def open1Ba( batch_size, total_epoch, @@ -635,13 +644,15 @@ def open1Ba( with open(tmp_config_path, "w") as f: f.write(json.dumps(data)) if version in ["v1", "v2"]: - cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"' % (python_exec, tmp_config_path) + cmd = '"%s" -s GPT_SoVITS/s2_train.py --config "%s"' % (python_exec, tmp_config_path) else: - cmd = '"%s" GPT_SoVITS/s2_train_v3_lora.py --config "%s"' % (python_exec, tmp_config_path) + cmd = '"%s" -s GPT_SoVITS/s2_train_v3_lora.py --config "%s"' % (python_exec, tmp_config_path) yield ( process_info(process_name_sovits, "opened"), {"__type__": "update", "visible": False}, - {"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"} + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, ) print(cmd) p_train_SoVITS = Popen(cmd, shell=True) @@ -651,13 +662,17 @@ def open1Ba( yield ( process_info(process_name_sovits, "finish"), {"__type__": "update", "visible": True}, - {"__type__": "update", "visible": False},SoVITS_dropdown_update,GPT_dropdown_update + {"__type__": "update", "visible": False}, + SoVITS_dropdown_update, + GPT_dropdown_update, ) else: yield ( process_info(process_name_sovits, "occupy"), {"__type__": "update", "visible": False}, - {"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"} + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, ) @@ -722,11 +737,13 @@ def open1Bb( with open(tmp_config_path, "w") as f: f.write(yaml.dump(data, default_flow_style=False)) # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir) - cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" ' % (python_exec, tmp_config_path) + cmd = '"%s" -s GPT_SoVITS/s1_train.py --config_file "%s" ' % (python_exec, tmp_config_path) yield ( process_info(process_name_gpt, "opened"), {"__type__": "update", "visible": False}, - {"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"} + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, ) print(cmd) p_train_GPT = Popen(cmd, shell=True) @@ -736,13 +753,17 @@ def open1Bb( yield ( process_info(process_name_gpt, "finish"), {"__type__": "update", "visible": True}, - {"__type__": "update", "visible": False},SoVITS_dropdown_update,GPT_dropdown_update + {"__type__": "update", "visible": False}, + SoVITS_dropdown_update, + GPT_dropdown_update, ) else: yield ( process_info(process_name_gpt, "occupy"), {"__type__": "update", "visible": False}, - {"__type__": "update", "visible": True},{"__type__": "update"},{"__type__": "update"} + {"__type__": "update", "visible": True}, + {"__type__": "update"}, + {"__type__": "update"}, ) @@ -793,7 +814,7 @@ def open_slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max return if ps_slice == []: for i_part in range(n_parts): - cmd = '"%s" tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s' % ( + cmd = '"%s" -s tools/slice_audio.py "%s" "%s" %s %s %s %s %s %s %s %s %s' % ( python_exec, inp, opt_root, @@ -887,7 +908,7 @@ def open1a(inp_text, inp_wav_dir, exp_name, gpu_numbers, bert_pretrained_dir): } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1a.append(p) @@ -974,7 +995,7 @@ def open1b(inp_text, inp_wav_dir, exp_name, gpu_numbers, ssl_pretrained_dir): } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1b.append(p) @@ -1045,7 +1066,7 @@ def open1c(inp_text, exp_name, gpu_numbers, pretrained_s2G_path): } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1c.append(p) @@ -1143,7 +1164,7 @@ def open1abc( } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/1-get-text.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) @@ -1189,7 +1210,7 @@ def open1abc( } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) @@ -1229,7 +1250,7 @@ def open1abc( } ) os.environ.update(config) - cmd = '"%s" GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec + cmd = '"%s" -s GPT_SoVITS/prepare_datasets/3-get-semantic.py' % python_exec print(cmd) p = Popen(cmd, shell=True) ps1abc.append(p) @@ -1291,6 +1312,7 @@ def close1abc(): {"__type__": "update", "visible": False}, ) + def switch_version(version_): os.environ["version"] = version_ global version @@ -1323,7 +1345,7 @@ def switch_version(version_): if os.path.exists("GPT_SoVITS/text/G2PWModel"): ... else: - cmd = '"%s" GPT_SoVITS/download.py' % python_exec + cmd = '"%s" -s GPT_SoVITS/download.py' % python_exec p = Popen(cmd, shell=True) p.wait() @@ -1332,7 +1354,7 @@ def sync(text): return {"__type__": "update", "value": text} -with gr.Blocks(title="GPT-SoVITS WebUI") as app: +with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app: gr.Markdown( value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.") + "
" @@ -1492,7 +1514,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: with gr.Row(): exp_name = gr.Textbox(label=i18n("*实验/模型名"), value="xxx", interactive=True) gpu_info = gr.Textbox(label=i18n("显卡信息"), value=gpu_info, visible=True, interactive=False) - version_checkbox = gr.Radio(label=i18n("版本"), value=version, choices=["v1", "v2", "v4"])#, "v3" + version_checkbox = gr.Radio(label=i18n("版本"), value=version, choices=["v1", "v2", "v4"]) # , "v3" with gr.Row(): pretrained_s2G = gr.Textbox( label=i18n("预训练SoVITS-G模型路径"), @@ -1915,7 +1937,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: if_grad_ckpt, lora_rank, ], - [info1Ba, button1Ba_open, button1Ba_close,SoVITS_dropdown,GPT_dropdown], + [info1Ba, button1Ba_open, button1Ba_close, SoVITS_dropdown, GPT_dropdown], ) button1Bb_open.click( open1Bb, @@ -1930,7 +1952,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: gpu_numbers1Bb, pretrained_s1, ], - [info1Bb, button1Bb_open, button1Bb_close,SoVITS_dropdown,GPT_dropdown], + [info1Bb, button1Bb_open, button1Bb_close, SoVITS_dropdown, GPT_dropdown], ) version_checkbox.change( switch_version,