【Hackathon 8th No.9】在 PaddleSpeech 中复现 DAC 训练需要用到的 loss (#3988)

cchenhaifeng · web-flow · commit d7bf91561d5a · 2025-02-26T16:46:34.000+08:00
* add DAC loss

* fix bug

* fix codestyle

* fix codestyle

* fix codestyle

* fix codestyle

* fix codestyle

* fix codestyle
diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py
@@ -13,7 +13,3 @@
 # limitations under the License.
 import _locale
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
-
-__version__ = '0.0.0'
-
-__commit__ = '9cf8c1985a98bb380c183116123672976bdfe5c9'
diff --git a/paddlespeech/audiotools/core/__init__.py b/paddlespeech/audiotools/core/__init__.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from . import util
-from ._julius import fft_conv1d
-from ._julius import FFTConv1D
+from ...t2s.modules import fft_conv1d
+from ...t2s.modules import FFTConv1D
 from ._julius import highpass_filter
 from ._julius import highpass_filters
 from ._julius import lowpass_filter
diff --git a/paddlespeech/audiotools/core/_julius.py b/paddlespeech/audiotools/core/_julius.py
@@ -20,8 +20,6 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlespeech.t2s.modules import fft_conv1d
-from paddlespeech.t2s.modules import FFTConv1D
 from paddlespeech.utils import satisfy_paddle_version
 
 __all__ = [
@@ -312,6 +310,7 @@ def forward(self, _input):
                 mode="replicate",
                 data_format="NCL")
         if self.fft:
+            from paddlespeech.t2s.modules import fft_conv1d
             out = fft_conv1d(_input, self.filters, stride=self.stride)
         else:
             out = F.conv1d(_input, self.filters, stride=self.stride)
diff --git a/paddlespeech/audiotools/core/util.py b/paddlespeech/audiotools/core/util.py
@@ -32,7 +32,6 @@
 from flatten_dict import flatten
 from flatten_dict import unflatten
 
-from .audio_signal import AudioSignal
 from paddlespeech.utils import satisfy_paddle_version
 from paddlespeech.vector.training.seeding import seed_everything
 
@@ -232,8 +231,7 @@ def ensure_tensor(
 
 def _get_value(other):
     # 
-    from . import AudioSignal
-
+    from .audio_signal import AudioSignal
     if isinstance(other, AudioSignal):
         return other.audio_data
     return other
@@ -784,6 +782,8 @@ def collate(list_of_dicts: list, n_splits: int=None):
         Dictionary containing batched data.
     """
 
+    from .audio_signal import AudioSignal
+
     batches = []
     list_len = len(list_of_dicts)
 
@@ -873,7 +873,7 @@ def generate_chord_dataset(
 
     """
     import librosa
-    from . import AudioSignal
+    from .audio_signal import AudioSignal
     from ..data.preprocess import create_csv
 
     min_midi = librosa.note_to_midi(min_note)
diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+from typing import Callable
+from typing import List
+from typing import Optional
 from typing import Tuple
+from typing import Union
 
 import librosa
 import numpy as np
@@ -23,6 +27,8 @@
 from scipy.stats import betabinom
 from typeguard import typechecked
 
+from paddlespeech.audiotools.core.audio_signal import AudioSignal
+from paddlespeech.audiotools.core.audio_signal import STFTParams
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.predictor.duration_predictor import (
     DurationPredictorLoss,  # noqa: H301
@@ -1326,3 +1332,276 @@ def _generate_prior(self, text_lengths, feats_lengths,
             bb_prior[bidx, :T, :N] = prob
 
         return bb_prior
+
+
+class MultiScaleSTFTLoss(nn.Layer):
+    """Computes the multi-scale STFT loss from [1].
+
+    References
+    ----------
+
+    1.  Engel, Jesse, Chenjie Gu, and Adam Roberts.
+        "DDSP: Differentiable Digital Signal Processing."
+        International Conference on Learning Representations. 2019.
+
+    Implementation copied from: https://github.com/descriptinc/audiotools/blob/master/audiotools/metrics/spectral.py
+    """
+
+    def __init__(
+            self,
+            window_lengths: List[int]=[2048, 512],
+            loss_fn: Callable=nn.L1Loss(),
+            clamp_eps: float=1e-5,
+            mag_weight: float=1.0,
+            log_weight: float=1.0,
+            pow: float=2.0,
+            weight: float=1.0,
+            match_stride: bool=False,
+            window_type: Optional[str]=None, ):
+        """
+        Args:
+            window_lengths : List[int], optional
+                Length of each window of each STFT, by default [2048, 512]
+            loss_fn : typing.Callable, optional
+                How to compare each loss, by default nn.L1Loss()
+            clamp_eps : float, optional
+                Clamp on the log magnitude, below, by default 1e-5
+            mag_weight : float, optional
+                Weight of raw magnitude portion of loss, by default 1.0
+            log_weight : float, optional
+                Weight of log magnitude portion of loss, by default 1.0
+            pow : float, optional
+                Power to raise magnitude to before taking log, by default 2.0
+            weight : float, optional
+                Weight of this loss, by default 1.0
+            match_stride : bool, optional
+                Whether to match the stride of convolutional layers, by default False
+            window_type : str, optional
+                Type of window to use, by default None.
+        """
+        super().__init__()
+
+        self.stft_params = [
+            STFTParams(
+                window_length=w,
+                hop_length=w // 4,
+                match_stride=match_stride,
+                window_type=window_type, ) for w in window_lengths
+        ]
+        self.loss_fn = loss_fn
+        self.log_weight = log_weight
+        self.mag_weight = mag_weight
+        self.clamp_eps = clamp_eps
+        self.weight = weight
+        self.pow = pow
+
+    def forward(self, x: AudioSignal, y: AudioSignal):
+        """Computes multi-scale STFT between an estimate and a reference
+        signal.
+
+        Args:
+            x : AudioSignal
+                Estimate signal
+            y : AudioSignal
+                Reference signal
+
+        Returns:
+            paddle.Tensor
+                Multi-scale STFT loss.
+        
+        Example:
+            >>> from paddlespeech.audiotools.core.audio_signal import AudioSignal
+            >>> import paddle
+
+            >>> x = AudioSignal("https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav", 2_05)
+            >>> y = x * 0.01
+            >>> loss = MultiScaleSTFTLoss()
+            >>> loss(x, y).numpy()
+            7.562150
+        """
+        for s in self.stft_params:
+            x.stft(s.window_length, s.hop_length, s.window_type)
+            y.stft(s.window_length, s.hop_length, s.window_type)
+            loss += self.log_weight * self.loss_fn(
+                x.magnitude.clip(self.clamp_eps).pow(self.pow).log10(),
+                y.magnitude.clip(self.clamp_eps).pow(self.pow).log10(), )
+            loss += self.mag_weight * self.loss_fn(x.magnitude, y.magnitude)
+        return loss
+
+
+class GANLoss(nn.Layer):
+    """
+    Computes a discriminator loss, given a discriminator on
+    generated waveforms/spectrograms compared to ground truth
+    waveforms/spectrograms. Computes the loss for both the
+    discriminator and the generator in separate functions.
+
+    Example:
+    >>> from paddlespeech.audiotools.core.audio_signal import AudioSignal
+    >>> import paddle
+
+    >>> x = AudioSignal("https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav", 2_05)
+    >>> y = x * 0.01
+    >>> class My_discriminator0:
+    >>>     def __call__(self, x):
+    >>>         return x.sum()
+    >>> loss = GANLoss(My_discriminator0())
+    >>> [loss(x, y)[0].numpy(), loss(x, y)[1].numpy()]
+    [-0.102722, -0.001027]
+
+    >>> class My_discriminator1:
+    >>>     def __call__(self, x):
+    >>>         return x.sum()
+    >>> loss = GANLoss(My_discriminator1())
+    >>> [loss.generator_loss(x, y)[0].numpy(), loss.generator_loss(x, y)[1].numpy()]
+    [1.00019, 0]
+
+    >>> loss.discriminator_loss(x, y)
+    1.000200
+    """
+
+    def __init__(self, discriminator):
+        """
+        Args:
+            discriminator : paddle.nn.layer
+                Discriminator model
+        """
+        super().__init__()
+        self.discriminator = discriminator
+
+    def forward(self,
+                fake: Union[AudioSignal, paddle.Tensor],
+                real: Union[AudioSignal, paddle.Tensor]):
+        if isinstance(fake, AudioSignal):
+            d_fake = self.discriminator(fake.audio_data)
+        else:
+            d_fake = self.discriminator(fake)
+
+        if isinstance(real, AudioSignal):
+            d_real = self.discriminator(real.audio_data)
+        else:
+            d_real = self.discriminator(real)
+        return d_fake, d_real
+
+    def discriminator_loss(self, fake, real):
+        d_fake, d_real = self.forward(fake, real)
+
+        loss_d = 0
+        for x_fake, x_real in zip(d_fake, d_real):
+            loss_d += paddle.mean(x_fake[-1]**2)
+            loss_d += paddle.mean((1 - x_real[-1])**2)
+        return loss_d
+
+    def generator_loss(self, fake, real):
+        d_fake, d_real = self.forward(fake, real)
+
+        loss_g = 0
+        for x_fake in d_fake:
+            loss_g += paddle.mean((1 - x_fake[-1])**2)
+
+        loss_feature = 0
+
+        for i in range(len(d_fake)):
+            for j in range(len(d_fake[i]) - 1):
+                loss_feature += F.l1_loss(d_fake[i][j], d_real[i][j]())
+        return loss_g, loss_feature
+
+
+class SISDRLoss(nn.Layer):
+    """
+    Computes the Scale-Invariant Source-to-Distortion Ratio between a batch
+    of estimated and reference audio signals or aligned features.
+
+    Implementation copied from: https://github.com/descriptinc/audiotools/blob/master/audiotools/metrics/distance.py
+
+    Example:
+    >>> from paddlespeech.audiotools.core.audio_signal import AudioSignal
+    >>> import paddle
+
+    >>> x = AudioSignal("https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav", 2_05)
+    >>> y = x * 0.01
+    >>> sisdr = SISDRLoss()
+    >>> sisdr(x, y).numpy()
+    -145.377640
+    """
+
+    def __init__(
+            self,
+            scaling: bool=True,
+            reduction: str="mean",
+            zero_mean: bool=True,
+            clip_min: Optional[int]=None,
+            weight: float=1.0, ):
+        """
+        Args:
+            scaling : bool, optional
+                Whether to use scale-invariant (True) or
+                signal-to-noise ratio (False), by default True
+            reduction : str, optional
+                How to reduce across the batch (either 'mean',
+                'sum', or none).], by default ' mean'
+            zero_mean : bool, optional
+                Zero mean the references and estimates before
+                computing the loss, by default True
+            clip_min : int, optional
+                The minimum possible loss value. Helps network
+                to not focus on making already good examples better, by default None
+            weight : float, optional
+                Weight of this loss, defaults to 1.0.
+        """
+        self.scaling = scaling
+        self.reduction = reduction
+        self.zero_mean = zero_mean
+        self.clip_min = clip_min
+        self.weight = weight
+        super().__init__()
+
+    def forward(self,
+                x: Union[AudioSignal, paddle.Tensor],
+                y: Union[AudioSignal, paddle.Tensor]):
+        eps = 1e-8
+        # B, C, T
+        if isinstance(x, AudioSignal):
+            references = x.audio_data
+            estimates = y.audio_data
+        else:
+            references = x
+            estimates = y
+
+        nb = references.shape[0]
+        references = references.reshape([nb, 1, -1]).transpose([0, 2, 1])
+        estimates = estimates.reshape([nb, 1, -1]).transpose([0, 2, 1])
+
+        # samples now on axis 1
+        if self.zero_mean:
+            mean_reference = references.mean(axis=1, keepdim=True)
+            mean_estimate = estimates.mean(axis=1, keepdim=True)
+        else:
+            mean_reference = 0
+            mean_estimate = 0
+
+        _references = references - mean_reference
+        _estimates = estimates - mean_estimate
+
+        references_projection = (_references**2).sum(axis=-2) + eps
+        references_on_estimates = (_estimates * _references).sum(axis=-2) + eps
+
+        scale = (
+            (references_on_estimates / references_projection).unsqueeze(axis=1)
+            if self.scaling else 1)
+
+        e_true = scale * _references
+        e_res = _estimates - e_true
+
+        signal = (e_true**2).sum(axis=1)
+        noise = (e_res**2).sum(axis=1)
+        sdr = -10 * paddle.log10(signal / noise + eps)
+
+        if self.clip_min != None:
+            sdr = paddle.clip(sdr, min=self.clip_min)
+
+        if self.reduction == "mean":
+            sdr = sdr.mean()
+        elif self.reduction == "sum":
+            sdr = sdr.sum()
+        return sdr
diff --git a/tests/unit/audiotools/core/test_util.py b/tests/unit/audiotools/core/test_util.py
@@ -13,7 +13,6 @@
 
 from paddlespeech.audiotools import util
 from paddlespeech.audiotools.core.audio_signal import AudioSignal
-from paddlespeech.vector.training.seeding import seed_everything
 
 
 def test_check_random_state():
@@ -36,12 +35,12 @@ def test_check_random_state():
 
 
 def test_seed():
-    seed_everything(0)
+    util.seed_everything(0)
     paddle_result_a = paddle.randn([1])
     np_result_a = np.random.randn(1)
     py_result_a = random.random()
 
-    seed_everything(0)
+    util.seed_everything(0)
     paddle_result_b = paddle.randn([1])
     np_result_b = np.random.randn(1)
     py_result_b = random.random()
diff --git a/tests/unit/audiotools/test_audiotools.sh b/tests/unit/audiotools/test_audiotools.sh
@@ -1,4 +1,3 @@
-python -m pip install -r ../../../paddlespeech/audiotools/requirements.txt
 wget  https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/audio.tar.gz
 wget  https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/regression.tar.gz
 tar -zxvf audio.tar.gz
diff --git a/tests/unit/ci.sh b/tests/unit/ci.sh
diff --git a/tests/unit/tts/test_losses.py b/tests/unit/tts/test_losses.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-python -m pip install -r ../../../paddlespeech/audiotools/requirements.txt`
`2`	`1`	`wget https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/audio.tar.gz`
`3`	`2`	`wget https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/regression.tar.gz`
`4`	`3`	`tar -zxvf audio.tar.gz`