File size: 2,102 Bytes
1832e16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import librosa
import numpy as np
import pyloudnorm as pyln
import torch

from config import SILENCE_RATIO, SR
from utils import hungarian, safe_corr_np
import warnings
warnings.filterwarnings("ignore", message="Possible clipped samples in output.")


def loudness_normalize(wav, sr=SR, target_lufs=-23.0):
    meter = pyln.Meter(sr)
    loudness = meter.integrated_loudness(wav)
    normalized_wav = pyln.normalize.loudness(wav, loudness, target_lufs)
    peak = np.max(np.abs(normalized_wav))
    if peak > 1.0:
        normalized_wav = normalized_wav / max(peak, 1e-12)
    return np.clip(normalized_wav, -1.0, 1.0)


def frame_rms_torch(sig, win, hop):
    dev = sig.device
    frames = sig.unfold(0, win, hop)
    if frames.size(0) and (frames.size(0) - 1) * hop == sig.numel() - win:
        frames = frames[:-1]
    rms = torch.sqrt((frames**2).mean(1) + 1e-12)
    return rms.to(dev)


def make_union_voiced_mask(refs_tensors, win, hop):
    device = refs_tensors[0].device
    rms_vecs = [frame_rms_torch(r, win, hop) for r in refs_tensors]
    lengths = [v.numel() for v in rms_vecs]
    L_max = max(lengths)
    silent_union = torch.zeros(L_max, dtype=torch.bool, device=device)
    for idx, (rms, L) in enumerate(zip(rms_vecs, lengths)):
        thr = SILENCE_RATIO * torch.sqrt((refs_tensors[idx] ** 2).mean())
        sil = rms <= thr
        silent_union[:L] |= sil
    return ~silent_union


def assign_outputs_to_refs_by_corr(ref_paths, out_paths):
    if not out_paths:
        return [None] * len(ref_paths)
    refs = [loudness_normalize(librosa.load(str(p), sr=SR)[0]) for p in ref_paths]
    outs = [loudness_normalize(librosa.load(str(p), sr=SR)[0]) for p in out_paths]
    n, m = len(refs), len(outs)
    K = max(n, m)
    C = np.ones((K, K), dtype=np.float64)
    for i in range(n):
        for j in range(m):
            r = safe_corr_np(refs[i], outs[j])
            C[i, j] = 1.0 - (r + 1.0) * 0.5  # lower = better
    ri, cj = hungarian(C)
    mapping = [None] * n
    for i, j in zip(ri, cj):
        if i < n and j < m:
            mapping[i] = int(j)
    return mapping