Skip to content

Why is there such a big difference between the results of PitchYinProbabilistic and PitchYin? #1473

@xjtusjtu

Description

@xjtusjtu

Dear developers,

I've found that the results given by the three algorithms, librosa.yin, librosa.pyin, and PitchYinProbabilistic, are numerically very close.

However, the result given by PitchYinProbabilistic is quite different from the previous three. I'm wondering why.

Additionally, the expected situation is that the results returned by all APIs should be numerically very close, right? It's just that the pyin algorithm gives the probability and the amplitude of unvoiced frames is negative.

Any help would be greatly appreciated!

The following is my code:

import numpy as np
import librosa
import essentia
import essentia.standard as es


def extract_f0_librosa_yin(y, sr, frame_length=1024, hop_length=512, f0_min=65, f0_max=2093):
    """使用librosa的YIN算法提取基频
    
    https://librosa.org/doc/latest/generated/librosa.yin.html
    
    Args:
        y (np.ndarray): 输入音频信号
        sr (int): 采样率
        frame_length (int): 帧长(采样点数)
        hop_length (int): 帧移(采样点数)
        f0_min (float): 最小基频(Hz)
        f0_max (float): 最大基频(Hz)
    
    Returns:
        tuple: (f0s, voiced_flags) 基频序列和有声判断
    """
    f0s = librosa.yin(
        y,
        fmin=f0_min,
        fmax=f0_max,
        sr=sr,
        frame_length=frame_length,
        hop_length=hop_length
    )
    
    # 无法根据f0是否为0来判断是否为浊音
    voiced_flags = None
    
    return f0s, voiced_flags


def extract_f0_librosa_pyin(y, sr, frame_length=1024, hop_length=512, f0_min=65, f0_max=2093):
    """使用librosa的PYIN算法提取基频
    
    https://librosa.org/doc/latest/generated/librosa.pyin.html
    
    Args:
        y (np.ndarray): 输入音频信号
        sr (int): 采样率
        frame_length (int): 帧长(采样点数)
        hop_length (int): 帧移(采样点数)
        f0_min (float): 最小基频(Hz)
        f0_max (float): 最大基频(Hz)
    
    Returns:
        tuple: (f0s, voiced_flags) 基频序列和有声判断
    """
    f0s, voiced_flags, voiced_probs = librosa.pyin(
        y,
        fmin=f0_min,
        fmax=f0_max,
        sr=sr,
        frame_length=frame_length,
        hop_length=hop_length
    )
    
    return f0s, voiced_flags


def extract_f0_essentia_yin(y, sr, frame_length=1024, hop_length=512, f0_min=65, f0_max=2093):
    """使用essentia的YIN算法提取基频
    
    https://essentia.upf.edu/reference/std_PitchYin.html#pitchyin
    
    Args:
        y (np.ndarray): 输入音频信号
        sr (int): 采样率
        frame_length (int): 帧长(采样点数)
        hop_length (int): 帧移(采样点数)
        f0_min (float): 最小基频(Hz)
        f0_max (float): 最大基频(Hz)
    
    Returns:
        tuple: (f0s, voiced_flags) 基频序列和有声判断
    """
    # 确保输入是float32类型
    y = np.array(y, dtype=np.float32)
    
    # 创建YIN算法实例
    pitch_yin = es.PitchYin(
        frameSize=frame_length,
        sampleRate=sr,
        minFrequency=f0_min,
        maxFrequency=f0_max
    )
    
    # 分帧处理
    f0s = []
    voiced_flags = []
    for frame in es.FrameGenerator(y, frameSize=frame_length, hopSize=hop_length):
        pitch, confidence = pitch_yin(frame)
        f0s.append(pitch)
        voiced_flags.append(confidence > 0.5)  # 使用置信度阈值判断有声/无声
        
    return np.array(f0s), np.array(voiced_flags)


def extract_f0_essentia_pyin(y, sr, frame_length=1024, hop_length=512, f0_min=65, f0_max=2093):
    """使用essentia的概率YIN算法提取基频
    
    https://essentia.upf.edu/reference/streaming_PitchYinProbabilistic.html
    
    Args:
        y (np.ndarray): 输入音频信号
        sr (int): 采样率
        frame_length (int): 帧长(采样点数)
        hop_length (int): 帧移(采样点数)
        f0_min (float): 最小基频(Hz)
        f0_max (float): 最大基频(Hz)
    
    Returns:
        tuple: (f0s, voiced_probs) 基频序列和有声概率
    """
    # 确保输入是float32类型
    y = np.array(y, dtype=np.float32)
    
    print('强制修改')
    
    # 创建PYIN算法实例
    pyin = es.PitchYinProbabilistic(
        frameSize=frame_length,
        hopSize=hop_length,
        lowRMSThreshold=0.1,
        outputUnvoiced='negative',
        preciseTime=True,
        sampleRate=sr
    )
    
    # 提取基频和概率
    f0s, voiced_probs = pyin(y)
    
    return f0s, voiced_probs



methods = ['librosa_yin', 'librosa_pyin', 'essentia_yin', 'essentia_pyin']


for method in methods:
    print(method)
    
    f0_extractors = {
        'librosa_yin': extract_f0_librosa_yin,
        'librosa_pyin': extract_f0_librosa_pyin,
        'essentia_yin': extract_f0_essentia_yin,
        'essentia_pyin': extract_f0_essentia_pyin
    }
    
    f0s, voiced_flags = f0_extractors[method](
        y, sr, frame_length=1024, hop_length=512
    )
    print(f0s.shape, f0s)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions