Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,6 @@ cython_debug/
#.idea/

# PyPI configuration file
.pypirc
.pypirc

ssh/
31 changes: 26 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,37 @@ requires-python = ">=3.9, <3.13"
license = { text = "BSD-2-Clause" }

dependencies = [
"ctranslate2<4.5.0",
"ctranslate2>=4.5.0",
"faster-whisper>=1.1.1",
"nltk>=3.9.1",
"numpy>=2.0.2",
"onnxruntime>=1.19",
"pandas>=2.2.3",
"pyannote-audio>=3.3.2",
"torch>=2.5.1",
"torchaudio>=2.5.1",
"transformers>=4.48.0",
"pyannote-audio>=4.0.0",
"omegaconf>=2.3.0",
# pyannote (I think) will require this, but v2.6.0 uses torch in a way that
# will raise exceptions
"lightning<2.6.0",
# NOTE: some torchcodec versions are incompatible with some torch versions
# and if you happen to use an incompatible version, you will currently see
# a very obscure bad_alloc exception when those libs are loaded, which tells
# you nothing at all and makes you waste hours and hours hunting down the
# culprit.
# https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec
# contains a table showing which versions are compatible with each other.
#
# torchvision, by the way, also requires a compatible version
"torch~=2.8.0",
"torchaudio~=2.8.0",
"torchcodec~=0.7.0",
"torchvision~=0.23.0",
# apparently they changed some default arguments values after transformers v4.51.0
# I believe num_beams is the culprit. It used to be 1, now it is set to 5. See
# https://github.com/huggingface/transformers/issues/40682
# So, according to them if you pass num_beams=1 to the pipeline, versions >4.51.0
# be as fast as before. But, since I am not exactly sure where to put that, I'll
# just lock the version for now.
"transformers>=4.48.0, <=4.51.0",
]


Expand Down
2 changes: 1 addition & 1 deletion whisperx/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def cli():
parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
parser.add_argument("--min_speakers", default=None, type=int, help="Minimum number of speakers to in audio file")
parser.add_argument("--max_speakers", default=None, type=int, help="Maximum number of speakers to in audio file")
parser.add_argument("--diarize_model", default="pyannote/speaker-diarization-3.1", type=str, help="Name of the speaker diarization model to use")
parser.add_argument("--diarize_model", default="pyannote/speaker-diarization-community-1", type=str, help="Name of the speaker diarization model to use")
parser.add_argument("--speaker_embeddings", action="store_true", help="Include speaker embeddings in JSON output (only works with --diarize)")

parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
Expand Down
107 changes: 105 additions & 2 deletions whisperx/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

from whisperx.utils import exact_div

import gc;
import tempfile;

# hard-coded audio hyperparameters
SAMPLE_RATE = 16000
N_FFT = 400
Expand All @@ -22,7 +25,7 @@
TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 20ms per audio token


def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
def load_audio_ram(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
"""
Open an audio file and read as mono waveform, resampling as necessary

Expand Down Expand Up @@ -62,7 +65,107 @@ def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e

return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
# doing this is parts, instead of as a "one-liner" allows us to free the memory
# used by {out}, which can be quite a lot in case of very long audio files.
# This should help minimize crashes caused by running out of RAM.
# Another alternative would be using temporary files and/or check if the audio
# file is already in the required format and load it directly with np.fromfile()
r = np.frombuffer(out, np.int16);
del out;
gc.collect();

r = r.flatten();
r = r.astype(np.float32) / 32768.0

return r;

def load_audio_tmpfile(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
"""
Open an audio file and read as mono waveform, resampling as necessary

Parameters
----------
file: str
The audio file to open

sr: int
The sample rate to resample the audio if necessary

Returns
-------
A NumPy array containing the audio waveform, in float32 dtype.
"""
resampledAudioFilePath = '';
import tempfile;
with tempfile.TemporaryDirectory() as tmpdir:
resampledAudioFilePath = os.path.join(tmpdir, 'resampledAudio');
try:
# Launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI to be installed.
cmd = [
"ffmpeg",
"-nostdin",
"-threads",
"0",
"-i",
file,
"-f",
"s16le",
"-ac",
"1",
"-acodec",
"pcm_s16le",
"-ar",
str(sr),
resampledAudioFilePath,
]
out = subprocess.run(cmd, capture_output=True, check=True).stdout
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e

# doing this is parts, instead of as a "one-liner" allows us to free the memory
# used by {out}, which can be quite a lot in case of very long audio files.
# This should help minimize crashes caused by running out of RAM.
# Another alternative would be using temporary files and/or check if the audio
# file is already in the required format and load it directly with np.fromfile()
r = np.fromfile(resampledAudioFilePath, np.int16);

r = r.flatten();
r = r.astype(np.float32) / 32768.0

return r;

def load_audio(file: str, sr: int = SAMPLE_RATE, useTmpFile=False) -> np.ndarray:
"""
Open an audio file and read as mono waveform, resampling as necessary

Parameters
----------
file: str
The audio file to open

sr: int
The sample rate to resample the audio if necessary

useTmpFile: bool
If true, resample the audio using a temporary file.
Might be the only way to load very long audio files
without running out of RAM

Returns
-------
A NumPy array containing the audio waveform, in float32 dtype.
"""
#try:
# return load_audio_ram(file, sr);
#except:
# print('call to load_audio_ram() failed. Trying again, this time using temporary files');
# return load_audio_tmpfile(file, sr);
# unfortunately the above does not work, so we will rely on the user
if(useTmpFile):
return load_audio_tmpfile(file, sr);
else:
return load_audio_ram(file, sr);


def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
Expand Down
29 changes: 11 additions & 18 deletions whisperx/diarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def __init__(
):
if isinstance(device, str):
device = torch.device(device)
model_config = model_name or "pyannote/speaker-diarization-3.1"
self.model = Pipeline.from_pretrained(model_config, use_auth_token=use_auth_token).to(device)
model_config = model_name or "pyannote/speaker-diarization-community-1"
self.model = Pipeline.from_pretrained(model_config, token=use_auth_token).to(device)

def __call__(
self,
Expand Down Expand Up @@ -51,22 +51,15 @@ def __call__(
'sample_rate': SAMPLE_RATE
}

if return_embeddings:
diarization, embeddings = self.model(
audio_data,
num_speakers=num_speakers,
min_speakers=min_speakers,
max_speakers=max_speakers,
return_embeddings=True,
)
else:
diarization = self.model(
audio_data,
num_speakers=num_speakers,
min_speakers=min_speakers,
max_speakers=max_speakers,
)
embeddings = None
output = self.model(
audio_data,
num_speakers=num_speakers,
min_speakers=min_speakers,
max_speakers=max_speakers,
)

diarization = output.speaker_diarization
embeddings = output.speaker_embeddings

diarize_df = pd.DataFrame(diarization.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
Expand Down
2 changes: 1 addition & 1 deletion whisperx/vads/pyannote.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def __init__(
**inference_kwargs,
):

super().__init__(segmentation=segmentation, fscore=fscore, use_auth_token=use_auth_token, **inference_kwargs)
super().__init__(segmentation=segmentation, fscore=fscore, token=use_auth_token, **inference_kwargs)

def apply(self, file: AudioFile, hook: Optional[Callable] = None) -> Annotation:
"""Apply voice activity detection
Expand Down