m-bain · to-audiobook · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025
diff --git a/.gitignore b/.gitignore
@@ -168,4 +168,6 @@ cython_debug/
 #.idea/
 
 # PyPI configuration file
-.pypirc
+.pypirc
+
+ssh/
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,16 +9,37 @@ requires-python = ">=3.9, <3.13"
 license = { text = "BSD-2-Clause" }
 
 dependencies = [
-    "ctranslate2<4.5.0",
+    "ctranslate2>=4.5.0",
     "faster-whisper>=1.1.1",
     "nltk>=3.9.1",
     "numpy>=2.0.2",
     "onnxruntime>=1.19",
     "pandas>=2.2.3",
-    "pyannote-audio>=3.3.2",
-    "torch>=2.5.1",
-    "torchaudio>=2.5.1",
-    "transformers>=4.48.0",
+    "pyannote-audio>=4.0.0",
+    "omegaconf>=2.3.0",
+    # pyannote (I think) will require this, but v2.6.0 uses torch in a way that
+    # will raise exceptions
+    "lightning<2.6.0",
+    # NOTE: some torchcodec versions are incompatible with some torch versions
+    # and if you happen to use an incompatible version, you will currently see
+    # a very obscure bad_alloc exception when those libs are loaded, which tells
+    # you nothing at all and makes you waste hours and hours hunting down the
+    # culprit.
+    # https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec
+    # contains a table showing which versions are compatible with each other.
+    #
+    # torchvision, by the way, also requires a compatible version
+    "torch~=2.8.0",
+    "torchaudio~=2.8.0",
+    "torchcodec~=0.7.0",
+    "torchvision~=0.23.0",
+    # apparently they changed some default arguments values after transformers v4.51.0
+    # I believe num_beams is the culprit. It used to be 1, now it is set to 5. See
+    # https://github.com/huggingface/transformers/issues/40682
+    # So, according to them if you pass num_beams=1 to the pipeline, versions >4.51.0
+    # be as fast as before. But, since I am not exactly sure where to put that, I'll
+    # just lock the version for now.
+    "transformers>=4.48.0, <=4.51.0",
 ]
 
 

diff --git a/whisperx/__main__.py b/whisperx/__main__.py
@@ -43,7 +43,7 @@ def cli():
     parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
     parser.add_argument("--min_speakers", default=None, type=int, help="Minimum number of speakers to in audio file")
     parser.add_argument("--max_speakers", default=None, type=int, help="Maximum number of speakers to in audio file")
-    parser.add_argument("--diarize_model", default="pyannote/speaker-diarization-3.1", type=str, help="Name of the speaker diarization model to use")
+    parser.add_argument("--diarize_model", default="pyannote/speaker-diarization-community-1", type=str, help="Name of the speaker diarization model to use")
     parser.add_argument("--speaker_embeddings", action="store_true", help="Include speaker embeddings in JSON output (only works with --diarize)")
 
     parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")

diff --git a/whisperx/audio.py b/whisperx/audio.py
@@ -9,6 +9,9 @@
 
 from whisperx.utils import exact_div
 
+import gc;
+import tempfile;
+
 # hard-coded audio hyperparameters
 SAMPLE_RATE = 16000
 N_FFT = 400
@@ -22,7 +25,7 @@
 TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
 
 
-def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
+def load_audio_ram(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
     """
     Open an audio file and read as mono waveform, resampling as necessary
 
@@ -62,7 +65,107 @@ def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
     except subprocess.CalledProcessError as e:
         raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
 
-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+    # doing this is parts, instead of as a "one-liner" allows us to free the memory
+    # used by {out}, which can be quite a lot in case of very long audio files.
+    # This should help minimize crashes caused by running out of RAM. 
+    # Another alternative would be using temporary files and/or check if the audio
+    # file is already in the required format and load it directly with np.fromfile()
+    r = np.frombuffer(out, np.int16);
+    del out;
+    gc.collect();
+
+    r = r.flatten();
+    r = r.astype(np.float32) / 32768.0
+
+    return r;
+
+def load_audio_tmpfile(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+
+    sr: int
+        The sample rate to resample the audio if necessary
+
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    resampledAudioFilePath = '';
+    import tempfile;
+    with tempfile.TemporaryDirectory() as tmpdir:
+        resampledAudioFilePath = os.path.join(tmpdir, 'resampledAudio');
+        try:    
+                # Launches a subprocess to decode audio while down-mixing and resampling as necessary.
+                # Requires the ffmpeg CLI to be installed.
+                cmd = [
+                    "ffmpeg",
+                    "-nostdin",
+                    "-threads",
+                    "0",
+                    "-i",
+                    file,
+                    "-f",
+                    "s16le",
+                    "-ac",
+                    "1",
+                    "-acodec",
+                    "pcm_s16le",
+                    "-ar",
+                    str(sr),
+                    resampledAudioFilePath,
+                ]
+                out = subprocess.run(cmd, capture_output=True, check=True).stdout
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+
+        # doing this is parts, instead of as a "one-liner" allows us to free the memory
+        # used by {out}, which can be quite a lot in case of very long audio files.
+        # This should help minimize crashes caused by running out of RAM. 
+        # Another alternative would be using temporary files and/or check if the audio
+        # file is already in the required format and load it directly with np.fromfile()
+        r = np.fromfile(resampledAudioFilePath, np.int16);    
+
+    r = r.flatten();
+    r = r.astype(np.float32) / 32768.0
+
+    return r;
+
+def load_audio(file: str, sr: int = SAMPLE_RATE, useTmpFile=False) -> np.ndarray:
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+
+    sr: int
+        The sample rate to resample the audio if necessary
+
+    useTmpFile: bool
+        If true, resample the audio using a temporary file. 
+        Might be the only way to load very long audio files
+        without running out of RAM
+
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """    
+    #try:
+    #    return load_audio_ram(file, sr);
+    #except:
+    #    print('call to load_audio_ram() failed. Trying again, this time using temporary files');
+    #    return load_audio_tmpfile(file, sr);
+    # unfortunately the above does not work, so we will rely on the user
+    if(useTmpFile):
+        return load_audio_tmpfile(file, sr);
+    else:
+        return load_audio_ram(file, sr);
 
 
 def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):

diff --git a/whisperx/diarize.py b/whisperx/diarize.py
@@ -17,8 +17,8 @@ def __init__(
     ):
         if isinstance(device, str):
             device = torch.device(device)
-        model_config = model_name or "pyannote/speaker-diarization-3.1"
-        self.model = Pipeline.from_pretrained(model_config, use_auth_token=use_auth_token).to(device)
+        model_config = model_name or "pyannote/speaker-diarization-community-1"
+        self.model = Pipeline.from_pretrained(model_config, token=use_auth_token).to(device)
 
     def __call__(
         self,
@@ -51,22 +51,15 @@ def __call__(
             'sample_rate': SAMPLE_RATE
         }
 
-        if return_embeddings:
-            diarization, embeddings = self.model(
-                audio_data,
-                num_speakers=num_speakers,
-                min_speakers=min_speakers,
-                max_speakers=max_speakers,
-                return_embeddings=True,
-            )
-        else:
-            diarization = self.model(
-                audio_data,
-                num_speakers=num_speakers,
-                min_speakers=min_speakers,
-                max_speakers=max_speakers,
-            )
-            embeddings = None
+        output = self.model(
+            audio_data,
+            num_speakers=num_speakers,
+            min_speakers=min_speakers,
+            max_speakers=max_speakers,
+        )
+
+        diarization = output.speaker_diarization
+        embeddings = output.speaker_embeddings
 
         diarize_df = pd.DataFrame(diarization.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
         diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)

diff --git a/whisperx/vads/pyannote.py b/whisperx/vads/pyannote.py
@@ -193,7 +193,7 @@ def __init__(
             **inference_kwargs,
     ):
 
-        super().__init__(segmentation=segmentation, fscore=fscore, use_auth_token=use_auth_token, **inference_kwargs)
+        super().__init__(segmentation=segmentation, fscore=fscore, token=use_auth_token, **inference_kwargs)
 
     def apply(self, file: AudioFile, hook: Optional[Callable] = None) -> Annotation:
         """Apply voice activity detection
-Original file line number
+Diff line change
@@ Expand Up / @@ -168,4 +168,6 @@ cython_debug/ @@
     #.idea/
     # PyPI configuration file
-    .pypirc
+    .pypirc
+    ssh/