Files
scarif/pdf-to-audiobook/pdf_to_audiobook.py

157 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
pdf_to_audiobook.py
1) Always convert PDF → Markdown via marker + your OpenAI-compatible endpoint.
2) If you pass --voice-name, use Coqui XTTS (voice cloning);
otherwise use Kokoro TTS with the “Daniel” speaker.
3) Manage your 6-sec snippets in voices/ directory.
"""
import os
import argparse
import tempfile
import numpy as np
import torch
import soundfile as sf
from pathlib import Path
from pdfminer.high_level import extract_text
from pydub import AudioSegment# Allow Coquis XttsConfig class to be unpickled
# ─── Whitelist Coqui classes for safe CPUonly loading ────────────────────────
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import XttsAudioConfig
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.models.xtts import XttsArgs
from torch.serialization import add_safe_globals
add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
# ─── Coqui XTTS API ─────────────────────────────────────────────────────────
from TTS.api import TTS
# ─── Kokoro TTS API ─────────────────────────────────────────────────────────
from kokoro import KPipeline
# ─── marker PDF→Markdown API ────────────────────────────────────────────────
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
from marker.output import text_from_rendered
VOICE_DIR = Path("voices")
def list_available_voices():
return sorted(p.stem for p in VOICE_DIR.glob("*.wav"))
def convert_pdf_to_markdown(pdf_path: str) -> str:
key = os.getenv("OPENAI_API_KEY")
model = os.getenv("OPENAI_MODEL")
url = os.getenv("OPENAI_BASE_URL")
if not key:
raise RuntimeError("Please set OPENAI_API_KEY in your environment")
cfg = {
"output_format": "markdown",
"use_llm": True,
"llm_service": "marker.services.openai.OpenAIService",
"openai_api_key": key,
"openai_model": model,
"openai_base_url": url,
}
parser = ConfigParser(cfg)
converter = PdfConverter(
config = parser.generate_config_dict(),
artifact_dict = create_model_dict(),
processor_list = parser.get_processors(),
renderer = parser.get_renderer(),
llm_service = parser.get_llm_service(),
)
rendered = converter(pdf_path)
markdown, _, _ = text_from_rendered(rendered)
return markdown
def synthesize_with_coqui(
text: str,
out_mp3: str,
voice: str,
speaker: str | None,
lang: str,
):
use_gpu = torch.cuda.is_available()
tts = TTS(model_name="tts_models/multilingual/xtts_v2", gpu=use_gpu)
spk_wav = [voice]
spk_name = speaker or tts.speakers[0]
print(f"⚙️ Using Coqui voice sample “{voice}” → speaker “{spk_name}")
tmp_wav = tempfile.mktemp(suffix=".wav")
tts.tts_to_file(
text = text,
file_path = tmp_wav,
speaker_wav = spk_wav,
speaker = spk_name,
language = lang,
split_sentences = True,
)
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
print(f"✅ Coqui MP3 saved → {out_mp3}")
def synthesize_with_kokoro(
text: str,
out_mp3:str,
lang: str,
):
pipeline = KPipeline(lang_code=lang)
chunks = []
# Daniel is the chosen voice
for _, _, audio in pipeline(text, voice="Daniel"):
chunks.append(audio)
full = np.concatenate(chunks, axis=0)
tmp_wav = tempfile.mktemp(suffix=".wav")
sf.write(tmp_wav, full, 24000)
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
print(f"✅ Kokoro MP3 saved → {out_mp3}")
def main():
# ensure voices/ exists
VOICE_DIR.mkdir(exist_ok=True)
ap = argparse.ArgumentParser(
description="PDF→Markdown→Audiobook (Coqui or Kokoro/Daniel)"
)
ap.add_argument("pdf", help="Input PDF file")
ap.add_argument("--list-voices", action="store_true",
help="Show available voice snippets and exit")
ap.add_argument("--voice-name", help="Name of a .mp3 in voices/ to clone")
ap.add_argument("--speaker", help="Coqui built-in speaker (default if none)")
ap.add_argument("--lang", default="en", help="Language code")
ap.add_argument("--out", default="audiobook.mp3",
help="Output MP3 path")
args = ap.parse_args()
if args.list_voices:
print("Available voices:")
for name in list_available_voices():
print(" ", name)
return
# 1) PDF→Markdown
print("🔄 Converting PDF → Markdown…")
text = convert_pdf_to_markdown(args.pdf)
# 2) Choose engine
if args.voice_name:
wav_path = VOICE_DIR / f"{args.voice_name}.wav"
if not wav_path.is_file():
raise FileNotFoundError(f"voices/{args.voice_name}.wav not found.")
synthesize_with_coqui(text, args.out, str(wav_path),
args.speaker, args.lang)
else:
synthesize_with_kokoro(text, args.out, args.lang)
if __name__ == "__main__":
main()