#!/usr/bin/env python3 """ pdf_to_audiobook.py 1) Always convert PDF → Markdown via marker + your OpenAI-compatible endpoint. 2) If you pass --voice-name, use Coqui XTTS (voice cloning); otherwise use Kokoro TTS with the “Daniel” speaker. 3) Manage your 6-sec snippets in voices/ directory. """ import os import argparse import tempfile import numpy as np import torch import soundfile as sf from pathlib import Path from pdfminer.high_level import extract_text from pydub import AudioSegment# Allow Coqui’s XttsConfig class to be unpickled # ─── Whitelist Coqui classes for safe CPU‐only loading ──────────────────────── from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import XttsAudioConfig from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.models.xtts import XttsArgs from torch.serialization import add_safe_globals add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs]) # ─── Coqui XTTS API ───────────────────────────────────────────────────────── from TTS.api import TTS # ─── Kokoro TTS API ───────────────────────────────────────────────────────── from kokoro import KPipeline # ─── marker PDF→Markdown API ──────────────────────────────────────────────── from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.config.parser import ConfigParser from marker.output import text_from_rendered VOICE_DIR = Path("voices") def list_available_voices(): return sorted(p.stem for p in VOICE_DIR.glob("*.wav")) def convert_pdf_to_markdown(pdf_path: str) -> str: key = os.getenv("OPENAI_API_KEY") model = os.getenv("OPENAI_MODEL") url = os.getenv("OPENAI_BASE_URL") if not key: raise RuntimeError("Please set OPENAI_API_KEY in your environment") cfg = { "output_format": "markdown", "use_llm": True, "llm_service": "marker.services.openai.OpenAIService", "openai_api_key": key, "openai_model": model, "openai_base_url": url, } parser = ConfigParser(cfg) converter = PdfConverter( config = parser.generate_config_dict(), artifact_dict = create_model_dict(), processor_list = parser.get_processors(), renderer = parser.get_renderer(), llm_service = parser.get_llm_service(), ) rendered = converter(pdf_path) markdown, _, _ = text_from_rendered(rendered) return markdown def synthesize_with_coqui( text: str, out_mp3: str, voice: str, speaker: str | None, lang: str, ): use_gpu = torch.cuda.is_available() tts = TTS(model_name="tts_models/multilingual/xtts_v2", gpu=use_gpu) spk_wav = [voice] spk_name = speaker or tts.speakers[0] print(f"⚙️ Using Coqui voice sample “{voice}” → speaker “{spk_name}”") tmp_wav = tempfile.mktemp(suffix=".wav") tts.tts_to_file( text = text, file_path = tmp_wav, speaker_wav = spk_wav, speaker = spk_name, language = lang, split_sentences = True, ) AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3") print(f"✅ Coqui MP3 saved → {out_mp3}") def synthesize_with_kokoro( text: str, out_mp3:str, lang: str, ): pipeline = KPipeline(lang_code=lang) chunks = [] # Daniel is the chosen voice for _, _, audio in pipeline(text, voice="Daniel"): chunks.append(audio) full = np.concatenate(chunks, axis=0) tmp_wav = tempfile.mktemp(suffix=".wav") sf.write(tmp_wav, full, 24000) AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3") print(f"✅ Kokoro MP3 saved → {out_mp3}") def main(): # ensure voices/ exists VOICE_DIR.mkdir(exist_ok=True) ap = argparse.ArgumentParser( description="PDF→Markdown→Audiobook (Coqui or Kokoro/Daniel)" ) ap.add_argument("pdf", help="Input PDF file") ap.add_argument("--list-voices", action="store_true", help="Show available voice snippets and exit") ap.add_argument("--voice-name", help="Name of a .mp3 in voices/ to clone") ap.add_argument("--speaker", help="Coqui built-in speaker (default if none)") ap.add_argument("--lang", default="en", help="Language code") ap.add_argument("--out", default="audiobook.mp3", help="Output MP3 path") args = ap.parse_args() if args.list_voices: print("Available voices:") for name in list_available_voices(): print(" ", name) return # 1) PDF→Markdown print("🔄 Converting PDF → Markdown…") text = convert_pdf_to_markdown(args.pdf) # 2) Choose engine if args.voice_name: wav_path = VOICE_DIR / f"{args.voice_name}.wav" if not wav_path.is_file(): raise FileNotFoundError(f"voices/{args.voice_name}.wav not found.") synthesize_with_coqui(text, args.out, str(wav_path), args.speaker, args.lang) else: synthesize_with_kokoro(text, args.out, args.lang) if __name__ == "__main__": main()