157 lines
5.5 KiB
Python
157 lines
5.5 KiB
Python
|
|
|
|||
|
|
|
|||
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
pdf_to_audiobook.py
|
|||
|
|
|
|||
|
|
1) Always convert PDF → Markdown via marker + your OpenAI-compatible endpoint.
|
|||
|
|
2) If you pass --voice-name, use Coqui XTTS (voice cloning);
|
|||
|
|
otherwise use Kokoro TTS with the “Daniel” speaker.
|
|||
|
|
3) Manage your 6-sec snippets in voices/ directory.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import argparse
|
|||
|
|
import tempfile
|
|||
|
|
import numpy as np
|
|||
|
|
import torch
|
|||
|
|
import soundfile as sf
|
|||
|
|
from pathlib import Path
|
|||
|
|
from pdfminer.high_level import extract_text
|
|||
|
|
from pydub import AudioSegment# Allow Coqui’s XttsConfig class to be unpickled
|
|||
|
|
|
|||
|
|
# ─── Whitelist Coqui classes for safe CPU‐only loading ────────────────────────
|
|||
|
|
from TTS.tts.configs.xtts_config import XttsConfig
|
|||
|
|
from TTS.tts.models.xtts import XttsAudioConfig
|
|||
|
|
from TTS.config.shared_configs import BaseDatasetConfig
|
|||
|
|
from TTS.tts.models.xtts import XttsArgs
|
|||
|
|
from torch.serialization import add_safe_globals
|
|||
|
|
add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
|
|||
|
|
|
|||
|
|
# ─── Coqui XTTS API ─────────────────────────────────────────────────────────
|
|||
|
|
from TTS.api import TTS
|
|||
|
|
|
|||
|
|
# ─── Kokoro TTS API ─────────────────────────────────────────────────────────
|
|||
|
|
from kokoro import KPipeline
|
|||
|
|
|
|||
|
|
# ─── marker PDF→Markdown API ────────────────────────────────────────────────
|
|||
|
|
from marker.converters.pdf import PdfConverter
|
|||
|
|
from marker.models import create_model_dict
|
|||
|
|
from marker.config.parser import ConfigParser
|
|||
|
|
from marker.output import text_from_rendered
|
|||
|
|
|
|||
|
|
VOICE_DIR = Path("voices")
|
|||
|
|
|
|||
|
|
def list_available_voices():
|
|||
|
|
return sorted(p.stem for p in VOICE_DIR.glob("*.wav"))
|
|||
|
|
|
|||
|
|
def convert_pdf_to_markdown(pdf_path: str) -> str:
|
|||
|
|
key = os.getenv("OPENAI_API_KEY")
|
|||
|
|
model = os.getenv("OPENAI_MODEL")
|
|||
|
|
url = os.getenv("OPENAI_BASE_URL")
|
|||
|
|
if not key:
|
|||
|
|
raise RuntimeError("Please set OPENAI_API_KEY in your environment")
|
|||
|
|
cfg = {
|
|||
|
|
"output_format": "markdown",
|
|||
|
|
"use_llm": True,
|
|||
|
|
"llm_service": "marker.services.openai.OpenAIService",
|
|||
|
|
"openai_api_key": key,
|
|||
|
|
"openai_model": model,
|
|||
|
|
"openai_base_url": url,
|
|||
|
|
}
|
|||
|
|
parser = ConfigParser(cfg)
|
|||
|
|
converter = PdfConverter(
|
|||
|
|
config = parser.generate_config_dict(),
|
|||
|
|
artifact_dict = create_model_dict(),
|
|||
|
|
processor_list = parser.get_processors(),
|
|||
|
|
renderer = parser.get_renderer(),
|
|||
|
|
llm_service = parser.get_llm_service(),
|
|||
|
|
)
|
|||
|
|
rendered = converter(pdf_path)
|
|||
|
|
markdown, _, _ = text_from_rendered(rendered)
|
|||
|
|
return markdown
|
|||
|
|
|
|||
|
|
def synthesize_with_coqui(
|
|||
|
|
text: str,
|
|||
|
|
out_mp3: str,
|
|||
|
|
voice: str,
|
|||
|
|
speaker: str | None,
|
|||
|
|
lang: str,
|
|||
|
|
):
|
|||
|
|
use_gpu = torch.cuda.is_available()
|
|||
|
|
tts = TTS(model_name="tts_models/multilingual/xtts_v2", gpu=use_gpu)
|
|||
|
|
|
|||
|
|
spk_wav = [voice]
|
|||
|
|
spk_name = speaker or tts.speakers[0]
|
|||
|
|
print(f"⚙️ Using Coqui voice sample “{voice}” → speaker “{spk_name}”")
|
|||
|
|
|
|||
|
|
tmp_wav = tempfile.mktemp(suffix=".wav")
|
|||
|
|
tts.tts_to_file(
|
|||
|
|
text = text,
|
|||
|
|
file_path = tmp_wav,
|
|||
|
|
speaker_wav = spk_wav,
|
|||
|
|
speaker = spk_name,
|
|||
|
|
language = lang,
|
|||
|
|
split_sentences = True,
|
|||
|
|
)
|
|||
|
|
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
|
|||
|
|
print(f"✅ Coqui MP3 saved → {out_mp3}")
|
|||
|
|
|
|||
|
|
def synthesize_with_kokoro(
|
|||
|
|
text: str,
|
|||
|
|
out_mp3:str,
|
|||
|
|
lang: str,
|
|||
|
|
):
|
|||
|
|
pipeline = KPipeline(lang_code=lang)
|
|||
|
|
chunks = []
|
|||
|
|
# Daniel is the chosen voice
|
|||
|
|
for _, _, audio in pipeline(text, voice="Daniel"):
|
|||
|
|
chunks.append(audio)
|
|||
|
|
full = np.concatenate(chunks, axis=0)
|
|||
|
|
|
|||
|
|
tmp_wav = tempfile.mktemp(suffix=".wav")
|
|||
|
|
sf.write(tmp_wav, full, 24000)
|
|||
|
|
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
|
|||
|
|
print(f"✅ Kokoro MP3 saved → {out_mp3}")
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
# ensure voices/ exists
|
|||
|
|
VOICE_DIR.mkdir(exist_ok=True)
|
|||
|
|
|
|||
|
|
ap = argparse.ArgumentParser(
|
|||
|
|
description="PDF→Markdown→Audiobook (Coqui or Kokoro/Daniel)"
|
|||
|
|
)
|
|||
|
|
ap.add_argument("pdf", help="Input PDF file")
|
|||
|
|
ap.add_argument("--list-voices", action="store_true",
|
|||
|
|
help="Show available voice snippets and exit")
|
|||
|
|
ap.add_argument("--voice-name", help="Name of a .mp3 in voices/ to clone")
|
|||
|
|
ap.add_argument("--speaker", help="Coqui built-in speaker (default if none)")
|
|||
|
|
ap.add_argument("--lang", default="en", help="Language code")
|
|||
|
|
ap.add_argument("--out", default="audiobook.mp3",
|
|||
|
|
help="Output MP3 path")
|
|||
|
|
args = ap.parse_args()
|
|||
|
|
|
|||
|
|
if args.list_voices:
|
|||
|
|
print("Available voices:")
|
|||
|
|
for name in list_available_voices():
|
|||
|
|
print(" ", name)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 1) PDF→Markdown
|
|||
|
|
print("🔄 Converting PDF → Markdown…")
|
|||
|
|
text = convert_pdf_to_markdown(args.pdf)
|
|||
|
|
|
|||
|
|
# 2) Choose engine
|
|||
|
|
if args.voice_name:
|
|||
|
|
wav_path = VOICE_DIR / f"{args.voice_name}.wav"
|
|||
|
|
if not wav_path.is_file():
|
|||
|
|
raise FileNotFoundError(f"voices/{args.voice_name}.wav not found.")
|
|||
|
|
synthesize_with_coqui(text, args.out, str(wav_path),
|
|||
|
|
args.speaker, args.lang)
|
|||
|
|
else:
|
|||
|
|
synthesize_with_kokoro(text, args.out, args.lang)
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|
|||
|
|
|