157 lines
5.5 KiB
Python
157 lines
5.5 KiB
Python
|
||
|
||
#!/usr/bin/env python3
|
||
"""
|
||
pdf_to_audiobook.py
|
||
|
||
1) Always convert PDF → Markdown via marker + your OpenAI-compatible endpoint.
|
||
2) If you pass --voice-name, use Coqui XTTS (voice cloning);
|
||
otherwise use Kokoro TTS with the “Daniel” speaker.
|
||
3) Manage your 6-sec snippets in voices/ directory.
|
||
"""
|
||
|
||
import os
|
||
import argparse
|
||
import tempfile
|
||
import numpy as np
|
||
import torch
|
||
import soundfile as sf
|
||
from pathlib import Path
|
||
from pdfminer.high_level import extract_text
|
||
from pydub import AudioSegment# Allow Coqui’s XttsConfig class to be unpickled
|
||
|
||
# ─── Whitelist Coqui classes for safe CPU‐only loading ────────────────────────
|
||
from TTS.tts.configs.xtts_config import XttsConfig
|
||
from TTS.tts.models.xtts import XttsAudioConfig
|
||
from TTS.config.shared_configs import BaseDatasetConfig
|
||
from TTS.tts.models.xtts import XttsArgs
|
||
from torch.serialization import add_safe_globals
|
||
add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
|
||
|
||
# ─── Coqui XTTS API ─────────────────────────────────────────────────────────
|
||
from TTS.api import TTS
|
||
|
||
# ─── Kokoro TTS API ─────────────────────────────────────────────────────────
|
||
from kokoro import KPipeline
|
||
|
||
# ─── marker PDF→Markdown API ────────────────────────────────────────────────
|
||
from marker.converters.pdf import PdfConverter
|
||
from marker.models import create_model_dict
|
||
from marker.config.parser import ConfigParser
|
||
from marker.output import text_from_rendered
|
||
|
||
VOICE_DIR = Path("voices")
|
||
|
||
def list_available_voices():
|
||
return sorted(p.stem for p in VOICE_DIR.glob("*.wav"))
|
||
|
||
def convert_pdf_to_markdown(pdf_path: str) -> str:
|
||
key = os.getenv("OPENAI_API_KEY")
|
||
model = os.getenv("OPENAI_MODEL")
|
||
url = os.getenv("OPENAI_BASE_URL")
|
||
if not key:
|
||
raise RuntimeError("Please set OPENAI_API_KEY in your environment")
|
||
cfg = {
|
||
"output_format": "markdown",
|
||
"use_llm": True,
|
||
"llm_service": "marker.services.openai.OpenAIService",
|
||
"openai_api_key": key,
|
||
"openai_model": model,
|
||
"openai_base_url": url,
|
||
}
|
||
parser = ConfigParser(cfg)
|
||
converter = PdfConverter(
|
||
config = parser.generate_config_dict(),
|
||
artifact_dict = create_model_dict(),
|
||
processor_list = parser.get_processors(),
|
||
renderer = parser.get_renderer(),
|
||
llm_service = parser.get_llm_service(),
|
||
)
|
||
rendered = converter(pdf_path)
|
||
markdown, _, _ = text_from_rendered(rendered)
|
||
return markdown
|
||
|
||
def synthesize_with_coqui(
|
||
text: str,
|
||
out_mp3: str,
|
||
voice: str,
|
||
speaker: str | None,
|
||
lang: str,
|
||
):
|
||
use_gpu = torch.cuda.is_available()
|
||
tts = TTS(model_name="tts_models/multilingual/xtts_v2", gpu=use_gpu)
|
||
|
||
spk_wav = [voice]
|
||
spk_name = speaker or tts.speakers[0]
|
||
print(f"⚙️ Using Coqui voice sample “{voice}” → speaker “{spk_name}”")
|
||
|
||
tmp_wav = tempfile.mktemp(suffix=".wav")
|
||
tts.tts_to_file(
|
||
text = text,
|
||
file_path = tmp_wav,
|
||
speaker_wav = spk_wav,
|
||
speaker = spk_name,
|
||
language = lang,
|
||
split_sentences = True,
|
||
)
|
||
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
|
||
print(f"✅ Coqui MP3 saved → {out_mp3}")
|
||
|
||
def synthesize_with_kokoro(
|
||
text: str,
|
||
out_mp3:str,
|
||
lang: str,
|
||
):
|
||
pipeline = KPipeline(lang_code=lang)
|
||
chunks = []
|
||
# Daniel is the chosen voice
|
||
for _, _, audio in pipeline(text, voice="Daniel"):
|
||
chunks.append(audio)
|
||
full = np.concatenate(chunks, axis=0)
|
||
|
||
tmp_wav = tempfile.mktemp(suffix=".wav")
|
||
sf.write(tmp_wav, full, 24000)
|
||
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
|
||
print(f"✅ Kokoro MP3 saved → {out_mp3}")
|
||
|
||
def main():
|
||
# ensure voices/ exists
|
||
VOICE_DIR.mkdir(exist_ok=True)
|
||
|
||
ap = argparse.ArgumentParser(
|
||
description="PDF→Markdown→Audiobook (Coqui or Kokoro/Daniel)"
|
||
)
|
||
ap.add_argument("pdf", help="Input PDF file")
|
||
ap.add_argument("--list-voices", action="store_true",
|
||
help="Show available voice snippets and exit")
|
||
ap.add_argument("--voice-name", help="Name of a .mp3 in voices/ to clone")
|
||
ap.add_argument("--speaker", help="Coqui built-in speaker (default if none)")
|
||
ap.add_argument("--lang", default="en", help="Language code")
|
||
ap.add_argument("--out", default="audiobook.mp3",
|
||
help="Output MP3 path")
|
||
args = ap.parse_args()
|
||
|
||
if args.list_voices:
|
||
print("Available voices:")
|
||
for name in list_available_voices():
|
||
print(" ", name)
|
||
return
|
||
|
||
# 1) PDF→Markdown
|
||
print("🔄 Converting PDF → Markdown…")
|
||
text = convert_pdf_to_markdown(args.pdf)
|
||
|
||
# 2) Choose engine
|
||
if args.voice_name:
|
||
wav_path = VOICE_DIR / f"{args.voice_name}.wav"
|
||
if not wav_path.is_file():
|
||
raise FileNotFoundError(f"voices/{args.voice_name}.wav not found.")
|
||
synthesize_with_coqui(text, args.out, str(wav_path),
|
||
args.speaker, args.lang)
|
||
else:
|
||
synthesize_with_kokoro(text, args.out, args.lang)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|