Add script to convert pdf to audiobook

This commit is contained in:
2025-07-14 11:09:44 +01:00
parent a06db3333b
commit 650bb88bc0
6 changed files with 203 additions and 0 deletions

View File

@@ -0,0 +1,23 @@
# Dockerfile
FROM python:3.10-slim
# 1) System deps for audio processing
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential \
ffmpeg \
libsndfile1 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# 2) Python deps
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 3) Copy your script + code
COPY . .
# 4) Default entrypoint is just the script; pass args via docker-compose or CLI
ENTRYPOINT ["python", "pdf_to_audiobook.py"]

View File

@@ -0,0 +1,156 @@
#!/usr/bin/env python3
"""
pdf_to_audiobook.py
1) Always convert PDF → Markdown via marker + your OpenAI-compatible endpoint.
2) If you pass --voice-name, use Coqui XTTS (voice cloning);
otherwise use Kokoro TTS with the “Daniel” speaker.
3) Manage your 6-sec snippets in voices/ directory.
"""
import os
import argparse
import tempfile
import numpy as np
import torch
import soundfile as sf
from pathlib import Path
from pdfminer.high_level import extract_text
from pydub import AudioSegment# Allow Coquis XttsConfig class to be unpickled
# ─── Whitelist Coqui classes for safe CPUonly loading ────────────────────────
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import XttsAudioConfig
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.models.xtts import XttsArgs
from torch.serialization import add_safe_globals
add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
# ─── Coqui XTTS API ─────────────────────────────────────────────────────────
from TTS.api import TTS
# ─── Kokoro TTS API ─────────────────────────────────────────────────────────
from kokoro import KPipeline
# ─── marker PDF→Markdown API ────────────────────────────────────────────────
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
from marker.output import text_from_rendered
VOICE_DIR = Path("voices")
def list_available_voices():
return sorted(p.stem for p in VOICE_DIR.glob("*.wav"))
def convert_pdf_to_markdown(pdf_path: str) -> str:
key = os.getenv("OPENAI_API_KEY")
model = os.getenv("OPENAI_MODEL")
url = os.getenv("OPENAI_BASE_URL")
if not key:
raise RuntimeError("Please set OPENAI_API_KEY in your environment")
cfg = {
"output_format": "markdown",
"use_llm": True,
"llm_service": "marker.services.openai.OpenAIService",
"openai_api_key": key,
"openai_model": model,
"openai_base_url": url,
}
parser = ConfigParser(cfg)
converter = PdfConverter(
config = parser.generate_config_dict(),
artifact_dict = create_model_dict(),
processor_list = parser.get_processors(),
renderer = parser.get_renderer(),
llm_service = parser.get_llm_service(),
)
rendered = converter(pdf_path)
markdown, _, _ = text_from_rendered(rendered)
return markdown
def synthesize_with_coqui(
text: str,
out_mp3: str,
voice: str,
speaker: str | None,
lang: str,
):
use_gpu = torch.cuda.is_available()
tts = TTS(model_name="tts_models/multilingual/xtts_v2", gpu=use_gpu)
spk_wav = [voice]
spk_name = speaker or tts.speakers[0]
print(f"⚙️ Using Coqui voice sample “{voice}” → speaker “{spk_name}")
tmp_wav = tempfile.mktemp(suffix=".wav")
tts.tts_to_file(
text = text,
file_path = tmp_wav,
speaker_wav = spk_wav,
speaker = spk_name,
language = lang,
split_sentences = True,
)
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
print(f"✅ Coqui MP3 saved → {out_mp3}")
def synthesize_with_kokoro(
text: str,
out_mp3:str,
lang: str,
):
pipeline = KPipeline(lang_code=lang)
chunks = []
# Daniel is the chosen voice
for _, _, audio in pipeline(text, voice="Daniel"):
chunks.append(audio)
full = np.concatenate(chunks, axis=0)
tmp_wav = tempfile.mktemp(suffix=".wav")
sf.write(tmp_wav, full, 24000)
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
print(f"✅ Kokoro MP3 saved → {out_mp3}")
def main():
# ensure voices/ exists
VOICE_DIR.mkdir(exist_ok=True)
ap = argparse.ArgumentParser(
description="PDF→Markdown→Audiobook (Coqui or Kokoro/Daniel)"
)
ap.add_argument("pdf", help="Input PDF file")
ap.add_argument("--list-voices", action="store_true",
help="Show available voice snippets and exit")
ap.add_argument("--voice-name", help="Name of a .mp3 in voices/ to clone")
ap.add_argument("--speaker", help="Coqui built-in speaker (default if none)")
ap.add_argument("--lang", default="en", help="Language code")
ap.add_argument("--out", default="audiobook.mp3",
help="Output MP3 path")
args = ap.parse_args()
if args.list_voices:
print("Available voices:")
for name in list_available_voices():
print(" ", name)
return
# 1) PDF→Markdown
print("🔄 Converting PDF → Markdown…")
text = convert_pdf_to_markdown(args.pdf)
# 2) Choose engine
if args.voice_name:
wav_path = VOICE_DIR / f"{args.voice_name}.wav"
if not wav_path.is_file():
raise FileNotFoundError(f"voices/{args.voice_name}.wav not found.")
synthesize_with_coqui(text, args.out, str(wav_path),
args.speaker, args.lang)
else:
synthesize_with_kokoro(text, args.out, args.lang)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,9 @@
marker-pdf
kokoro>=0.9.4
soundfile
TTS
pdfminer.six
pydub
torch
"transformers<4.50.0"

Binary file not shown.

Binary file not shown.