Add script to convert pdf to audiobook
This commit is contained in:
@@ -267,6 +267,21 @@ services:
|
||||
networks:
|
||||
- nginx
|
||||
|
||||
pdf2audiobook:
|
||||
<<: *logging
|
||||
build: ./pdf-to-audiobook
|
||||
image: pdf2audiobook:latest
|
||||
container_name: pdf2audiobook
|
||||
working_dir: /app
|
||||
volumes:
|
||||
- ./pdf-to-audiobook:/app
|
||||
- /mnt/tower/stardust/chris/files/Library:/books:ro
|
||||
- /mnt/tower/stardust/chris/files/Audiobooks:/audio
|
||||
environment:
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
- OPENAI_API_BASE=${OPENAI_API_BASE:-https://aihubmix.com/v1}
|
||||
- OPENAI_MODEL=${OPENAI_MODEL:-aihubmix-Llama-3-3-70B-Instruct}
|
||||
|
||||
jitsi:
|
||||
<<: *logging
|
||||
image: jitsi/web:stable
|
||||
|
||||
23
pdf-to-audiobook/Dockerfile
Normal file
23
pdf-to-audiobook/Dockerfile
Normal file
@@ -0,0 +1,23 @@
|
||||
# Dockerfile
|
||||
FROM python:3.10-slim
|
||||
|
||||
# 1) System deps for audio processing
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
ffmpeg \
|
||||
libsndfile1 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 2) Python deps
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 3) Copy your script + code
|
||||
COPY . .
|
||||
|
||||
# 4) Default entrypoint is just the script; pass args via docker-compose or CLI
|
||||
ENTRYPOINT ["python", "pdf_to_audiobook.py"]
|
||||
|
||||
156
pdf-to-audiobook/pdf_to_audiobook.py
Normal file
156
pdf-to-audiobook/pdf_to_audiobook.py
Normal file
@@ -0,0 +1,156 @@
|
||||
|
||||
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
pdf_to_audiobook.py
|
||||
|
||||
1) Always convert PDF → Markdown via marker + your OpenAI-compatible endpoint.
|
||||
2) If you pass --voice-name, use Coqui XTTS (voice cloning);
|
||||
otherwise use Kokoro TTS with the “Daniel” speaker.
|
||||
3) Manage your 6-sec snippets in voices/ directory.
|
||||
"""
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import tempfile
|
||||
import numpy as np
|
||||
import torch
|
||||
import soundfile as sf
|
||||
from pathlib import Path
|
||||
from pdfminer.high_level import extract_text
|
||||
from pydub import AudioSegment# Allow Coqui’s XttsConfig class to be unpickled
|
||||
|
||||
# ─── Whitelist Coqui classes for safe CPU‐only loading ────────────────────────
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.tts.models.xtts import XttsAudioConfig
|
||||
from TTS.config.shared_configs import BaseDatasetConfig
|
||||
from TTS.tts.models.xtts import XttsArgs
|
||||
from torch.serialization import add_safe_globals
|
||||
add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
|
||||
|
||||
# ─── Coqui XTTS API ─────────────────────────────────────────────────────────
|
||||
from TTS.api import TTS
|
||||
|
||||
# ─── Kokoro TTS API ─────────────────────────────────────────────────────────
|
||||
from kokoro import KPipeline
|
||||
|
||||
# ─── marker PDF→Markdown API ────────────────────────────────────────────────
|
||||
from marker.converters.pdf import PdfConverter
|
||||
from marker.models import create_model_dict
|
||||
from marker.config.parser import ConfigParser
|
||||
from marker.output import text_from_rendered
|
||||
|
||||
VOICE_DIR = Path("voices")
|
||||
|
||||
def list_available_voices():
|
||||
return sorted(p.stem for p in VOICE_DIR.glob("*.wav"))
|
||||
|
||||
def convert_pdf_to_markdown(pdf_path: str) -> str:
|
||||
key = os.getenv("OPENAI_API_KEY")
|
||||
model = os.getenv("OPENAI_MODEL")
|
||||
url = os.getenv("OPENAI_BASE_URL")
|
||||
if not key:
|
||||
raise RuntimeError("Please set OPENAI_API_KEY in your environment")
|
||||
cfg = {
|
||||
"output_format": "markdown",
|
||||
"use_llm": True,
|
||||
"llm_service": "marker.services.openai.OpenAIService",
|
||||
"openai_api_key": key,
|
||||
"openai_model": model,
|
||||
"openai_base_url": url,
|
||||
}
|
||||
parser = ConfigParser(cfg)
|
||||
converter = PdfConverter(
|
||||
config = parser.generate_config_dict(),
|
||||
artifact_dict = create_model_dict(),
|
||||
processor_list = parser.get_processors(),
|
||||
renderer = parser.get_renderer(),
|
||||
llm_service = parser.get_llm_service(),
|
||||
)
|
||||
rendered = converter(pdf_path)
|
||||
markdown, _, _ = text_from_rendered(rendered)
|
||||
return markdown
|
||||
|
||||
def synthesize_with_coqui(
|
||||
text: str,
|
||||
out_mp3: str,
|
||||
voice: str,
|
||||
speaker: str | None,
|
||||
lang: str,
|
||||
):
|
||||
use_gpu = torch.cuda.is_available()
|
||||
tts = TTS(model_name="tts_models/multilingual/xtts_v2", gpu=use_gpu)
|
||||
|
||||
spk_wav = [voice]
|
||||
spk_name = speaker or tts.speakers[0]
|
||||
print(f"⚙️ Using Coqui voice sample “{voice}” → speaker “{spk_name}”")
|
||||
|
||||
tmp_wav = tempfile.mktemp(suffix=".wav")
|
||||
tts.tts_to_file(
|
||||
text = text,
|
||||
file_path = tmp_wav,
|
||||
speaker_wav = spk_wav,
|
||||
speaker = spk_name,
|
||||
language = lang,
|
||||
split_sentences = True,
|
||||
)
|
||||
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
|
||||
print(f"✅ Coqui MP3 saved → {out_mp3}")
|
||||
|
||||
def synthesize_with_kokoro(
|
||||
text: str,
|
||||
out_mp3:str,
|
||||
lang: str,
|
||||
):
|
||||
pipeline = KPipeline(lang_code=lang)
|
||||
chunks = []
|
||||
# Daniel is the chosen voice
|
||||
for _, _, audio in pipeline(text, voice="Daniel"):
|
||||
chunks.append(audio)
|
||||
full = np.concatenate(chunks, axis=0)
|
||||
|
||||
tmp_wav = tempfile.mktemp(suffix=".wav")
|
||||
sf.write(tmp_wav, full, 24000)
|
||||
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
|
||||
print(f"✅ Kokoro MP3 saved → {out_mp3}")
|
||||
|
||||
def main():
|
||||
# ensure voices/ exists
|
||||
VOICE_DIR.mkdir(exist_ok=True)
|
||||
|
||||
ap = argparse.ArgumentParser(
|
||||
description="PDF→Markdown→Audiobook (Coqui or Kokoro/Daniel)"
|
||||
)
|
||||
ap.add_argument("pdf", help="Input PDF file")
|
||||
ap.add_argument("--list-voices", action="store_true",
|
||||
help="Show available voice snippets and exit")
|
||||
ap.add_argument("--voice-name", help="Name of a .mp3 in voices/ to clone")
|
||||
ap.add_argument("--speaker", help="Coqui built-in speaker (default if none)")
|
||||
ap.add_argument("--lang", default="en", help="Language code")
|
||||
ap.add_argument("--out", default="audiobook.mp3",
|
||||
help="Output MP3 path")
|
||||
args = ap.parse_args()
|
||||
|
||||
if args.list_voices:
|
||||
print("Available voices:")
|
||||
for name in list_available_voices():
|
||||
print(" ", name)
|
||||
return
|
||||
|
||||
# 1) PDF→Markdown
|
||||
print("🔄 Converting PDF → Markdown…")
|
||||
text = convert_pdf_to_markdown(args.pdf)
|
||||
|
||||
# 2) Choose engine
|
||||
if args.voice_name:
|
||||
wav_path = VOICE_DIR / f"{args.voice_name}.wav"
|
||||
if not wav_path.is_file():
|
||||
raise FileNotFoundError(f"voices/{args.voice_name}.wav not found.")
|
||||
synthesize_with_coqui(text, args.out, str(wav_path),
|
||||
args.speaker, args.lang)
|
||||
else:
|
||||
synthesize_with_kokoro(text, args.out, args.lang)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
9
pdf-to-audiobook/requirements.txt
Normal file
9
pdf-to-audiobook/requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
marker-pdf
|
||||
kokoro>=0.9.4
|
||||
soundfile
|
||||
TTS
|
||||
pdfminer.six
|
||||
pydub
|
||||
torch
|
||||
"transformers<4.50.0"
|
||||
|
||||
BIN
pdf-to-audiobook/voices/andy_serkiss.mp3
Normal file
BIN
pdf-to-audiobook/voices/andy_serkiss.mp3
Normal file
Binary file not shown.
BIN
pdf-to-audiobook/voices/john_lee.mp3
Normal file
BIN
pdf-to-audiobook/voices/john_lee.mp3
Normal file
Binary file not shown.
Reference in New Issue
Block a user