Add script to convert pdf to audiobook
This commit is contained in:
@@ -267,6 +267,21 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- nginx
|
- nginx
|
||||||
|
|
||||||
|
pdf2audiobook:
|
||||||
|
<<: *logging
|
||||||
|
build: ./pdf-to-audiobook
|
||||||
|
image: pdf2audiobook:latest
|
||||||
|
container_name: pdf2audiobook
|
||||||
|
working_dir: /app
|
||||||
|
volumes:
|
||||||
|
- ./pdf-to-audiobook:/app
|
||||||
|
- /mnt/tower/stardust/chris/files/Library:/books:ro
|
||||||
|
- /mnt/tower/stardust/chris/files/Audiobooks:/audio
|
||||||
|
environment:
|
||||||
|
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||||
|
- OPENAI_API_BASE=${OPENAI_API_BASE:-https://aihubmix.com/v1}
|
||||||
|
- OPENAI_MODEL=${OPENAI_MODEL:-aihubmix-Llama-3-3-70B-Instruct}
|
||||||
|
|
||||||
jitsi:
|
jitsi:
|
||||||
<<: *logging
|
<<: *logging
|
||||||
image: jitsi/web:stable
|
image: jitsi/web:stable
|
||||||
|
|||||||
23
pdf-to-audiobook/Dockerfile
Normal file
23
pdf-to-audiobook/Dockerfile
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
# Dockerfile
|
||||||
|
FROM python:3.10-slim
|
||||||
|
|
||||||
|
# 1) System deps for audio processing
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
ffmpeg \
|
||||||
|
libsndfile1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# 2) Python deps
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# 3) Copy your script + code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# 4) Default entrypoint is just the script; pass args via docker-compose or CLI
|
||||||
|
ENTRYPOINT ["python", "pdf_to_audiobook.py"]
|
||||||
|
|
||||||
156
pdf-to-audiobook/pdf_to_audiobook.py
Normal file
156
pdf-to-audiobook/pdf_to_audiobook.py
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
|
||||||
|
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
pdf_to_audiobook.py
|
||||||
|
|
||||||
|
1) Always convert PDF → Markdown via marker + your OpenAI-compatible endpoint.
|
||||||
|
2) If you pass --voice-name, use Coqui XTTS (voice cloning);
|
||||||
|
otherwise use Kokoro TTS with the “Daniel” speaker.
|
||||||
|
3) Manage your 6-sec snippets in voices/ directory.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import tempfile
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import soundfile as sf
|
||||||
|
from pathlib import Path
|
||||||
|
from pdfminer.high_level import extract_text
|
||||||
|
from pydub import AudioSegment# Allow Coqui’s XttsConfig class to be unpickled
|
||||||
|
|
||||||
|
# ─── Whitelist Coqui classes for safe CPU‐only loading ────────────────────────
|
||||||
|
from TTS.tts.configs.xtts_config import XttsConfig
|
||||||
|
from TTS.tts.models.xtts import XttsAudioConfig
|
||||||
|
from TTS.config.shared_configs import BaseDatasetConfig
|
||||||
|
from TTS.tts.models.xtts import XttsArgs
|
||||||
|
from torch.serialization import add_safe_globals
|
||||||
|
add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs])
|
||||||
|
|
||||||
|
# ─── Coqui XTTS API ─────────────────────────────────────────────────────────
|
||||||
|
from TTS.api import TTS
|
||||||
|
|
||||||
|
# ─── Kokoro TTS API ─────────────────────────────────────────────────────────
|
||||||
|
from kokoro import KPipeline
|
||||||
|
|
||||||
|
# ─── marker PDF→Markdown API ────────────────────────────────────────────────
|
||||||
|
from marker.converters.pdf import PdfConverter
|
||||||
|
from marker.models import create_model_dict
|
||||||
|
from marker.config.parser import ConfigParser
|
||||||
|
from marker.output import text_from_rendered
|
||||||
|
|
||||||
|
VOICE_DIR = Path("voices")
|
||||||
|
|
||||||
|
def list_available_voices():
|
||||||
|
return sorted(p.stem for p in VOICE_DIR.glob("*.wav"))
|
||||||
|
|
||||||
|
def convert_pdf_to_markdown(pdf_path: str) -> str:
|
||||||
|
key = os.getenv("OPENAI_API_KEY")
|
||||||
|
model = os.getenv("OPENAI_MODEL")
|
||||||
|
url = os.getenv("OPENAI_BASE_URL")
|
||||||
|
if not key:
|
||||||
|
raise RuntimeError("Please set OPENAI_API_KEY in your environment")
|
||||||
|
cfg = {
|
||||||
|
"output_format": "markdown",
|
||||||
|
"use_llm": True,
|
||||||
|
"llm_service": "marker.services.openai.OpenAIService",
|
||||||
|
"openai_api_key": key,
|
||||||
|
"openai_model": model,
|
||||||
|
"openai_base_url": url,
|
||||||
|
}
|
||||||
|
parser = ConfigParser(cfg)
|
||||||
|
converter = PdfConverter(
|
||||||
|
config = parser.generate_config_dict(),
|
||||||
|
artifact_dict = create_model_dict(),
|
||||||
|
processor_list = parser.get_processors(),
|
||||||
|
renderer = parser.get_renderer(),
|
||||||
|
llm_service = parser.get_llm_service(),
|
||||||
|
)
|
||||||
|
rendered = converter(pdf_path)
|
||||||
|
markdown, _, _ = text_from_rendered(rendered)
|
||||||
|
return markdown
|
||||||
|
|
||||||
|
def synthesize_with_coqui(
|
||||||
|
text: str,
|
||||||
|
out_mp3: str,
|
||||||
|
voice: str,
|
||||||
|
speaker: str | None,
|
||||||
|
lang: str,
|
||||||
|
):
|
||||||
|
use_gpu = torch.cuda.is_available()
|
||||||
|
tts = TTS(model_name="tts_models/multilingual/xtts_v2", gpu=use_gpu)
|
||||||
|
|
||||||
|
spk_wav = [voice]
|
||||||
|
spk_name = speaker or tts.speakers[0]
|
||||||
|
print(f"⚙️ Using Coqui voice sample “{voice}” → speaker “{spk_name}”")
|
||||||
|
|
||||||
|
tmp_wav = tempfile.mktemp(suffix=".wav")
|
||||||
|
tts.tts_to_file(
|
||||||
|
text = text,
|
||||||
|
file_path = tmp_wav,
|
||||||
|
speaker_wav = spk_wav,
|
||||||
|
speaker = spk_name,
|
||||||
|
language = lang,
|
||||||
|
split_sentences = True,
|
||||||
|
)
|
||||||
|
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
|
||||||
|
print(f"✅ Coqui MP3 saved → {out_mp3}")
|
||||||
|
|
||||||
|
def synthesize_with_kokoro(
|
||||||
|
text: str,
|
||||||
|
out_mp3:str,
|
||||||
|
lang: str,
|
||||||
|
):
|
||||||
|
pipeline = KPipeline(lang_code=lang)
|
||||||
|
chunks = []
|
||||||
|
# Daniel is the chosen voice
|
||||||
|
for _, _, audio in pipeline(text, voice="Daniel"):
|
||||||
|
chunks.append(audio)
|
||||||
|
full = np.concatenate(chunks, axis=0)
|
||||||
|
|
||||||
|
tmp_wav = tempfile.mktemp(suffix=".wav")
|
||||||
|
sf.write(tmp_wav, full, 24000)
|
||||||
|
AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3")
|
||||||
|
print(f"✅ Kokoro MP3 saved → {out_mp3}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# ensure voices/ exists
|
||||||
|
VOICE_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
ap = argparse.ArgumentParser(
|
||||||
|
description="PDF→Markdown→Audiobook (Coqui or Kokoro/Daniel)"
|
||||||
|
)
|
||||||
|
ap.add_argument("pdf", help="Input PDF file")
|
||||||
|
ap.add_argument("--list-voices", action="store_true",
|
||||||
|
help="Show available voice snippets and exit")
|
||||||
|
ap.add_argument("--voice-name", help="Name of a .mp3 in voices/ to clone")
|
||||||
|
ap.add_argument("--speaker", help="Coqui built-in speaker (default if none)")
|
||||||
|
ap.add_argument("--lang", default="en", help="Language code")
|
||||||
|
ap.add_argument("--out", default="audiobook.mp3",
|
||||||
|
help="Output MP3 path")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.list_voices:
|
||||||
|
print("Available voices:")
|
||||||
|
for name in list_available_voices():
|
||||||
|
print(" ", name)
|
||||||
|
return
|
||||||
|
|
||||||
|
# 1) PDF→Markdown
|
||||||
|
print("🔄 Converting PDF → Markdown…")
|
||||||
|
text = convert_pdf_to_markdown(args.pdf)
|
||||||
|
|
||||||
|
# 2) Choose engine
|
||||||
|
if args.voice_name:
|
||||||
|
wav_path = VOICE_DIR / f"{args.voice_name}.wav"
|
||||||
|
if not wav_path.is_file():
|
||||||
|
raise FileNotFoundError(f"voices/{args.voice_name}.wav not found.")
|
||||||
|
synthesize_with_coqui(text, args.out, str(wav_path),
|
||||||
|
args.speaker, args.lang)
|
||||||
|
else:
|
||||||
|
synthesize_with_kokoro(text, args.out, args.lang)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
||||||
9
pdf-to-audiobook/requirements.txt
Normal file
9
pdf-to-audiobook/requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
marker-pdf
|
||||||
|
kokoro>=0.9.4
|
||||||
|
soundfile
|
||||||
|
TTS
|
||||||
|
pdfminer.six
|
||||||
|
pydub
|
||||||
|
torch
|
||||||
|
"transformers<4.50.0"
|
||||||
|
|
||||||
BIN
pdf-to-audiobook/voices/andy_serkiss.mp3
Normal file
BIN
pdf-to-audiobook/voices/andy_serkiss.mp3
Normal file
Binary file not shown.
BIN
pdf-to-audiobook/voices/john_lee.mp3
Normal file
BIN
pdf-to-audiobook/voices/john_lee.mp3
Normal file
Binary file not shown.
Reference in New Issue
Block a user