diff --git a/docker-compose.yml b/docker-compose.yml index ab292de..587b3a8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -267,6 +267,21 @@ services: networks: - nginx + pdf2audiobook: + <<: *logging + build: ./pdf-to-audiobook + image: pdf2audiobook:latest + container_name: pdf2audiobook + working_dir: /app + volumes: + - ./pdf-to-audiobook:/app + - /mnt/tower/stardust/chris/files/Library:/books:ro + - /mnt/tower/stardust/chris/files/Audiobooks:/audio + environment: + - OPENAI_API_KEY=${OPENAI_API_KEY} + - OPENAI_API_BASE=${OPENAI_API_BASE:-https://aihubmix.com/v1} + - OPENAI_MODEL=${OPENAI_MODEL:-aihubmix-Llama-3-3-70B-Instruct} + jitsi: <<: *logging image: jitsi/web:stable diff --git a/pdf-to-audiobook/Dockerfile b/pdf-to-audiobook/Dockerfile new file mode 100644 index 0000000..52b4f82 --- /dev/null +++ b/pdf-to-audiobook/Dockerfile @@ -0,0 +1,23 @@ +# Dockerfile +FROM python:3.10-slim + +# 1) System deps for audio processing +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ffmpeg \ + libsndfile1 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# 2) Python deps +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# 3) Copy your script + code +COPY . . + +# 4) Default entrypoint is just the script; pass args via docker-compose or CLI +ENTRYPOINT ["python", "pdf_to_audiobook.py"] + diff --git a/pdf-to-audiobook/pdf_to_audiobook.py b/pdf-to-audiobook/pdf_to_audiobook.py new file mode 100644 index 0000000..9aecc7f --- /dev/null +++ b/pdf-to-audiobook/pdf_to_audiobook.py @@ -0,0 +1,156 @@ + + +#!/usr/bin/env python3 +""" +pdf_to_audiobook.py + +1) Always convert PDF → Markdown via marker + your OpenAI-compatible endpoint. +2) If you pass --voice-name, use Coqui XTTS (voice cloning); + otherwise use Kokoro TTS with the “Daniel” speaker. +3) Manage your 6-sec snippets in voices/ directory. +""" + +import os +import argparse +import tempfile +import numpy as np +import torch +import soundfile as sf +from pathlib import Path +from pdfminer.high_level import extract_text +from pydub import AudioSegment# Allow Coqui’s XttsConfig class to be unpickled + +# ─── Whitelist Coqui classes for safe CPU‐only loading ──────────────────────── +from TTS.tts.configs.xtts_config import XttsConfig +from TTS.tts.models.xtts import XttsAudioConfig +from TTS.config.shared_configs import BaseDatasetConfig +from TTS.tts.models.xtts import XttsArgs +from torch.serialization import add_safe_globals +add_safe_globals([XttsConfig, XttsAudioConfig, BaseDatasetConfig, XttsArgs]) + +# ─── Coqui XTTS API ───────────────────────────────────────────────────────── +from TTS.api import TTS + +# ─── Kokoro TTS API ───────────────────────────────────────────────────────── +from kokoro import KPipeline + +# ─── marker PDF→Markdown API ──────────────────────────────────────────────── +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.config.parser import ConfigParser +from marker.output import text_from_rendered + +VOICE_DIR = Path("voices") + +def list_available_voices(): + return sorted(p.stem for p in VOICE_DIR.glob("*.wav")) + +def convert_pdf_to_markdown(pdf_path: str) -> str: + key = os.getenv("OPENAI_API_KEY") + model = os.getenv("OPENAI_MODEL") + url = os.getenv("OPENAI_BASE_URL") + if not key: + raise RuntimeError("Please set OPENAI_API_KEY in your environment") + cfg = { + "output_format": "markdown", + "use_llm": True, + "llm_service": "marker.services.openai.OpenAIService", + "openai_api_key": key, + "openai_model": model, + "openai_base_url": url, + } + parser = ConfigParser(cfg) + converter = PdfConverter( + config = parser.generate_config_dict(), + artifact_dict = create_model_dict(), + processor_list = parser.get_processors(), + renderer = parser.get_renderer(), + llm_service = parser.get_llm_service(), + ) + rendered = converter(pdf_path) + markdown, _, _ = text_from_rendered(rendered) + return markdown + +def synthesize_with_coqui( + text: str, + out_mp3: str, + voice: str, + speaker: str | None, + lang: str, +): + use_gpu = torch.cuda.is_available() + tts = TTS(model_name="tts_models/multilingual/xtts_v2", gpu=use_gpu) + + spk_wav = [voice] + spk_name = speaker or tts.speakers[0] + print(f"⚙️ Using Coqui voice sample “{voice}” → speaker “{spk_name}”") + + tmp_wav = tempfile.mktemp(suffix=".wav") + tts.tts_to_file( + text = text, + file_path = tmp_wav, + speaker_wav = spk_wav, + speaker = spk_name, + language = lang, + split_sentences = True, + ) + AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3") + print(f"✅ Coqui MP3 saved → {out_mp3}") + +def synthesize_with_kokoro( + text: str, + out_mp3:str, + lang: str, +): + pipeline = KPipeline(lang_code=lang) + chunks = [] + # Daniel is the chosen voice + for _, _, audio in pipeline(text, voice="Daniel"): + chunks.append(audio) + full = np.concatenate(chunks, axis=0) + + tmp_wav = tempfile.mktemp(suffix=".wav") + sf.write(tmp_wav, full, 24000) + AudioSegment.from_wav(tmp_wav).export(out_mp3, format="mp3") + print(f"✅ Kokoro MP3 saved → {out_mp3}") + +def main(): + # ensure voices/ exists + VOICE_DIR.mkdir(exist_ok=True) + + ap = argparse.ArgumentParser( + description="PDF→Markdown→Audiobook (Coqui or Kokoro/Daniel)" + ) + ap.add_argument("pdf", help="Input PDF file") + ap.add_argument("--list-voices", action="store_true", + help="Show available voice snippets and exit") + ap.add_argument("--voice-name", help="Name of a .mp3 in voices/ to clone") + ap.add_argument("--speaker", help="Coqui built-in speaker (default if none)") + ap.add_argument("--lang", default="en", help="Language code") + ap.add_argument("--out", default="audiobook.mp3", + help="Output MP3 path") + args = ap.parse_args() + + if args.list_voices: + print("Available voices:") + for name in list_available_voices(): + print(" ", name) + return + + # 1) PDF→Markdown + print("🔄 Converting PDF → Markdown…") + text = convert_pdf_to_markdown(args.pdf) + + # 2) Choose engine + if args.voice_name: + wav_path = VOICE_DIR / f"{args.voice_name}.wav" + if not wav_path.is_file(): + raise FileNotFoundError(f"voices/{args.voice_name}.wav not found.") + synthesize_with_coqui(text, args.out, str(wav_path), + args.speaker, args.lang) + else: + synthesize_with_kokoro(text, args.out, args.lang) + +if __name__ == "__main__": + main() + diff --git a/pdf-to-audiobook/requirements.txt b/pdf-to-audiobook/requirements.txt new file mode 100644 index 0000000..165c191 --- /dev/null +++ b/pdf-to-audiobook/requirements.txt @@ -0,0 +1,9 @@ +marker-pdf +kokoro>=0.9.4 +soundfile +TTS +pdfminer.six +pydub +torch +"transformers<4.50.0" + diff --git a/pdf-to-audiobook/voices/andy_serkiss.mp3 b/pdf-to-audiobook/voices/andy_serkiss.mp3 new file mode 100644 index 0000000..00ae5af Binary files /dev/null and b/pdf-to-audiobook/voices/andy_serkiss.mp3 differ diff --git a/pdf-to-audiobook/voices/john_lee.mp3 b/pdf-to-audiobook/voices/john_lee.mp3 new file mode 100644 index 0000000..e94898e Binary files /dev/null and b/pdf-to-audiobook/voices/john_lee.mp3 differ