pixelated-training-code / utils /subtitle_processor.py
oneblackmage's picture
Upload folder using huggingface_hub
7849935 verified
import re
from pathlib import Path
class SubtitleProcessor:
"""Utility for cleaning and formatting YouTube VTT subtitles."""
@staticmethod
def clean_vtt(vtt_content: str) -> str:
"""
Clean VTT content by removing timestamps, tags, and duplicates.
YouTube automatic captions often repeat lines with incremental words.
"""
# Remove header
lines = vtt_content.split("\n")
if lines and lines[0].startswith("WEBVTT"):
lines = lines[1:]
# Remove metadata lines (Kind:, Language:, etc)
lines = [
line
for line in lines
if not any(
line.startswith(prefix)
for prefix in ["Kind:", "Language:", "align:", "position:"]
)
]
# Remove timestamp lines and tags
# Pattern for 00:00:00.000 --> 00:00:00.000
timestamp_pattern = re.compile(
r"\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*"
)
# Pattern for <00:00:00.000><c> etc
tag_pattern = re.compile(r"<[^>]+>")
cleaned_paragraphs = []
current_text = []
seen_lines = set()
for line in lines:
line = line.strip()
if not line:
continue
if timestamp_pattern.match(line):
continue
# Clean tags
cleaned_line = tag_pattern.sub("", line).strip()
if not cleaned_line:
continue
# YouTube auto-subs repeat text heavily.
# We want to keep unique sentences/segments.
if cleaned_line in seen_lines:
continue
seen_lines.add(cleaned_line)
current_text.append(cleaned_line)
# Merge lines and remove redundant parts of sentences
full_text = " ".join(current_text)
# Simple cleanup of redundant repeated segments (YouTube specific)
# e.g. "Hello world Hello world there" -> "Hello world there"
# This is a bit complex to do perfectly without NLP, but we can do some basics.
return full_text
@staticmethod
def format_as_markdown(text: str, metadata: dict) -> str:
"""Format the cleaned text as a structured Markdown file."""
title = metadata.get("title", "Unknown Title")
channel = metadata.get("channel", "Unknown Channel")
video_url = metadata.get("url", "")
date = metadata.get("date", "")
md = f"# {title}\n\n"
md += f"**Channel:** {channel}\n"
md += f"**Source:** {video_url}\n"
md += f"**Date:** {date}\n\n"
md += "## Transcript\n\n"
# Split into paragraphs of roughly 5-7 sentences
sentences = re.split(r"(?<=[.!?])\s+", text)
paragraphs = []
for i in range(0, len(sentences), 6):
paragraphs.append(" ".join(sentences[i : i + 6]))
md += "\n\n".join(paragraphs)
md += "\n"
return md