pixelated-training-code / utils /subtitle_processor.py

Upload folder using huggingface_hub

7849935 verified 7 days ago

3.01 kB

	import re
	from pathlib import Path


	class SubtitleProcessor:
	"""Utility for cleaning and formatting YouTube VTT subtitles."""

	@staticmethod
	def clean_vtt(vtt_content: str) -> str:
	"""
	Clean VTT content by removing timestamps, tags, and duplicates.
	YouTube automatic captions often repeat lines with incremental words.
	"""
	# Remove header
	lines = vtt_content.split("\n")
	if lines and lines[0].startswith("WEBVTT"):
	lines = lines[1:]

	# Remove metadata lines (Kind:, Language:, etc)
	lines = [
	line
	for line in lines
	if not any(
	line.startswith(prefix)
	for prefix in ["Kind:", "Language:", "align:", "position:"]
	)
	]

	# Remove timestamp lines and tags
	# Pattern for 00:00:00.000 --> 00:00:00.000
	timestamp_pattern = re.compile(
	r"\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*"
	)
	# Pattern for <00:00:00.000><c> etc
	tag_pattern = re.compile(r"<[^>]+>")

	cleaned_paragraphs = []
	current_text = []

	seen_lines = set()

	for line in lines:
	line = line.strip()
	if not line:
	continue

	if timestamp_pattern.match(line):
	continue

	# Clean tags
	cleaned_line = tag_pattern.sub("", line).strip()

	if not cleaned_line:
	continue

	# YouTube auto-subs repeat text heavily.
	# We want to keep unique sentences/segments.
	if cleaned_line in seen_lines:
	continue

	seen_lines.add(cleaned_line)
	current_text.append(cleaned_line)

	# Merge lines and remove redundant parts of sentences
	full_text = " ".join(current_text)

	# Simple cleanup of redundant repeated segments (YouTube specific)
	# e.g. "Hello world Hello world there" -> "Hello world there"
	# This is a bit complex to do perfectly without NLP, but we can do some basics.

	return full_text

	@staticmethod
	def format_as_markdown(text: str, metadata: dict) -> str:
	"""Format the cleaned text as a structured Markdown file."""
	title = metadata.get("title", "Unknown Title")
	channel = metadata.get("channel", "Unknown Channel")
	video_url = metadata.get("url", "")
	date = metadata.get("date", "")

	md = f"# {title}\n\n"
	md += f"Channel: {channel}\n"
	md += f"Source: {video_url}\n"
	md += f"Date: {date}\n\n"
	md += "## Transcript\n\n"

	# Split into paragraphs of roughly 5-7 sentences
	sentences = re.split(r"(?<=[.!?])\s+", text)
	paragraphs = []
	for i in range(0, len(sentences), 6):
	paragraphs.append(" ".join(sentences[i : i + 6]))

	md += "\n\n".join(paragraphs)
	md += "\n"

	return md