import re from pathlib import Path class SubtitleProcessor: """Utility for cleaning and formatting YouTube VTT subtitles.""" @staticmethod def clean_vtt(vtt_content: str) -> str: """ Clean VTT content by removing timestamps, tags, and duplicates. YouTube automatic captions often repeat lines with incremental words. """ # Remove header lines = vtt_content.split("\n") if lines and lines[0].startswith("WEBVTT"): lines = lines[1:] # Remove metadata lines (Kind:, Language:, etc) lines = [ line for line in lines if not any( line.startswith(prefix) for prefix in ["Kind:", "Language:", "align:", "position:"] ) ] # Remove timestamp lines and tags # Pattern for 00:00:00.000 --> 00:00:00.000 timestamp_pattern = re.compile( r"\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*" ) # Pattern for <00:00:00.000> etc tag_pattern = re.compile(r"<[^>]+>") cleaned_paragraphs = [] current_text = [] seen_lines = set() for line in lines: line = line.strip() if not line: continue if timestamp_pattern.match(line): continue # Clean tags cleaned_line = tag_pattern.sub("", line).strip() if not cleaned_line: continue # YouTube auto-subs repeat text heavily. # We want to keep unique sentences/segments. if cleaned_line in seen_lines: continue seen_lines.add(cleaned_line) current_text.append(cleaned_line) # Merge lines and remove redundant parts of sentences full_text = " ".join(current_text) # Simple cleanup of redundant repeated segments (YouTube specific) # e.g. "Hello world Hello world there" -> "Hello world there" # This is a bit complex to do perfectly without NLP, but we can do some basics. return full_text @staticmethod def format_as_markdown(text: str, metadata: dict) -> str: """Format the cleaned text as a structured Markdown file.""" title = metadata.get("title", "Unknown Title") channel = metadata.get("channel", "Unknown Channel") video_url = metadata.get("url", "") date = metadata.get("date", "") md = f"# {title}\n\n" md += f"**Channel:** {channel}\n" md += f"**Source:** {video_url}\n" md += f"**Date:** {date}\n\n" md += "## Transcript\n\n" # Split into paragraphs of roughly 5-7 sentences sentences = re.split(r"(?<=[.!?])\s+", text) paragraphs = [] for i in range(0, len(sentences), 6): paragraphs.append(" ".join(sentences[i : i + 6])) md += "\n\n".join(paragraphs) md += "\n" return md