| | import re |
| | from pathlib import Path |
| |
|
| |
|
| | class SubtitleProcessor: |
| | """Utility for cleaning and formatting YouTube VTT subtitles.""" |
| |
|
| | @staticmethod |
| | def clean_vtt(vtt_content: str) -> str: |
| | """ |
| | Clean VTT content by removing timestamps, tags, and duplicates. |
| | YouTube automatic captions often repeat lines with incremental words. |
| | """ |
| | |
| | lines = vtt_content.split("\n") |
| | if lines and lines[0].startswith("WEBVTT"): |
| | lines = lines[1:] |
| |
|
| | |
| | lines = [ |
| | line |
| | for line in lines |
| | if not any( |
| | line.startswith(prefix) |
| | for prefix in ["Kind:", "Language:", "align:", "position:"] |
| | ) |
| | ] |
| |
|
| | |
| | |
| | timestamp_pattern = re.compile( |
| | r"\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*" |
| | ) |
| | |
| | tag_pattern = re.compile(r"<[^>]+>") |
| |
|
| | cleaned_paragraphs = [] |
| | current_text = [] |
| |
|
| | seen_lines = set() |
| |
|
| | for line in lines: |
| | line = line.strip() |
| | if not line: |
| | continue |
| |
|
| | if timestamp_pattern.match(line): |
| | continue |
| |
|
| | |
| | cleaned_line = tag_pattern.sub("", line).strip() |
| |
|
| | if not cleaned_line: |
| | continue |
| |
|
| | |
| | |
| | if cleaned_line in seen_lines: |
| | continue |
| |
|
| | seen_lines.add(cleaned_line) |
| | current_text.append(cleaned_line) |
| |
|
| | |
| | full_text = " ".join(current_text) |
| |
|
| | |
| | |
| | |
| |
|
| | return full_text |
| |
|
| | @staticmethod |
| | def format_as_markdown(text: str, metadata: dict) -> str: |
| | """Format the cleaned text as a structured Markdown file.""" |
| | title = metadata.get("title", "Unknown Title") |
| | channel = metadata.get("channel", "Unknown Channel") |
| | video_url = metadata.get("url", "") |
| | date = metadata.get("date", "") |
| |
|
| | md = f"# {title}\n\n" |
| | md += f"**Channel:** {channel}\n" |
| | md += f"**Source:** {video_url}\n" |
| | md += f"**Date:** {date}\n\n" |
| | md += "## Transcript\n\n" |
| |
|
| | |
| | sentences = re.split(r"(?<=[.!?])\s+", text) |
| | paragraphs = [] |
| | for i in range(0, len(sentences), 6): |
| | paragraphs.append(" ".join(sentences[i : i + 6])) |
| |
|
| | md += "\n\n".join(paragraphs) |
| | md += "\n" |
| |
|
| | return md |
| |
|