From 60d0756a15b6c7fb739c6bbf06ef7f966822bbdc Mon Sep 17 00:00:00 2001 From: Arjun Satarkar Date: Sat, 21 Dec 2024 15:13:42 +0530 Subject: Initial commit --- .gitignore | 2 ++ parse_srt.py | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ rebreak_lines.py | 64 +++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 6 +++++ 4 files changed, 151 insertions(+) create mode 100644 .gitignore create mode 100644 parse_srt.py create mode 100755 rebreak_lines.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c43c45 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/venv/ +__pycache__/ diff --git a/parse_srt.py b/parse_srt.py new file mode 100644 index 0000000..acd72c0 --- /dev/null +++ b/parse_srt.py @@ -0,0 +1,79 @@ +from __future__ import annotations +import dataclasses +import enum +import itertools +import re +from typing import List + + +@dataclasses.dataclass +class Event: + start: str | None = None + end: str | None = None + content: str | None = None + + +class SRT: + def __init__(self): + self.events: List[Event] = [] + + @staticmethod + def from_str(text: str) -> SRT: + class ParseState(enum.Enum): + COUNTER = enum.auto() + TIMING = enum.auto() + CONTENT = enum.auto() + + PARSE_STATES = itertools.cycle(iter(ParseState)) + TIMESTAMP_CAPTURE = r"(\d\d:\d\d:\d\d,\d\d\d)" + TIMING_REGEX = rf"{TIMESTAMP_CAPTURE} --> {TIMESTAMP_CAPTURE}" + + srt = SRT() + lines = text.split("\n") + counter = 1 + state = next(PARSE_STATES) + event = Event() + for line_num, line in enumerate(lines, 1): + if not line: + match state: + case ParseState.CONTENT: + srt.events.append(event) + event = Event() + state = next(PARSE_STATES) + case ParseState.COUNTER: + pass + case _: + raise ParseError(f"Unexpected blank line (line {line_num})") + continue + match state: + case ParseState.COUNTER: + if int(line) == counter: + counter += 1 + state = next(PARSE_STATES) + else: + raise ParseError( + f"Invalid counter, expected {counter} (line {line_num})" + ) + case ParseState.TIMING: + match = re.fullmatch(TIMING_REGEX, line) + if match is None: + raise ParseError(f"Invalid timing info (line {line_num})") + event.start, event.end = match[1], match[2] + state = next(PARSE_STATES) + case ParseState.CONTENT: + event.content = ( + event.content if event.content is not None else "" + ) + f"{line}\n" + return srt + + def __str__(self): + result = "" + for counter, event in enumerate(self.events, 1): + result += f"{counter}\n" + result += f"{event.start} --> {event.end}\n" + result += f"{event.content}\n" + return result + + +class ParseError(Exception): + pass diff --git a/rebreak_lines.py b/rebreak_lines.py new file mode 100755 index 0000000..41594a3 --- /dev/null +++ b/rebreak_lines.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +""" +Given an SRT file, remove existing line breaks and reinsert them automatically +at word boundaries so as not to exceed 42 characters per line, trying to keep +lines within a single subtitle event at roughly the same length. + +NOTE: Using this script is *generally a bad idea*; like many aspects of +subtitling, placing line breaks benefits from contextual judgement. However, +if an existing subtitle file has no line breaks or far too many, as is the case +sometimes, this is an easy way to improve readability. +""" +import click +import parse_srt +import math +import sys +from typing import List + +# May still be exceeded if there are no word boundaries to wrap at +MAX_LINE_LENGTH = 42 + + +@click.command() +@click.argument("in_file_path") +def main(in_file_path: str): + with open(in_file_path) as f: + text = f.read() + srt = parse_srt.SRT.from_str(text) + + for event in srt.events: + event.content = rebreak(event.content) + + sys.stdout.write(str(srt)) + + +def rebreak(text: str) -> str: + get_target_line_num = lambda length: math.ceil(length / MAX_LINE_LENGTH) + text = " ".join(text.split("\n")) + target_line_num = get_target_line_num(len(text)) + + lines: List[str] = [] + for _ in range(target_line_num): + partition_at = round(len(text) / target_line_num) - 1 + + # Move to a word boundary + for steps_backward, c in enumerate(text[partition_at::-1]): + if c.isspace(): + break + if partition_at - steps_backward != 0: + partition_at -= steps_backward + else: + for steps_forward, c in enumerate(text[partition_at:]): + if c.isspace(): + break + partition_at += steps_forward + + lines.append(text[: partition_at + 1].strip()) + text = text[partition_at + 1 :] + target_line_num = get_target_line_num(len(text)) + + return ("\n".join(lines) if lines else text) + "\n" + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..befb51e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +black==24.10.0 +click==8.1.7 +mypy-extensions==1.0.0 +packaging==24.2 +pathspec==0.12.1 +platformdirs==4.3.6 -- cgit v1.2.3-57-g22cb