From df5a7db84432f4f60698b2ddb97a9f8955dd97fb Mon Sep 17 00:00:00 2001 From: Arjun Satarkar Date: Sat, 21 Dec 2024 19:05:04 +0530 Subject: Refactor for modularity, package --- src/srtfilter/__init__.py | 0 src/srtfilter/__main__.py | 4 +++ src/srtfilter/filters/rebreak_lines.py | 57 +++++++++++++++++++++++++++++++++ src/srtfilter/parse.py | 58 ++++++++++++++++++++++++++++++++++ src/srtfilter/srtfilter_cli.py | 32 +++++++++++++++++++ 5 files changed, 151 insertions(+) create mode 100644 src/srtfilter/__init__.py create mode 100644 src/srtfilter/__main__.py create mode 100644 src/srtfilter/filters/rebreak_lines.py create mode 100644 src/srtfilter/parse.py create mode 100755 src/srtfilter/srtfilter_cli.py (limited to 'src') diff --git a/src/srtfilter/__init__.py b/src/srtfilter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/srtfilter/__main__.py b/src/srtfilter/__main__.py new file mode 100644 index 0000000..16b5050 --- /dev/null +++ b/src/srtfilter/__main__.py @@ -0,0 +1,4 @@ +if __name__ == "__main__": + from .srtfilter_cli import main + + main() diff --git a/src/srtfilter/filters/rebreak_lines.py b/src/srtfilter/filters/rebreak_lines.py new file mode 100644 index 0000000..5c94aa6 --- /dev/null +++ b/src/srtfilter/filters/rebreak_lines.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +""" +Given an SRT event, remove existing line breaks and insert new ones. All +insertion will be at word boundaries, lines will usually not exceed 42 +characters, and lines within the event will if possible be roughly the same +length. + +NOTE: Using this filter is *generally a bad idea*; like many aspects of +subtitling, placing line breaks benefits from contextual judgement. However, +if an existing subtitle file has no line breaks or far too many, as is the case +sometimes, this is an easy way to improve readability. +""" +import math +import typing +from .. import parse + +# May still be exceeded if there are no word boundaries to wrap at +MAX_LINE_LENGTH = 42 + + +def filter(event: parse.Event) -> parse.Event: + event.content = rebreak(event.content) + return event + + +def rebreak(text: str) -> str: + get_target_line_num: typing.Callable[[int], int] = lambda length: math.ceil( + length / MAX_LINE_LENGTH + ) + text = " ".join(text.split("\n")) + target_line_num = get_target_line_num(len(text)) + + lines: list[str] = [] + for _ in range(target_line_num): + partition_at = round(len(text) / target_line_num) - 1 + + # Move to a word boundary + steps_backward = 0 + for steps_backward, c in enumerate(text[partition_at::-1]): + if c.isspace(): + break + if partition_at - steps_backward != 0: + partition_at -= steps_backward + else: + # Moving the partition backward would give us an empty line, so + # move forward instead to ensure we always make progress. + steps_forward = 0 + for steps_forward, c in enumerate(text[partition_at:]): + if c.isspace(): + break + partition_at += steps_forward + + lines.append(text[: partition_at + 1].strip()) + text = text[partition_at + 1 :] + target_line_num = get_target_line_num(len(text)) + + return ("\n".join(lines) if lines else text) + "\n" diff --git a/src/srtfilter/parse.py b/src/srtfilter/parse.py new file mode 100644 index 0000000..2e710ea --- /dev/null +++ b/src/srtfilter/parse.py @@ -0,0 +1,58 @@ +from __future__ import annotations +import dataclasses +import re + + +@dataclasses.dataclass +class Event: + start: str + end: str + content: str + + +class SRT: + def __init__(self): + self.events: list[Event] = [] + + @staticmethod + def from_str(text: str) -> SRT: + TIMESTAMP_CAPTURE = r"(\d\d:\d\d:\d\d,\d\d\d)" + TIMING_REGEX = rf"{TIMESTAMP_CAPTURE} --> {TIMESTAMP_CAPTURE}" + + srt = SRT() + counter = 1 + events = [event for event in text.split("\n\n") if event.strip()] + for event_str in events: + lines = event_str.split("\n") + counter_str, timing_str, content_lines = lines[0], lines[1], lines[2:] + + if int(counter_str) != counter: + raise ParseError( + f"Invalid counter '{counter_str}'; expected {counter}", event_str + ) + counter += 1 + + match = re.fullmatch(TIMING_REGEX, timing_str) + if match is None: + raise ParseError(f"Invalid timing info '{timing_str}'", event_str) + + content = "\n".join(content_lines + [""]) + + srt.events.append(Event(match[1], match[2], content)) + + return srt + + def __str__(self): + result = "" + for counter, event in enumerate(self.events, 1): + result += f"{counter}\n" + result += f"{event.start} --> {event.end}\n" + result += f"{event.content}\n" + return result + + +class ParseError(Exception): + def __init__(self, reason: str, event_str: str): + super().__init__(f"{reason}\nwhile parsing event:\n{event_str}") + self.reason = reason + self.event_str = event_str diff --git a/src/srtfilter/srtfilter_cli.py b/src/srtfilter/srtfilter_cli.py new file mode 100755 index 0000000..bc44409 --- /dev/null +++ b/src/srtfilter/srtfilter_cli.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import click +import sys +from . import parse +from .filters import rebreak_lines + + +@click.command() +@click.argument("in_file_path") +@click.option("--filter", "filter_arg", default="") +def main(in_file_path: str, filter_arg: str): + with open(in_file_path) as f: + text = f.read() + srt = parse.SRT.from_str(text) + + for filter_name in filter_arg.split(): + match filter_name: + case "rebreak_lines": + filter_module = rebreak_lines + case unknown: + raise InvalidFilterError(unknown) + srt.events = [filter_module.filter(event) for event in srt.events] + + sys.stdout.write(str(srt)) + + +class InvalidFilterError(Exception): + pass + + +if __name__ == "__main__": + main() -- cgit v1.2.3-57-g22cb