diff options
author | Arjun Satarkar <me@arjunsatarkar.net> | 2024-12-21 13:35:04 +0000 |
---|---|---|
committer | Arjun Satarkar <me@arjunsatarkar.net> | 2024-12-21 13:35:04 +0000 |
commit | df5a7db84432f4f60698b2ddb97a9f8955dd97fb (patch) | |
tree | c49ac71095e3eca5055ae518a3cee096afe19f8e /src | |
parent | 2df2e62042206cc04b82fca2a208d8c66d96a3b3 (diff) | |
download | srtfilter-df5a7db84432f4f60698b2ddb97a9f8955dd97fb.tar srtfilter-df5a7db84432f4f60698b2ddb97a9f8955dd97fb.tar.gz srtfilter-df5a7db84432f4f60698b2ddb97a9f8955dd97fb.zip |
Refactor for modularity, package
Diffstat (limited to 'src')
-rw-r--r-- | src/srtfilter/__init__.py | 0 | ||||
-rw-r--r-- | src/srtfilter/__main__.py | 4 | ||||
-rw-r--r-- | src/srtfilter/filters/rebreak_lines.py | 57 | ||||
-rw-r--r-- | src/srtfilter/parse.py | 58 | ||||
-rwxr-xr-x | src/srtfilter/srtfilter_cli.py | 32 |
5 files changed, 151 insertions, 0 deletions
diff --git a/src/srtfilter/__init__.py b/src/srtfilter/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/srtfilter/__init__.py diff --git a/src/srtfilter/__main__.py b/src/srtfilter/__main__.py new file mode 100644 index 0000000..16b5050 --- /dev/null +++ b/src/srtfilter/__main__.py @@ -0,0 +1,4 @@ +if __name__ == "__main__": + from .srtfilter_cli import main + + main() diff --git a/src/srtfilter/filters/rebreak_lines.py b/src/srtfilter/filters/rebreak_lines.py new file mode 100644 index 0000000..5c94aa6 --- /dev/null +++ b/src/srtfilter/filters/rebreak_lines.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +""" +Given an SRT event, remove existing line breaks and insert new ones. All +insertion will be at word boundaries, lines will usually not exceed 42 +characters, and lines within the event will if possible be roughly the same +length. + +NOTE: Using this filter is *generally a bad idea*; like many aspects of +subtitling, placing line breaks benefits from contextual judgement. However, +if an existing subtitle file has no line breaks or far too many, as is the case +sometimes, this is an easy way to improve readability. +""" +import math +import typing +from .. import parse + +# May still be exceeded if there are no word boundaries to wrap at +MAX_LINE_LENGTH = 42 + + +def filter(event: parse.Event) -> parse.Event: + event.content = rebreak(event.content) + return event + + +def rebreak(text: str) -> str: + get_target_line_num: typing.Callable[[int], int] = lambda length: math.ceil( + length / MAX_LINE_LENGTH + ) + text = " ".join(text.split("\n")) + target_line_num = get_target_line_num(len(text)) + + lines: list[str] = [] + for _ in range(target_line_num): + partition_at = round(len(text) / target_line_num) - 1 + + # Move to a word boundary + steps_backward = 0 + for steps_backward, c in enumerate(text[partition_at::-1]): + if c.isspace(): + break + if partition_at - steps_backward != 0: + partition_at -= steps_backward + else: + # Moving the partition backward would give us an empty line, so + # move forward instead to ensure we always make progress. + steps_forward = 0 + for steps_forward, c in enumerate(text[partition_at:]): + if c.isspace(): + break + partition_at += steps_forward + + lines.append(text[: partition_at + 1].strip()) + text = text[partition_at + 1 :] + target_line_num = get_target_line_num(len(text)) + + return ("\n".join(lines) if lines else text) + "\n" diff --git a/src/srtfilter/parse.py b/src/srtfilter/parse.py new file mode 100644 index 0000000..2e710ea --- /dev/null +++ b/src/srtfilter/parse.py @@ -0,0 +1,58 @@ +from __future__ import annotations +import dataclasses +import re + + +@dataclasses.dataclass +class Event: + start: str + end: str + content: str + + +class SRT: + def __init__(self): + self.events: list[Event] = [] + + @staticmethod + def from_str(text: str) -> SRT: + TIMESTAMP_CAPTURE = r"(\d\d:\d\d:\d\d,\d\d\d)" + TIMING_REGEX = rf"{TIMESTAMP_CAPTURE} --> {TIMESTAMP_CAPTURE}" + + srt = SRT() + counter = 1 + events = [event for event in text.split("\n\n") if event.strip()] + for event_str in events: + lines = event_str.split("\n") + counter_str, timing_str, content_lines = lines[0], lines[1], lines[2:] + + if int(counter_str) != counter: + raise ParseError( + f"Invalid counter '{counter_str}'; expected {counter}", event_str + ) + counter += 1 + + match = re.fullmatch(TIMING_REGEX, timing_str) + if match is None: + raise ParseError(f"Invalid timing info '{timing_str}'", event_str) + + content = "\n".join(content_lines + [""]) + + srt.events.append(Event(match[1], match[2], content)) + + return srt + + def __str__(self): + result = "" + for counter, event in enumerate(self.events, 1): + result += f"{counter}\n" + result += f"{event.start} --> {event.end}\n" + result += f"{event.content}\n" + return result + + +class ParseError(Exception): + def __init__(self, reason: str, event_str: str): + super().__init__(f"{reason}\nwhile parsing event:\n{event_str}") + self.reason = reason + self.event_str = event_str diff --git a/src/srtfilter/srtfilter_cli.py b/src/srtfilter/srtfilter_cli.py new file mode 100755 index 0000000..bc44409 --- /dev/null +++ b/src/srtfilter/srtfilter_cli.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import click +import sys +from . import parse +from .filters import rebreak_lines + + +@click.command() +@click.argument("in_file_path") +@click.option("--filter", "filter_arg", default="") +def main(in_file_path: str, filter_arg: str): + with open(in_file_path) as f: + text = f.read() + srt = parse.SRT.from_str(text) + + for filter_name in filter_arg.split(): + match filter_name: + case "rebreak_lines": + filter_module = rebreak_lines + case unknown: + raise InvalidFilterError(unknown) + srt.events = [filter_module.filter(event) for event in srt.events] + + sys.stdout.write(str(srt)) + + +class InvalidFilterError(Exception): + pass + + +if __name__ == "__main__": + main() |