Refactor for modularity, package

author: Arjun Satarkar <me@arjunsatarkar.net> 2024-12-21 13:35:04 +0000
committer: Arjun Satarkar <me@arjunsatarkar.net> 2024-12-21 13:35:04 +0000
commit: df5a7db84432f4f60698b2ddb97a9f8955dd97fb (patch)
tree: c49ac71095e3eca5055ae518a3cee096afe19f8e /src
parent: 2df2e62042206cc04b82fca2a208d8c66d96a3b3 (diff)
download: srtfilter-df5a7db84432f4f60698b2ddb97a9f8955dd97fb.tar
srtfilter-df5a7db84432f4f60698b2ddb97a9f8955dd97fb.tar.gz
srtfilter-df5a7db84432f4f60698b2ddb97a9f8955dd97fb.zip
5 files changed, 151 insertions, 0 deletions
diff --git a/src/srtfilter/__init__.py b/src/srtfilter/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/srtfilter/__init__.py
diff --git a/src/srtfilter/__main__.py b/src/srtfilter/__main__.py
new file mode 100644
index 0000000..16b5050
--- /dev/null
+++ b/src/srtfilter/__main__.py
@@ -0,0 +1,4 @@
+if __name__ == "__main__":
+    from .srtfilter_cli import main
+
+    main()
diff --git a/src/srtfilter/filters/rebreak_lines.py b/src/srtfilter/filters/rebreak_lines.py
new file mode 100644
index 0000000..5c94aa6
--- /dev/null
+++ b/src/srtfilter/filters/rebreak_lines.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""
+Given an SRT event, remove existing line breaks and insert new ones. All
+insertion will be at word boundaries, lines will usually not exceed 42
+characters, and lines within the event will if possible be roughly the same
+length.
+
+NOTE: Using this filter is *generally a bad idea*; like many aspects of
+subtitling, placing line breaks benefits from contextual judgement. However,
+if an existing subtitle file has no line breaks or far too many, as is the case
+sometimes, this is an easy way to improve readability.
+"""
+import math
+import typing
+from .. import parse
+
+# May still be exceeded if there are no word boundaries to wrap at
+MAX_LINE_LENGTH = 42
+
+
+def filter(event: parse.Event) -> parse.Event:
+    event.content = rebreak(event.content)
+    return event
+
+
+def rebreak(text: str) -> str:
+    get_target_line_num: typing.Callable[[int], int] = lambda length: math.ceil(
+        length / MAX_LINE_LENGTH
+    )
+    text = " ".join(text.split("\n"))
+    target_line_num = get_target_line_num(len(text))
+
+    lines: list[str] = []
+    for _ in range(target_line_num):
+        partition_at = round(len(text) / target_line_num) - 1
+
+        # Move to a word boundary
+        steps_backward = 0
+        for steps_backward, c in enumerate(text[partition_at::-1]):
+            if c.isspace():
+                break
+        if partition_at - steps_backward != 0:
+            partition_at -= steps_backward
+        else:
+            # Moving the partition backward would give us an empty line, so
+            # move forward instead to ensure we always make progress.
+            steps_forward = 0
+            for steps_forward, c in enumerate(text[partition_at:]):
+                if c.isspace():
+                    break
+            partition_at += steps_forward
+
+        lines.append(text[: partition_at + 1].strip())
+        text = text[partition_at + 1 :]
+        target_line_num = get_target_line_num(len(text))
+
+    return ("\n".join(lines) if lines else text) + "\n"
diff --git a/src/srtfilter/parse.py b/src/srtfilter/parse.py
new file mode 100644
index 0000000..2e710ea
--- /dev/null
+++ b/src/srtfilter/parse.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+import dataclasses
+import re
+
+
+@dataclasses.dataclass
+class Event:
+    start: str
+    end: str
+    content: str
+
+
+class SRT:
+    def __init__(self):
+        self.events: list[Event] = []
+
+    @staticmethod
+    def from_str(text: str) -> SRT:
+        TIMESTAMP_CAPTURE = r"(\d\d:\d\d:\d\d,\d\d\d)"
+        TIMING_REGEX = rf"{TIMESTAMP_CAPTURE} --> {TIMESTAMP_CAPTURE}"
+
+        srt = SRT()
+        counter = 1
+        events = [event for event in text.split("\n\n") if event.strip()]
+        for event_str in events:
+            lines = event_str.split("\n")
+            counter_str, timing_str, content_lines = lines[0], lines[1], lines[2:]
+
+            if int(counter_str) != counter:
+                raise ParseError(
+                    f"Invalid counter '{counter_str}'; expected {counter}", event_str
+                )
+            counter += 1
+
+            match = re.fullmatch(TIMING_REGEX, timing_str)
+            if match is None:
+                raise ParseError(f"Invalid timing info '{timing_str}'", event_str)
+
+            content = "\n".join(content_lines + [""])
+
+            srt.events.append(Event(match[1], match[2], content))
+
+        return srt
+
+    def __str__(self):
+        result = ""
+        for counter, event in enumerate(self.events, 1):
+            result += f"{counter}\n"
+            result += f"{event.start} --> {event.end}\n"
+            result += f"{event.content}\n"
+        return result
+
+
+class ParseError(Exception):
+    def __init__(self, reason: str, event_str: str):
+        super().__init__(f"{reason}\nwhile parsing event:\n{event_str}")
+        self.reason = reason
+        self.event_str = event_str
diff --git a/src/srtfilter/srtfilter_cli.py b/src/srtfilter/srtfilter_cli.py
new file mode 100755
index 0000000..bc44409
--- /dev/null
+++ b/src/srtfilter/srtfilter_cli.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+import click
+import sys
+from . import parse
+from .filters import rebreak_lines
+
+
+@click.command()
+@click.argument("in_file_path")
+@click.option("--filter", "filter_arg", default="")
+def main(in_file_path: str, filter_arg: str):
+    with open(in_file_path) as f:
+        text = f.read()
+    srt = parse.SRT.from_str(text)
+
+    for filter_name in filter_arg.split():
+        match filter_name:
+            case "rebreak_lines":
+                filter_module = rebreak_lines
+            case unknown:
+                raise InvalidFilterError(unknown)
+        srt.events = [filter_module.filter(event) for event in srt.events]
+
+    sys.stdout.write(str(srt))
+
+
+class InvalidFilterError(Exception):
+    pass
+
+
+if __name__ == "__main__":
+    main()
author	Arjun Satarkar <me@arjunsatarkar.net>	2024-12-21 13:35:04 +0000
committer	Arjun Satarkar <me@arjunsatarkar.net>	2024-12-21 13:35:04 +0000
commit	df5a7db84432f4f60698b2ddb97a9f8955dd97fb (patch)
tree	c49ac71095e3eca5055ae518a3cee096afe19f8e /src
parent	2df2e62042206cc04b82fca2a208d8c66d96a3b3 (diff)
download	srtfilter-df5a7db84432f4f60698b2ddb97a9f8955dd97fb.tar srtfilter-df5a7db84432f4f60698b2ddb97a9f8955dd97fb.tar.gz srtfilter-df5a7db84432f4f60698b2ddb97a9f8955dd97fb.zip