From df5a7db84432f4f60698b2ddb97a9f8955dd97fb Mon Sep 17 00:00:00 2001 From: Arjun Satarkar Date: Sat, 21 Dec 2024 19:05:04 +0530 Subject: Refactor for modularity, package --- .github/workflows/check.yml | 5 +-- .gitignore | 1 + Justfile | 9 +++-- README.md | 3 ++ parse_srt.py | 58 ---------------------------- pyproject.toml | 27 +++++++++++++ pyrightconfig.json | 5 ++- rebreak_lines.py | 70 ---------------------------------- src/srtfilter/__init__.py | 0 src/srtfilter/__main__.py | 4 ++ src/srtfilter/filters/rebreak_lines.py | 57 +++++++++++++++++++++++++++ src/srtfilter/parse.py | 58 ++++++++++++++++++++++++++++ src/srtfilter/srtfilter_cli.py | 32 ++++++++++++++++ 13 files changed, 193 insertions(+), 136 deletions(-) create mode 100644 README.md delete mode 100644 parse_srt.py create mode 100644 pyproject.toml delete mode 100755 rebreak_lines.py create mode 100644 src/srtfilter/__init__.py create mode 100644 src/srtfilter/__main__.py create mode 100644 src/srtfilter/filters/rebreak_lines.py create mode 100644 src/srtfilter/parse.py create mode 100755 src/srtfilter/srtfilter_cli.py diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 3b19e6b..7d40006 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -8,6 +8,5 @@ jobs: with: python-version: "3.12" - uses: taiki-e/install-action@just - - run: | - pip install -r requirements.txt - just check + - run: pip install -r requirements.txt + - run: just check diff --git a/.gitignore b/.gitignore index 1c43c45..9ba1833 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /venv/ __pycache__/ +/dist/ diff --git a/Justfile b/Justfile index 5c62e93..76bebd4 100644 --- a/Justfile +++ b/Justfile @@ -1,7 +1,10 @@ -check: typecheck lint +check: typecheck check_style typecheck: pyright -lint: - black --check *.py +check_style: + black --check src + +format: + black src diff --git a/README.md b/README.md new file mode 100644 index 0000000..a9c5262 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# srtfilter + +Parser for the SubRip/SRT format and framework for filters that modify SRT files. diff --git a/parse_srt.py b/parse_srt.py deleted file mode 100644 index 2e710ea..0000000 --- a/parse_srt.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import annotations -import dataclasses -import re - - -@dataclasses.dataclass -class Event: - start: str - end: str - content: str - - -class SRT: - def __init__(self): - self.events: list[Event] = [] - - @staticmethod - def from_str(text: str) -> SRT: - TIMESTAMP_CAPTURE = r"(\d\d:\d\d:\d\d,\d\d\d)" - TIMING_REGEX = rf"{TIMESTAMP_CAPTURE} --> {TIMESTAMP_CAPTURE}" - - srt = SRT() - counter = 1 - events = [event for event in text.split("\n\n") if event.strip()] - for event_str in events: - lines = event_str.split("\n") - counter_str, timing_str, content_lines = lines[0], lines[1], lines[2:] - - if int(counter_str) != counter: - raise ParseError( - f"Invalid counter '{counter_str}'; expected {counter}", event_str - ) - counter += 1 - - match = re.fullmatch(TIMING_REGEX, timing_str) - if match is None: - raise ParseError(f"Invalid timing info '{timing_str}'", event_str) - - content = "\n".join(content_lines + [""]) - - srt.events.append(Event(match[1], match[2], content)) - - return srt - - def __str__(self): - result = "" - for counter, event in enumerate(self.events, 1): - result += f"{counter}\n" - result += f"{event.start} --> {event.end}\n" - result += f"{event.content}\n" - return result - - -class ParseError(Exception): - def __init__(self, reason: str, event_str: str): - super().__init__(f"{reason}\nwhile parsing event:\n{event_str}") - self.reason = reason - self.event_str = event_str diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..dc0a102 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "srtfilter" +version = "0.1.0" +authors = [ + { name="Arjun Satarkar", email="me@arjunsatarkar.net" }, +] +requires-python = ">=3.12" +readme = "README.md" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +dependencies = [ + "click==8.1.*" +] + +[project.scripts] +srtfilter = "srtfilter.srtfilter_cli:main" + +[project.urls] +Homepage = "https://github.com/arjunsatarkar/srtfilter" +Issues = "https://github.com/arjunsatarkar/srtfilter/issues" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" diff --git a/pyrightconfig.json b/pyrightconfig.json index 864fa90..5bd770c 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -1,3 +1,4 @@ { - "strict": ["."] -} \ No newline at end of file + "include": ["src"], + "strict": ["src"] +} diff --git a/rebreak_lines.py b/rebreak_lines.py deleted file mode 100755 index 54473e2..0000000 --- a/rebreak_lines.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python3 -""" -Given an SRT file, remove existing line breaks and reinsert them automatically -at word boundaries so as not to exceed 42 characters per line, trying to keep -lines within a single subtitle event at roughly the same length. - -NOTE: Using this script is *generally a bad idea*; like many aspects of -subtitling, placing line breaks benefits from contextual judgement. However, -if an existing subtitle file has no line breaks or far too many, as is the case -sometimes, this is an easy way to improve readability. -""" -import click -import parse_srt -import math -import sys -import typing - -# May still be exceeded if there are no word boundaries to wrap at -MAX_LINE_LENGTH = 42 - - -@click.command() -@click.argument("in_file_path") -def main(in_file_path: str): - with open(in_file_path) as f: - text = f.read() - srt = parse_srt.SRT.from_str(text) - - for event in srt.events: - event.content = rebreak(event.content) - - sys.stdout.write(str(srt)) - - -def rebreak(text: str) -> str: - get_target_line_num: typing.Callable[[int], int] = lambda length: math.ceil( - length / MAX_LINE_LENGTH - ) - text = " ".join(text.split("\n")) - target_line_num = get_target_line_num(len(text)) - - lines: list[str] = [] - for _ in range(target_line_num): - partition_at = round(len(text) / target_line_num) - 1 - - # Move to a word boundary - steps_backward = 0 - for steps_backward, c in enumerate(text[partition_at::-1]): - if c.isspace(): - break - if partition_at - steps_backward != 0: - partition_at -= steps_backward - else: - # Moving the partition backward would give us an empty line, so - # move forward instead to ensure we always make progress. - steps_forward = 0 - for steps_forward, c in enumerate(text[partition_at:]): - if c.isspace(): - break - partition_at += steps_forward - - lines.append(text[: partition_at + 1].strip()) - text = text[partition_at + 1 :] - target_line_num = get_target_line_num(len(text)) - - return ("\n".join(lines) if lines else text) + "\n" - - -if __name__ == "__main__": - main() diff --git a/src/srtfilter/__init__.py b/src/srtfilter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/srtfilter/__main__.py b/src/srtfilter/__main__.py new file mode 100644 index 0000000..16b5050 --- /dev/null +++ b/src/srtfilter/__main__.py @@ -0,0 +1,4 @@ +if __name__ == "__main__": + from .srtfilter_cli import main + + main() diff --git a/src/srtfilter/filters/rebreak_lines.py b/src/srtfilter/filters/rebreak_lines.py new file mode 100644 index 0000000..5c94aa6 --- /dev/null +++ b/src/srtfilter/filters/rebreak_lines.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +""" +Given an SRT event, remove existing line breaks and insert new ones. All +insertion will be at word boundaries, lines will usually not exceed 42 +characters, and lines within the event will if possible be roughly the same +length. + +NOTE: Using this filter is *generally a bad idea*; like many aspects of +subtitling, placing line breaks benefits from contextual judgement. However, +if an existing subtitle file has no line breaks or far too many, as is the case +sometimes, this is an easy way to improve readability. +""" +import math +import typing +from .. import parse + +# May still be exceeded if there are no word boundaries to wrap at +MAX_LINE_LENGTH = 42 + + +def filter(event: parse.Event) -> parse.Event: + event.content = rebreak(event.content) + return event + + +def rebreak(text: str) -> str: + get_target_line_num: typing.Callable[[int], int] = lambda length: math.ceil( + length / MAX_LINE_LENGTH + ) + text = " ".join(text.split("\n")) + target_line_num = get_target_line_num(len(text)) + + lines: list[str] = [] + for _ in range(target_line_num): + partition_at = round(len(text) / target_line_num) - 1 + + # Move to a word boundary + steps_backward = 0 + for steps_backward, c in enumerate(text[partition_at::-1]): + if c.isspace(): + break + if partition_at - steps_backward != 0: + partition_at -= steps_backward + else: + # Moving the partition backward would give us an empty line, so + # move forward instead to ensure we always make progress. + steps_forward = 0 + for steps_forward, c in enumerate(text[partition_at:]): + if c.isspace(): + break + partition_at += steps_forward + + lines.append(text[: partition_at + 1].strip()) + text = text[partition_at + 1 :] + target_line_num = get_target_line_num(len(text)) + + return ("\n".join(lines) if lines else text) + "\n" diff --git a/src/srtfilter/parse.py b/src/srtfilter/parse.py new file mode 100644 index 0000000..2e710ea --- /dev/null +++ b/src/srtfilter/parse.py @@ -0,0 +1,58 @@ +from __future__ import annotations +import dataclasses +import re + + +@dataclasses.dataclass +class Event: + start: str + end: str + content: str + + +class SRT: + def __init__(self): + self.events: list[Event] = [] + + @staticmethod + def from_str(text: str) -> SRT: + TIMESTAMP_CAPTURE = r"(\d\d:\d\d:\d\d,\d\d\d)" + TIMING_REGEX = rf"{TIMESTAMP_CAPTURE} --> {TIMESTAMP_CAPTURE}" + + srt = SRT() + counter = 1 + events = [event for event in text.split("\n\n") if event.strip()] + for event_str in events: + lines = event_str.split("\n") + counter_str, timing_str, content_lines = lines[0], lines[1], lines[2:] + + if int(counter_str) != counter: + raise ParseError( + f"Invalid counter '{counter_str}'; expected {counter}", event_str + ) + counter += 1 + + match = re.fullmatch(TIMING_REGEX, timing_str) + if match is None: + raise ParseError(f"Invalid timing info '{timing_str}'", event_str) + + content = "\n".join(content_lines + [""]) + + srt.events.append(Event(match[1], match[2], content)) + + return srt + + def __str__(self): + result = "" + for counter, event in enumerate(self.events, 1): + result += f"{counter}\n" + result += f"{event.start} --> {event.end}\n" + result += f"{event.content}\n" + return result + + +class ParseError(Exception): + def __init__(self, reason: str, event_str: str): + super().__init__(f"{reason}\nwhile parsing event:\n{event_str}") + self.reason = reason + self.event_str = event_str diff --git a/src/srtfilter/srtfilter_cli.py b/src/srtfilter/srtfilter_cli.py new file mode 100755 index 0000000..bc44409 --- /dev/null +++ b/src/srtfilter/srtfilter_cli.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import click +import sys +from . import parse +from .filters import rebreak_lines + + +@click.command() +@click.argument("in_file_path") +@click.option("--filter", "filter_arg", default="") +def main(in_file_path: str, filter_arg: str): + with open(in_file_path) as f: + text = f.read() + srt = parse.SRT.from_str(text) + + for filter_name in filter_arg.split(): + match filter_name: + case "rebreak_lines": + filter_module = rebreak_lines + case unknown: + raise InvalidFilterError(unknown) + srt.events = [filter_module.filter(event) for event in srt.events] + + sys.stdout.write(str(srt)) + + +class InvalidFilterError(Exception): + pass + + +if __name__ == "__main__": + main() -- cgit v1.2.3-57-g22cb