aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--parse_srt.py79
-rwxr-xr-xrebreak_lines.py64
-rw-r--r--requirements.txt6
4 files changed, 151 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1c43c45
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/venv/
+__pycache__/
diff --git a/parse_srt.py b/parse_srt.py
new file mode 100644
index 0000000..acd72c0
--- /dev/null
+++ b/parse_srt.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+import dataclasses
+import enum
+import itertools
+import re
+from typing import List
+
+
+@dataclasses.dataclass
+class Event:
+ start: str | None = None
+ end: str | None = None
+ content: str | None = None
+
+
+class SRT:
+ def __init__(self):
+ self.events: List[Event] = []
+
+ @staticmethod
+ def from_str(text: str) -> SRT:
+ class ParseState(enum.Enum):
+ COUNTER = enum.auto()
+ TIMING = enum.auto()
+ CONTENT = enum.auto()
+
+ PARSE_STATES = itertools.cycle(iter(ParseState))
+ TIMESTAMP_CAPTURE = r"(\d\d:\d\d:\d\d,\d\d\d)"
+ TIMING_REGEX = rf"{TIMESTAMP_CAPTURE} --> {TIMESTAMP_CAPTURE}"
+
+ srt = SRT()
+ lines = text.split("\n")
+ counter = 1
+ state = next(PARSE_STATES)
+ event = Event()
+ for line_num, line in enumerate(lines, 1):
+ if not line:
+ match state:
+ case ParseState.CONTENT:
+ srt.events.append(event)
+ event = Event()
+ state = next(PARSE_STATES)
+ case ParseState.COUNTER:
+ pass
+ case _:
+ raise ParseError(f"Unexpected blank line (line {line_num})")
+ continue
+ match state:
+ case ParseState.COUNTER:
+ if int(line) == counter:
+ counter += 1
+ state = next(PARSE_STATES)
+ else:
+ raise ParseError(
+ f"Invalid counter, expected {counter} (line {line_num})"
+ )
+ case ParseState.TIMING:
+ match = re.fullmatch(TIMING_REGEX, line)
+ if match is None:
+ raise ParseError(f"Invalid timing info (line {line_num})")
+ event.start, event.end = match[1], match[2]
+ state = next(PARSE_STATES)
+ case ParseState.CONTENT:
+ event.content = (
+ event.content if event.content is not None else ""
+ ) + f"{line}\n"
+ return srt
+
+ def __str__(self):
+ result = ""
+ for counter, event in enumerate(self.events, 1):
+ result += f"{counter}\n"
+ result += f"{event.start} --> {event.end}\n"
+ result += f"{event.content}\n"
+ return result
+
+
+class ParseError(Exception):
+ pass
diff --git a/rebreak_lines.py b/rebreak_lines.py
new file mode 100755
index 0000000..41594a3
--- /dev/null
+++ b/rebreak_lines.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""
+Given an SRT file, remove existing line breaks and reinsert them automatically
+at word boundaries so as not to exceed 42 characters per line, trying to keep
+lines within a single subtitle event at roughly the same length.
+
+NOTE: Using this script is *generally a bad idea*; like many aspects of
+subtitling, placing line breaks benefits from contextual judgement. However,
+if an existing subtitle file has no line breaks or far too many, as is the case
+sometimes, this is an easy way to improve readability.
+"""
+import click
+import parse_srt
+import math
+import sys
+from typing import List
+
+# May still be exceeded if there are no word boundaries to wrap at
+MAX_LINE_LENGTH = 42
+
+
+@click.command()
+@click.argument("in_file_path")
+def main(in_file_path: str):
+ with open(in_file_path) as f:
+ text = f.read()
+ srt = parse_srt.SRT.from_str(text)
+
+ for event in srt.events:
+ event.content = rebreak(event.content)
+
+ sys.stdout.write(str(srt))
+
+
+def rebreak(text: str) -> str:
+ get_target_line_num = lambda length: math.ceil(length / MAX_LINE_LENGTH)
+ text = " ".join(text.split("\n"))
+ target_line_num = get_target_line_num(len(text))
+
+ lines: List[str] = []
+ for _ in range(target_line_num):
+ partition_at = round(len(text) / target_line_num) - 1
+
+ # Move to a word boundary
+ for steps_backward, c in enumerate(text[partition_at::-1]):
+ if c.isspace():
+ break
+ if partition_at - steps_backward != 0:
+ partition_at -= steps_backward
+ else:
+ for steps_forward, c in enumerate(text[partition_at:]):
+ if c.isspace():
+ break
+ partition_at += steps_forward
+
+ lines.append(text[: partition_at + 1].strip())
+ text = text[partition_at + 1 :]
+ target_line_num = get_target_line_num(len(text))
+
+ return ("\n".join(lines) if lines else text) + "\n"
+
+
+if __name__ == "__main__":
+ main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..befb51e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+black==24.10.0
+click==8.1.7
+mypy-extensions==1.0.0
+packaging==24.2
+pathspec==0.12.1
+platformdirs==4.3.6