rebreak_lines.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

#!/usr/bin/env python3
"""
Given an SRT file, remove existing line breaks and reinsert them automatically
at word boundaries so as not to exceed 42 characters per line, trying to keep
lines within a single subtitle event at roughly the same length.

NOTE: Using this script is *generally a bad idea*; like many aspects of
subtitling, placing line breaks benefits from contextual judgement. However,
if an existing subtitle file has no line breaks or far too many, as is the case
sometimes, this is an easy way to improve readability.
"""
import click
import parse_srt
import math
import sys
from typing import List

# May still be exceeded if there are no word boundaries to wrap at
MAX_LINE_LENGTH = 42


@click.command()
@click.argument("in_file_path")
def main(in_file_path: str):
    with open(in_file_path) as f:
        text = f.read()
    srt = parse_srt.SRT.from_str(text)

    for event in srt.events:
        event.content = rebreak(event.content)

    sys.stdout.write(str(srt))


def rebreak(text: str) -> str:
    get_target_line_num = lambda length: math.ceil(length / MAX_LINE_LENGTH)
    text = " ".join(text.split("\n"))
    target_line_num = get_target_line_num(len(text))

    lines: List[str] = []
    for _ in range(target_line_num):
        partition_at = round(len(text) / target_line_num) - 1

        # Move to a word boundary
        for steps_backward, c in enumerate(text[partition_at::-1]):
            if c.isspace():
                break
        if partition_at - steps_backward != 0:
            partition_at -= steps_backward
        else:
            for steps_forward, c in enumerate(text[partition_at:]):
                if c.isspace():
                    break
            partition_at += steps_forward

        lines.append(text[: partition_at + 1].strip())
        text = text[partition_at + 1 :]
        target_line_num = get_target_line_num(len(text))

    return ("\n".join(lines) if lines else text) + "\n"


if __name__ == "__main__":
    main()