From 13a6832bb4f9ec68d26ae59c8dd1260279e3e70a Mon Sep 17 00:00:00 2001 From: Daniel Friesel Date: Sun, 27 Dec 2020 05:55:04 +0100 Subject: heuristics, whoop whoop --- convert.zsh | 46 +++++++++++++++++--------- lib/export-carriage.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 15 deletions(-) create mode 100755 lib/export-carriage.py diff --git a/convert.zsh b/convert.zsh index eb5bb25..c37a208 100755 --- a/convert.zsh +++ b/convert.zsh @@ -4,11 +4,7 @@ setopt err_exit no_unset mkdir -p svg pdf png -# bbox: left bottom right top - # S. 6, 11ff -ice1_bbox='78 45 830 198' -#ice1_bbox='95 59 825 150' # doesn't work for 8029, 8031 ice1_offset=11 # Stelle 6 bis 9 der UIC-Wagennr. typeset -a ice1_types=( @@ -253,7 +249,7 @@ x 1101 # DBpdzfa # 110.A ) -function extract_wagons() { +function extract_wagons { start=$1 shift bbox=$1 @@ -301,18 +297,38 @@ function extract_wagons() { done } +function extract_wagons_heuristic { + start=$1 + shift + + for i in {1..$#}; do + target=$@[$i] + + if [[ $target == x ]]; then + continue + fi + + echo "Page $(( start + i - 1 )): $target" + inkscape --export-filename=svg/${target}.svg tmp-$(( start + i - 1 )).pdf &> /dev/null + lib/export-carriage.py svg/${target}.svg png/${target}.png + done +} + pdfseparate Fahrzeuglexikon_2020.pdf tmp-%d.pdf -extract_wagons $ice1_offset $ice1_bbox $ice1_types -extract_wagons $ice2_offset $ice2_bbox $ice2_types -extract_wagons $ice3_403_1_offset $ice3_403_1_bbox $ice3_403_1_types -extract_wagons $ice3_403_2_offset $ice3_403_2_bbox $ice3_403_2_types -extract_wagons $ice3_403_r_offset $ice3_403_r_bbox $ice3_403_r_types -extract_wagons $ice3_406_offset $ice3_406_bbox $ice3_406_types -extract_wagons $ice3_406_r_offset $ice3_406_r_bbox $ice3_406_r_types -extract_wagons $ice3_407_offset $ice3_407_bbox $ice3_407_types -extract_wagons $ice4_offset $ice4_bbox $ice4_types -extract_wagons $ic1_offset $ic1_bbox $ic1_types +extract_wagons_heuristic $ice1_offset $ice1_types +extract_wagons_heuristic $ice2_offset $ice2_types +extract_wagons_heuristic $ice3_403_1_offset $ice3_403_1_types +extract_wagons_heuristic $ice3_403_2_offset $ice3_403_2_types +extract_wagons_heuristic $ice3_403_r_offset $ice3_403_r_types +extract_wagons_heuristic $ice3_406_offset $ice3_406_types +extract_wagons_heuristic $ice3_406_r_offset $ice3_406_r_types +extract_wagons_heuristic $ice3_407_offset $ice3_407_types +extract_wagons_heuristic $ice4_offset $ice4_types +extract_wagons_heuristic $icet_411_s1_offset $icet_411_s1_types +extract_wagons_heuristic $icet_411_s2_offset $icet_411_s2_types +extract_wagons_heuristic $icet_415_offset $icet_415_types +extract_wagons_heuristic $ic1_offset $ic1_types extract_wagons $ic2_bt_offset $ic2_bt_bbox $ic2_bt_types extract_wagons $ic2_sk_offset $ic2_sk_bbox $ic2_sk_types diff --git a/lib/export-carriage.py b/lib/export-carriage.py new file mode 100755 index 0000000..69e5133 --- /dev/null +++ b/lib/export-carriage.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 + +import numpy as np +import subprocess +import sys + + +class SVGObject: + def __init__(self, attrlist): + self.id = attrlist[0] + self.x = float(attrlist[1]) + self.y = float(attrlist[2]) + self.w = float(attrlist[3]) + self.h = float(attrlist[4]) + self.x_ = self.x + self.w + self.y_ = self.y + self.h + + def is_path(self): + return self.id.startswith("path") + + def is_tspan(self): + return self.id.startswith("tspan") + + def __repr__(self): + return "{}".format( + self.id, self.x, self.y, self.w, self.h + ) + + +def main(infile, outfile): + ret = subprocess.run( + ["inkscape", "--query-all", infile], capture_output=True, check=True + ) + + objects = ret.stdout.decode("ascii").split("\n") + + # remove last (empty) line + objects.pop() + + # objects = [[ID, X, Y, W, H], ... ] + objects = list(map(lambda x: x.split(","), objects)) + objects = list(map(lambda x: SVGObject(x), objects)) + + ys = list() + for o in objects: + if o.is_path() and o.h > 2 and o.h < 4 and o.w > 10: + ys.append(o.y) + bins = np.bincount(ys) + + candidates = list(filter(lambda i: bins[i] > 2, range(len(bins)))) + + crop_y_min = np.min(candidates) - 1 + crop_y_max = np.max(candidates) + 4 + + objects_to_delete = list() + x_positions = list() + + for o in objects: + if (o.is_path or o.is_tspan) and (o.y < crop_y_min or o.y_ > crop_y_max): + objects_to_delete.append(o.id) + elif o.is_path and o.y >= crop_y_min and o.y_ <= crop_y_max: + x_positions.append(o.x) + x_positions.append(o.x_) + + crop_x_min = int(np.min(x_positions)) + crop_x_max = int(np.max(x_positions)) + 1 + + objects_to_delete = ",".join(objects_to_delete) + + export_area = f"--export-area={crop_x_min}:{crop_y_min}:{crop_x_max}:{crop_y_max}" + select_objects = f"--select={objects_to_delete}" + + subprocess.run( + [ + "xvfb-run", + "inkscape", + export_area, + select_objects, + "--verb=EditDelete", + "--batch-process", + "--export-dpi=600", + "-o", + outfile, + infile, + ] + ) + + +if __name__ == "__main__": + main(*sys.argv[1:]) -- cgit v1.2.3