├── .gitignore ├── Cargo.toml ├── LICENSE ├── examples └── text.rs └── src ├── entry.rs ├── lib.rs ├── text.rs ├── tree.rs └── util.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pdf_text" 3 | version = "0.1.0" 4 | edition = "2021" 5 | authors = ["Sebastian Köln "] 6 | keywords = ["pdf", "text", "extract"] 7 | license = "MIT" 8 | description = "PDF text extraction" 9 | 10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 11 | 12 | [dependencies] 13 | pdf = { git = "https://github.com/pdf-rs/pdf", features = ["cache"] } 14 | pdf_render = { git = "https://github.com/pdf-rs/pdf_render" } 15 | font = { git = "https://github.com/pdf-rs/font" } 16 | itertools = "*" 17 | log = "*" 18 | ordered-float = "*" 19 | serde = { version = "*", features = ["derive"] } 20 | unicode-normalization = "0.1.19" 21 | 22 | pathfinder_geometry = { git = "https://github.com/servo/pathfinder" } 23 | pathfinder_color = { git = "https://github.com/servo/pathfinder" } 24 | pathfinder_content = { git = "https://github.com/servo/pathfinder" } 25 | table = { git = "https://github.com/s3bk/table", features = ["serde"] } 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 PDF-rs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /examples/text.rs: -------------------------------------------------------------------------------- 1 | use itertools::Itertools; 2 | use pdf::file::FileOptions; 3 | 4 | fn main() { 5 | let input = std::env::args_os().nth(1).expect("no file given"); 6 | let file = FileOptions::cached().open(&input).expect("can't read PDF"); 7 | let resolver = file.resolver(); 8 | 9 | for (page_nr, page) in file.pages().enumerate() { 10 | let page = page.expect("can't read page"); 11 | let flow = pdf_text::run(&file, &page, &resolver).expect("can't render page"); 12 | println!("# page {}", page_nr + 1); 13 | for run in flow.runs { 14 | for line in run.lines { 15 | println!("{}", line.words.iter().map(|w| &w.text).format(" ")); 16 | } 17 | println!(); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/entry.rs: -------------------------------------------------------------------------------- 1 | use serde::{Serialize, Deserialize}; 2 | use table::Table; 3 | 4 | use crate::util::{Rect, CellContent}; 5 | 6 | #[derive(Serialize, Deserialize)] 7 | pub struct Word { 8 | pub text: String, 9 | pub rect: Rect, 10 | } 11 | #[derive(Serialize, Deserialize)] 12 | pub struct Line { 13 | pub words: Vec, 14 | } 15 | #[derive(Serialize, Deserialize)] 16 | pub struct Run { 17 | pub lines: Vec, 18 | pub kind: RunType, 19 | } 20 | 21 | #[derive(Serialize, Deserialize)] 22 | pub struct Flow { 23 | pub lines: Vec, 24 | pub runs: Vec, 25 | } 26 | #[derive(Serialize, Deserialize)] 27 | pub enum RunType { 28 | ParagraphContinuation, 29 | Paragraph, 30 | Header, 31 | Cell, 32 | } 33 | 34 | impl Flow { 35 | pub fn new() -> Self { 36 | Flow { 37 | lines: vec![], 38 | runs: vec![] 39 | } 40 | } 41 | pub fn add_line(&mut self, words: Vec, kind: RunType) { 42 | if words.len() > 0 { 43 | self.runs.push(Run { 44 | lines: vec![Line { words }], 45 | kind 46 | }); 47 | } 48 | } 49 | pub fn add_table(&mut self, table: Table) { 50 | 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | 3 | use entry::Flow; 4 | use pdf::{backend::Backend, object::{Page, Resolve}, PdfError}; 5 | use pdf_render::{tracer::{TraceCache, Tracer, DrawItem}, Fill, render_pattern, render_page, FillMode}; 6 | 7 | mod tree; 8 | mod util; 9 | mod text; 10 | pub mod entry; 11 | 12 | pub fn run(file: &pdf::file::CachedFile, page: &Page, resolve: &impl Resolve) -> Result { 13 | let cache = TraceCache::new(); 14 | 15 | let mut clip_paths = vec![]; 16 | let mut tracer = Tracer::new(&cache, &mut clip_paths); 17 | 18 | render_page(&mut tracer, resolve, &page, Default::default())?; 19 | 20 | let bbox = tracer.view_box(); 21 | 22 | let items = tracer.finish(); 23 | let mut patterns = HashSet::new(); 24 | for item in items.iter() { 25 | if let DrawItem::Vector(ref v) = item { 26 | if let Some(FillMode { color: Fill::Pattern(id), .. }) = v.fill { 27 | patterns.insert(id); 28 | } 29 | if let Some((FillMode { color: Fill::Pattern(id), .. }, _)) = v.stroke { 30 | patterns.insert(id); 31 | } 32 | } 33 | } 34 | 35 | let mut spans = vec![]; 36 | let mut lines = vec![]; 37 | let mut visit_item = |item| { 38 | match item { 39 | DrawItem::Text(t, _) if bbox.intersects(t.rect) => { 40 | spans.push(t); 41 | } 42 | DrawItem::Vector(path) if bbox.intersects(path.outline.bounds()) => { 43 | for contour in path.outline.contours() { 44 | use pathfinder_content::{outline::ContourIterFlags, segment::SegmentKind}; 45 | for segment in contour.iter(ContourIterFlags::empty()) { 46 | match segment.kind { 47 | SegmentKind::Line => lines.push([ 48 | segment.baseline.from_x(), 49 | segment.baseline.from_y(), 50 | segment.baseline.to_x(), 51 | segment.baseline.to_y() 52 | ]), 53 | _ => {} 54 | } 55 | } 56 | } 57 | 58 | } 59 | _ => {} 60 | } 61 | }; 62 | 63 | for &p in patterns.iter() { 64 | let pattern = match resolve.get(p) { 65 | Ok(p) => p, 66 | Err(e) => { 67 | log::warn!("failed to load pattern: {:?}", e); 68 | continue; 69 | } 70 | }; 71 | let mut pat_tracer = Tracer::new(&cache, &mut clip_paths); 72 | 73 | render_pattern(&mut pat_tracer, &*pattern, resolve)?; 74 | let pat_items = pat_tracer.finish(); 75 | for item in pat_items { 76 | visit_item(item); 77 | } 78 | } 79 | 80 | for item in items { 81 | visit_item(item); 82 | } 83 | 84 | let root = tree::build(&spans, bbox, &lines); 85 | let mut flow = Flow::new(); 86 | tree::items(&mut flow, &spans, &root, bbox.min_x()); 87 | Ok(flow) 88 | } -------------------------------------------------------------------------------- /src/text.rs: -------------------------------------------------------------------------------- 1 | use pathfinder_geometry::vector::Vector2F; 2 | use pdf_render::TextSpan; 3 | use itertools::{Itertools}; 4 | use unicode_normalization::UnicodeNormalization; 5 | use crate::{util::avg, entry::Word, util::Rect}; 6 | 7 | pub fn concat_text<'a>(out: &mut String, items: impl Iterator + Clone) -> Vec { 8 | let mut words = vec![]; 9 | 10 | let gaps = items.clone() 11 | .flat_map(|s| { 12 | let tr_inv = s.transform.matrix.inverse(); 13 | let pos = (tr_inv * s.transform.vector).x(); 14 | s.chars.iter() 15 | .filter(|c| !s.text[c.offset..].chars().next().unwrap().is_whitespace()) 16 | // (left edge, right edge, font size) 17 | .map(move |c| (c.pos + pos, c.pos + pos + c.width, s.font_size)) 18 | }) 19 | .tuple_windows() 20 | // skip things that go in reverse 21 | .filter(|(a, b)| b.0 > a.0) 22 | // compute the distance between the right edge of the left char and the left edge of the right char 23 | // and clamp it to a minimum of 0.01 and maximum of half the mean font size 24 | .map(|(a, b)| (b.0 - a.1).max(0.01).min(0.25 * (a.2 + b.2))); 25 | 26 | // compute the average font size of all chars 27 | let font_size = avg(items.clone().map(|s| s.font_size)).unwrap(); 28 | //gaps.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); 29 | 30 | // set the threshold at twice the average gap, clamped to half the font size 31 | let space_gap = (0.5 * font_size).min(2.0 * avg(gaps).unwrap_or(0.0)); //2.0 * gaps[gaps.len()/2]; 32 | let mut end = 0.; // trailing edge of the last char 33 | let mut trailing_space = out.chars().last().map(|c| c.is_whitespace()).unwrap_or(true); 34 | let mut word_start_pos = 0.0; 35 | let mut word_start_idx = out.len(); 36 | let mut y_min = f32::INFINITY; 37 | let mut y_max = -f32::INFINITY; 38 | let mut word_start = true; 39 | let mut word_end = 0.0; 40 | 41 | for span in items { 42 | let mut pos = 0; // byte index of last char into span.text 43 | let tr_inv = span.transform.matrix.inverse(); 44 | let x_off = (tr_inv * span.transform.vector).x(); 45 | for c in span.chars.iter() { 46 | 47 | let s = &span.text[pos..c.offset]; 48 | if c.offset > 0 { 49 | let is_whitespace = s.chars().all(|c| c.is_whitespace()); 50 | if !trailing_space || !is_whitespace { 51 | out.extend(s.nfkc()); 52 | } 53 | trailing_space = is_whitespace; 54 | } 55 | if !trailing_space && c.pos + x_off > end + space_gap { 56 | words.push(Word { 57 | text: out[word_start_idx..].into(), 58 | rect: Rect { 59 | x: word_start_pos, 60 | y: y_min, 61 | h: y_max - y_min, 62 | w: word_end - word_start_pos 63 | } 64 | }); 65 | 66 | out.push(' '); 67 | trailing_space = true; 68 | word_start = true; 69 | word_start_idx = out.len(); 70 | } 71 | pos = c.offset; 72 | end = c.pos + x_off + c.width; 73 | if c.offset == 0 || !trailing_space { 74 | word_end = (span.transform.matrix * Vector2F::new(end, 0.0)).x(); 75 | } 76 | 77 | if word_start { 78 | y_min = span.rect.min_y(); 79 | y_max = span.rect.max_y(); 80 | word_start_pos = (span.transform.matrix * Vector2F::new(c.pos + x_off, 0.0)).x(); 81 | word_start = false; 82 | } else { 83 | y_min = y_min.min(span.rect.min_y()); 84 | y_max = y_max.max(span.rect.max_y()); 85 | } 86 | } 87 | trailing_space = span.text[pos..].chars().all(|c| c.is_whitespace()); 88 | 89 | out.extend(span.text[pos..].nfkc()); 90 | } 91 | words.push(Word { 92 | text: out[word_start_idx..].into(), 93 | rect: Rect { 94 | x: word_start_pos, 95 | y: y_min, 96 | h: y_max - y_min, 97 | w: word_end - word_start_pos 98 | } 99 | }); 100 | 101 | words 102 | } 103 | -------------------------------------------------------------------------------- /src/tree.rs: -------------------------------------------------------------------------------- 1 | use pdf_render::TextSpan; 2 | use pathfinder_geometry::{ 3 | vector::Vector2F, 4 | rect::RectF 5 | }; 6 | #[cfg(feature="ocr")] 7 | use tesseract_plumbing::Text; 8 | 9 | use std::collections::BTreeSet; 10 | use std::iter::once; 11 | use std::sync::Arc; 12 | use itertools::{Itertools}; 13 | use ordered_float::NotNan; 14 | use crate::entry::{Flow, Line, Run, RunType, Word}; 15 | use crate::util::{is_number, avg, CellContent}; 16 | use crate::text::{concat_text}; 17 | use std::mem::take; 18 | use table::Table; 19 | 20 | pub fn build(spans: &[TextSpan], bbox: RectF, lines: &[[f32; 4]]) -> Node { 21 | if spans.len() == 0 { 22 | return Node::singleton(&[]); 23 | } 24 | 25 | let mut boxes: Vec<(RectF, usize)> = spans.iter().enumerate().map(|(i, t)| (t.rect, i)).collect(); 26 | let mut boxes = boxes.as_mut_slice(); 27 | 28 | let avg_font_size = avg(spans.iter().map(|s| s.font_size)).unwrap(); 29 | let probaby_header = |boxes: &[(RectF, usize)]| { 30 | let class = classify(boxes.iter().filter_map(|&(_, i)| spans.get(i))); 31 | if matches!(class, Class::Header | Class::Number) { 32 | return true; 33 | } 34 | let f = avg(boxes.iter().filter_map(|&(_, i)| spans.get(i)).map(|s| s.font_size)).unwrap(); 35 | f > avg_font_size 36 | }; 37 | let probably_footer = |boxes: &mut [(RectF, usize)]| { 38 | sort_x(boxes); 39 | let x_gaps: Vec = gaps(avg_font_size, boxes, |r| (r.min_x(), r.max_x())) 40 | .collect(); 41 | 42 | let count = split_by(boxes, &x_gaps, |r| r.min_x()).filter(|cell| probaby_header(cell)).count(); 43 | count == x_gaps.len() + 1 44 | }; 45 | 46 | sort_y(boxes); 47 | let (top, bottom) = top_bottom_gap(boxes, bbox); 48 | if let Some(bottom) = bottom { 49 | if probably_footer(&mut boxes[bottom..]) { 50 | boxes = &mut boxes[..bottom]; 51 | } 52 | } 53 | if let Some(top) = top { 54 | if probaby_header(&mut boxes[..top]) { 55 | boxes = &mut boxes[top..]; 56 | } 57 | } 58 | sort_x(boxes); 59 | let (left, right) = left_right_gap(boxes, bbox); 60 | if let Some(right) = right { 61 | if probaby_header(&boxes[right..]) { 62 | boxes = &mut boxes[..right]; 63 | } 64 | } 65 | if let Some(left) = left { 66 | if probaby_header(&boxes[..left]) { 67 | boxes = &mut boxes[left..]; 68 | } 69 | } 70 | let lines = analyze_lines(lines); 71 | split(boxes, &spans, &lines) 72 | } 73 | 74 | fn analyze_lines(lines: &[[f32; 4]]) -> Lines { 75 | let mut hlines = BTreeSet::new(); 76 | let mut vlines = BTreeSet::new(); 77 | 78 | for &[x1, y1, x2, y2] in lines { 79 | if x1 == x2 { 80 | vlines.insert(NotNan::new(x1).unwrap()); 81 | } else if y1 == y2 { 82 | hlines.insert(NotNan::new(y1).unwrap()); 83 | } 84 | } 85 | 86 | fn dedup(lines: impl Iterator>) -> Vec<(f32, f32)> { 87 | let threshold = 10.0; 88 | let mut out = vec![]; 89 | let mut lines = lines.map(|f| *f).peekable(); 90 | while let Some(start) = lines.next() { 91 | let mut last = start; 92 | while let Some(&p) = lines.peek() { 93 | if last + threshold > p { 94 | last = p; 95 | lines.next(); 96 | } else { 97 | break; 98 | } 99 | } 100 | out.push((start, last)); 101 | } 102 | out 103 | } 104 | 105 | let hlines = dedup(hlines.iter().cloned()); 106 | let vlines = dedup(vlines.iter().cloned()); 107 | 108 | let mut line_grid = vec![false; vlines.len() * hlines.len()]; 109 | for &[x1, y1, x2, y2] in lines { 110 | if x1 == x2 { 111 | let v_idx = vlines.iter().position(|&(a, b)| a <= x1 && x1 <= b).unwrap_or(vlines.len()); 112 | let h_start = hlines.iter().position(|&(a, b)| y1 >= a).unwrap_or(hlines.len()); 113 | let h_end = hlines.iter().position(|&(a, b)| y2 <= b).unwrap_or(hlines.len()); 114 | for h in h_start .. h_end { 115 | line_grid[v_idx * hlines.len() + h] = true; 116 | } 117 | } else if y1 == y2 { 118 | let h_idx = hlines.iter().position(|&(a, b)| a <= y1 && y1 <= b).unwrap_or(hlines.len()); 119 | let v_start = vlines.iter().position(|&(a, b)| x1 >= a).unwrap_or(vlines.len()); 120 | let v_end = vlines.iter().position(|&(a, b)| x2 <= b).unwrap_or(vlines.len()); 121 | for v in v_start .. v_end { 122 | line_grid[v * hlines.len() + h_idx] = true; 123 | } 124 | } 125 | } 126 | 127 | 128 | //println!("hlines: {:?}", hlines); 129 | //println!("vlines: {:?}", vlines); 130 | 131 | Lines { hlines, vlines, line_grid } 132 | } 133 | 134 | pub struct Lines { 135 | hlines: Vec<(f32, f32)>, 136 | vlines: Vec<(f32, f32)>, 137 | line_grid: Vec, 138 | } 139 | 140 | #[derive(Copy, Clone, Debug)] 141 | struct Span { 142 | start: NotNan, 143 | end: NotNan, 144 | } 145 | impl Span { 146 | fn horiz(rect: &RectF) -> Option { 147 | Self::new(rect.min_x(), rect.max_x()) 148 | } 149 | fn vert(rect: &RectF) -> Option { 150 | Self::new(rect.min_y(), rect.max_y()) 151 | } 152 | fn new(mut start: f32, mut end: f32) -> Option { 153 | if start > end { 154 | std::mem::swap(&mut start, &mut end); 155 | } 156 | Some(Span { 157 | start: NotNan::new(start).ok()?, 158 | end: NotNan::new(end).ok()?, 159 | }) 160 | } 161 | fn intersect(self, other: Span) -> Option { 162 | if self.start <= other.end && other.start <= self.end { 163 | Some(Span { 164 | start: self.start.max(other.start), 165 | end: self.end.min(other.end), 166 | }) 167 | } else { 168 | None 169 | } 170 | } 171 | fn union(self, other: Span) -> Option { 172 | if self.start <= other.end && other.start <= self.end { 173 | Some(Span { 174 | start: self.start.min(other.start), 175 | end: self.end.max(other.end) 176 | }) 177 | } else { 178 | None 179 | } 180 | } 181 | } 182 | 183 | pub fn split2(boxes: &mut [(RectF, usize)], spans: &[TextSpan], lines_info: &Lines) -> Node { 184 | use std::mem::replace; 185 | 186 | #[derive(Debug)] 187 | enum LineTag { 188 | Unknown, 189 | Text, 190 | Table, 191 | } 192 | 193 | sort_y(boxes); 194 | let mut lines = vec![]; 195 | let mut y = Span::vert(&boxes[0].0).unwrap(); 196 | let mut items = vec![boxes[0]]; 197 | 198 | let build_line = |boxes: &[(RectF, usize)]| -> (LineTag, Span, Vec<(Span, Vec)>) { 199 | let mut line = vec![]; 200 | let mut x = Span::horiz(&boxes[0].0).unwrap(); 201 | let mut y = Span::vert(&boxes[0].0).unwrap(); 202 | let mut items = vec![boxes[0].1]; 203 | 204 | for &(rect, i) in &boxes[1..] { 205 | y = y.union(Span::vert(&rect).unwrap()).unwrap(); 206 | let x2 = Span::horiz(&rect).unwrap(); 207 | if let Some(u) = x.union(x2) { 208 | x = u; 209 | items.push(i); 210 | } else { 211 | line.push((x, replace(&mut items, vec![i]))); 212 | x = x2; 213 | } 214 | } 215 | line.push((x, items)); 216 | 217 | let f = avg(boxes.iter().filter_map(|&(_, i)| spans.get(i)).map(|s| s.font_size)).unwrap(); 218 | 219 | let max_gap = line.iter().tuple_windows().map(|(l, r)| r.0.start - l.0.end).max(); 220 | let tag = match max_gap { 221 | None => LineTag::Unknown, 222 | Some(x) if x.into_inner() < 0.3 * f => LineTag::Text, 223 | Some(_) => LineTag::Table, 224 | }; 225 | 226 | (tag, y, line) 227 | }; 228 | 229 | let mut line = vec![boxes[0]]; 230 | for &(rect, i) in &boxes[1..] { 231 | let y2 = Span::vert(&rect).unwrap(); 232 | if let Some(overlap) = y.intersect(y2) { 233 | y = overlap; 234 | } else { 235 | sort_x(&mut line); 236 | lines.push(build_line(&line)); 237 | line.clear(); 238 | y = y2 239 | } 240 | line.push((rect, i)); 241 | } 242 | sort_x(&mut line); 243 | lines.push(build_line(&line)); 244 | 245 | 246 | let mut vparts = vec![]; 247 | let mut start = 0; 248 | while let Some(p) = lines[start..].iter().position(|(tag, _, line)| matches!(tag, LineTag::Unknown | LineTag::Table)) { 249 | let table_start = start + p; 250 | let table_end = lines[table_start+1..].iter().position(|(tag, _, _)| matches!(tag, LineTag::Text)).map(|e| table_start+1+e).unwrap_or(lines.len()); 251 | 252 | for &(_, y, ref line) in &lines[start..table_start] { 253 | vparts.push((y, Node::Final { indices: line.iter().flat_map(|(_, indices)| indices.iter().cloned()).collect() })); 254 | } 255 | 256 | let lines = &lines[table_start..table_end]; 257 | start = table_end; 258 | 259 | let mut columns: Vec = vec![]; 260 | for (_, _, line) in lines.iter() { 261 | for &(x, ref parts) in line.iter() { 262 | // find any column that is contained in this 263 | let mut found = 0; 264 | for span in columns.iter_mut() { 265 | if let Some(overlap) = span.intersect(x) { 266 | *span = overlap; 267 | found += 1; 268 | } 269 | } 270 | if found == 0 { 271 | columns.push(x); 272 | } 273 | } 274 | } 275 | let avg_vgap = avg(lines.iter().map(|(_, y, _)| y).tuple_windows().map(|(a, b)| *(b.start - a.end))); 276 | 277 | columns.sort_by_key(|s| s.start); 278 | 279 | let mut buf = String::new(); 280 | 281 | let d_threshold = avg_vgap.unwrap_or(0.0); 282 | let mut prev_end = None; 283 | 284 | let mut table: Table> = Table::empty(lines.len() as u32, columns.len() as u32); 285 | 286 | let mut row = 0; 287 | for (_, span, line) in lines { 288 | let mut col = 0; 289 | 290 | let combine = prev_end.map(|y: NotNan| { 291 | if *(span.start - y) < d_threshold { 292 | !lines_info.hlines.iter().map(|(a, b)| 0.5 * (a+b)).any(|l| *y < l && *span.start > l) 293 | } else { 294 | false 295 | } 296 | }).unwrap_or(false); 297 | 298 | if !combine { 299 | row += 1; 300 | } 301 | 302 | for &(x, ref parts) in line { 303 | let mut cols = columns.iter().enumerate() 304 | .filter(|&(_, &x2)| x.intersect(x2).is_some()) 305 | .map(|(i, _)| i); 306 | 307 | let first_col = cols.next().unwrap(); 308 | let last_col = cols.last().unwrap_or(first_col); 309 | 310 | if let Some(cell) = combine.then(|| table.get_cell_value_mut(row, first_col as u32)).flatten() { 311 | // append to previous line 312 | cell.extend_from_slice(parts); 313 | } else { 314 | let colspan = (last_col - first_col) as u32 + 1; 315 | let rowspan = 1; 316 | table.set_cell(parts.clone(), row, first_col as u32, rowspan, colspan); 317 | } 318 | col = last_col + 1; 319 | } 320 | prev_end = Some(span.end); 321 | } 322 | let y = Span { start: lines[0].1.start, end: lines.last().unwrap().1.end }; 323 | vparts.push((y, Node::Table { table })); 324 | } 325 | for &(_, y, ref line) in &lines[start..] { 326 | vparts.push((y, Node::Final { indices: line.iter().flat_map(|(_, indices)| indices.iter().cloned()).collect() })); 327 | } 328 | 329 | if vparts.len() > 1 { 330 | let y = vparts.iter().tuple_windows().map(|(a, b)| 0.5 * (a.0.end + b.0.start).into_inner()).collect(); 331 | Node::Grid { 332 | tag: NodeTag::Complex, 333 | x: vec![], 334 | y, 335 | cells: vparts.into_iter().map(|(_, n)| n).collect() 336 | } 337 | } else { 338 | vparts.pop().unwrap().1 339 | } 340 | } 341 | 342 | #[derive(Debug)] 343 | pub enum Node { 344 | Final { indices: Vec }, 345 | Grid { x: Vec, y: Vec, cells: Vec, tag: NodeTag }, 346 | Table { table: Table> }, 347 | } 348 | impl Node { 349 | fn tag(&self) -> NodeTag { 350 | match *self { 351 | Node::Grid { tag, .. } => tag, 352 | Node::Table { .. } => NodeTag::Complex, 353 | Node::Final { .. } => NodeTag::Singleton, 354 | } 355 | } 356 | fn indices(&self, out: &mut Vec) { 357 | match *self { 358 | Node::Final { ref indices } => out.extend_from_slice(&indices), 359 | Node::Grid { ref cells, .. } => { 360 | for n in cells { 361 | n.indices(out); 362 | } 363 | } 364 | Node::Table { ref table } => { 365 | out.extend( 366 | table.values() 367 | .flat_map(|v| v.value.iter()) 368 | .cloned() 369 | ); 370 | } 371 | } 372 | } 373 | fn singleton(nodes: &[(RectF, usize)]) -> Self { 374 | Node::Final { indices: nodes.iter().map(|t| t.1).collect() } 375 | } 376 | } 377 | 378 | #[derive(PartialOrd, Ord, Eq, PartialEq, Clone, Copy, Debug)] 379 | pub enum NodeTag { 380 | Singleton, 381 | Line, 382 | Paragraph, 383 | Complex, 384 | } 385 | 386 | pub fn items(mut flow: &mut Flow, spans: &[TextSpan], node: &Node, x_anchor: f32) { 387 | match *node { 388 | Node::Final { ref indices } => { 389 | if indices.len() > 0 { 390 | let node_spans = indices.iter().flat_map(|&i| spans.get(i)); 391 | let bbox = node_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap(); 392 | let class = classify(node_spans.clone()); 393 | let mut text = String::new(); 394 | let words = concat_text(&mut text, node_spans); 395 | 396 | let t = match class { 397 | Class::Header => RunType::Header, 398 | _ => RunType::Paragraph, 399 | }; 400 | flow.add_line(words, t); 401 | } 402 | } 403 | Node::Grid { ref x, ref y, ref cells, tag } => { 404 | match tag { 405 | NodeTag::Singleton | 406 | NodeTag::Line => { 407 | let mut indices = vec![]; 408 | node.indices(&mut indices); 409 | let line_spans = indices.iter().flat_map(|&i| spans.get(i)); 410 | let bbox: RectF = line_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into(); 411 | 412 | let mut text = String::new(); 413 | let words = concat_text(&mut text, line_spans.clone()); 414 | let class = classify(line_spans.clone()); 415 | 416 | let t = match class { 417 | Class::Header => RunType::Header, 418 | _ => RunType::Paragraph, 419 | }; 420 | flow.add_line(words, t); 421 | } 422 | NodeTag::Paragraph => { 423 | assert_eq!(x.len(), 0); 424 | let mut lines: Vec<(RectF, usize)> = vec![]; 425 | let mut indices = vec![]; 426 | for n in cells { 427 | let start = indices.len(); 428 | n.indices(&mut indices); 429 | if indices.len() > start { 430 | let cell_spans = indices[start..].iter().flat_map(|&i| spans.get(i)); 431 | let bbox = cell_spans.map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into(); 432 | lines.push((bbox, indices.len())); 433 | } 434 | } 435 | 436 | let para_spans = indices.iter().flat_map(|&i| spans.get(i)); 437 | let class = classify(para_spans.clone()); 438 | let bbox = lines.iter().map(|t| t.0).reduce(|a, b| a.union_rect(b)).unwrap(); 439 | let line_height = avg(para_spans.map(|s| s.rect.height())).unwrap(); 440 | // classify the lines by this vertical line 441 | let left_margin = bbox.min_x() + 0.5 * line_height; 442 | 443 | // count how many are right and left of the split. 444 | let mut left = 0; 445 | let mut right = 0; 446 | 447 | for (line_bbox, _) in lines.iter() { 448 | if line_bbox.min_x() >= left_margin { 449 | right += 1; 450 | } else { 451 | left += 1; 452 | } 453 | } 454 | 455 | // typically paragraphs are indented to the right and longer than 2 lines. 456 | // then there will be a higher left count than right count. 457 | let indent = left > right; 458 | 459 | let mut para_start = 0; 460 | let mut line_start = 0; 461 | let mut text = String::new(); 462 | let mut para_bbox = RectF::default(); 463 | let mut flow_lines = vec![]; 464 | for &(line_bbox, end) in lines.iter() { 465 | if line_start != 0 { 466 | // if a line is indented (or outdented), it marks a new paragraph 467 | if (line_bbox.min_x() >= left_margin) == indent { 468 | flow.runs.push(Run { 469 | lines: take(&mut flow_lines), 470 | kind: match class { 471 | Class::Header => RunType::Header, 472 | _ => RunType::Paragraph 473 | } 474 | }); 475 | para_start = line_start; 476 | } else { 477 | text.push('\n'); 478 | } 479 | } 480 | if end > line_start { 481 | let words = concat_text(&mut text, indices[line_start..end].iter().flat_map(|&i| spans.get(i))); 482 | 483 | if words.len() > 0 { 484 | flow_lines.push(Line { words }); 485 | } 486 | } 487 | if para_start == line_start { 488 | para_bbox = line_bbox; 489 | } else { 490 | para_bbox = para_bbox.union_rect(line_bbox); 491 | } 492 | line_start = end; 493 | } 494 | 495 | flow.runs.push(Run { 496 | lines: flow_lines, 497 | kind: match class { 498 | Class::Header => RunType::Header, 499 | _ => RunType::Paragraph 500 | } 501 | }); 502 | } 503 | NodeTag::Complex => { 504 | let x_anchors = once(x_anchor).chain(x.iter().cloned()).cycle(); 505 | for (node, x) in cells.iter().zip(x_anchors) { 506 | items(flow, spans, node, x); 507 | } 508 | } 509 | } 510 | } 511 | Node::Table { ref table } => { 512 | if let Some(bbox) = table.values() 513 | .flat_map(|v| v.value.iter().flat_map(|&i| spans.get(i).map(|s| s.rect))) 514 | .reduce(|a, b| a.union_rect(b)) { 515 | let table = table.flat_map(|indices| { 516 | if indices.len() == 0 { 517 | None 518 | } else { 519 | let line_spans = indices.iter().flat_map(|&i| spans.get(i)); 520 | let bbox: RectF = line_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into(); 521 | 522 | let mut text = String::new(); 523 | concat_text(&mut text, line_spans.clone()); 524 | Some(CellContent { 525 | text, 526 | rect: bbox.into(), 527 | }) 528 | } 529 | }); 530 | flow.add_table(table); 531 | } 532 | } 533 | } 534 | } 535 | 536 | 537 | pub fn render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF) { 538 | _render(w, spans, node, bbox, 0) 539 | } 540 | fn _render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF, level: usize) { 541 | use std::fmt::Write; 542 | 543 | match *node { 544 | Node::Final { ref indices } => { 545 | /* 546 | for i in start..end { 547 | if let Span::Text(ref t) = spans[i] { 548 | write!(w, r#"").unwrap(); 553 | } 554 | } 555 | */ 556 | 557 | if indices.len() > 0 { 558 | let class = classify(indices.iter().cloned().filter_map(|i| spans.get(i))); 559 | 560 | for &i in indices.iter() { 561 | let r = spans[i].rect; 562 | write!(w, r#""#, 563 | r.min_x(), r.max_x(), r.max_y(), r.max_y(), 564 | class 565 | ); 566 | } 567 | } 568 | } 569 | Node::Grid { ref x, ref y, ref cells, tag } => { 570 | use std::iter::once; 571 | let columns = x.len() + 1; 572 | write!(w, r#""#, 573 | bbox.min_x(), bbox.min_y(), bbox.width(), bbox.height(), tag 574 | ); 575 | 576 | for (j, ((min_y, max_y), row)) in once(bbox.min_y()).chain(y.iter().cloned()).chain(once(bbox.max_y())).tuple_windows().zip(cells.chunks_exact(columns)).enumerate() { 577 | if j > 0 { 578 | writeln!(w, r#""#, 579 | bbox.min_x(), bbox.max_x(), min_y, min_y); 580 | } 581 | 582 | for (i, ((min_x, max_x), cell)) in once(bbox.min_x()).chain(x.iter().cloned()).chain(once(bbox.max_x())).tuple_windows().zip(row).enumerate() { 583 | if i > 0 { 584 | writeln!(w, r#""#, 585 | min_x, min_x, bbox.min_y(), bbox.max_y()); 586 | } 587 | 588 | let bbox = RectF::from_points(Vector2F::new(min_x, min_y), Vector2F::new(max_x, max_y)); 589 | _render(w, spans, cell, bbox, level+1); 590 | } 591 | } 592 | } 593 | Node::Table { .. } => { 594 | 595 | } 596 | } 597 | } 598 | 599 | fn split(boxes: &mut [(RectF, usize)], spans: &[TextSpan], lines: &Lines) -> Node { 600 | let num_boxes = boxes.len(); 601 | if num_boxes < 2 { 602 | return Node::singleton(boxes); 603 | } 604 | 605 | sort_x(boxes); 606 | let max_x_gap = dist_x(boxes); 607 | sort_y(boxes); 608 | let max_y_gap = dist_y(boxes); 609 | 610 | let x_y_ratio = 1.0; 611 | 612 | let max_gap = match (max_x_gap, max_y_gap) { 613 | (Some((x, _)), Some((y, _))) => x.max(y * x_y_ratio), 614 | (Some((x, _)), None) => x, 615 | (None, Some((y, _))) => y * x_y_ratio, 616 | (None, None) => { 617 | sort_x(boxes); 618 | return Node::singleton(boxes); 619 | } 620 | }; 621 | let x_threshold = (max_gap * 0.5).max(1.0); 622 | let y_threshold = (max_gap * 0.5 / x_y_ratio).max(0.1); 623 | let mut cells = vec![]; 624 | 625 | let y_gaps: Vec = gaps(y_threshold, boxes, |r| (r.min_y(), r.max_y())) 626 | .collect(); 627 | 628 | sort_x(boxes); 629 | let x_gaps: Vec = gaps(x_threshold, boxes, |r| (r.min_x(), r.max_x())) 630 | .collect(); 631 | 632 | if x_gaps.len() == 0 && y_gaps.len() == 0 { 633 | return overlapping_lines(boxes); 634 | } 635 | 636 | if x_gaps.len() > 1 && y_gaps.len() > 1 { 637 | return split2(boxes, spans, lines); 638 | } 639 | 640 | sort_y(boxes); 641 | for row in split_by(boxes, &y_gaps, |r| r.min_y()) { 642 | 643 | if x_gaps.len() > 0 { 644 | sort_x(row); 645 | for cell in split_by(row, &x_gaps, |r| r.min_x()) { 646 | sort_y(cell); 647 | assert!(cell.len() < num_boxes); 648 | cells.push(split(cell, spans, lines)); 649 | } 650 | } else { 651 | cells.push(split(row, spans, lines)); 652 | } 653 | } 654 | 655 | assert!(x_gaps.len() > 0 || y_gaps.len() > 0); 656 | let tag = if y_gaps.len() == 0 { 657 | if cells.iter().all(|n| n.tag() <= NodeTag::Line) { 658 | NodeTag::Line 659 | } else { 660 | NodeTag::Complex 661 | } 662 | } else if x_gaps.len() == 0 { 663 | if cells.iter().all(|n| n.tag() <= NodeTag::Line) { 664 | NodeTag::Paragraph 665 | } else { 666 | NodeTag::Complex 667 | } 668 | } else { 669 | NodeTag::Complex 670 | }; 671 | 672 | Node::Grid { 673 | x: x_gaps, 674 | y: y_gaps, 675 | cells, 676 | tag, 677 | } 678 | } 679 | #[allow(dead_code)] 680 | fn split_v(boxes: &mut [(RectF, usize)]) -> Node { 681 | let num_boxes = boxes.len(); 682 | if num_boxes < 2 { 683 | return Node::singleton(boxes) 684 | } 685 | 686 | let max_y_gap = dist_y(boxes); 687 | 688 | let max_gap = match max_y_gap { 689 | Some((y, _)) => y, 690 | None => { 691 | sort_x(boxes); 692 | return Node::singleton(boxes); 693 | } 694 | }; 695 | let threshold = max_gap * 0.8; 696 | let mut cells = vec![]; 697 | 698 | let y_gaps: Vec = gaps(threshold, boxes, |r| (r.min_y(), r.max_y())) 699 | .collect(); 700 | 701 | for row in split_by(boxes, &y_gaps, |r| r.min_y()) { 702 | assert!(row.len() < num_boxes); 703 | cells.push(split_v(row)); 704 | } 705 | 706 | let tag = if cells.iter().all(|n| n.tag() <= NodeTag::Line) { 707 | NodeTag::Paragraph 708 | } else { 709 | NodeTag::Complex 710 | }; 711 | 712 | Node::Grid { 713 | x: vec![], 714 | y: y_gaps, 715 | cells, 716 | tag, 717 | } 718 | } 719 | 720 | fn top_bottom_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option, Option) { 721 | let num_boxes = boxes.len(); 722 | if num_boxes < 2 { 723 | return (None, None); 724 | } 725 | 726 | let mut gaps = gap_list(boxes, |r| (r.min_y(), r.max_y())); 727 | let top_limit = bbox.min_y() + bbox.height() * 0.2; 728 | let bottom_limit = bbox.min_y() + bbox.height() * 0.8; 729 | match gaps.next() { 730 | Some((y, _, top)) if y < top_limit => { 731 | match gaps.last() { 732 | Some((y, _, bottom)) if y > bottom_limit => (Some(top), Some(bottom)), 733 | _ => (Some(top), None) 734 | } 735 | } 736 | Some((y, _, bottom)) if y > bottom_limit => (None, Some(bottom)), 737 | _ => (None, None) 738 | } 739 | } 740 | fn left_right_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option, Option) { 741 | let num_boxes = boxes.len(); 742 | if num_boxes < 2 { 743 | return (None, None); 744 | } 745 | 746 | let mut gaps = gap_list(boxes, |r| (r.min_x(), r.max_x())); 747 | let left_limit = bbox.min_x() + bbox.width() * 0.2; 748 | let right_limit = bbox.min_x() + bbox.width() * 0.8; 749 | match gaps.next() { 750 | Some((x, _, left)) if x < left_limit => { 751 | match gaps.last() { 752 | Some((x, _, right)) if x > right_limit => (Some(left), Some(right)), 753 | _ => (Some(left), None) 754 | } 755 | } 756 | Some((x, _, right)) if x > right_limit => (None, Some(right)), 757 | _ => (None, None) 758 | } 759 | } 760 | 761 | fn sort_x(boxes: &mut [(RectF, usize)]) { 762 | boxes.sort_unstable_by(|a, b| a.0.min_x().partial_cmp(&b.0.min_x()).unwrap()); 763 | } 764 | fn sort_y(boxes: &mut [(RectF, usize)]) { 765 | boxes.sort_unstable_by(|a, b| a.0.min_y().partial_cmp(&b.0.min_y()).unwrap()); 766 | } 767 | fn overlapping_lines(boxes: &mut [(RectF, usize)]) -> Node { 768 | sort_y(boxes); 769 | let avg_height = avg(boxes.iter().map(|(r, _)| r.height())).unwrap(); 770 | 771 | let mut y_center = boxes[0].0.center().y(); 772 | let mut lines = vec![]; 773 | let mut y_splits = vec![]; 774 | 775 | let mut start = 0; 776 | 'a: loop { 777 | for (i, &(r, _)) in boxes[start..].iter().enumerate() { 778 | if r.center().y() > 0.5 * avg_height + y_center { 779 | let end = start + i; 780 | sort_x(&mut boxes[start..end]); 781 | let bbox = boxes[start..end].iter().map(|&(r, _)| r).reduce(|a, b| a.union_rect(b)).unwrap(); 782 | 783 | y_splits.push(bbox.max_y()); 784 | lines.push(Node::singleton(&boxes[start..end])); 785 | y_center = r.center().y(); 786 | 787 | start = end; 788 | continue 'a; 789 | } 790 | } 791 | 792 | sort_x(&mut boxes[start..]); 793 | lines.push(Node::singleton(&boxes[start..])); 794 | 795 | break; 796 | } 797 | match lines.len() { 798 | 0 => Node::singleton(&[]), 799 | 1 => lines.pop().unwrap(), 800 | _ => Node::Grid { 801 | x: vec![], 802 | y: y_splits, 803 | cells: lines, 804 | tag: NodeTag::Paragraph 805 | } 806 | } 807 | } 808 | 809 | fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator + 'a { 810 | let mut boxes = boxes.iter(); 811 | let &(ref r, _) = boxes.next().unwrap(); 812 | let (_, mut last_max) = span(r); 813 | boxes.enumerate().filter_map(move |(idx, &(ref r, _))| { 814 | let (min, max) = span(&r); 815 | let r = if min > last_max { 816 | Some((last_max, min, idx+1)) 817 | } else { 818 | None 819 | }; 820 | last_max = max.max(last_max); 821 | r 822 | }) 823 | } 824 | 825 | fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator + 'a { 826 | let mut boxes = boxes.iter(); 827 | let &(ref r, _) = boxes.next().unwrap(); 828 | let (_, mut last_max) = span(r); 829 | boxes.filter_map(move |&(ref r, _)| { 830 | let (min, max) = span(&r); 831 | let r = if min - last_max >= threshold { 832 | Some(0.5 * (last_max + min)) 833 | } else { 834 | None 835 | }; 836 | last_max = max.max(last_max); 837 | r 838 | }) 839 | } 840 | 841 | fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> { 842 | gap_list(boxes, span) 843 | .max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap()) 844 | .map(|(a, b, _)| (b - a, 0.5 * (a + b))) 845 | } 846 | 847 | fn dist_x(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> { 848 | max_gap(boxes, |r| (r.min_x(), r.max_x())) 849 | } 850 | fn dist_y(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> { 851 | max_gap(boxes, |r| (r.min_y(), r.max_y())) 852 | } 853 | fn split_by<'a>(list: &'a mut [(RectF, usize)], at: &'a [f32], by: impl Fn(&RectF) -> f32) -> impl Iterator { 854 | SplitBy { 855 | data: list, 856 | points: at.iter().cloned(), 857 | by, 858 | end: false 859 | } 860 | } 861 | 862 | struct SplitBy<'a, I, F> { 863 | data: &'a mut [(RectF, usize)], 864 | points: I, 865 | by: F, 866 | end: bool, 867 | } 868 | impl<'a, I, F> Iterator for SplitBy<'a, I, F> where 869 | I: Iterator, 870 | F: Fn(&RectF) -> f32 871 | { 872 | type Item = &'a mut [(RectF, usize)]; 873 | fn next(&mut self) -> Option { 874 | if self.end { 875 | return None; 876 | } 877 | match self.points.next() { 878 | Some(p) => { 879 | let idx = self.data.iter().position(|(ref r, _)| (self.by)(r) > p).unwrap_or(self.data.len()); 880 | let (head, tail) = take(&mut self.data).split_at_mut(idx); 881 | self.data = tail; 882 | Some(head) 883 | }, 884 | None => { 885 | self.end = true; 886 | Some(take(&mut self.data)) 887 | } 888 | } 889 | } 890 | } 891 | 892 | use super::util::Tri; 893 | #[derive(Copy, Clone, Debug, PartialEq)] 894 | enum Class { 895 | Number, 896 | Header, 897 | Paragraph, 898 | Mixed, 899 | } 900 | 901 | #[derive(Debug)] 902 | struct TriCount { 903 | tru: usize, 904 | fal: usize, 905 | } 906 | impl TriCount { 907 | fn new() -> Self { 908 | TriCount { 909 | tru: 0, 910 | fal: 0 911 | } 912 | } 913 | fn add(&mut self, b: bool) { 914 | match b { 915 | false => self.fal += 1, 916 | true => self.tru += 1, 917 | } 918 | } 919 | fn count(&self) -> Tri { 920 | match (self.fal, self.tru) { 921 | (0, 0) => Tri::Unknown, 922 | (0, _) => Tri::True, 923 | (_, 0) => Tri::False, 924 | (f, t) => Tri::Maybe(t as f32 / (t + f) as f32) 925 | } 926 | } 927 | } 928 | fn classify<'a>(spans: impl Iterator) -> Class { 929 | use pdf_render::FontEntry; 930 | 931 | let mut bold = TriCount::new(); 932 | let mut numeric = TriCount::new(); 933 | let mut uniform = TriCount::new(); 934 | let mut first_font: *const FontEntry = std::ptr::null(); 935 | 936 | for s in spans { 937 | numeric.add(is_number(&s.text)); 938 | if let Some(ref font) = s.font { 939 | bold.add(font.name.contains("Bold")); 940 | let font_ptr = Arc::as_ptr(font); 941 | if first_font.is_null() { 942 | first_font = font_ptr; 943 | } else { 944 | uniform.add(font_ptr == first_font); 945 | } 946 | } 947 | } 948 | uniform.add(true); 949 | 950 | match (numeric.count(), bold.count(), uniform.count()) { 951 | (Tri::True, _, Tri::True) => Class::Number, 952 | (_, Tri::True, Tri::True) => Class::Header, 953 | (_, Tri::False, Tri::True) => Class::Paragraph, 954 | (_, Tri::False, _) => Class::Paragraph, 955 | (_, Tri::Maybe(_), _) => Class::Paragraph, 956 | _ => Class::Mixed 957 | } 958 | } -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | use pathfinder_geometry::rect::RectF; 2 | use serde::{Serialize, Deserialize}; 3 | 4 | 5 | pub fn is_number(s: &str) -> bool { 6 | s.len() > 0 && s.chars().all(|c| ('0' ..= '9').contains(&c)) 7 | } 8 | 9 | pub fn avg(iter: impl Iterator) -> Option { 10 | let mut count = 0; 11 | let mut sum = 0.; 12 | for i in iter { 13 | sum += i; 14 | count += 1; 15 | } 16 | if count > 0 { 17 | Some(sum / count as f32) 18 | } else { 19 | None 20 | } 21 | } 22 | 23 | pub enum Tri { 24 | False, 25 | True, 26 | Maybe(f32), 27 | Unknown, 28 | } 29 | 30 | #[derive(Copy, Clone, Debug)] 31 | #[derive(Serialize, Deserialize)] 32 | #[repr(C)] 33 | pub struct Rect { 34 | pub x: f32, 35 | pub y: f32, 36 | pub w: f32, 37 | pub h: f32 38 | } 39 | impl From for Rect { 40 | fn from(r: RectF) -> Self { 41 | Rect { 42 | x: r.origin_x(), 43 | y: r.origin_y(), 44 | w: r.width(), 45 | h: r.height() 46 | } 47 | } 48 | } 49 | 50 | #[derive(Clone, Debug, Serialize)] 51 | pub struct CellContent { 52 | pub text: String, 53 | pub rect: Rect, 54 | } --------------------------------------------------------------------------------