8 | {
9 | public T Instance { get; set; }
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Execution/PipelineInputCache.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.Execution
7 | {
8 | class PipelineInputCache where TD: class
9 | {
10 | Dictionary _dictDocuments = new Dictionary();
11 | int _numberOfPages = -1;
12 |
13 | public void SetSize(int size)
14 | {
15 | if (size <= 0)
16 | PdfReaderException.AlwaysThrow("Invalid size");
17 |
18 | _numberOfPages = size;
19 | }
20 |
21 | Document GetCache()
22 | {
23 | _dictDocuments.TryGetValue(typeof(T), out Document cache);
24 |
25 | if(cache == null)
26 | {
27 | cache = new Document(_numberOfPages);
28 | _dictDocuments[typeof(T)] = cache;
29 | }
30 |
31 | return cache;
32 | }
33 |
34 | public TD FromCache(int pageNumber)
35 | {
36 | var cache = GetCache();
37 | return cache[pageNumber];
38 | }
39 |
40 | public void StoreCache(int pageNumber, TD result)
41 | {
42 | var cache = GetCache();
43 | cache[pageNumber] = result;
44 | }
45 |
46 | class Document : List
47 | {
48 | public Document(int size) : base(new TD[size]) {}
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Execution/PipelinePageStats.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace PdfTextReader.Execution
6 | {
7 | class PipelinePageStats
8 | {
9 | T _internalProperty;
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Execution/PipelinePdfLog.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.Execution
7 | {
8 | class PipelinePdfLog
9 | {
10 | List _log = new List();
11 |
12 | class PipelinePdfLogEntry
13 | {
14 | public int PageNumber;
15 | public Type Component;
16 | public string Message;
17 | }
18 |
19 | public void LogCheck(int pageNumber, Type component, string message)
20 | {
21 | _log.Add(new PipelinePdfLogEntry()
22 | {
23 | PageNumber = pageNumber,
24 | Component = component,
25 | Message = message
26 | });
27 | }
28 |
29 | public void SaveErrors(string inputfile, string outputfile)
30 | {
31 |
32 | }
33 |
34 | public IEnumerable GetErrors()
35 | {
36 | return _log.Select(t => t.PageNumber).Distinct().OrderBy(t => t);
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Execution/PipelineResult.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 | using PdfTextReader.Base;
5 |
6 | namespace PdfTextReader.Execution
7 | {
8 | interface IPipelineResults
9 | {
10 | T GetResults();
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ExecutionStats/PrintAnalyticsExtensions.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using PdfTextReader.Execution;
3 | using PdfTextReader.Parser;
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Text;
7 |
8 | namespace PdfTextReader.ExecutionStats
9 | {
10 | static class PrintAnalyticsExtensions
11 | {
12 | public static PipelineText PrintAnalytics(this PipelineText pipelineText, string filename)
13 | {
14 | return pipelineText.Log(filename);
15 | }
16 |
17 | public static PipelineText PrintAnalytics(this PipelineText pipelineText, string filename)
18 | {
19 | return pipelineText.Log(filename);
20 | }
21 |
22 | public static PipelineText PrintAnalytics(this PipelineText pipelineText, string filename)
23 | {
24 | return pipelineText.Log(filename);
25 | }
26 |
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ExecutionStats/PrintDebugCount.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 | using System.IO;
6 |
7 | namespace PdfTextReader.ExecutionStats
8 | {
9 | class PrintDebugCount : ILogStructure
10 | {
11 | string _message = typeof(T).Name;
12 | int _count = 0;
13 |
14 | public void StartLog(TextWriter input)
15 | {
16 | }
17 |
18 | public void EndLog(TextWriter input)
19 | {
20 | input.WriteLine(_message + ": " + _count);
21 | }
22 |
23 | public void Log(TextWriter input, T data)
24 | {
25 | _count++;
26 | }
27 |
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ExecutionStats/PrintDebugExtensions.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Execution;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.ExecutionStats
7 | {
8 | static class PrintDebugExtensions
9 | {
10 | public static PipelineText DebugCount(this PipelineText pipelineText)
11 | {
12 | return pipelineText.Log>(Console.Out);
13 | }
14 |
15 | public static PipelineText DebugPrint(this PipelineText pipelineText)
16 | {
17 | return pipelineText.Log>(Console.Out);
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ExecutionStats/PrintDebugPrint.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 | using System.IO;
6 |
7 | namespace PdfTextReader.ExecutionStats
8 | {
9 | class PrintDebugPrint : ILogStructure
10 | {
11 | string _message = typeof(T).Name;
12 |
13 | public void StartLog(TextWriter input)
14 | {
15 | input.WriteLine("DebugPrint: " + _message);
16 | }
17 |
18 | public void EndLog(TextWriter input)
19 | {
20 | }
21 |
22 | public void Log(TextWriter input, T data)
23 | {
24 | input.WriteLine(data.ToString());
25 | input.WriteLine("");
26 | }
27 |
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ExecutionStats/ShowParserWarnings.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using PdfTextReader.Execution;
3 | using System;
4 | using System.Collections.Generic;
5 | using System.Linq;
6 | using System.Text;
7 |
8 | namespace PdfTextReader.ExecutionStats
9 | {
10 | class ShowParserWarnings
11 | {
12 | public IEnumerable GetPages(PipelineStats statistics)
13 | {
14 | var layout = (ValidateLayout)statistics.Calculate();
15 | var overlap = (ValidateOverlap)statistics.Calculate();
16 | var unhandled = (ValidateUnhandledExceptions)statistics.Calculate();
17 |
18 | var pagesLayout = layout.GetPageErrors().ToList();
19 | var pagesOverlap = overlap.GetPageErrors().ToList();
20 | var pagesUnhandled = unhandled.GetPageErrors().ToList();
21 |
22 | var pages = pagesLayout
23 | .Concat(pagesOverlap)
24 | .Concat(pagesUnhandled)
25 | .Distinct().OrderBy(t => t).ToList();
26 |
27 | return pages;
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ExecutionStats/TextInfo.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 | using PdfTextReader.Base;
5 |
6 | namespace PdfTextReader.ExecutionStats
7 | {
8 | class TextInfo
9 | {
10 | public string FontName;
11 | public float FontSize;
12 | public string FontStyle;
13 | public string Text;
14 |
15 | public TextInfo(TextLine line)
16 | {
17 | this.FontName = line.FontName;
18 | this.FontSize = line.FontSize;
19 | this.FontStyle = line.FontStyle;
20 | this.Text = line.Text;
21 | }
22 |
23 | public TextInfo(string fontName, string fontStyle, float fontSize)
24 | {
25 | this.FontName = fontName;
26 | this.FontStyle = fontStyle;
27 | this.FontSize = fontSize;
28 | this.Text = "";
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ExecutionStats/ValidateFooter.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using PdfTextReader.PDFCore;
3 | using System;
4 | using System.Collections.Generic;
5 | using System.Text;
6 |
7 | namespace PdfTextReader.ExecutionStats
8 | {
9 | class ValidateFooter : ICalculateStats
10 | {
11 | const float statRegionTooLarge = 200f;
12 |
13 | public object Calculate(IEnumerable stats)
14 | {
15 | float total = 0;
16 | int count = 0;
17 | int missingFooter = 0;
18 |
19 | foreach(var stat in stats)
20 | {
21 | if (stat == null)
22 | continue;
23 |
24 | if( stat.HasFooter )
25 | {
26 | float height = (float)stat.FooterHeight;
27 |
28 | if (height > statRegionTooLarge)
29 | {
30 | PdfReaderException.AlwaysThrow("height > statRegionTooLarge");
31 | }
32 |
33 | total += height;
34 | count++;
35 | }
36 | else
37 | {
38 | missingFooter++;
39 | }
40 | }
41 |
42 | return new
43 | {
44 | PagesWithoutFooter = missingFooter,
45 | AverageFooterHeight = total / count
46 | };
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ExecutionStats/ValidateOverlap.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.ExecutionStats
7 | {
8 | class ValidateOverlap : ICalculateStats
9 | {
10 | public IList Results { get; private set; }
11 |
12 | public IEnumerable GetPageErrors()
13 | {
14 | for (int i = 0; i < Results.Count; i++)
15 | {
16 | if ((Results[i] != null) && (Results[i] != StatsBlocksOverlapped.Empty))
17 | yield return i+1;
18 | }
19 | }
20 |
21 | public object Calculate(IEnumerable stats)
22 | {
23 | var result = new List();
24 |
25 | foreach (var s in stats)
26 | {
27 | var r = (s == StatsBlocksOverlapped.Empty) ? null : s;
28 |
29 | result.Add(r);
30 | }
31 |
32 | Results = result;
33 |
34 | return this;
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ExecutionStats/ValidateUnhandledExceptions.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.ExecutionStats
7 | {
8 | class ValidateUnhandledExceptions : ICalculateStats
9 | {
10 | public IList Results { get; private set; }
11 |
12 | public IEnumerable GetPageErrors()
13 | {
14 | for (int i = 0; i < Results.Count; i++)
15 | {
16 | if (Results[i] == null)
17 | continue;
18 |
19 | yield return Results[i].PageNumber;
20 | }
21 | }
22 |
23 | public object Calculate(IEnumerable stats)
24 | {
25 | var result = new List();
26 |
27 | foreach (var s in stats)
28 | {
29 | result.Add(s);
30 | }
31 |
32 | Results = result;
33 |
34 | return this;
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/PdfTextReader/IVirtualFS.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Text;
5 |
6 | namespace PdfTextReader
7 | {
8 | public interface IVirtualFS
9 | {
10 | Stream OpenReader(string filename);
11 | Stream OpenWriter(string filename);
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/AddImageSpace.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Execution;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using PdfTextReader.PDFText;
7 | using PdfTextReader.Base;
8 |
9 | namespace PdfTextReader.PDFCore
10 | {
11 | class AddImageSpace : IProcessBlock
12 | {
13 | private List _images;
14 |
15 | public AddImageSpace(PreProcessImages parserImage)
16 | {
17 | var page = parserImage.Images;
18 |
19 | if (page == null)
20 | {
21 | PdfReaderException.AlwaysThrow("AddImageSpace requires PreProcessImages");
22 | }
23 |
24 | this._images = page.AllBlocks.ToList();
25 | }
26 |
27 | public BlockPage Process(BlockPage page)
28 | {
29 | if(this._images == null)
30 | {
31 | PdfReaderException.AlwaysThrow("AddImageSpace requires PreProcessImages");
32 | }
33 |
34 | var result = new BlockPage();
35 |
36 | foreach (var block in page.AllBlocks)
37 | {
38 | result.Add(block);
39 | }
40 | foreach (var block in _images)
41 | {
42 | result.Add(block);
43 | }
44 |
45 | return result;
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/AddTableSpace.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Execution;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using PdfTextReader.Base;
7 |
8 | namespace PdfTextReader.PDFCore
9 | {
10 | class AddTableSpace : IProcessBlock
11 | {
12 | private List _tables;
13 |
14 | public AddTableSpace(PDFCore.IdentifyTables parserTable)
15 | {
16 | var page = parserTable.PageTables;
17 |
18 | if (page == null)
19 | {
20 | PdfReaderException.AlwaysThrow("AddTableSpace requires IdentifyTables");
21 | }
22 |
23 | this._tables = page.AllBlocks.ToList();
24 | }
25 |
26 | public BlockPage Process(BlockPage page)
27 | {
28 | if(this._tables == null)
29 | {
30 | PdfReaderException.AlwaysThrow("AddTableSpace requires IdentifyTables");
31 | }
32 |
33 | var result = new BlockPage();
34 |
35 | foreach (var block in page.AllBlocks)
36 | {
37 | result.Add(block);
38 | }
39 | foreach (var block in _tables)
40 | {
41 | result.Add(block);
42 | }
43 |
44 | return result;
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/BasicFirstPageStats.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 |
7 | namespace PdfTextReader.PDFCore
8 | {
9 | class BasicFirstPageStats : IProcessBlock
10 | {
11 | public float MinX { get; private set; }
12 | public float MaxX { get; private set; }
13 | public float PageWidth { get; private set; }
14 | public float TabStop { get; private set; }
15 |
16 | static BasicFirstPageStats Global = null;
17 |
18 | public BasicFirstPageStats Stats { get
19 | {
20 | return Global;
21 | }
22 | }
23 |
24 | [Obsolete]
25 | public static void Reset()
26 | {
27 | Global = null;
28 | }
29 |
30 | public void SetTabStop(float tabstop)
31 | {
32 | TabStop = tabstop;
33 | }
34 |
35 | void SetupPage(BlockPage page)
36 | {
37 | if (Global != null)
38 | return;
39 |
40 | Global = this;
41 |
42 | var blocks = page.AllBlocks;
43 |
44 | MinX = blocks.Min(b => b.GetX());
45 | MaxX = blocks.Max(b => b.GetX() + b.GetWidth());
46 | PageWidth = MaxX - MinX;
47 | }
48 |
49 | public BlockPage Process(BlockPage page)
50 | {
51 | SetupPage(page);
52 |
53 | return page;
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/DouIgnoreLongDotSequence.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 | using PdfTextReader.Base;
5 | using System.Linq;
6 |
7 | namespace PdfTextReader.PDFCore
8 | {
9 | // bug #37: "omisses" can span multiple columns
10 | // proposed fix: decrease the width
11 | // should be run after GroupLines, to prevent cases where omisses are broken
12 | class DouIgnoreLongDotSequence : IProcessBlock
13 | {
14 | public BlockPage Process(BlockPage page)
15 | {
16 | var result = new BlockPage();
17 |
18 | foreach (var block in page.AllBlocks)
19 | {
20 | var blockLine = (BlockLine)block;
21 |
22 | // divide by 4
23 | if(blockLine.GetText().Contains("...................."))
24 | {
25 | blockLine.Width /= 4;
26 | }
27 |
28 | result.Add(blockLine);
29 | }
30 |
31 | return result;
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/FilterHeaderFooter.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 | using System.Linq;
6 |
7 | namespace PdfTextReader.PDFCore
8 | {
9 | class FilterHeaderFooter : IProcessBlock, IValidateBlock
10 | {
11 | private float _headerH = float.NaN;
12 | private float _footerH = float.NaN;
13 |
14 | public FilterHeaderFooter(HeaderFooterData data)
15 | {
16 | _headerH = data.HeaderH;
17 | _footerH = data.FooterH;
18 |
19 | if( float.IsNaN(_headerH) || float.IsNaN(_footerH) )
20 | PdfReaderException.AlwaysThrow("FilterHeaderFooter requires HeaderFooterData");
21 | }
22 |
23 | public BlockPage Process(BlockPage page)
24 | {
25 | var content = new BlockPage();
26 |
27 | foreach(var b in page.AllBlocks)
28 | {
29 | if( b.GetH() > _footerH && b.GetH() < _headerH )
30 | {
31 | content.Add(b);
32 | }
33 | }
34 |
35 | return content;
36 | }
37 |
38 | public BlockPage Validate(BlockPage page)
39 | {
40 | var headerfooter = new BlockPage();
41 |
42 | foreach (var b in page.AllBlocks)
43 | {
44 | if (b.GetH() <= _footerH || b.GetH() >= _headerH)
45 | {
46 | headerfooter.Add(b);
47 | }
48 | }
49 |
50 | return headerfooter;
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/FinalBlockResultData.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.PDFCore
7 | {
8 | class FinalBlockResultData : IProcessBlockData
9 | {
10 | public BlockPage LastResult { get; private set; }
11 |
12 | public BlockPage Process(BlockPage page)
13 | {
14 | LastResult = page;
15 | return page;
16 | }
17 |
18 | public void UpdateInstance(object cache)
19 | {
20 | }
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/HeaderFooterData.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 | using System.Linq;
6 |
7 | namespace PdfTextReader.PDFCore
8 | {
9 | class HeaderFooterData : IProcessBlockData
10 | {
11 | public float HeaderH = float.NaN;
12 | public float FooterH = float.NaN;
13 |
14 | public BlockPage LastResult { get; private set; }
15 |
16 | public BlockPage Process(BlockPage page)
17 | {
18 | LastResult = page;
19 | return page;
20 | }
21 |
22 | public void UpdateInstance(object cache)
23 | {
24 | var instance = (HeaderFooterData)cache;
25 |
26 | this.HeaderH = instance.HeaderH;
27 | this.FooterH = instance.FooterH;
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/IdentifyTablesData.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using PdfTextReader.Execution;
3 | using System;
4 | using System.Collections.Generic;
5 | using System.Linq;
6 | using System.Text;
7 |
8 | namespace PdfTextReader.PDFCore
9 | {
10 | class IdentifyTablesData : IProcessBlockData
11 | {
12 | public bool Ready { get; set; }
13 | public BlockPage PageTables { get; set; }
14 | public BlockPage PageLines { get; set; }
15 | public BlockPage PageBackground { get; set; }
16 | public TableCell PageFooterLine { get; set; }
17 |
18 | public BlockPage LastResult { get; set; }
19 |
20 | public BlockPage Process(BlockPage page)
21 | {
22 | LastResult = page;
23 | return page;
24 | }
25 |
26 | public void UpdateInstance(object cache)
27 | {
28 | var instance = (IdentifyTablesData)cache;
29 | this.LastResult = instance.LastResult;
30 | this.Ready = instance.Ready;
31 | this.PageTables = instance.PageTables;
32 | this.PageLines = instance.PageLines;
33 | this.PageBackground = instance.PageBackground;
34 | this.PageFooterLine = instance.PageFooterLine;
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/MergeBlockLines.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using PdfTextReader.Base;
6 |
7 | namespace PdfTextReader.PDFCore
8 | {
9 | class MergeBlockLines : IProcessBlock
10 | {
11 | public BlockPage Process(BlockPage page)
12 | {
13 | var result = new BlockPage();
14 | BlockSet last = null;
15 |
16 | foreach (var block in page.AllBlocks)
17 | {
18 | var blockset = (BlockSet)block;
19 |
20 | if ((last == null) || (!CanBeMerged(last, blockset)))
21 | {
22 | var b = new BlockSet();
23 | b.AddRange(blockset);
24 |
25 | result.Add(b);
26 |
27 | last = b;
28 | }
29 | else
30 | {
31 | // merge blocks
32 | last.AddRange(blockset);
33 | }
34 | }
35 |
36 | return result;
37 | }
38 |
39 | bool CanBeMerged(BlockSet a, BlockSet b)
40 | {
41 | var lastLine = a.Last();
42 | var firstLine = b.First();
43 |
44 | return Block.HasOverlap(lastLine, firstLine);
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/PageInfoStats.cs:
--------------------------------------------------------------------------------
1 | using Newtonsoft.Json;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.PDFCore
7 | {
8 | class PageInfoStats
9 | {
10 | public class HeaderInfo
11 | {
12 | public string ISSN;
13 | public string Local;
14 | public string DataDia;
15 | public string DataYMD;
16 | public string JornalAnoSupl;
17 | public string JornalEdicao;
18 | }
19 |
20 | public HeaderInfo Header { get; private set; }
21 |
22 | public void SetInfo(HeaderInfo headerInfo )
23 | {
24 | this.Header = headerInfo;
25 | }
26 |
27 | public override string ToString()
28 | {
29 | if (this.Header == null)
30 | return "";
31 |
32 | return JsonConvert.SerializeObject(this.Header, Formatting.Indented);
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/ProcessPdfTextData.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.PDFCore
7 | {
8 | class ProcessPdfTextData : IProcessBlockData
9 | {
10 | public BlockPage LastResult { get; private set; }
11 |
12 | public BlockPage Process(BlockPage page)
13 | {
14 | LastResult = page;
15 | return page;
16 | }
17 |
18 | public void UpdateInstance(object cache)
19 | {
20 | var instance = (ProcessPdfTextData)cache;
21 |
22 | if (instance == null)
23 | PdfReaderException.AlwaysThrow("Null cache value");
24 |
25 | this.LastResult = instance.LastResult;
26 | }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/RemoveBlockHidden.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.PDFCore
7 | {
8 | class RemoveBlockHidden : IProcessBlock, IValidateBlock
9 | {
10 | public BlockPage Process(BlockPage page)
11 | {
12 | var newpage = new BlockPage();
13 |
14 | foreach(var block in page.AllBlocks)
15 | {
16 | if (block is BlockHidden)
17 | continue;
18 |
19 | newpage.Add(block);
20 | }
21 |
22 | return newpage;
23 | }
24 |
25 | public BlockPage Validate(BlockPage page)
26 | {
27 | var newpage = new BlockPage();
28 |
29 | foreach (var block in page.AllBlocks)
30 | {
31 | if (block is BlockHidden)
32 | {
33 | newpage.Add(block);
34 | }
35 | }
36 |
37 | return newpage;
38 | }
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/RemoveHeader.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 |
7 | namespace PdfTextReader.PDFCore
8 | {
9 | class RemoveHeader : IProcessBlock, IValidateBlock
10 | {
11 | const float statRegionTooLarge = 200f;
12 |
13 | public BlockPage Process(BlockPage page)
14 | {
15 | if (page.AllBlocks.Count() == 0)
16 | return page;
17 |
18 | float err = 1f;
19 | float maxH = page.AllBlocks.Max(b => b.GetH()) - err;
20 |
21 | var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() < maxH);
22 |
23 | var result = new BlockPage();
24 |
25 | result.AddRange(blocksAtHeader);
26 |
27 | return result;
28 | }
29 |
30 | public BlockPage Validate(BlockPage page)
31 | {
32 | if (page.AllBlocks.Count() == 0)
33 | return page;
34 |
35 | float err = 1f;
36 | float maxH = page.AllBlocks.Max(b => b.GetH()) - err;
37 |
38 | var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() >= maxH);
39 |
40 | var result = new BlockPage();
41 |
42 | result.AddRange(blocksAtHeader);
43 |
44 | float height = result.AllBlocks.GetHeight();
45 | if (height > statRegionTooLarge)
46 | PdfReaderException.AlwaysThrow("height > statRegionTooLarge");
47 |
48 | return result;
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/RemoveImageTexts.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Execution;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using PdfTextReader.PDFText;
7 | using PdfTextReader.Base;
8 |
9 | namespace PdfTextReader.PDFCore
10 | {
11 | class RemoveImageTexts : IProcessBlock
12 | {
13 | private List _images;
14 |
15 | public RemoveImageTexts(PreProcessImages parseImage)
16 | {
17 | var page = parseImage.Images;
18 |
19 | if (page == null)
20 | {
21 | PdfReaderException.AlwaysThrow("RemoveImageTexts requires PreProcessImages");
22 | }
23 |
24 | this._images = page.AllBlocks.ToList();
25 | }
26 |
27 | public BlockPage Process(BlockPage page)
28 | {
29 | if (this._images == null)
30 | {
31 | PdfReaderException.AlwaysThrow("RemoveImageTexts requires PreProcessImages");
32 | }
33 |
34 | var result = new BlockPage();
35 |
36 | foreach (var block in page.AllBlocks)
37 | {
38 | bool insideImage = false;
39 |
40 | foreach (var table in _images)
41 | {
42 | if (Block.HasOverlap(table, block))
43 | {
44 | insideImage = true;
45 | break;
46 | }
47 | }
48 |
49 | if (!insideImage)
50 | {
51 | result.Add(block);
52 | }
53 | }
54 |
55 | return result;
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/RemoveTableDotChar.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.PDFCore
7 | {
8 | class RemoveTableDotChar : IProcessBlock, IValidateBlock
9 | {
10 | public BlockPage Process(BlockPage page)
11 | {
12 | var result = new BlockPage();
13 |
14 | foreach (var block in page.AllBlocks)
15 | {
16 | if (block.GetText() != ".")
17 | result.Add(block);
18 | }
19 |
20 | return result;
21 | }
22 |
23 | public BlockPage Validate(BlockPage page)
24 | {
25 | var result = new BlockPage();
26 |
27 | foreach(var block in page.AllBlocks)
28 | {
29 | if (block.GetText() == ".")
30 | result.Add(block);
31 | }
32 |
33 | return result;
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/RemoveTableText.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Execution;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using PdfTextReader.Base;
7 |
8 | namespace PdfTextReader.PDFCore
9 | {
10 | class RemoveTableText : IProcessBlock
11 | {
12 | private List _tables;
13 |
14 | public RemoveTableText(PDFCore.IdentifyTables parserTable)
15 | {
16 | var page = parserTable.PageTables;
17 |
18 | if (page == null)
19 | {
20 | PdfReaderException.AlwaysThrow("RemoveTableText requires IdentifyTables");
21 | }
22 |
23 | this._tables = page.AllBlocks.ToList();
24 | }
25 |
26 | public BlockPage Process(BlockPage page)
27 | {
28 | if(this._tables == null)
29 | {
30 | PdfReaderException.AlwaysThrow("RemoveTableText requires IdentifyTables");
31 | }
32 |
33 | var result = new BlockPage();
34 |
35 | foreach(var block in page.AllBlocks)
36 | {
37 | bool insideTable = false;
38 |
39 | foreach(var table in _tables)
40 | {
41 | if( Block.HasOverlap(table, block) )
42 | {
43 | insideTable = true;
44 | break;
45 | }
46 | }
47 |
48 | if( !insideTable )
49 | {
50 | result.Add(block);
51 | }
52 | }
53 |
54 | return result;
55 | }
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/SetIdentifyTablesCompatibility.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using PdfTextReader.PDFText;
3 | using System;
4 | using System.Collections.Generic;
5 | using System.Text;
6 |
7 | namespace PdfTextReader.PDFCore
8 | {
9 | class SetIdentifyTablesCompatibility : IProcessBlock
10 | {
11 | private readonly IdentifyTables _pre;
12 | private readonly IdentifyTablesData _data;
13 |
14 | public SetIdentifyTablesCompatibility(IdentifyTables pre, IdentifyTablesData data)
15 | {
16 | this._pre = pre;
17 | this._data = data;
18 | }
19 |
20 | public void SetCompatibility(IdentifyTables pre, IdentifyTablesData data)
21 | {
22 | if (data.Ready == false)
23 | {
24 | if (pre.PageTables == null && pre.PageLines == null && pre.PageBackground == null)
25 | PdfReaderException.AlwaysThrow("there is no data available");
26 |
27 | data.PageFooterLine = pre.PageFooterLine;
28 | data.PageTables = pre.PageTables;
29 | data.PageLines = pre.PageLines;
30 | data.PageBackground = pre.PageBackground;
31 | data.Ready = true;
32 | }
33 |
34 | // set the compatibility between PreProcessImages and ProcessImageData
35 | pre.SetCompatibility(data);
36 | }
37 |
38 | public BlockPage Process(BlockPage page)
39 | {
40 | SetCompatibility(_pre, _data);
41 |
42 | // do nothing
43 | return page;
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/SetProcessImageCompatibility.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using PdfTextReader.PDFText;
3 | using System;
4 | using System.Collections.Generic;
5 | using System.Text;
6 |
7 | namespace PdfTextReader.PDFCore
8 | {
9 | class SetProcessImageCompatibility : IProcessBlock
10 | {
11 | private readonly PreProcessImages _pre;
12 | private readonly ProcessImageData _data;
13 |
14 | public SetProcessImageCompatibility(PreProcessImages pre, ProcessImageData data)
15 | {
16 | this._pre = pre;
17 | this._data = data;
18 | }
19 |
20 | public void SetCompatibility(PreProcessImages pre, ProcessImageData data)
21 | {
22 | if (data.Images == null)
23 | PdfReaderException.AlwaysThrow("Null image");
24 |
25 | // set the compatibility between PreProcessImages and ProcessImageData
26 | pre.SetCompatibility(data);
27 | }
28 |
29 | public BlockPage Process(BlockPage page)
30 | {
31 | SetCompatibility(_pre, _data);
32 |
33 | // do nothing
34 | return page;
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PDFCore/ShowBlocksets.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.PDFCore
7 | {
8 | class ShowBlocksets : IProcessBlock
9 | {
10 | public BlockPage Process(BlockPage page2)
11 | {
12 | var page = page2 as BlockPage2;
13 |
14 | if (page == null)
15 | PdfReaderException.AlwaysThrow("ShowBlocksets must execute AFTER OrganizePageLayout");
16 |
17 | var blocksets = new BlockPage();
18 |
19 | foreach(var seg in page.Segments)
20 | {
21 | blocksets.AddRange(seg.Columns);
22 | }
23 |
24 | return blocksets;
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/AggregateAnexo.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.TextStructures;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using PdfTextReader.Base;
7 |
8 | namespace PdfTextReader.Parser
9 | {
10 | class AggregateAnexo : IAggregateStructure
11 | {
12 | public bool Aggregate(Conteudo line)
13 | {
14 | if (line.Titulo.ToLower().Contains("anexo"))
15 | {
16 | return true;
17 | }
18 | else
19 | {
20 | return false;
21 | }
22 | }
23 |
24 | public Conteudo Create(List conteudos)
25 | {
26 | Conteudo newConteudo = conteudos[0];
27 | newConteudo.Anexos = new List();
28 | if (conteudos.Count() > 1)
29 | {
30 | for (int i = 1; i < conteudos.Count; i++)
31 | {
32 | Anexo a = new Anexo()
33 | {
34 | Titulo = conteudos[i].Titulo,
35 | Texto = conteudos[i].Corpo
36 | };
37 | newConteudo.Anexos.Add(a);
38 | }
39 |
40 | }
41 |
42 | return newConteudo;
43 |
44 | }
45 |
46 | public void Init(Conteudo line)
47 | {
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/AggregateSingularBody.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.TextStructures;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using PdfTextReader.Base;
7 |
8 | namespace PdfTextReader.Parser
9 | {
10 | class AggregateSingularBody : IAggregateStructure
11 | {
12 | public bool Aggregate(Conteudo line)
13 | {
14 | if (line.Titulo.ToLower().Contains("seção") || line.Titulo.ToLower().Contains("capítulo"))
15 | {
16 | return true;
17 | }
18 | else
19 | {
20 | return false;
21 | }
22 | }
23 |
24 | public Conteudo Create(List conteudos)
25 | {
26 | Conteudo newConteudo = conteudos[0];
27 | if (conteudos.Count() > 1)
28 | {
29 | for (int i = 1; i < conteudos.Count; i++)
30 | {
31 | //Verificando se na hierarquia entrou o título da lei (Capitulo)
32 | var titleParts = conteudos[i].Hierarquia.Split(':');
33 | foreach (string title in titleParts)
34 | {
35 | if (title.Contains("CAPÍTULO"))
36 | conteudos[i].Titulo = title + "\n" + conteudos[i].Titulo;
37 | }
38 | newConteudo.Corpo = newConteudo.Corpo + "\n" + conteudos[i].Titulo + "\n" + conteudos[i].Corpo;
39 | }
40 | }
41 | return newConteudo;
42 | }
43 |
44 | public void Init(Conteudo line)
45 | {
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/Artigo.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace PdfTextReader.Parser
6 | {
7 | class Artigo
8 | {
9 | public Metadados Metadados { get; set; }
10 | public Conteudo Conteudo { get; set; }
11 | public List Anexos { get; set; }
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/Autor.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace PdfTextReader.Parser
6 | {
7 | class Autor
8 | {
9 | public string Assinatura { get; set; }
10 | public string Cargo { get; set; }
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/Content.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 | using PdfTextReader.Base;
5 |
6 | namespace PdfTextReader.Parser
7 | {
8 | class Content : TextStructure
9 | {
10 | public TipoDoConteudo ContentType { get; set; }
11 |
12 | public Content() { }
13 |
14 | public Content(TextStructure structure, TipoDoConteudo type)
15 | {
16 | this.FontName = structure.FontName;
17 | this.FontSize = structure.FontSize;
18 | this.FontStyle = structure.FontStyle;
19 | this.Text = structure.Text;
20 | this.TextAlignment = structure.TextAlignment;
21 | this.ContentType = type;
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/Conteudo.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 | using PdfTextReader.Base;
5 |
6 | namespace PdfTextReader.Parser
7 | {
8 | class Conteudo
9 | {
10 | //For internal use
11 | public int IntenalId { get; set; }
12 | public int Page { get; set; }
13 | public string PID { get; set; }
14 |
15 | public string Hierarquia { get; set; }
16 | public string Titulo { get; set; }
17 | public string Corpo { get; set; }
18 | public List Autor { get; set; }
19 | public string Caput { get; set; }
20 | public string Grade { get; set; }
21 | public string Data { get; set; }
22 | public string Setor { get; set; }
23 | public string Departamento { get; set; }
24 |
25 | public string[] HierarquiaTitulo { get; set; }
26 | public string Texto { get; set; }
27 |
28 | //Just for while
29 | public List Anexos { get; set; }
30 |
31 | public override string ToString() => Titulo;
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/HifenUtil.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 | using System.Text.RegularExpressions;
5 |
6 | namespace PdfTextReader.Parser
7 | {
8 | class HifenUtil
9 | {
10 | static Regex _pattern = new Regex(@"(-[mst])?(.)-\n([^ ]+( )?)");
11 |
12 | public static string ExtrairHifen(string texto)
13 | {
14 | string replace = _pattern.Replace(texto, m => {
15 | var g = m.Groups;
16 | bool keep = false;
17 |
18 | string corpo = g[0].Value;
19 | bool isMesoclise = g[1].Success;
20 | char charMesoclise = isMesoclise ? g[1].Value[1] : '\0';
21 | char charAntes = g[2].Value[0];
22 | char charDepois = g[3].Value[0];
23 | string afterMatch = g[3].Value;
24 |
25 | if(isMesoclise)
26 | {
27 | keep = true;
28 | }
29 |
30 | if(IsNumber(charAntes) || IsNumber(charDepois))
31 | {
32 | keep = true;
33 | }
34 |
35 | string keepHifen = (keep) ? "-" : "";
36 |
37 | return corpo.Replace("-\n", keepHifen).TrimEnd() + "\n";
38 | });
39 |
40 | return replace;
41 | }
42 |
43 | static bool IsNumber(char ch)
44 | {
45 | return (ch >= '0' && ch <= '9');
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/InjectFilename.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.PDFCore;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace PdfTextReader.Parser
7 | {
8 | class InjectFilename
9 | {
10 | public string Filename { get; set; }
11 | public PageInfoStats InfoStats { get; set; }
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/Metadados.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace PdfTextReader.Parser
6 | {
7 | class Metadados
8 | {
9 | public string Nome { get; set; }
10 | public string SecaoDoDiario { get; set; }
11 | public string DataPublicacao { get; set; }
12 | public string TipoDoArtigo { get; set; }
13 | public string Grade { get; set; }
14 | public int NumeroDaPagina { get; set; }
15 | public string PdfLink { get; set; }
16 | public string IdMateria { get; set; }
17 | public string NumeroDaEdicao { get; set; }
18 | public string NumeroDoJornal { get; set; }
19 | public string Titulo { get; set; }
20 |
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/ProcessParserJson.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Text;
6 | using PdfTextReader.Base;
7 | using Newtonsoft.Json;
8 |
9 | namespace PdfTextReader.Parser
10 | {
11 | class ProcessParserJson
12 | {
13 | public void Write(Artigo artigo, string doc)
14 | {
15 | // TODO: fix it
16 | // Rollback to previous name
17 | //string finalURL = ProcessName(artigos.FirstOrDefault(), doc);
18 | string finalURL = doc;
19 |
20 | JsonSerializerSettings settings = new JsonSerializerSettings() { Formatting = Formatting.Indented };
21 | using (Stream virtualStream = VirtualFS.OpenWrite($"{finalURL}.json"))
22 | {
23 | string content = JsonConvert.SerializeObject(artigo, settings);
24 |
25 | using (var writer = new StreamWriter(virtualStream))
26 | {
27 | writer.Write(content);
28 | }
29 | }
30 | }
31 |
32 | public void WriteJson(IEnumerable artigos, string doc)
33 | {
34 | int i = 1;
35 | foreach(var artigo in artigos)
36 | {
37 | string doc_i = doc + (i++);
38 | this.Write(artigo, doc_i);
39 | }
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/TipoDoConteudo.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 | using PdfTextReader.Base;
5 |
6 | namespace PdfTextReader.Parser
7 | {
8 | public enum TipoDoConteudo
9 | {
10 | Título,
11 | Grade,
12 | Corpo,
13 | Assinatura,
14 | Data,
15 | Caput,
16 | Cargo,
17 | Setor,
18 | Departamento
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Parser/TransformExemplo.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.TextStructures;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using PdfTextReader.Base;
7 |
8 | namespace PdfTextReader.Parser
9 | {
10 | //
11 | //
12 | // TODO: rewrite using IFilterStructure
13 | //
14 | //
15 | class TransformExemplo : IAggregateStructure
16 | {
17 | public bool Aggregate(TextStructure line)
18 | {
19 | // never aggregate multiple lines
20 | return false;
21 | }
22 |
23 | public TextStructure Create(List textStructureList)
24 | {
25 | if (textStructureList.Count != 1)
26 | throw new InvalidOperationException("impossible");
27 |
28 | var textStruct = textStructureList[0];
29 |
30 | // filter password out
31 | if (textStruct.Text.Contains("password"))
32 | return null;
33 |
34 | return textStruct;
35 | }
36 |
37 | public void Init(TextStructure line)
38 | {
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ParserStages/StageConvertContent.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using PdfTextReader.Parser;
3 | using PdfTextReader.TextStructures;
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Text;
7 |
8 | namespace PdfTextReader.ParserStages
9 | {
10 | class StageConvertContent
11 | {
12 | private readonly string _input;
13 | private readonly string _output;
14 | private readonly StageContext _context;
15 |
16 | public StageConvertContent(StageContext context)
17 | {
18 | this._input = context.InputFolder;
19 | this._output = context.OutputFolder;
20 | this._context = context;
21 | }
22 |
23 | public void Process()
24 | {
25 | var pipelineText = _context.GetPipelineText();
26 |
27 | var resultPipeline = pipelineText
28 | .ConvertText();
29 |
30 | _context.SetPipelineText(resultPipeline);
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ParserStages/StageConvertStructText.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using PdfTextReader.Parser;
3 | using PdfTextReader.TextStructures;
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Text;
7 |
8 | namespace PdfTextReader.ParserStages
9 | {
10 | class StageConvertStructText
11 | {
12 | private readonly string _input;
13 | private readonly string _output;
14 | private readonly StageContext _context;
15 |
16 | public StageConvertStructText(StageContext context)
17 | {
18 | this._input = context.InputFolder;
19 | this._output = context.OutputFolder;
20 | this._context = context;
21 | }
22 |
23 | public void Process()
24 | {
25 | var pipelineText = _context.GetPipelineText();
26 |
27 | var resultPipeline = pipelineText
28 | .ConvertText(true)
29 | .Log($"{_context.OutputFilePrefix}-text-version.txt");
30 |
31 | _context.SetPipelineText(resultPipeline);
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ParserStages/StageDbgFlow.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Execution;
2 | using PdfTextReader.PDFCore;
3 | using PdfTextReader.PDFText;
4 | using System;
5 | using System.Collections.Generic;
6 | using System.Drawing;
7 | using System.Text;
8 |
9 | namespace PdfTextReader.ParserStages
10 | {
11 | class StageDbgFlow
12 | {
13 | private readonly string _input;
14 | private readonly string _output;
15 | private readonly StageContext _context;
16 |
17 | public StageDbgFlow(StageContext context)
18 | {
19 | this._input = context.InputFolder;
20 | this._output = context.OutputFolder;
21 | this._context = context;
22 | }
23 |
24 | public void Process()
25 | {
26 | string basename = _context.Basename;
27 | Pipeline pipeline = _context.GetPipeline();
28 |
29 | pipeline.Input($"{_context.InputFilePrefix}.pdf")
30 | .Output($"{_context.OutputFilePrefix}-dbg0-flow.pdf")
31 | .StageProcess(Flow);
32 | }
33 |
34 | void Flow(PipelineInputPdf.PipelineInputPdfPage page)
35 | {
36 | page.ParsePdf()
37 | .Show(Color.Blue)
38 | .ParsePdf()
39 | .Show(Color.Orange)
40 | .ParsePdf()
41 | .Show(Color.Yellow)
42 | .ShowLine(Color.Black) ;
43 | }
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/PdfTextReader/ParserStages/StageExtractHeaderDOU.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Execution;
2 | using PdfTextReader.Parser;
3 | using PdfTextReader.PDFCore;
4 | using PdfTextReader.PDFText;
5 | using System;
6 | using System.Collections.Generic;
7 | using System.Drawing;
8 | using System.Text;
9 |
10 | namespace PdfTextReader.ParserStages
11 | {
12 | class StageExtractHeaderDOU
13 | {
14 | private readonly string _input;
15 | private readonly string _output;
16 | private readonly StageContext _context;
17 |
18 | public StageExtractHeaderDOU(StageContext context)
19 | {
20 | this._input = context.InputFolder;
21 | this._output = context.OutputFolder;
22 | this._context = context;
23 | }
24 |
25 | public void Process()
26 | {
27 | string basename = _context.Basename;
28 | Pipeline pipeline = _context.GetPipeline();
29 |
30 | var page = pipeline.Input($"{_context.InputFilePrefix}.pdf")
31 | .Page(1)
32 | .ParsePdf()
33 | .ParseBlock();
34 |
35 | var extract = page.CreateInstance();
36 | var infoStats = extract.InfoStats;
37 | string content = infoStats.ToString();
38 |
39 | var filename = _context.CreateGlobalInstance();
40 | filename.Filename = _context.Basename;
41 | filename.InfoStats = infoStats;
42 |
43 | _context.WriteFile("header", $"{_context.OutputFilePrefix}-header.txt", content);
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/PdfTextReader/PdfTextReader.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netstandard2.0
5 | Debug;Release;CORE
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/src/PdfTextReader/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.IO;
3 | using System.Diagnostics;
4 |
5 | namespace PdfTextReader
6 | {
7 | public class Program
8 | {
9 | public static void Main(string[] args)
10 | {
11 | if (args.Length == 2 && args[0] == "extract")
12 | {
13 | ExampleStages.ExtractHeader(args[1]);
14 | return;
15 | }
16 |
17 | Console.WriteLine("PDF Text Reader");
18 | var watch = Stopwatch.StartNew();
19 |
20 | Program3.ProcessStage("2010_04_19_p_anvisa", 1);
21 |
22 | watch.Stop();
23 | var elapsedMs = watch.ElapsedMilliseconds;
24 |
25 | Console.WriteLine($"Elapsed time was: {elapsedMs}");
26 |
27 | Console.ReadKey();
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/AfterFilterTextSegments.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.TextStructures;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using PdfTextReader.Base;
7 |
8 | namespace PdfTextReader.Parser
9 | {
10 | class AfterFilterTextSegments : IAggregateStructure
11 | {
12 | public bool Aggregate(TextSegment line)
13 | {
14 | return (line.Title.Length == 0);
15 | }
16 |
17 | public TextSegment Create(List _structures)
18 | {
19 | if( _structures.Count == 1 )
20 | {
21 | return new TextSegment()
22 | {
23 | Title = _structures[0].Title,
24 | Body = _structures[0].Body
25 | };
26 | }
27 |
28 | var title = _structures[0].Title;
29 | var body = _structures.SelectMany(s => s.Body).ToArray();
30 |
31 | int additionalTitles = _structures.Skip(1).Where(s => s.Title.Length > 0).Count();
32 | if (additionalTitles > 0)
33 | PdfReaderException.Throw("s.Title.Length > 0");
34 |
35 | return new TextSegment()
36 | {
37 | Title = title,
38 | Body = body
39 | };
40 | }
41 |
42 | public void Init(TextSegment line)
43 | {
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/AnalyzeLines.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 | using System.IO;
6 |
7 | namespace PdfTextReader.TextStructures
8 | {
9 | class AnalyzeLines : ILogStructure
10 | {
11 | public void StartLog(TextWriter input)
12 | {
13 | }
14 |
15 | public void Log(TextWriter input, TextLine line)
16 | {
17 | input.WriteLine("-----------------------------------");
18 |
19 | float? afterSpace = line.AfterSpace;
20 | float? beforeSpace = line.BeforeSpace;
21 |
22 | input.WriteLine($"Margins: (LEFT: {line.MarginLeft}, RIGHT: {line.MarginRight})");
23 |
24 | input.Write($"TEXT: {line.Text}");
25 | input.WriteLine($" ({line.FontName}, {line.FontSize.ToString("0.00")}, {line.FontStyle})");
26 | input.WriteLine($" (AfterSpace: {afterSpace})");
27 | input.WriteLine($" (BeforeSpace: {afterSpace})");
28 | input.WriteLine();
29 |
30 | input.WriteLine("");
31 | }
32 |
33 | public void EndLog(TextWriter input)
34 | {
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/AnalyzeLinesCenterRight.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 | using System.IO;
6 |
7 | namespace PdfTextReader.TextStructures
8 | {
9 | class AnalyzeLinesCenterRight : ILogStructure
10 | {
11 | public void StartLog(TextWriter input)
12 | {
13 | }
14 |
15 | public void Log(TextWriter input, TextLine line)
16 | {
17 |
18 |
19 | float? afterSpace = line.AfterSpace;
20 | float? beforeSpace = line.BeforeSpace;
21 |
22 |
23 | float? lineCenter = (float)line.CenteredAt;
24 |
25 | if (lineCenter < 29 && lineCenter > 28)
26 | {
27 | input.WriteLine("-----------------------------------");
28 | input.WriteLine($"Margins: (LEFT: {line.MarginLeft}, RIGHT: {line.MarginRight}, CENTER: {line.CenteredAt})");
29 |
30 | input.Write($"TEXT: {line.Text}");
31 | input.WriteLine($" ({line.FontName}, {line.FontSize.ToString("0.00")}, {line.FontStyle})");
32 | input.WriteLine($" (AfterSpace: {afterSpace})");
33 | input.WriteLine($" (BeforeSpace: {afterSpace})");
34 | input.WriteLine();
35 |
36 | input.WriteLine("");
37 |
38 |
39 | }
40 |
41 | }
42 |
43 | public void EndLog(TextWriter input)
44 | {
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/AnalyzePageInfo.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 | using System.IO;
6 |
7 | namespace PdfTextReader.TextStructures
8 | {
9 | class AnalyzePageInfo : ILogStructure2
10 | {
11 | ITransformIndexTree _index;
12 | int _structureId = 0;
13 |
14 | public void Init(ITransformIndexTree index)
15 | {
16 | if (index == null)
17 | throw new ArgumentNullException();
18 |
19 | _index = index;
20 | }
21 |
22 | public void Log(TextWriter input, T instance)
23 | {
24 | int page = _index.FindPageStart(instance);
25 |
26 | input.WriteLine($"Page {page}: {_structureId} [{instance.ToString().Replace("\n", " ")}]");
27 |
28 | _structureId++;
29 | }
30 |
31 | public void StartLog(TextWriter input)
32 | {
33 | }
34 |
35 | public void EndLog(TextWriter input)
36 | {
37 | }
38 |
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/AnalyzeSegmentTextVersion.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.IO;
5 | using System.Linq;
6 | using System.Text;
7 |
8 | namespace PdfTextReader.TextStructures
9 | {
10 | class AnalyzeSegmentTextVersion : ILogStructure
11 | {
12 | public void EndLog(TextWriter input)
13 | {
14 | }
15 |
16 | public void Log(TextWriter input, TextSegment data)
17 | {
18 | input.WriteLine(data.TitleText);
19 | input.WriteLine();
20 | input.WriteLine(data.BodyText);
21 | input.WriteLine();
22 | input.WriteLine();
23 | }
24 |
25 | public void StartLog(TextWriter input)
26 | {
27 | }
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/AnalyzeSegmentTitles.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 | using System.IO;
6 |
7 | namespace PdfTextReader.TextStructures
8 | {
9 | class AnalyzeSegmentTitles : ILogStructure
10 | {
11 | public void StartLog(TextWriter input)
12 | {
13 | }
14 |
15 | public void Log(TextWriter input, TextSegment segment)
16 | {
17 | input.WriteLine("-----------------------------------");
18 |
19 | foreach(var title in segment.Title)
20 | {
21 | float? afterSpace = title.AfterSpace;
22 |
23 | input.Write(title.Text);
24 | input.WriteLine($" ({title.FontName}, {title.FontSize.ToString("0.00")}, {title.FontStyle})");
25 | input.WriteLine($" ({afterSpace})");
26 | input.WriteLine();
27 | }
28 |
29 | input.WriteLine("");
30 | }
31 |
32 | public void EndLog(TextWriter input)
33 | {
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/AnalyzeSegments.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.IO;
5 | using System.Text;
6 |
7 | namespace PdfTextReader.TextStructures
8 | {
9 | class AnalyzeSegments : ILogStructure
10 | {
11 | public void EndLog(TextWriter input)
12 | {
13 | }
14 |
15 | public void Log(TextWriter input, TextSegment data)
16 | {
17 | input.WriteLine("-----------------------------------");
18 |
19 | input.WriteLine($"Title Count: {data.Title.Length}");
20 | input.WriteLine($"Body Count: {data.Body.Length}");
21 | input.WriteLine();
22 |
23 | input.WriteLine($"Body Alignments: ");
24 | foreach (var item in data.Body)
25 | {
26 | input.WriteLine($"Text: {item.Text} ==>> {item.TextAlignment}");
27 | }
28 |
29 | input.WriteLine("");
30 | }
31 |
32 | public void StartLog(TextWriter input)
33 | {
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/AnalyzeSegments2.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.IO;
5 | using System.Linq;
6 | using System.Text;
7 |
8 | namespace PdfTextReader.TextStructures
9 | {
10 | class AnalyzeSegments2 : ILogStructure
11 | {
12 | public void EndLog(TextWriter input)
13 | {
14 | }
15 |
16 | public void Log(TextWriter input, TextSegment data)
17 | {
18 | input.WriteLine("Text,FontName,FontSize,FontStyle,MarginLeft,MarginRight,TextAlignment,AfterSpace");
19 | if (data.Title.Length > 0)
20 | {
21 | input.WriteLine(data.Title.LastOrDefault().Text);
22 | }
23 | foreach (var item in data.Body)
24 | {
25 | input.WriteLine($"{item.Text.Replace(",",";")},{item.FontName},{item.FontSize},{item.FontStyle},{item.MarginLeft},{item.MarginRight},{item.TextAlignment},{item.AfterSpace}");
26 | }
27 |
28 | input.WriteLine("");
29 | input.WriteLine("--,--,--,--,--,--,--,--,");
30 | input.WriteLine("--,--,--,--,--,--,--,--,");
31 | input.WriteLine("");
32 | }
33 |
34 | public void StartLog(TextWriter input)
35 | {
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/AnalyzeStructures.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 | using System.IO;
6 |
7 | namespace PdfTextReader.TextStructures
8 | {
9 | class AnalyzeStructures : ILogStructure
10 | {
11 | public void StartLog(TextWriter input)
12 | {
13 | }
14 |
15 | public void Log(TextWriter input, TextStructure structure)
16 | {
17 | input.WriteLine("-----------------------------------");
18 |
19 | float? afterSpace = structure.AfterSpace;
20 |
21 | input.WriteLine($"Aligment: {structure.TextAlignment}");
22 |
23 | input.Write(structure.Text);
24 | input.WriteLine($" ({structure.FontName}, {structure.FontSize.ToString("0.00")}, {structure.FontStyle} - {structure.HasBackColor})");
25 | input.WriteLine($" ({afterSpace})");
26 | input.WriteLine();
27 |
28 | input.WriteLine("");
29 | }
30 |
31 | public void EndLog(TextWriter input)
32 | {
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/CreateContent.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using PdfTextReader.Execution;
3 | using PdfTextReader.Parser;
4 | using PdfTextReader.PDFCore;
5 | using System;
6 | using System.Collections.Generic;
7 | using System.Text;
8 |
9 | namespace PdfTextReader.TextStructures
10 | {
11 | class CreateContent : IAggregateStructure
12 | {
13 | public CreateContent(BasicFirstPageStats basicFirstPageStats, PipelinePageStats teste, PipelineDocumentStats docstats)
14 | {
15 | }
16 |
17 | public bool Aggregate(TextSegment line)
18 | {
19 | return false;
20 | }
21 |
22 | public TextSegment Create(List input)
23 | {
24 | return input[0];
25 | }
26 |
27 | public void Init(TextSegment line)
28 | {
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/CreateTextLineIndex.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.TextStructures;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text;
6 | using PdfTextReader.Base;
7 |
8 | namespace PdfTextReader.Parser
9 | {
10 | class CreateTextLineIndex : IAggregateStructure
11 | {
12 | // this class does nothing
13 | // however, it indirectly creates an index for TextLine
14 | public bool Aggregate(TextLine line)
15 | {
16 | return false;
17 | }
18 |
19 | public TextLine Create(List lines)
20 | {
21 | return lines[0];
22 | }
23 |
24 | public void Init(TextLine line)
25 | {
26 | }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/GenerateArtigoGN4.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using PdfTextReader.Parser;
3 | using System;
4 | using System.Collections.Generic;
5 | using System.IO;
6 | using System.Text;
7 |
8 | namespace PdfTextReader.TextStructures
9 | {
10 | class GenerateArtigoGN4 : ILogMultipleStructure
11 | {
12 | int _id = 0;
13 | ProcessParser2 _procParser = new ProcessParser2();
14 | Converter2GN _convert = new Converter2GN();
15 | private InjectFilename _filename;
16 |
17 | public GenerateArtigoGN4(InjectFilename filename)
18 | {
19 | this._filename = filename;
20 | }
21 |
22 | public string CreateId(Artigo data)
23 | {
24 | // return (_id++).ToString();
25 | return data.Conteudo.PID;
26 | }
27 |
28 | public void Log(string id, Stream input, Artigo data)
29 | {
30 | MemoryStream memstream = new MemoryStream();
31 | _procParser.XMLWriter(data, memstream);
32 |
33 | memstream.Seek(0, SeekOrigin.Begin);
34 | var reader = new StreamReader(memstream);
35 |
36 | string pdfname = _filename.Filename;
37 | string article = reader.ReadToEnd();
38 | string edition = _filename?.InfoStats?.Header?.JornalEdicao ?? "";
39 | string result = _convert.Convert(pdfname, id, article, edition);
40 |
41 | using (var writer = new StreamWriter(input))
42 | {
43 | writer.Write(result);
44 | }
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/GenerateArtigoTmp.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using PdfTextReader.Parser;
3 | using System;
4 | using System.Collections.Generic;
5 | using System.IO;
6 | using System.Text;
7 |
8 | namespace PdfTextReader.TextStructures
9 | {
10 | class GenerateArtigoTmp : ILogMultipleStructure
11 | {
12 | int _id = 0;
13 | ProcessParser2 _procParser = new ProcessParser2();
14 |
15 | public string CreateId(Artigo data)
16 | {
17 | return (_id++).ToString();
18 | }
19 |
20 | public void Log(string id, Stream input, Artigo data)
21 | {
22 | _procParser.XMLWriter(data, input);
23 | }
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/ShowStructureCentral.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Drawing;
5 | using System.Text;
6 |
7 | namespace PdfTextReader.TextStructures
8 | {
9 | class ShowStructureCentral : ILogStructurePdf
10 | {
11 | public void EndLogPdf(IPipelineDebug pipeline)
12 | {
13 | }
14 |
15 | public void LogPdf(IPipelineDebug pipeline, TextStructure data)
16 | {
17 | if (data.TextAlignment == TextAlignment.CENTER)
18 | {
19 | pipeline.ShowLine(data.Lines, Color.Red);
20 | }
21 |
22 | if (data.TextAlignment == TextAlignment.RIGHT)
23 | {
24 | pipeline.ShowLine(data.Lines, Color.Blue);
25 | }
26 | }
27 |
28 | public void StartLogPdf(IPipelineDebug pipeline)
29 | {
30 | }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/PdfTextReader/TextStructures/ShowTitleSegment.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Base;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Drawing;
5 | using System.Text;
6 |
7 | namespace PdfTextReader.TextStructures
8 | {
9 | class ShowTitleSegment : ILogStructurePdf
10 | {
11 | int _id = 0;
12 |
13 | public void EndLogPdf(IPipelineDebug pipeline)
14 | {
15 | }
16 |
17 | public void LogPdf(IPipelineDebug pipeline, TextSegment data)
18 | {
19 | var titles = data.OriginalTitle;
20 |
21 | if (titles.Length == 0)
22 | return;
23 |
24 | for(int i=0; i
8 | {
9 | public int Id { get; set; }
10 | public TO Key { get; set; }
11 | public TI Start { get; set; }
12 | public TI End { get; set; }
13 | public List Items { get; set; }
14 | }
15 | class TransformIndexEntry2
16 | {
17 | public int Id { get; set; }
18 | public T Key { get; set; }
19 | public int StartId { get; set; }
20 | public int EndId { get; set; }
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/PdfTextReader/VirtualFS.Static.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.IO;
5 | using System.Linq;
6 | using System.Text;
7 |
8 | namespace PdfTextReader.Base
9 | {
10 | partial class VirtualFS : IVirtualFS
11 | {
12 | public static Stream OpenRead(string filename)
13 | {
14 | return g_vfs.OpenReader(filename);
15 | }
16 | public static Stream OpenWrite(string filename)
17 | {
18 | return g_vfs.OpenWriter(filename);
19 | }
20 |
21 | [DebuggerHidden]
22 | public static void ConfigureFileSystem(IVirtualFS virtualFS)
23 | {
24 | if (virtualFS == null)
25 | throw new ArgumentNullException(nameof(IVirtualFS));
26 |
27 | g_vfs = virtualFS;
28 | }
29 |
30 | // iText.Kernel.Pdf
31 | public static iText.Kernel.Pdf.PdfReader OpenPdfReader(string filename)
32 | {
33 | return new iText.Kernel.Pdf.PdfReader(OpenRead(filename));
34 | }
35 |
36 | public static iText.Kernel.Pdf.PdfWriter OpenPdfWriter(string filename)
37 | {
38 | return new iText.Kernel.Pdf.PdfWriter(OpenWrite(filename));
39 | }
40 |
41 | public static StreamWriter OpenStreamWriter(string filename)
42 | {
43 | return new StreamWriter(OpenWrite(filename));
44 | }
45 |
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/PdfTextReader/VirtualFS.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.IO;
5 | using System.Linq;
6 | using System.Text;
7 |
8 | namespace PdfTextReader.Base
9 | {
10 | partial class VirtualFS : IVirtualFS
11 | {
12 | static IVirtualFS g_vfs = new VirtualFS();
13 |
14 | public Stream OpenReader(string filename)
15 | {
16 | System.Diagnostics.Debug.WriteLine($"READ: {filename}");
17 | return new FileStream(filename, FileMode.Open, FileAccess.Read);
18 | }
19 |
20 | public Stream OpenWriter(string filename)
21 | {
22 | System.Diagnostics.Debug.WriteLine($"WRITE: {filename}");
23 |
24 | string folderName = Path.GetDirectoryName(filename);
25 | if(!Directory.Exists(folderName))
26 | {
27 | if(Path.IsPathRooted(folderName))
28 | {
29 | throw new NotImplementedException();
30 | }
31 | else
32 | {
33 | DirectoryInfo directory = new DirectoryInfo(".");
34 | directory.CreateSubdirectory(folderName);
35 | }
36 | }
37 |
38 | return new FileStream(filename, FileMode.Create);
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/PdfToImageFunction/PdfToImageFunction.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 | netstandard2.0
4 | v2
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | PreserveNewest
17 |
18 |
19 | PreserveNewest
20 |
21 |
22 | PreserveNewest
23 |
24 |
25 | PreserveNewest
26 |
27 |
28 | PreserveNewest
29 |
30 |
31 | PreserveNewest
32 | Never
33 |
34 |
35 | PreserveNewest
36 |
37 |
38 |
--------------------------------------------------------------------------------
/src/PdfToImageFunction/Properties/PublishProfiles/FunctionApp20180412035249 - Web Deploy.pubxml:
--------------------------------------------------------------------------------
1 |
2 |
6 |
7 |
8 | MSDeploy
9 | AzureWebSite
10 | Release
11 | Any CPU
12 | http://functionapp20180412035249.azurewebsites.net
13 | False
14 | False
15 | functionapp20180412035249.scm.azurewebsites.net:443
16 | /subscriptions/eb6659ac-634f-4460-8e5c-c92db0afcabb/resourcegroups/casa-civil-br/providers/Microsoft.Web/sites/FunctionApp20180412035249
17 | FunctionApp20180412035249
18 | True
19 | WMSVC
20 | True
21 | $FunctionApp20180412035249
22 | <_SavePWD>True
23 | False
24 |
25 |
--------------------------------------------------------------------------------
/src/PdfToImageFunction/host.json:
--------------------------------------------------------------------------------
1 | {
2 | // Value indicating the timeout duration for all functions.
3 | // In Dynamic SKUs, the valid range is from 1 second to 10 minutes and the default value is 5 minutes.
4 | // In Paid SKUs there is no limit and the default is no timeout.
5 | "functionTimeout": "00:05:00",
6 | "queues": {
7 | // The maximum interval in milliseconds between
8 | // queue polls. The default is 1 minute.
9 | "maxPollingInterval": 2000,
10 |
11 | // The visibility timeout that will be applied to messages that fail processing
12 | // (i.e. the time interval between retries). The default is zero.
13 | "visibilityTimeout": "00:00:30",
14 |
15 | // The number of queue messages to retrieve and process in
16 | // parallel (per job function). The default is 16 and the maximum is 32.
17 | "batchSize": 4,
18 |
19 | // The number of times to try processing a message before
20 | // moving it to the poison queue. The default is 5.
21 | "maxDequeueCount": 5
22 | }
23 | }
--------------------------------------------------------------------------------
/src/PdfToImageFunction/pdf/D141.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/PdfToImageFunction/pdf/D141.pdf
--------------------------------------------------------------------------------
/src/PdfToImageFunction/temp/readme.txt:
--------------------------------------------------------------------------------
1 | This folder will be used by ghostScript to write image from the pdf file
--------------------------------------------------------------------------------
/src/QueueConsole/Config.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.Extensions.Configuration;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Diagnostics;
5 |
6 | namespace QueueConsole
7 | {
8 | class Config
9 | {
10 | private readonly IConfigurationRoot _config;
11 |
12 | public Config(string[] args)
13 | {
14 | _config = new ConfigurationBuilder()
15 | .AddJsonFile("appsettings.json",true,true)
16 | .AddCommandLine(args)
17 | .Build();
18 | }
19 |
20 | [DebuggerHidden]
21 | public string Get(string configName)
22 | {
23 | var value = _config[configName];
24 |
25 | if (value == null)
26 | throw new NotConfigured(configName);
27 |
28 | return value;
29 | }
30 |
31 | [DebuggerHidden]
32 | public string TryGet(string configName)
33 | {
34 | var value = _config[configName];
35 | return value;
36 | }
37 |
38 | class NotConfigured : Exception
39 | {
40 | public readonly string Name;
41 |
42 | public NotConfigured(string name) : base($"configuration '{name}' not found")
43 | {
44 | Name = name;
45 | }
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/QueueConsole/MainConsole.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Azure.Queue;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 | using System.Threading.Tasks;
6 |
7 | namespace QueueConsole
8 | {
9 | class MainConsole
10 | {
11 | public void Run(string queueSas)
12 | {
13 | RunWriterAsync(queueSas).Wait();
14 |
15 | Console.WriteLine("Read messages:");
16 | RunReaderAsync(queueSas).Wait();
17 | }
18 |
19 | public async Task RunWriterAsync(string queueSas)
20 | {
21 | var azQueue = await AzureQueue.OpenAsync(queueSas);
22 |
23 | while(true)
24 | {
25 | string text = Console.ReadLine();
26 |
27 | if (String.IsNullOrEmpty(text))
28 | break;
29 |
30 | await azQueue.AddMessageAsync(text);
31 | }
32 | }
33 | public async Task RunReaderAsync(string queueSas)
34 | {
35 | var azQueue = await AzureQueue.OpenAsync(queueSas);
36 |
37 | while (true)
38 | {
39 | var message = await azQueue.TryGetMessageAsync();
40 |
41 | if (message == null)
42 | break;
43 |
44 | Console.WriteLine($"message: {message.Content}");
45 |
46 | message.Done();
47 | }
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/QueueConsole/Program.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.WindowsAzure.Storage;
2 | using PdfTextReader.Azure;
3 | using System;
4 | using System.Collections.Generic;
5 | using System.Text.RegularExpressions;
6 | using System.Threading.Tasks;
7 |
8 | namespace QueueConsole
9 | {
10 | class Program
11 | {
12 | static Config _config = null;
13 |
14 | static void Main(string[] args)
15 | {
16 | Console.WriteLine("Console Queue");
17 |
18 | _config = new Config(args);
19 |
20 | string INPUT_STORAGE_ACCOUNT = _config.TryGet("INPUT_STORAGE_ACCOUNT");
21 | string QUEUE_STORAGE_ACCOUNT = _config.TryGet("QUEUE_STORAGE_ACCOUNT");
22 | string QUEUE_NAME = _config.TryGet("QUEUE_NAME");
23 |
24 | string QUEUE_SAS = _config.TryGet("QUEUE_SAS");
25 |
26 | if( !String.IsNullOrWhiteSpace(INPUT_STORAGE_ACCOUNT) )
27 | {
28 | (new MainPdfToImage()).Run(INPUT_STORAGE_ACCOUNT, QUEUE_STORAGE_ACCOUNT, QUEUE_NAME);
29 | }
30 | else
31 | {
32 | (new MainConsole()).Run(QUEUE_SAS);
33 | }
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/QueueConsole/QueueConsole.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | netcoreapp2.0
6 | Debug;Release;CORE
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 | PreserveNewest
25 |
26 |
27 | PreserveNewest
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/src/QueueConsole/appsettings.json:
--------------------------------------------------------------------------------
1 | {
2 | "INPUT_STORAGE_ACCOUNT": "",
3 | "QUEUE_STORAGE_ACCOUNT": "",
4 | "QUEUE_NAME": ""
5 | }
6 |
--------------------------------------------------------------------------------
/src/Validator/File.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace Validator
6 | {
7 | class File
8 | {
9 | public File(string folder, string filename)
10 | {
11 | Folder = folder;
12 | Filename = filename;
13 | }
14 |
15 | public readonly string Folder;
16 | public readonly string Filename;
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/Validator/GeneralProcess.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace Validator
6 | {
7 | class GeneralProcess : IRunner
8 | {
9 | public string FilePattern => "*.pdf";
10 |
11 | public void Run(File file, string outputname)
12 | {
13 | string inputFolder = file.Folder;
14 | string basename = file.Filename;
15 |
16 | PdfTextReader.ProgramValidator.Process(basename, inputFolder, outputname);
17 | }
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/Validator/IRunner.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace Validator
6 | {
7 | interface IRunner
8 | {
9 | string FilePattern { get; }
10 | void Run(File file, string outputname);
11 | }
12 |
13 | interface IRunner2 : IRunner
14 | {
15 | void Close(string outputfolder);
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/src/Validator/Process2010.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace Validator
6 | {
7 | class Process2010 : IRunner
8 | {
9 | int _totalProcessed = 0;
10 |
11 | //public string FilePattern => "DO1_2010_0?_10.pdf"; //6
12 | //public string FilePattern => "*.pdf";
13 | public string FilePattern => "DO1_2010_12_??.pdf";
14 |
15 | public void Run(File file, string outputname)
16 | {
17 | string inputFolder = file.Folder;
18 | string basename = file.Filename;
19 |
20 | //if (!basename.Contains("DO1_2010_02_02"))
21 | // return;
22 |
23 | // CMD C:\PDF\2010\ c:\pdf\output 2010
24 | PdfTextReader.ProgramValidator2010.Process(basename, inputFolder, outputname);
25 | _totalProcessed++;
26 | }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/Validator/Process2012.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace Validator
6 | {
7 | class Process2012 : IRunner
8 | {
9 | int _totalProcessed = 0;
10 |
11 | public string FilePattern => "*.pdf";
12 |
13 | public void Run(File file, string outputname)
14 | {
15 | string inputFolder = file.Folder;
16 | string basename = file.Filename;
17 |
18 | string folderOutput = FileList.CreateOutputFolder(outputname, basename);
19 |
20 | PdfTextReader.ProgramValidator2012.Process(basename, inputFolder, folderOutput);
21 | _totalProcessed++;
22 |
23 |
24 | }
25 |
26 |
27 |
28 |
29 |
30 |
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/Validator/Process2016.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Text;
5 |
6 | namespace Validator
7 | {
8 | class Process2016 : IRunner2
9 | {
10 | int _totalProcessed = 0;
11 |
12 | public string FilePattern => "*.pdf";
13 |
14 | public void Run(File file, string outputname)
15 | {
16 | string inputFolder = file.Folder;
17 | string basename = file.Filename;
18 |
19 | string folderOutput = FileList.CreateOutputFolder(outputname, basename);
20 |
21 | PdfTextReader.ProgramValidator2016.Process(basename, inputFolder, folderOutput);
22 | _totalProcessed++;
23 | }
24 |
25 | public void Close(string outputfolder)
26 | {
27 | ProgramValidatorXML.CreateFinalStats($"{outputfolder}/GlobalArticlePrecision.txt");
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/Validator/ProcessDefault.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace Validator
6 | {
7 | class ProcessDefault : IRunner
8 | {
9 | int _totalProcessed = 0;
10 |
11 | public string FilePattern => "*.pdf";
12 |
13 | public void Run(File file, string outputname)
14 | {
15 | string inputFolder = file.Folder;
16 | string basename = file.Filename;
17 |
18 | string folderOutput = FileList.CreateOutputFolder(outputname, basename);
19 |
20 | PdfTextReader.ProgramValidatorDefault.Process(basename, inputFolder, folderOutput);
21 | _totalProcessed++;
22 | }
23 |
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/Validator/Validate2010.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace Validator
6 | {
7 | class Validate2010 : IRunner
8 | {
9 | int _totalProcessed = 0;
10 | int _totalErrors = 0;
11 |
12 | //public string FilePattern => "DO1_2010_??_10.pdf"; //6
13 | public string FilePattern => "*.pdf";
14 |
15 | public void Run(File file, string outputname)
16 | {
17 | string inputFolder = file.Folder;
18 | string basename = file.Filename;
19 |
20 | // CMD c:\pdf\output_6 c:\pdf\valid valid2010
21 | int errors = PdfTextReader.ValidatorPipeline.Process(basename, inputFolder, outputname);
22 |
23 | _totalProcessed++;
24 | _totalErrors += errors;
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/src/Validator/Validator.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | netcoreapp2.0
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/WebFrontendImages/Logic/ImageProcessing.cs:
--------------------------------------------------------------------------------
1 | using SixLabors.ImageSharp;
2 | using SixLabors.ImageSharp.Formats.Jpeg;
3 | using SixLabors.ImageSharp.PixelFormats;
4 | using SixLabors.ImageSharp.Processing;
5 | using SixLabors.ImageSharp.Processing.Transforms;
6 | using SixLabors.Primitives;
7 | using System;
8 | using System.Collections.Generic;
9 | using System.IO;
10 | using System.Linq;
11 | using System.Threading.Tasks;
12 |
13 | namespace WebFrontendImages.Logic
14 | {
15 | public class ImageProcessing
16 | {
17 | static readonly JpegDecoder JPEG = new JpegDecoder();
18 |
19 | public static Stream Crop(Stream stream, float tx, float ty, float tw, float th)
20 | {
21 | Stream output = new MemoryStream();
22 |
23 | using (Image image = Image.Load(stream, JPEG))
24 | {
25 | int x1 = (int)(image.Width * tx);
26 | int y1 = (int)(image.Height * ty);
27 | int dx = (int)(image.Width * tw);
28 | int dy = (int)(image.Height * th);
29 |
30 | image.Mutate(x => x
31 | .Crop(new Rectangle(x1, y1, dx, dy))
32 | //.Resize(image.Width / 2, image.Height / 2)
33 | );
34 |
35 | image.Save(output, new JpegEncoder());
36 | }
37 |
38 | output.Seek(0, SeekOrigin.Begin);
39 |
40 | return output;
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/WebFrontendImages/Logic/ImageSource.cs:
--------------------------------------------------------------------------------
1 | using Microsoft.WindowsAzure.Storage.Blob;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.IO;
5 | using System.Linq;
6 | using System.Threading.Tasks;
7 |
8 | namespace WebFrontendImages.Logic
9 | {
10 | public class ImageSource
11 | {
12 | CloudBlobContainer _container;
13 |
14 | public ImageSource(string storageUrl)
15 | {
16 | if (String.IsNullOrEmpty(storageUrl))
17 | throw new ArgumentNullException(nameof(storageUrl));
18 |
19 | var container = new CloudBlobContainer(new Uri(storageUrl));
20 |
21 | _container = container;
22 | }
23 |
24 | public async Task GetAsync(string filename)
25 | {
26 | var blob = _container.GetBlobReference(filename);
27 |
28 | return await blob.OpenReadAsync();
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/WebFrontendImages/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.IO;
4 | using System.Linq;
5 | using System.Threading.Tasks;
6 | using Microsoft.AspNetCore;
7 | using Microsoft.AspNetCore.Hosting;
8 | using Microsoft.Extensions.Configuration;
9 | using Microsoft.Extensions.Logging;
10 |
11 | namespace WebFrontendImages
12 | {
13 | public class Program
14 | {
15 | public static void Main(string[] args)
16 | {
17 | BuildWebHost(args).Run();
18 | }
19 |
20 | public static IWebHost BuildWebHost(string[] args) =>
21 | WebHost.CreateDefaultBuilder(args)
22 | .UseStartup()
23 | .Build();
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/WebFrontendImages/Properties/launchSettings.json:
--------------------------------------------------------------------------------
1 | {
2 | "iisSettings": {
3 | "windowsAuthentication": false,
4 | "anonymousAuthentication": true,
5 | "iisExpress": {
6 | "applicationUrl": "http://localhost:52069/",
7 | "sslPort": 0
8 | }
9 | },
10 | "profiles": {
11 | "IIS Express": {
12 | "commandName": "IISExpress",
13 | "launchBrowser": true,
14 | "launchUrl": "api/values",
15 | "environmentVariables": {
16 | "ASPNETCORE_ENVIRONMENT": "Development"
17 | }
18 | },
19 | "WebFrontendImages": {
20 | "commandName": "Project",
21 | "launchBrowser": true,
22 | "launchUrl": "api/values",
23 | "environmentVariables": {
24 | "ASPNETCORE_ENVIRONMENT": "Development"
25 | },
26 | "applicationUrl": "http://localhost:52070/"
27 | }
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/WebFrontendImages/WebFrontendImages.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netcoreapp2.0
5 | 5e47a648-480b-4066-8538-b5cb625c4453
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/src/WebFrontendImages/appsettings.Development.json:
--------------------------------------------------------------------------------
1 | {
2 | "Logging": {
3 | "IncludeScopes": false,
4 | "LogLevel": {
5 | "Default": "Debug",
6 | "System": "Information",
7 | "Microsoft": "Information"
8 | }
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/src/WebFrontendImages/appsettings.json:
--------------------------------------------------------------------------------
1 | {
2 | "Logging": {
3 | "IncludeScopes": false,
4 | "Debug": {
5 | "LogLevel": {
6 | "Default": "Warning"
7 | }
8 | },
9 | "Console": {
10 | "LogLevel": {
11 | "Default": "Warning"
12 | }
13 | }
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/src/WebFrontendImages/wwwroot/test.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
10 |
11 |
32 |
33 |
--------------------------------------------------------------------------------
/test/PdfTextReader.Test/PdfTextReader.Tests.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netcoreapp2.0
5 |
6 | false
7 |
8 | Debug;Release;CORE
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/test/PdfTextReader.Test/UnitTest1.cs:
--------------------------------------------------------------------------------
1 | using PdfTextReader.Azure;
2 | using System;
3 | using System.IO;
4 | using Xunit;
5 |
6 | namespace PdfTextReader.Test
7 | {
8 | public class UnitTest1
9 | {
10 | [Fact]
11 | public async void Test1()
12 | {
13 |
14 | var pdfFile = @"C:\Users\visouza\Repos\DOU-OCR\data\pdf\D141.pdf";
15 | int pdfPages = 48;
16 | var gs = @"C:\Program Files\gs\gs9.23\bin\gswin64.exe";
17 | var tempFolder = @"C:\temp\dou";
18 |
19 | var pdfInput = File.OpenRead(pdfFile);
20 |
21 | PdfImageConverter imageConverter = new PdfImageConverter(gs, tempFolder, "102.4");
22 |
23 | Stream[] pdfPageImageList = null;
24 |
25 | //The array of streams will respect the page number-1, page 1 equal index 0;
26 | imageConverter.GenerateImage(pdfInput, ref pdfPageImageList);
27 |
28 | Assert.Equal(pdfPages, pdfPageImageList.Length);
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
| |