├── BuildTablesFromPdf.Renderer ├── Resources │ ├── HTML_Footer.txt │ └── HTML_Header.txt ├── FileOpen.cs ├── FileSystemBrowserHelper.cs ├── packages.config ├── frmNotepad.cs ├── Properties │ ├── Settings.settings │ ├── AssemblyInfo.cs │ ├── Settings.Designer.cs │ ├── Resources.Designer.cs │ └── Resources.resx ├── Program.cs ├── App.config ├── frmNotepad.Designer.cs ├── HtmlConverter.cs ├── FileOpen.designer.cs ├── BuildTablesFromPdf.Renderer.csproj ├── frmNotepad.resx ├── frmRenderer.resx ├── FileOpen.resx └── frmRenderer.cs ├── BuildTablesFromPdf.Console ├── Example.pdf ├── ExampleSource.xlsx ├── packages.config ├── App.config ├── Program.cs ├── Properties │ └── AssemblyInfo.cs ├── Example.pdf.html └── BuildTablesFromPdf.Console.csproj ├── packages ├── iTextSharp-LGPL.4.1.6 │ ├── lib │ │ └── iTextSharp.dll │ └── iTextSharp-LGPL.4.1.6.nupkg └── repositories.config ├── BuildTablesFromPdf.Engine.Test ├── TestFiles │ ├── Test1.pdf │ ├── Test2.pdf │ └── Test3.pdf ├── packages.config ├── Properties │ └── AssemblyInfo.cs ├── SimpleTextExtractorTest.cs ├── BuildTablesFromPdfTest.cs ├── PdfArrayDataTypeTest.cs └── BuildTablesFromPdf.Engine.Test.csproj ├── BuildTablesFromPdf.Engine ├── Statements │ ├── PointStatement.cs │ ├── SingleLineStatement.cs │ ├── FillPathStatement.cs │ ├── StrokePathStatement.cs │ ├── CloseStrokePathStatement.cs │ ├── GreyColorStatement.cs │ ├── StrokingColorStatement.cs │ ├── NonStrokingColorStatement.cs │ ├── SetLineWidthStatement.cs │ ├── SetLineDashPatternStatement.cs │ ├── LineCapStyleStatement.cs │ ├── LineJoinStyleStatement.cs │ ├── LineToStatement.cs │ ├── SetPointStatement.cs │ ├── BezierCurveStatement.cs │ ├── TextObjectStatementLine.cs │ ├── MultiLineStatement.cs │ ├── ColorStatement.cs │ ├── RectangleStatement.cs │ ├── TextObjectStatement.cs │ └── Statement.cs ├── packages.config ├── IPageContent.cs ├── PageCollection.cs ├── FontInfo.cs ├── CMap │ ├── BFRangeCollection.cs │ ├── NameCharacter.cs │ ├── EncodingDifferenceCollection.cs │ ├── EncodingDifference.cs │ ├── BFChar.cs │ ├── BFRange.cs │ ├── EncodingDifferenceToUnicode.cs │ └── CMapToUnicode.cs ├── Tables │ ├── Row.cs │ ├── Column.cs │ └── Table.cs ├── BuildTablesFromPdf.Engine.csproj.DotSettings ├── GraphicState.cs ├── Color.cs ├── PdfDataTypes │ ├── PdfObjectDataType.cs │ ├── PdfNumericDataType.cs │ ├── PdfBooleanDataType.cs │ ├── PdfStringDataType.cs │ ├── PdfHexStringDataType.cs │ └── PdfArrayDataType.cs ├── Properties │ └── AssemblyInfo.cs ├── Paragraph.cs ├── Point.cs ├── BuildTablesFromPdf.Engine.csproj ├── Matrix.cs ├── PdfFontHelper.cs ├── SimpleTextExtractor.cs └── Page.cs ├── LICENSE ├── README.md ├── .gitattributes ├── BuildTablesFromPdf.sln └── .gitignore /BuildTablesFromPdf.Renderer/Resources/HTML_Footer.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Console/Example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bubibubi/ExtractTablesFromPdf/HEAD/BuildTablesFromPdf.Console/Example.pdf -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/FileOpen.cs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bubibubi/ExtractTablesFromPdf/HEAD/BuildTablesFromPdf.Renderer/FileOpen.cs -------------------------------------------------------------------------------- /BuildTablesFromPdf.Console/ExampleSource.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bubibubi/ExtractTablesFromPdf/HEAD/BuildTablesFromPdf.Console/ExampleSource.xlsx -------------------------------------------------------------------------------- /packages/iTextSharp-LGPL.4.1.6/lib/iTextSharp.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bubibubi/ExtractTablesFromPdf/HEAD/packages/iTextSharp-LGPL.4.1.6/lib/iTextSharp.dll -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine.Test/TestFiles/Test1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bubibubi/ExtractTablesFromPdf/HEAD/BuildTablesFromPdf.Engine.Test/TestFiles/Test1.pdf -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine.Test/TestFiles/Test2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bubibubi/ExtractTablesFromPdf/HEAD/BuildTablesFromPdf.Engine.Test/TestFiles/Test2.pdf -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine.Test/TestFiles/Test3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bubibubi/ExtractTablesFromPdf/HEAD/BuildTablesFromPdf.Engine.Test/TestFiles/Test3.pdf -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/FileSystemBrowserHelper.cs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bubibubi/ExtractTablesFromPdf/HEAD/BuildTablesFromPdf.Renderer/FileSystemBrowserHelper.cs -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/PointStatement.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine.Statements 2 | { 3 | class PointStatement : SingleLineStatement 4 | { 5 | } 6 | } -------------------------------------------------------------------------------- /packages/iTextSharp-LGPL.4.1.6/iTextSharp-LGPL.4.1.6.nupkg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bubibubi/ExtractTablesFromPdf/HEAD/packages/iTextSharp-LGPL.4.1.6/iTextSharp-LGPL.4.1.6.nupkg -------------------------------------------------------------------------------- /BuildTablesFromPdf.Console/packages.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/packages.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/packages.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine.Test/packages.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Console/App.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/SingleLineStatement.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine.Statements 4 | { 5 | class SingleLineStatement : Statement 6 | { 7 | public string RawContent { get; set; } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/Resources/HTML_Header.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/FillPathStatement.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine.Statements 2 | { 3 | internal class FillPathStatement : SingleLineStatement 4 | { 5 | public static readonly FillPathStatement Value = new FillPathStatement(); 6 | } 7 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/StrokePathStatement.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine.Statements 2 | { 3 | internal class StrokePathStatement : SingleLineStatement 4 | { 5 | public static readonly StrokePathStatement Value = new StrokePathStatement(); 6 | } 7 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/CloseStrokePathStatement.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine.Statements 2 | { 3 | internal class CloseStrokePathStatement : SingleLineStatement 4 | { 5 | public static readonly CloseStrokePathStatement Value = new CloseStrokePathStatement(); 6 | } 7 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/GreyColorStatement.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine.Statements 2 | { 3 | internal class GreyColorStatement : ColorStatement 4 | { 5 | public GreyColorStatement(string rawContent) 6 | { 7 | RawContent = rawContent; 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/StrokingColorStatement.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine.Statements 2 | { 3 | class StrokingColorStatement : ColorStatement 4 | { 5 | public StrokingColorStatement(string rawContent) 6 | { 7 | RawContent = rawContent; 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/IPageContent.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine 4 | { 5 | public interface IPageContent 6 | { 7 | void AddText(Point point, string content); 8 | bool Contains(Point point); 9 | bool Contains(double y); 10 | double Y { get; } 11 | } 12 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/NonStrokingColorStatement.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine.Statements 2 | { 3 | class NonStrokingColorStatement : ColorStatement 4 | { 5 | public NonStrokingColorStatement(string rawContent) 6 | { 7 | RawContent = rawContent; 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/SetLineWidthStatement.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine.Statements 2 | { 3 | internal class SetLineWidthStatement : SingleLineStatement 4 | { 5 | public SetLineWidthStatement(string rawContent) 6 | { 7 | RawContent = rawContent; 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/SetLineDashPatternStatement.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine.Statements 2 | { 3 | internal class SetLineDashPatternStatement : SingleLineStatement 4 | { 5 | public SetLineDashPatternStatement(string rawContent) 6 | { 7 | RawContent = rawContent; 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/LineCapStyleStatement.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine.Statements 4 | { 5 | class LineCapStyleStatement : SingleLineStatement 6 | { 7 | public LineCapStyleStatement(string rawContent) 8 | { 9 | base.RawContent = rawContent; 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/PageCollection.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using iTextSharp.text.pdf; 4 | 5 | namespace BuildTablesFromPdf.Engine 6 | { 7 | public class PageCollection : List 8 | { 9 | public List Errors { get; set; } 10 | public PdfReader PdfReader { get; set; } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/LineJoinStyleStatement.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine.Statements 4 | { 5 | class LineJoinStyleStatement : SingleLineStatement 6 | { 7 | public LineJoinStyleStatement(string rawContent) 8 | { 9 | base.RawContent = rawContent; 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/FontInfo.cs: -------------------------------------------------------------------------------- 1 | using BuildTablesFromPdf.Engine.CMap; 2 | 3 | namespace BuildTablesFromPdf.Engine 4 | { 5 | public class FontInfo 6 | { 7 | public double FontHeight { get; set; } 8 | public CMapToUnicode CMapToUnicode { get; set; } 9 | public EncodingDifferenceToUnicode EncodingDifferenceToUnicode { get; set; } 10 | } 11 | } -------------------------------------------------------------------------------- /packages/repositories.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/CMap/BFRangeCollection.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | 5 | namespace BuildTablesFromPdf.Engine.CMap 6 | { 7 | public class BFRangeCollection : List 8 | { 9 | public BFRange Find(int id) 10 | { 11 | return this.SingleOrDefault(_ => _.BeginChar <= id && _.EndChar >= id); 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/CMap/NameCharacter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine.CMap 4 | { 5 | public class NameCharacter 6 | { 7 | public string Name { get; set; } 8 | public char Character { get; set; } 9 | 10 | public override string ToString() 11 | { 12 | return string.Format("{0} => {1}", Name, Character); 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Tables/Row.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine.Tables 2 | { 3 | public class Row 4 | { 5 | public double BeginY { get; set; } 6 | public double EndY { get; set; } 7 | public int Index { get; set; } 8 | 9 | public override string ToString() 10 | { 11 | return string.Format("Index: {0}, {1}-{2}", Index, BeginY, EndY); 12 | } 13 | } 14 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Tables/Column.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine.Tables 2 | { 3 | public class Column 4 | { 5 | public double BeginX { get; set; } 6 | public double EndX { get; set; } 7 | public int Index { get; set; } 8 | 9 | public override string ToString() 10 | { 11 | return string.Format("Index: {0}, {1}-{2}", Index, BeginX, EndX); 12 | } 13 | 14 | } 15 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/CMap/EncodingDifferenceCollection.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | 5 | namespace BuildTablesFromPdf.Engine.CMap 6 | { 7 | public class EncodingDifferenceCollection : List 8 | { 9 | public EncodingDifference Find(int id) 10 | { 11 | return this.SingleOrDefault(_ => _.BeginChar <= id && _.EndChar >= id); 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/frmNotepad.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Windows.Forms; 3 | 4 | namespace BuildTablesFromPdf.Renderer 5 | { 6 | public partial class frmNotepad : Form 7 | { 8 | public void Start(string content) 9 | { 10 | txtNotepad.Text = content; 11 | Show(); 12 | } 13 | 14 | 15 | public frmNotepad() 16 | { 17 | InitializeComponent(); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/Properties/Settings.settings: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/BuildTablesFromPdf.Engine.csproj.DotSettings: -------------------------------------------------------------------------------- 1 | 2 | True -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/GraphicState.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine 2 | { 3 | public class GraphicState 4 | { 5 | public Matrix TransformMatrix { get; set; } 6 | public Color Color { get; set; } 7 | 8 | public GraphicState Clone() 9 | { 10 | return new GraphicState() 11 | { 12 | TransformMatrix = TransformMatrix, 13 | Color = Color 14 | }; 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Color.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Engine 2 | { 3 | public struct Color 4 | { 5 | public static readonly Color White = new Color(1, 1, 1); 6 | 7 | public Color(float r, float g, float b) 8 | { 9 | R = r; 10 | G = g; 11 | B = b; 12 | } 13 | 14 | public readonly float R; 15 | public readonly float G; 16 | public readonly float B; 17 | 18 | public bool IsWhite() 19 | { 20 | return R > .95 && G > .95 && B > .95; 21 | } 22 | } 23 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Console/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using BuildTablesFromPdf.Engine; 3 | 4 | // https://pdftables.com/pdf-converter-for-business 5 | 6 | namespace BuildTablesFromPdf 7 | { 8 | class Program 9 | { 10 | static void Main(string[] args) 11 | { 12 | 13 | var pages = ContentExtractor.Read(@"Example.PDF"); 14 | var page = pages[0]; 15 | 16 | page.DetermineTableStructures(); 17 | page.DetermineParagraphs(); 18 | page.FillContent(); 19 | 20 | Console.WriteLine(page); 21 | 22 | Console.ReadLine(); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using System.Windows.Forms; 6 | 7 | namespace BuildTablesFromPdf.Renderer 8 | { 9 | static class Program 10 | { 11 | /// 12 | /// The main entry point for the application. 13 | /// 14 | [STAThread] 15 | static void Main() 16 | { 17 | Application.EnableVisualStyles(); 18 | Application.SetCompatibleTextRenderingDefault(false); 19 | Application.Run(new frmRenderer()); 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/CMap/EncodingDifference.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | 4 | namespace BuildTablesFromPdf.Engine.CMap 5 | { 6 | public class EncodingDifference 7 | { 8 | public EncodingDifference(int beginChar) 9 | { 10 | BeginChar = beginChar; 11 | NameCharacters = new List(); 12 | } 13 | 14 | public int BeginChar { get; private set; } 15 | 16 | public int EndChar 17 | { 18 | get { return BeginChar + NameCharacters.Count - 1; } 19 | } 20 | 21 | 22 | public List NameCharacters { get; private set; } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/LineToStatement.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine.Statements 4 | { 5 | /// 6 | /// Draw a line from the current point to the specified point 7 | /// 8 | /// 9 | class LineToStatement : PointStatement 10 | { 11 | public LineToStatement(string rawContent) 12 | { 13 | RawContent = rawContent; 14 | _point = Point.Parse(rawContent); 15 | } 16 | 17 | private Point _point; 18 | 19 | public Point Point 20 | { 21 | get { return _point; } 22 | set { _point = value; } 23 | } 24 | } 25 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/SetPointStatement.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine.Statements 4 | { 5 | 6 | /// 7 | /// Set the new begin point for the graphic operations 8 | /// 9 | /// 10 | class SetPointStatement : PointStatement 11 | { 12 | private Point _point; 13 | 14 | public SetPointStatement(string rawContent) 15 | { 16 | RawContent = rawContent; 17 | _point = Point.Parse(rawContent); 18 | } 19 | 20 | public Point Point 21 | { 22 | get { return _point; } 23 | set { _point = value; } 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/BezierCurveStatement.cs: -------------------------------------------------------------------------------- 1 | using System.Globalization; 2 | 3 | namespace BuildTablesFromPdf.Engine.Statements 4 | { 5 | /// 6 | /// Draw a bezier curve from the current point to the specified point using intermediate points 7 | /// 8 | /// 9 | class BezierCurveStatement : SingleLineStatement 10 | { 11 | public BezierCurveStatement(string rawContent) 12 | { 13 | RawContent = rawContent; 14 | float x = float.Parse(rawContent.Split(' ')[4], NumberFormatInfo.InvariantInfo); 15 | float y = float.Parse(rawContent.Split(' ')[5], NumberFormatInfo.InvariantInfo); 16 | 17 | ToPoint = new Point(x, y); 18 | } 19 | 20 | public readonly Point ToPoint; 21 | 22 | } 23 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/PdfDataTypes/PdfObjectDataType.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using BuildTablesFromPdf.Engine.Statements; 3 | 4 | namespace BuildTablesFromPdf.Engine 5 | { 6 | public static class PdfObjectDataType 7 | { 8 | public static bool IsStartChar(string content, int i) 9 | { 10 | return content[i] == '/'; 11 | } 12 | 13 | public static string GetRawData(string content, ref int i) 14 | { 15 | if (!IsStartChar(content, i)) 16 | throw new ArgumentException("Content is not a PdfNumericDataType", "content"); 17 | 18 | string data = String.Empty; 19 | while (!Statement.IsSeparator(content[i])) 20 | { 21 | data += content[i]; 22 | i++; 23 | } 24 | return data; 25 | } 26 | } 27 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/TextObjectStatementLine.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine.Statements 4 | { 5 | public class TextObjectStatementLine : ICloneable 6 | { 7 | public Point Position { get; set; } 8 | public string Content { get; set; } 9 | public FontInfo Font { get; set; } 10 | 11 | public double FontHeight { get; set; } 12 | 13 | #region ICloneable Members 14 | 15 | public TextObjectStatementLine Clone() 16 | { 17 | return new TextObjectStatementLine() 18 | { 19 | Position = Position, 20 | Content = Content, 21 | Font = Font, 22 | FontHeight = FontHeight 23 | }; 24 | } 25 | 26 | 27 | object ICloneable.Clone() 28 | { 29 | return Clone(); 30 | } 31 | 32 | #endregion 33 | } 34 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/MultiLineStatement.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using iTextSharp.text.pdf; 4 | 5 | namespace BuildTablesFromPdf.Engine.Statements 6 | { 7 | public abstract class MultiLineStatement : Statement 8 | { 9 | public PdfReader PdfReader { get; private set; } 10 | public int PageNumber { get; private set; } 11 | public Matrix BaseTransformMatrix { get; private set; } 12 | 13 | public MultiLineStatement(PdfReader pdfReader, int pageNumber, Matrix baseTransformMatrix) 14 | { 15 | PdfReader = pdfReader; 16 | PageNumber = pageNumber; 17 | BaseTransformMatrix = baseTransformMatrix; 18 | RawContent = new List(); 19 | } 20 | 21 | /// 22 | /// Closes the multi line statement. 23 | /// 24 | public abstract void CloseMultiLineStatement(); 25 | 26 | public List RawContent { get; private set; } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/App.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/PdfDataTypes/PdfNumericDataType.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine 4 | { 5 | public static class PdfNumericDataType 6 | { 7 | public static bool IsStartChar(string content, int i) 8 | { 9 | return IsValidChar(content, i); 10 | } 11 | 12 | public static bool IsValidChar(string content, int i) 13 | { 14 | return content[i] >= '0' && content[i] <= '9' || content[i] == '.' || content[i] == '+' || content[i] == '-'; 15 | } 16 | 17 | 18 | public static string GetRawData(string content, ref int i) 19 | { 20 | if (!IsStartChar(content, i)) 21 | throw new ArgumentException("Content is not a PdfNumericDataType", "content"); 22 | 23 | string data = String.Empty; 24 | while (IsValidChar(content, i)) 25 | { 26 | data += content[i]; 27 | i++; 28 | } 29 | return data; 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/CMap/BFChar.cs: -------------------------------------------------------------------------------- 1 | using System.Globalization; 2 | using BuildTablesFromPdf.Engine.Statements; 3 | 4 | namespace BuildTablesFromPdf.Engine.CMap 5 | { 6 | public class BFChar : BFRange 7 | { 8 | public static BFRange Parse(string s, ref int startPosition) 9 | { 10 | string sBeginChar; 11 | string sUnicodeChar; 12 | 13 | Statement.SkipSpace(s, ref startPosition); 14 | sBeginChar = PdfHexStringDataType.GetRawData(s, ref startPosition); 15 | Statement.SkipSpace(s, ref startPosition); 16 | sUnicodeChar = PdfHexStringDataType.GetRawData(s, ref startPosition); 17 | 18 | int beginChar = int.Parse(sBeginChar.Substring(1, sBeginChar.Length - 2), NumberStyles.HexNumber); 19 | int unicodeChar = int.Parse(sUnicodeChar.Substring(1, sUnicodeChar.Length - 2), NumberStyles.HexNumber); 20 | 21 | return new BFRange() 22 | { 23 | BeginChar = beginChar, 24 | EndChar = beginChar, 25 | UnicodeChar = unicodeChar 26 | }; 27 | } 28 | 29 | } 30 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Umberto Ballestrazzi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/ColorStatement.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Globalization; 3 | 4 | namespace BuildTablesFromPdf.Engine.Statements 5 | { 6 | class ColorStatement : SingleLineStatement 7 | { 8 | public Color Color 9 | { 10 | get 11 | { 12 | string[] parts = RawContent.Split(new [] {' '}, StringSplitOptions.RemoveEmptyEntries); 13 | 14 | if (parts.Length > 0 && parts[0].StartsWith("/P")) // based on a pattern space 15 | return Color.White; 16 | if (parts.Length == 4) 17 | return new Color(float.Parse(parts[0], NumberFormatInfo.InvariantInfo), float.Parse(parts[1], NumberFormatInfo.InvariantInfo), float.Parse(parts[2], NumberFormatInfo.InvariantInfo)); 18 | if (parts.Length == 2) 19 | return new Color(float.Parse(parts[0], NumberFormatInfo.InvariantInfo), float.Parse(parts[0], NumberFormatInfo.InvariantInfo), float.Parse(parts[0], NumberFormatInfo.InvariantInfo)); 20 | else 21 | return new Color(float.Parse(parts[0], NumberFormatInfo.InvariantInfo), float.Parse(parts[0], NumberFormatInfo.InvariantInfo), float.Parse(parts[0], NumberFormatInfo.InvariantInfo)); 22 | } 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/PdfDataTypes/PdfBooleanDataType.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using BuildTablesFromPdf.Engine.Statements; 3 | 4 | namespace BuildTablesFromPdf.Engine 5 | { 6 | public static class PdfBooleanDataType 7 | { 8 | public static bool IsBoolean(string content, int i) 9 | { 10 | return IsBooleanTrue(content, i) || IsBooleanFalse(content, i); 11 | } 12 | 13 | private static bool IsBooleanFalse(string content, int i) 14 | { 15 | return 16 | content.Length > i + 5 && 17 | content.Substring(i, 5) == "false" && 18 | content.Length == i + 6 || Statement.IsSeparator(content[i + 5]); 19 | } 20 | 21 | private static bool IsBooleanTrue(string content, int i) 22 | { 23 | return 24 | content.Length > i + 4 && 25 | content.Substring(i, 4) == "true" && 26 | content.Length == i + 5 || Statement.IsSeparator(content[i + 5]); 27 | } 28 | 29 | 30 | public static string GetRawData(string content, ref int i) 31 | { 32 | if (IsBooleanTrue(content, i)) 33 | { 34 | i += 4; 35 | return "true"; 36 | } 37 | else if (IsBooleanTrue(content, i)) 38 | { 39 | i += 5; 40 | return "false"; 41 | } 42 | else 43 | throw new ArgumentException("content is not a boolean"); 44 | } 45 | } 46 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Console/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.InteropServices; 3 | 4 | // General Information about an assembly is controlled through the following 5 | // set of attributes. Change these attribute values to modify the information 6 | // associated with an assembly. 7 | [assembly: AssemblyTitle("BuildTablesFromPdf")] 8 | [assembly: AssemblyDescription("")] 9 | [assembly: AssemblyConfiguration("")] 10 | [assembly: AssemblyCompany("")] 11 | [assembly: AssemblyProduct("BuildTablesFromPdf")] 12 | [assembly: AssemblyCopyright("Copyright © 2017-2021")] 13 | [assembly: AssemblyTrademark("")] 14 | [assembly: AssemblyCulture("")] 15 | 16 | // Setting ComVisible to false makes the types in this assembly not visible 17 | // to COM components. If you need to access a type in this assembly from 18 | // COM, set the ComVisible attribute to true on that type. 19 | [assembly: ComVisible(false)] 20 | 21 | // The following GUID is for the ID of the typelib if this project is exposed to COM 22 | [assembly: Guid("ee9bb99c-dd0c-4405-a6e4-2214d297e566")] 23 | 24 | // Version information for an assembly consists of the following four values: 25 | // 26 | // Major Version 27 | // Minor Version 28 | // Build Number 29 | // Revision 30 | // 31 | // You can specify all the values or you can default the Build and Revision Numbers 32 | // by using the '*' as shown below: 33 | // [assembly: AssemblyVersion("1.0.*")] 34 | [assembly: AssemblyVersion("1.0.3.0")] 35 | [assembly: AssemblyFileVersion("1.0.2.0")] 36 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.InteropServices; 3 | 4 | // General Information about an assembly is controlled through the following 5 | // set of attributes. Change these attribute values to modify the information 6 | // associated with an assembly. 7 | [assembly: AssemblyTitle("BuildTablesFromPdf.Engine")] 8 | [assembly: AssemblyDescription("")] 9 | [assembly: AssemblyConfiguration("")] 10 | [assembly: AssemblyCompany("")] 11 | [assembly: AssemblyProduct("BuildTablesFromPdf.Engine")] 12 | [assembly: AssemblyCopyright("Copyright © 2017-2023")] 13 | [assembly: AssemblyTrademark("")] 14 | [assembly: AssemblyCulture("")] 15 | 16 | // Setting ComVisible to false makes the types in this assembly not visible 17 | // to COM components. If you need to access a type in this assembly from 18 | // COM, set the ComVisible attribute to true on that type. 19 | [assembly: ComVisible(false)] 20 | 21 | // The following GUID is for the ID of the typelib if this project is exposed to COM 22 | [assembly: Guid("3414dbb0-904c-4699-bc9e-9a08d25f55ff")] 23 | 24 | // Version information for an assembly consists of the following four values: 25 | // 26 | // Major Version 27 | // Minor Version 28 | // Build Number 29 | // Revision 30 | // 31 | // You can specify all the values or you can default the Build and Revision Numbers 32 | // by using the '*' as shown below: 33 | // [assembly: AssemblyVersion("1.0.*")] 34 | [assembly: AssemblyVersion("1.0.5.0")] 35 | [assembly: AssemblyFileVersion("1.0.5.0")] 36 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.InteropServices; 3 | 4 | // General Information about an assembly is controlled through the following 5 | // set of attributes. Change these attribute values to modify the information 6 | // associated with an assembly. 7 | [assembly: AssemblyTitle("BuildTablesFromPdf.Renderer")] 8 | [assembly: AssemblyDescription("")] 9 | [assembly: AssemblyConfiguration("")] 10 | [assembly: AssemblyCompany("")] 11 | [assembly: AssemblyProduct("BuildTablesFromPdf.Renderer")] 12 | [assembly: AssemblyCopyright("Copyright © 2017-2021")] 13 | [assembly: AssemblyTrademark("")] 14 | [assembly: AssemblyCulture("")] 15 | 16 | // Setting ComVisible to false makes the types in this assembly not visible 17 | // to COM components. If you need to access a type in this assembly from 18 | // COM, set the ComVisible attribute to true on that type. 19 | [assembly: ComVisible(false)] 20 | 21 | // The following GUID is for the ID of the typelib if this project is exposed to COM 22 | [assembly: Guid("05d22f0f-4b8a-4481-95bc-53b5083251cf")] 23 | 24 | // Version information for an assembly consists of the following four values: 25 | // 26 | // Major Version 27 | // Minor Version 28 | // Build Number 29 | // Revision 30 | // 31 | // You can specify all the values or you can default the Build and Revision Numbers 32 | // by using the '*' as shown below: 33 | // [assembly: AssemblyVersion("1.0.*")] 34 | [assembly: AssemblyVersion("1.0.3.0")] 35 | [assembly: AssemblyFileVersion("1.0.2.0")] 36 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/PdfDataTypes/PdfStringDataType.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine 4 | { 5 | public static class PdfStringDataType 6 | { 7 | public static string GetContentFromEscapedContent(string escapedContent) 8 | { 9 | if (escapedContent == null) throw new ArgumentNullException("escapedContent"); 10 | 11 | if (!escapedContent.StartsWith("(") || !escapedContent.EndsWith(")")) 12 | throw new ArgumentException(String.Format("Error retrieving content from escaped content '{0}'", escapedContent), "escapedContent"); 13 | 14 | return System.Text.RegularExpressions.Regex.Unescape(escapedContent.Substring(1, escapedContent.Length - 2)); 15 | } 16 | 17 | public static bool IsStartChar(string content, int i) 18 | { 19 | return content[i] == '('; 20 | } 21 | 22 | public static string GetRawData(string content, ref int i) 23 | { 24 | if (!IsStartChar(content, i)) 25 | throw new ArgumentException("The content is not a PdfStringDataType"); 26 | 27 | string data = String.Empty; 28 | while (content[i] != ')') 29 | { 30 | if (content[i] == '\\') 31 | { 32 | data += content[i]; 33 | i++; 34 | } 35 | data += content[i]; 36 | i++; 37 | } 38 | data += content[i]; 39 | i++; 40 | return data; 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # State of the library 2 | The library works with few pdfs for two main reasons: 3 | 1. The transformation matrix and the graphic state is not handled 4 | 2. The fonts/encodings are not correctly handled 5 | 6 | # ExtractTablesFromPdf 7 | Extract tables (and paragraphs outside tables) from pdf 8 | 9 | 10 | ## License limitations 11 | **(please read before use)** 12 | 13 | This software is released under MIT license but uses iTextSharp v.4.1.6 that is released under MPL LGPL license. Before using this software you should also agree with the iTextSharp v.4.1.6 license. 14 | Also, take care if you upgrade iTextSharp because newer versions are released under AGPL. 15 | 16 | ## What's this 17 | PDF is a file format used to define device independent page output. 18 | This project intend to retrieve text and tables from a pdf. 19 | 20 | The main part is the **Engine**. 21 | 22 | The **Renderer** is a debug window to understand what's happening. 23 | 24 | ## Usage 25 | 26 | Call 27 | 28 | var pages = ExtractText.Read(fileName); 29 | 30 | to read all the pages. 31 | 32 | Then, for every page, call 33 | 34 | Page.DetermineTableStructures(); 35 | Page.DetermineParagraphs(); 36 | Page.FillContent(); 37 | 38 | To check if you already called the method above, use 39 | 40 | Page.IsRefreshed 41 | 42 | After that you'll be able to access to 43 | 44 | Page.Contents 45 | 46 | Contents is a collection of IPageContent ordered from top of page to bottom. 47 | A IPageContent can be a 48 | - Paragraph that contains text (Content) 49 | - Table that contains a matrix of text (Content[,]) 50 | 51 | 52 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine.Test/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("BuildTablesFromPdf.Engine.Test")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("BuildTablesFromPdf.Engine.Test")] 13 | [assembly: AssemblyCopyright("Copyright © 2017")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("50ffe8d8-d89b-42d9-8a44-6337a48c5843")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.3.0")] 36 | [assembly: AssemblyFileVersion("1.0.3.0")] 37 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Console/Example.pdf.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 |

This is the first paragraph

T1.H1T1.H2T1.H3T1.H4T1.H5T1.H6
123456
24681012
369121518
This is a table (because the text is inside lines

This is the second paragraph

T2.H1T2.H2T2.H3T2.H4
Outside T21234
2468
36912

The page content is

Paragraph

Table

Table

Paragraph

Table (with text outside)

This paragraphs

11 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/RectangleStatement.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Globalization; 3 | 4 | namespace BuildTablesFromPdf.Engine.Statements 5 | { 6 | internal class RectangleStatement : SingleLineStatement 7 | { 8 | public RectangleStatement(string rawContent) 9 | { 10 | RawContent = rawContent; 11 | Corner = Point.Parse(rawContent); 12 | Width = float.Parse(rawContent.Split(' ')[2], NumberFormatInfo.InvariantInfo); 13 | Height = float.Parse(rawContent.Split(' ')[3], NumberFormatInfo.InvariantInfo); 14 | } 15 | 16 | public readonly Point Corner; 17 | public readonly float Width; 18 | public readonly float Height; 19 | 20 | public List GetLines() 21 | { 22 | var lines = new List(); 23 | if (Corner != new Point(Corner.X + Width, Corner.Y)) 24 | lines.Add(new Line(Corner, new Point(Corner.X + Width, Corner.Y))); 25 | if (new Point(Corner.X + Width, Corner.Y) != new Point(Corner.X + Width, Corner.Y + Height)) 26 | lines.Add(new Line(new Point(Corner.X + Width, Corner.Y), new Point(Corner.X + Width, Corner.Y + Height))); 27 | if (new Point(Corner.X , Corner.Y + Height) != new Point(Corner.X + Width, Corner.Y + Height)) 28 | lines.Add(new Line(new Point(Corner.X , Corner.Y + Height), new Point(Corner.X + Width, Corner.Y + Height))); 29 | if (Corner != new Point(Corner.X, Corner.Y + Height)) 30 | lines.Add(new Line(Corner, new Point(Corner.X, Corner.Y + Height))); 31 | return lines; 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/Properties/Settings.Designer.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // This code was generated by a tool. 4 | // Runtime Version:4.0.30319.42000 5 | // 6 | // Changes to this file may cause incorrect behavior and will be lost if 7 | // the code is regenerated. 8 | // 9 | //------------------------------------------------------------------------------ 10 | 11 | namespace BuildTablesFromPdf.Renderer.Properties { 12 | 13 | 14 | [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] 15 | [global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "12.0.0.0")] 16 | internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase { 17 | 18 | private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings()))); 19 | 20 | public static Settings Default { 21 | get { 22 | return defaultInstance; 23 | } 24 | } 25 | 26 | [global::System.Configuration.UserScopedSettingAttribute()] 27 | [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] 28 | [global::System.Configuration.DefaultSettingValueAttribute("")] 29 | public string FileName { 30 | get { 31 | return ((string)(this["FileName"])); 32 | } 33 | set { 34 | this["FileName"] = value; 35 | } 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/TextObjectStatement.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Globalization; 4 | using System.Linq; 5 | using BuildTablesFromPdf.Engine.CMap; 6 | using iTextSharp.text.pdf; 7 | 8 | namespace BuildTablesFromPdf.Engine.Statements 9 | { 10 | // From BT to ET 11 | public class TextObjectStatement:MultiLineStatement 12 | { 13 | 14 | public TextObjectStatement(PdfReader pdfReader, int pageNumber, Matrix baseTransformMatrix) 15 | : base(pdfReader, pageNumber, baseTransformMatrix) 16 | { 17 | Lines = new List(); 18 | } 19 | 20 | public List Lines { get; private set; } 21 | 22 | public override void CloseMultiLineStatement() 23 | { 24 | 25 | } 26 | 27 | // ReSharper disable once InconsistentNaming 28 | public static string GetTJContent(string rawContent, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode) 29 | { 30 | string content; 31 | string rawArray = rawContent.Remove(rawContent.Length - 2).Trim(); 32 | if (string.IsNullOrWhiteSpace(rawArray)) 33 | return null; 34 | PdfArrayDataType pdfArrayDataType = PdfArrayDataType.Parse(rawArray); 35 | content = string.Empty; 36 | foreach (string item in pdfArrayDataType.Elements.Where(_ => _ is string)) 37 | { 38 | string escapedContent; 39 | escapedContent = item.Trim(); 40 | content += 41 | PdfHexStringDataType.IsStartChar(escapedContent) ? 42 | PdfFontHelper.ToUnicode(PdfHexStringDataType.GetHexContent(escapedContent), cMapToUnicode, encodingDifferenceToUnicode).ToString() : 43 | PdfFontHelper.ToUnicode(PdfStringDataType.GetContentFromEscapedContent(escapedContent), cMapToUnicode, encodingDifferenceToUnicode); 44 | } 45 | return content; 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine.Test/SimpleTextExtractorTest.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using CR.Test; 4 | using Microsoft.VisualStudio.TestTools.UnitTesting; 5 | 6 | namespace BuildTablesFromPdf.Engine.Test 7 | { 8 | [TestClass] 9 | public class SimpleTextExtractorTest 10 | { 11 | 12 | private const string Path = @"TestFiles\"; 13 | 14 | 15 | [TestMethod] 16 | public void SimpleTextExtractorTestRun() 17 | { 18 | var pdfFileList = Directory.GetFiles(Path, "*.pdf"); 19 | 20 | foreach (var pdfFilePath in pdfFileList) 21 | CheckFile(pdfFilePath); 22 | 23 | } 24 | 25 | 26 | private static void CheckFile(string pdfFilePath) 27 | { 28 | Console.WriteLine("Reading " + System.IO.Path.GetFileName(pdfFilePath)); 29 | string fileContent = SimpleTextExtractor.ReadPdfFile(pdfFilePath); 30 | 31 | 32 | string txtFileName = pdfFilePath + ".Text.txt"; 33 | 34 | if (File.Exists(txtFileName)) 35 | { 36 | Console.WriteLine("Testing " + System.IO.Path.GetFileName(pdfFilePath)); 37 | string txtFileContent = File.ReadAllText(txtFileName); 38 | if (txtFileContent != fileContent) 39 | { 40 | string[] txtFileLines = txtFileContent.Replace("\r\n", "\r").Split('\r'); 41 | string[] txtLines = fileContent.Replace("\r\n", "\r").Split('\r'); 42 | string diff = MHDiff.GetDiff(txtFileLines, txtLines); 43 | Console.WriteLine("Files are different"); 44 | Console.WriteLine(diff); 45 | throw new Exception("Wrong content in file " + pdfFilePath); 46 | } 47 | } 48 | else 49 | { 50 | Console.WriteLine(System.IO.Path.GetFileName(pdfFilePath) + " NOT TESTED!!!"); 51 | Console.WriteLine("Creating txt file " + txtFileName); 52 | File.WriteAllText(txtFileName, fileContent); 53 | } 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/frmNotepad.Designer.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Renderer 2 | { 3 | partial class frmNotepad 4 | { 5 | /// 6 | /// Required designer variable. 7 | /// 8 | private System.ComponentModel.IContainer components = null; 9 | 10 | /// 11 | /// Clean up any resources being used. 12 | /// 13 | /// true if managed resources should be disposed; otherwise, false. 14 | protected override void Dispose(bool disposing) 15 | { 16 | if (disposing && (components != null)) 17 | { 18 | components.Dispose(); 19 | } 20 | base.Dispose(disposing); 21 | } 22 | 23 | #region Windows Form Designer generated code 24 | 25 | /// 26 | /// Required method for Designer support - do not modify 27 | /// the contents of this method with the code editor. 28 | /// 29 | private void InitializeComponent() 30 | { 31 | this.txtNotepad = new System.Windows.Forms.TextBox(); 32 | this.SuspendLayout(); 33 | // 34 | // txtNotepad 35 | // 36 | this.txtNotepad.Dock = System.Windows.Forms.DockStyle.Fill; 37 | this.txtNotepad.Location = new System.Drawing.Point(0, 0); 38 | this.txtNotepad.Multiline = true; 39 | this.txtNotepad.Name = "txtNotepad"; 40 | this.txtNotepad.ScrollBars = System.Windows.Forms.ScrollBars.Both; 41 | this.txtNotepad.Size = new System.Drawing.Size(455, 432); 42 | this.txtNotepad.TabIndex = 0; 43 | this.txtNotepad.WordWrap = false; 44 | // 45 | // frmNotepad 46 | // 47 | this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F); 48 | this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font; 49 | this.ClientSize = new System.Drawing.Size(455, 432); 50 | this.Controls.Add(this.txtNotepad); 51 | this.Name = "frmNotepad"; 52 | this.Text = "Notepad"; 53 | this.ResumeLayout(false); 54 | this.PerformLayout(); 55 | 56 | } 57 | 58 | #endregion 59 | 60 | private System.Windows.Forms.TextBox txtNotepad; 61 | } 62 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine.Test/BuildTablesFromPdfTest.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using CR.Test; 4 | using Microsoft.VisualStudio.TestTools.UnitTesting; 5 | 6 | namespace BuildTablesFromPdf.Engine.Test 7 | { 8 | [TestClass] 9 | public class BuildTablesFromPdfTest 10 | { 11 | 12 | private const string Path = @"TestFiles\"; 13 | 14 | 15 | [TestMethod] 16 | public void BuildTablesFromPdfTestRun() 17 | { 18 | var pdfFileList = Directory.GetFiles(Path, "*.pdf"); 19 | 20 | foreach (var pdfFilePath in pdfFileList) 21 | CheckFile(pdfFilePath); 22 | 23 | } 24 | 25 | 26 | private static void CheckFile(string pdfFilePath) 27 | { 28 | Console.WriteLine("Reading " + System.IO.Path.GetFileName(pdfFilePath)); 29 | PageCollection pages = ContentExtractor.ReadPdfFileAndRefreshContent(pdfFilePath); 30 | string fileContent = string.Empty; 31 | foreach (Page page in pages) 32 | { 33 | fileContent += "======================================================\r\n"; 34 | fileContent += page.ToString(); 35 | } 36 | 37 | 38 | string txtFileName = pdfFilePath + ".txt"; 39 | 40 | if (File.Exists(txtFileName)) 41 | { 42 | Console.WriteLine("Testing " + System.IO.Path.GetFileName(pdfFilePath)); 43 | string txtFileContent = File.ReadAllText(txtFileName); 44 | if (txtFileContent != fileContent) 45 | { 46 | string[] txtFileLines = txtFileContent.Replace("\r\n", "\r").Split('\r'); 47 | string[] txtLines = fileContent.Replace("\r\n", "\r").Split('\r'); 48 | string diff = MHDiff.GetDiff(txtFileLines, txtLines); 49 | Console.WriteLine("Files are different"); 50 | Console.WriteLine(diff); 51 | throw new Exception("Wrong content in file " + pdfFilePath); 52 | } 53 | } 54 | else 55 | { 56 | Console.WriteLine(System.IO.Path.GetFileName(pdfFilePath) + " NOT TESTED!!!"); 57 | Console.WriteLine("Creating txt file " + txtFileName); 58 | File.WriteAllText(txtFileName, fileContent); 59 | } 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/HtmlConverter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using BuildTablesFromPdf.Engine; 3 | using BuildTablesFromPdf.Engine.Tables; 4 | 5 | namespace BuildTablesFromPdf.Renderer 6 | { 7 | public static class HtmlConverter 8 | { 9 | public static string Convert(PageCollection pages) 10 | { 11 | string content = string.Empty; 12 | foreach (Page page in pages) 13 | { 14 | if (!string.IsNullOrEmpty(content)) 15 | content += "
"; 16 | foreach (IPageContent pageContent in page.Contents) 17 | { 18 | if (pageContent is Paragraph) 19 | { 20 | content += string.Format("

{0}

", ((Paragraph)pageContent).Content); 21 | } 22 | else if (pageContent is Table) 23 | { 24 | Table table = (Table)pageContent; 25 | content += ""; 26 | for (int rowIndex = 0; rowIndex < table.Rows.Count; rowIndex++) 27 | { 28 | content += ""; 29 | for (int columnIndex = 0; columnIndex < table.Columns.Count + 2; columnIndex++) 30 | { 31 | string borderStyle = 32 | columnIndex == 0 || columnIndex == table.Columns.Count + 1 ? 33 | "style=\"border: none;\"" : 34 | "" 35 | ; 36 | 37 | if (rowIndex == 0) 38 | { 39 | content += ""; 42 | } 43 | else 44 | { 45 | content += ""; 48 | } 49 | } 50 | content += ""; 51 | } 52 | content += "
"; 40 | content += table[rowIndex, columnIndex]; 41 | content += ""; 46 | content += table[rowIndex, columnIndex]; 47 | content += "
"; 53 | } 54 | else 55 | { 56 | content += pageContent.ToString().Replace("\r\n", "
"); 57 | } 58 | } 59 | } 60 | return Properties.Resources.HTML_Header + content + Properties.Resources.HTML_Footer; 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine.Test/PdfArrayDataTypeTest.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Microsoft.VisualStudio.TestTools.UnitTesting; 3 | 4 | namespace BuildTablesFromPdf.Engine.Test 5 | { 6 | [TestClass] 7 | public class PdfArrayDataTypeTest 8 | { 9 | [TestMethod] 10 | public void TestGetRawDataBaseTest() 11 | { 12 | var inputData = "[(:)19(-)] TJ\n0 -13.65 Td\n(\"$$7.#8-8:;9-) Tj\n0 -14.7 Td\n(+) Tj\n[(8)75(\")75(89<9#8-6)(\")75(89-)] TJ\n144 29.2 Td\n(\"96) Tj\n0 -14.65 Td\n($.!!9#8) Tj\n[(-)31(\"$$7.#8)] TJ\n-65.6 -12.1 Td\n/c 7 Tf\n(=>?@\\)*-ABC-DE'>F) Tj\n17.55 12.75 Td\n(=>?@\\)*-&GH) Tj\n14.95 12.5 Td\n(IJKL\\)*) Tj\n-13.8 30 Td\n(=>?@\\)*-MN') Tj\n46.9 -56.85 Td\n/c 8 Tf\n(O5POQP4O45) Tj\n-144 40 Td\n(/ \"#-#7-) Tj\n144 0.85 Td\n(\"931-O42O-OR1S-4O41-13R3-5OQ) Tj\n0 14.75 Td\n(OR1S4O4113R35OQ) Tj\n-10.45 -15.6 Td\n(T) Tj\n0 -12.5 Td\n(T) Tj\n0 -14.65 Td\n(T) Tj\n0 -12.5 Td\n(T) Tj\n0.1 55.25 Td\n(T) Tj\n-77.05 -14.1 Td\n/c 7 Tf\n(UV'WK\\)*-=>?@J\\)-U\\)GX\\)*-MN'\\)*) Tj\nET\nQ\nBT\n1 0 0 1 240.35 -205.7 Tm\n0 0 0 sc\n/c 8 Tf\n(Y!7<-) Tj\n115.15 1.95 Td\n(ZK) Tj\n17.85 -0.25 Td\n(T) Tj\n9.6 1.1 Td\n(OQPO[P4O44) Tj\n63.7 -0.85 Td\n[(8)75(\\\\)] TJ\n23.85 0 Td\n(U\\)]) Tj\n15.25 0 Td\n(T) Tj\n7.1 0.85 Td\n(5QPQ4P4O44) Tj\n-467.5 -19.95 Td\n/g 9 Tf\n( !\"#$%&'\\(\"\\)*) Tj\n519.8 0 Td\n(+\\(,\"$%) Tj\n-23.2 -11.5 Td\n/c 8 Tf\n(-- \"0"; 13 | int i = 0; 14 | var outputData = PdfArrayDataType.GetRawData(inputData, ref i); 15 | Assert.AreEqual("[(:)19(-)]", outputData); 16 | } 17 | 18 | [TestMethod] 19 | public void TestGetRawDataTextWithSquareBracketTest() 20 | { 21 | var inputData = "[(:])19(-)] TJ\n0 -13.65 Td\n(\"$$7.#8-8:;9-) Tj\n0 -14.7 Td\n(+) Tj\n[(8)75(\")75(89<9#8-6)(\")75(89-)] TJ\n144 29.2 Td\n(\"96) Tj\n0 -14.65 Td\n($.!!9#8) Tj\n[(-)31(\"$$7.#8)] TJ\n-65.6 -12.1 Td\n/c 7 Tf\n(=>?@\\)*-ABC-DE'>F) Tj\n17.55 12.75 Td\n(=>?@\\)*-&GH) Tj\n14.95 12.5 Td\n(IJKL\\)*) Tj\n-13.8 30 Td\n(=>?@\\)*-MN') Tj\n46.9 -56.85 Td\n/c 8 Tf\n(O5POQP4O45) Tj\n-144 40 Td\n(/ \"#-#7-) Tj\n144 0.85 Td\n(\"931-O42O-OR1S-4O41-13R3-5OQ) Tj\n0 14.75 Td\n(OR1S4O4113R35OQ) Tj\n-10.45 -15.6 Td\n(T) Tj\n0 -12.5 Td\n(T) Tj\n0 -14.65 Td\n(T) Tj\n0 -12.5 Td\n(T) Tj\n0.1 55.25 Td\n(T) Tj\n-77.05 -14.1 Td\n/c 7 Tf\n(UV'WK\\)*-=>?@J\\)-U\\)GX\\)*-MN'\\)*) Tj\nET\nQ\nBT\n1 0 0 1 240.35 -205.7 Tm\n0 0 0 sc\n/c 8 Tf\n(Y!7<-) Tj\n115.15 1.95 Td\n(ZK) Tj\n17.85 -0.25 Td\n(T) Tj\n9.6 1.1 Td\n(OQPO[P4O44) Tj\n63.7 -0.85 Td\n[(8)75(\\\\)] TJ\n23.85 0 Td\n(U\\)]) Tj\n15.25 0 Td\n(T) Tj\n7.1 0.85 Td\n(5QPQ4P4O44) Tj\n-467.5 -19.95 Td\n/g 9 Tf\n( !\"#$%&'\\(\"\\)*) Tj\n519.8 0 Td\n(+\\(,\"$%) Tj\n-23.2 -11.5 Td\n/c 8 Tf\n(-- \"0"; 22 | int i = 0; 23 | var outputData = PdfArrayDataType.GetRawData(inputData, ref i); 24 | Assert.AreEqual("[(:])19(-)]", outputData); 25 | } 26 | 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/CMap/BFRange.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Globalization; 3 | using BuildTablesFromPdf.Engine.Statements; 4 | 5 | namespace BuildTablesFromPdf.Engine.CMap 6 | { 7 | public class BFRange 8 | { 9 | public int BeginChar { get; set; } 10 | public int EndChar { get; set; } 11 | public int? UnicodeChar { get; set; } 12 | public int[] UnicodeChars { get; set; } 13 | 14 | public static BFRange Parse(string s, ref int startPosition) 15 | { 16 | string sBeginChar; 17 | string sEndChar; 18 | 19 | Statement.SkipSpace(s, ref startPosition); 20 | sBeginChar = PdfHexStringDataType.GetRawData(s, ref startPosition); 21 | Statement.SkipSpace(s, ref startPosition); 22 | sEndChar = PdfHexStringDataType.GetRawData(s, ref startPosition); 23 | Statement.SkipSpace(s, ref startPosition); 24 | 25 | int beginChar = int.Parse(sBeginChar.Substring(1, sBeginChar.Length - 2), NumberStyles.HexNumber); 26 | int endChar = int.Parse(sEndChar.Substring(1, sEndChar.Length - 2), NumberStyles.HexNumber); 27 | 28 | if (PdfArrayDataType.IsStartChar(s, startPosition)) 29 | { 30 | var rawData = PdfArrayDataType.GetRawData(s, ref startPosition); 31 | var array = PdfArrayDataType.Parse(rawData); 32 | var unicodeChars = new int[array.Elements.Count]; 33 | for (int i = 0; i < array.Elements.Count; i++) 34 | { 35 | string sUnicodeChar; 36 | int unused = 0; 37 | sUnicodeChar = PdfHexStringDataType.GetRawData(array.StringElements[i], ref unused); 38 | int unicodeChar = int.Parse(sUnicodeChar.Substring(1, sUnicodeChar.Length - 2), NumberStyles.HexNumber); 39 | unicodeChars[i] = unicodeChar; 40 | } 41 | 42 | return new BFRange() 43 | { 44 | BeginChar = beginChar, 45 | EndChar = endChar, 46 | UnicodeChars = unicodeChars 47 | }; 48 | 49 | } 50 | else 51 | { 52 | string sUnicodeChar; 53 | sUnicodeChar = PdfHexStringDataType.GetRawData(s, ref startPosition); 54 | 55 | int unicodeChar = int.Parse(sUnicodeChar.Substring(1, sUnicodeChar.Length - 2), NumberStyles.HexNumber); 56 | 57 | return new BFRange() 58 | { 59 | BeginChar = beginChar, 60 | EndChar = endChar, 61 | UnicodeChar = unicodeChar 62 | }; 63 | } 64 | } 65 | 66 | public override string ToString() 67 | { 68 | return string.Format("{0}-{1} {2}({3})", BeginChar, EndChar, UnicodeChar, (char)UnicodeChar.GetValueOrDefault('?')); 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2013 4 | VisualStudioVersion = 12.0.30723.0 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BuildTablesFromPdf.Console", "BuildTablesFromPdf.Console\BuildTablesFromPdf.Console.csproj", "{4079BDE2-6CAD-43C3-8C1B-A134840D95C1}" 7 | EndProject 8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BuildTablesFromPdf.Engine", "BuildTablesFromPdf.Engine\BuildTablesFromPdf.Engine.csproj", "{9883FA0A-CB8E-4053-A3E9-58FFE5269320}" 9 | EndProject 10 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BuildTablesFromPdf.Renderer", "BuildTablesFromPdf.Renderer\BuildTablesFromPdf.Renderer.csproj", "{DF805107-8789-4B69-8BE0-74AAF113AC79}" 11 | EndProject 12 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{8CE7025E-9E95-4174-8CB8-D952AF0B77A1}" 13 | ProjectSection(SolutionItems) = preProject 14 | LICENSE = LICENSE 15 | README.md = README.md 16 | EndProjectSection 17 | EndProject 18 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BuildTablesFromPdf.Engine.Test", "BuildTablesFromPdf.Engine.Test\BuildTablesFromPdf.Engine.Test.csproj", "{0EEDBCC7-E56A-435E-B41F-09953B8D9CFC}" 19 | EndProject 20 | Global 21 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 22 | Debug|Any CPU = Debug|Any CPU 23 | Release|Any CPU = Release|Any CPU 24 | EndGlobalSection 25 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 26 | {4079BDE2-6CAD-43C3-8C1B-A134840D95C1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 27 | {4079BDE2-6CAD-43C3-8C1B-A134840D95C1}.Debug|Any CPU.Build.0 = Debug|Any CPU 28 | {4079BDE2-6CAD-43C3-8C1B-A134840D95C1}.Release|Any CPU.ActiveCfg = Release|Any CPU 29 | {4079BDE2-6CAD-43C3-8C1B-A134840D95C1}.Release|Any CPU.Build.0 = Release|Any CPU 30 | {9883FA0A-CB8E-4053-A3E9-58FFE5269320}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 31 | {9883FA0A-CB8E-4053-A3E9-58FFE5269320}.Debug|Any CPU.Build.0 = Debug|Any CPU 32 | {9883FA0A-CB8E-4053-A3E9-58FFE5269320}.Release|Any CPU.ActiveCfg = Release|Any CPU 33 | {9883FA0A-CB8E-4053-A3E9-58FFE5269320}.Release|Any CPU.Build.0 = Release|Any CPU 34 | {DF805107-8789-4B69-8BE0-74AAF113AC79}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 35 | {DF805107-8789-4B69-8BE0-74AAF113AC79}.Debug|Any CPU.Build.0 = Debug|Any CPU 36 | {DF805107-8789-4B69-8BE0-74AAF113AC79}.Release|Any CPU.ActiveCfg = Release|Any CPU 37 | {DF805107-8789-4B69-8BE0-74AAF113AC79}.Release|Any CPU.Build.0 = Release|Any CPU 38 | {0EEDBCC7-E56A-435E-B41F-09953B8D9CFC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 39 | {0EEDBCC7-E56A-435E-B41F-09953B8D9CFC}.Debug|Any CPU.Build.0 = Debug|Any CPU 40 | {0EEDBCC7-E56A-435E-B41F-09953B8D9CFC}.Release|Any CPU.ActiveCfg = Release|Any CPU 41 | {0EEDBCC7-E56A-435E-B41F-09953B8D9CFC}.Release|Any CPU.Build.0 = Release|Any CPU 42 | EndGlobalSection 43 | GlobalSection(SolutionProperties) = preSolution 44 | HideSolutionNode = FALSE 45 | EndGlobalSection 46 | EndGlobal 47 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Paragraph.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Linq; 5 | 6 | namespace BuildTablesFromPdf.Engine 7 | { 8 | [DebuggerDisplay("{DebuggerDisplay}")] 9 | public class Paragraph : IPageContent, IFormattable 10 | { 11 | private List _Contents = new List(); 12 | 13 | public Paragraph(double y) 14 | { 15 | 16 | Y = y; 17 | } 18 | 19 | public double Y { get; private set; } 20 | 21 | public string Content 22 | { 23 | get 24 | { 25 | string result = null; 26 | foreach (ParagraphContent content in _Contents.OrderBy(_ => _.Point.X)) 27 | { 28 | if (result == null) 29 | result = content.Content; 30 | else 31 | result = result + " " + content.Content; 32 | } 33 | 34 | return result; 35 | } 36 | 37 | } 38 | 39 | public void AddText(Point point, string content) 40 | { 41 | if (!Contains(point)) 42 | throw new InvalidOperationException("The point is not on the paragraph"); 43 | 44 | _Contents.Add(new ParagraphContent(point, content)); 45 | } 46 | 47 | public bool Contains(Point point) 48 | { 49 | return Y - ContentExtractor.Tolerance < point.Y && point.Y < Y + ContentExtractor.Tolerance * 3; 50 | } 51 | 52 | public bool Contains(double y) 53 | { 54 | return Y - ContentExtractor.Tolerance < y && y < Y + ContentExtractor.Tolerance * 3; 55 | } 56 | 57 | #region IFormattable 58 | 59 | // ReSharper disable once UnusedMember.Local 60 | private string DebuggerDisplay 61 | { 62 | get { return ToString("d"); } 63 | } 64 | 65 | public override string ToString() 66 | { 67 | return ToString(""); 68 | } 69 | 70 | public string ToString(string format) 71 | { 72 | switch (format) 73 | { 74 | case "s": 75 | case "": 76 | case null: 77 | return Content; 78 | case "d": 79 | return string.Format("{0} {1}", Y, Content); 80 | default: 81 | throw new FormatException(); 82 | } 83 | } 84 | 85 | public string ToString(string format, IFormatProvider formatProvider) 86 | { 87 | return ToString(format); 88 | } 89 | 90 | #endregion 91 | 92 | 93 | private class ParagraphContent 94 | { 95 | public ParagraphContent(Point point, string content) 96 | { 97 | Point = point; 98 | Content = content; 99 | } 100 | 101 | public Point Point { get; private set; } 102 | public string Content { get; private set; } 103 | } 104 | 105 | } 106 | 107 | } 108 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/PdfDataTypes/PdfHexStringDataType.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine 4 | { 5 | public static class PdfHexStringDataType 6 | { 7 | 8 | public static bool IsStartChar(string content, int i) 9 | { 10 | return content[i] == '<'; 11 | } 12 | 13 | public static bool IsStartChar(string content) 14 | { 15 | return content.Length > 0 && content[0] == '<'; 16 | } 17 | 18 | 19 | public static string GetRawData(string content, ref int i) 20 | { 21 | if (!IsStartChar(content, i)) 22 | throw new ArgumentException("The content is not a PdfStringDataType"); 23 | 24 | string data = String.Empty; 25 | while (content[i] != '>') 26 | { 27 | data += content[i]; 28 | i++; 29 | } 30 | data += content[i]; 31 | i++; 32 | return data; 33 | } 34 | 35 | public static string GetContent(string escapedContent) 36 | { 37 | if (escapedContent == null) throw new ArgumentNullException("escapedContent"); 38 | escapedContent = escapedContent.Trim(); 39 | if (!escapedContent.StartsWith("<") || !escapedContent.EndsWith(">")) 40 | throw new ArgumentException(String.Format("Error retrieving content from escaped content '{0}'", escapedContent), "escapedContent"); 41 | 42 | string content = string.Empty; 43 | for (int i = 1; i < escapedContent.Length - 1; i += 2) 44 | { 45 | char c = (char)int.Parse(escapedContent.Substring(i, 2), System.Globalization.NumberStyles.HexNumber); 46 | content += c; 47 | } 48 | 49 | return content; 50 | } 51 | 52 | public static int[] GetHexContent(string escapedContent) 53 | { 54 | if (escapedContent == null) throw new ArgumentNullException("escapedContent"); 55 | 56 | if (!escapedContent.StartsWith("<") || !escapedContent.EndsWith(">")) 57 | throw new ArgumentException(String.Format("Error retrieving content from escaped content '{0}'", escapedContent), "escapedContent"); 58 | 59 | string hexContentString = escapedContent.Substring(1, escapedContent.Length - 2); 60 | if (hexContentString.Length < 5) 61 | return new[] { int.Parse(hexContentString, System.Globalization.NumberStyles.HexNumber) }; 62 | 63 | if ((hexContentString.Length & 0x01) != 0) 64 | throw new ArgumentException("Odd number of hex characters"); 65 | 66 | int[] content; 67 | if ((hexContentString.Length & 0x03) != 0) 68 | { 69 | content = new int[hexContentString.Length >> 1]; 70 | // Bytes 71 | for (int i = 0; i < hexContentString.Length >> 1; i++) 72 | content[i] = int.Parse(hexContentString.Substring(i << 1, 2), System.Globalization.NumberStyles.HexNumber); 73 | } 74 | else 75 | { 76 | content = new int[hexContentString.Length >> 2]; 77 | // Words 78 | for (int i = 0; i < hexContentString.Length >> 2; i ++) 79 | content[i] = int.Parse(hexContentString.Substring(i << 2, 4), System.Globalization.NumberStyles.HexNumber); 80 | } 81 | 82 | return content; 83 | } 84 | 85 | } 86 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Console/BuildTablesFromPdf.Console.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {4079BDE2-6CAD-43C3-8C1B-A134840D95C1} 8 | Exe 9 | Properties 10 | BuildTablesFromPdf 11 | BuildTablesFromPdf 12 | v4.5 13 | 512 14 | 15 | 16 | AnyCPU 17 | true 18 | full 19 | false 20 | bin\Debug\ 21 | DEBUG;TRACE 22 | prompt 23 | 4 24 | 25 | 26 | AnyCPU 27 | pdbonly 28 | true 29 | bin\Release\ 30 | TRACE 31 | prompt 32 | 4 33 | 34 | 35 | 36 | ..\packages\iTextSharp-LGPL.4.1.6\lib\iTextSharp.dll 37 | True 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | Always 55 | 56 | 57 | Always 58 | 59 | 60 | 61 | 62 | 63 | {9883fa0a-cb8e-4053-a3e9-58ffe5269320} 64 | BuildTablesFromPdf.Engine 65 | 66 | 67 | 68 | 75 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/Properties/Resources.Designer.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // This code was generated by a tool. 4 | // Runtime Version:4.0.30319.42000 5 | // 6 | // Changes to this file may cause incorrect behavior and will be lost if 7 | // the code is regenerated. 8 | // 9 | //------------------------------------------------------------------------------ 10 | 11 | namespace BuildTablesFromPdf.Renderer.Properties { 12 | using System; 13 | 14 | 15 | /// 16 | /// A strongly-typed resource class, for looking up localized strings, etc. 17 | /// 18 | // This class was auto-generated by the StronglyTypedResourceBuilder 19 | // class via a tool like ResGen or Visual Studio. 20 | // To add or remove a member, edit your .ResX file then rerun ResGen 21 | // with the /str option, or rebuild your VS project. 22 | [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0")] 23 | [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] 24 | [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] 25 | internal class Resources { 26 | 27 | private static global::System.Resources.ResourceManager resourceMan; 28 | 29 | private static global::System.Globalization.CultureInfo resourceCulture; 30 | 31 | [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")] 32 | internal Resources() { 33 | } 34 | 35 | /// 36 | /// Returns the cached ResourceManager instance used by this class. 37 | /// 38 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] 39 | internal static global::System.Resources.ResourceManager ResourceManager { 40 | get { 41 | if (object.ReferenceEquals(resourceMan, null)) { 42 | global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("BuildTablesFromPdf.Renderer.Properties.Resources", typeof(Resources).Assembly); 43 | resourceMan = temp; 44 | } 45 | return resourceMan; 46 | } 47 | } 48 | 49 | /// 50 | /// Overrides the current thread's CurrentUICulture property for all 51 | /// resource lookups using this strongly typed resource class. 52 | /// 53 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] 54 | internal static global::System.Globalization.CultureInfo Culture { 55 | get { 56 | return resourceCulture; 57 | } 58 | set { 59 | resourceCulture = value; 60 | } 61 | } 62 | 63 | /// 64 | /// Looks up a localized string similar to </body> 65 | ///</html>. 66 | /// 67 | internal static string HTML_Footer { 68 | get { 69 | return ResourceManager.GetString("HTML_Footer", resourceCulture); 70 | } 71 | } 72 | 73 | /// 74 | /// Looks up a localized string similar to <html> 75 | ///<head> 76 | ///</head> 77 | ///<body> 78 | ///. 79 | /// 80 | internal static string HTML_Header { 81 | get { 82 | return ResourceManager.GetString("HTML_Header", resourceCulture); 83 | } 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/CMap/EncodingDifferenceToUnicode.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using iTextSharp.text.pdf; 3 | 4 | namespace BuildTablesFromPdf.Engine.CMap 5 | { 6 | public class EncodingDifferenceToUnicode 7 | { 8 | public EncodingDifferenceToUnicode() 9 | { 10 | EncodingDifferences = new EncodingDifferenceCollection(); 11 | } 12 | 13 | public EncodingDifferenceCollection EncodingDifferences { get; private set; } 14 | 15 | 16 | public int ConvertToUnicode(int cid) 17 | { 18 | var encodingDifference = EncodingDifferences.Find(cid); 19 | if (encodingDifference == null) 20 | return cid; 21 | 22 | return encodingDifference.NameCharacters[cid - encodingDifference.BeginChar].Character; 23 | } 24 | 25 | public char ConvertToUnicodeChar(int cid) 26 | { 27 | return (char)ConvertToUnicode(cid); 28 | } 29 | 30 | public char ConvertToUnicodeChar(char cid) 31 | { 32 | return (char)ConvertToUnicode(cid); 33 | } 34 | 35 | 36 | public string ConvertToString(string content) 37 | { 38 | string convert = string.Empty; 39 | foreach (char c in content) 40 | convert += ConvertToUnicodeChar(c); 41 | return convert; 42 | } 43 | 44 | public string ConvertToString(int[] content) 45 | { 46 | string convert = string.Empty; 47 | foreach (int c in content) 48 | convert += ConvertToUnicodeChar(c); 49 | return convert; 50 | } 51 | 52 | 53 | /// 54 | /// Parses the specified string. 55 | /// 56 | /// The font dictionary. 57 | /// 58 | /// The EncodingDifferenceToUnicode or null if the characters map directly to unicode 59 | /// 60 | /// 61 | /// Name found before Number 62 | /// or 63 | /// In /Differences only Numbers and Names are allowed 64 | /// 65 | public static EncodingDifferenceToUnicode Parse(PdfDictionary fontDictionary) 66 | { 67 | EncodingDifferenceToUnicode parse = new EncodingDifferenceToUnicode(); 68 | 69 | var encodingDictionaryReference = fontDictionary.GetAsDict(PdfName.ENCODING); 70 | if (encodingDictionaryReference == null) 71 | return null; 72 | 73 | 74 | PdfArray differencesArray = encodingDictionaryReference.GetAsArray(PdfName.DIFFERENCES); 75 | 76 | EncodingDifference encodingDifference = null; 77 | 78 | foreach (var item in differencesArray.ArrayList) 79 | { 80 | if (item is PdfNumber) 81 | { 82 | encodingDifference = new EncodingDifference(((PdfNumber) item).IntValue); 83 | parse.EncodingDifferences.Add(encodingDifference); 84 | } 85 | else if (item is PdfName) 86 | { 87 | if (encodingDifference == null) 88 | throw new InvalidOperationException("Name found before Number"); 89 | string name = ((PdfName) item).ToString().Substring(1); 90 | var nameCharacter = NameCharacterCollection.Instance.Find(name); 91 | encodingDifference.NameCharacters.Add(nameCharacter); 92 | } 93 | else 94 | { 95 | throw new InvalidOperationException("In /Differences only Numbers and Names are allowed"); 96 | } 97 | } 98 | 99 | return parse; 100 | } 101 | 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Statements/Statement.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace BuildTablesFromPdf.Engine.Statements 4 | { 5 | public class Statement 6 | { 7 | public static string GetNextStatement(string content, ref int i) 8 | { 9 | string statement = ""; 10 | bool readingStatement = false; 11 | 12 | if (i >= content.Length) 13 | return null; 14 | 15 | while (i < content.Length) 16 | { 17 | if (IsSpace(content[i])) 18 | { 19 | if (readingStatement) 20 | { 21 | i++; 22 | return statement.Trim(); 23 | } 24 | 25 | statement += " "; 26 | i++; 27 | } 28 | else if (readingStatement && IsSeparator(content[i])) 29 | { 30 | return statement; 31 | } 32 | else if (PdfNumericDataType.IsStartChar(content, i)) 33 | { 34 | // string parameter 35 | statement += PdfNumericDataType.GetRawData(content, ref i); 36 | } 37 | else if (PdfStringDataType.IsStartChar(content, i)) 38 | { 39 | // string parameter 40 | statement += PdfStringDataType.GetRawData(content, ref i); 41 | } 42 | else if (PdfArrayDataType.IsStartChar(content, i)) 43 | { 44 | // array parameter 45 | statement += PdfArrayDataType.GetRawData(content, ref i); 46 | } 47 | else if (PdfHexStringDataType.IsStartChar(content, i)) 48 | { 49 | // hex string parameter 50 | statement += PdfHexStringDataType.GetRawData(content, ref i); 51 | } 52 | else if (content[i] == 't' && i + 5 < content.Length && content.Substring(i, 5) == "true ") 53 | { 54 | // boolean true parameter 55 | statement += "true "; 56 | i += 5; 57 | } 58 | else if (content[i] == 'f' && i + 6 < content.Length && content.Substring(i, 6) == "false ") 59 | { 60 | // boolean false parameter 61 | statement += "false "; 62 | i += 6; 63 | } 64 | else if (PdfObjectDataType.IsStartChar(content, i)) 65 | { 66 | // hex string parameter 67 | statement += PdfObjectDataType.GetRawData(content, ref i); 68 | } 69 | else 70 | { 71 | statement += content[i]; 72 | readingStatement = true; 73 | i++; 74 | } 75 | } 76 | 77 | return statement; 78 | } 79 | 80 | public static bool IsSeparator(char c) 81 | { 82 | switch (c) 83 | { 84 | case' ': 85 | case '\n': 86 | case '[': 87 | case '(': 88 | case '<': 89 | return true; 90 | default: 91 | return false; 92 | } 93 | } 94 | 95 | public static bool IsSpace(char c) 96 | { 97 | switch (c) 98 | { 99 | case ' ': 100 | case '\n': 101 | return true; 102 | default: 103 | return false; 104 | } 105 | } 106 | 107 | public static void SkipSpace(string s, ref int i) 108 | { 109 | while (i < s.Length && IsSpace(s[i])) 110 | i++; 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/CMap/CMapToUnicode.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using BuildTablesFromPdf.Engine.Statements; 3 | 4 | namespace BuildTablesFromPdf.Engine.CMap 5 | { 6 | public class CMapToUnicode 7 | { 8 | public CMapToUnicode() 9 | { 10 | BFRanges = new BFRangeCollection(); 11 | } 12 | 13 | public BFRangeCollection BFRanges { get; private set; } 14 | 15 | public int ConvertToUnicode(int cid) 16 | { 17 | var bfRange = BFRanges.Find(cid); 18 | if (bfRange == null) 19 | return cid; 20 | 21 | if (bfRange.UnicodeChar == null) 22 | { 23 | return bfRange.UnicodeChars[cid - bfRange.BeginChar]; 24 | } 25 | else 26 | { 27 | return cid - bfRange.BeginChar + bfRange.UnicodeChar.Value; 28 | } 29 | } 30 | 31 | public char ConvertToUnicodeChar(int cid) 32 | { 33 | return Convert.ToChar(ConvertToUnicode(cid)); 34 | } 35 | 36 | public char ConvertToUnicodeChar(char cid) 37 | { 38 | return Convert.ToChar(ConvertToUnicode(cid)); 39 | } 40 | 41 | 42 | public string ConvertToString(string content) 43 | { 44 | string convert = string.Empty; 45 | foreach (char c in content) 46 | convert += ConvertToUnicodeChar(c); 47 | return convert; 48 | } 49 | 50 | public string ConvertToString(int[] content) 51 | { 52 | string convert = string.Empty; 53 | foreach (int c in content) 54 | convert += ConvertToUnicodeChar(c); 55 | return convert; 56 | } 57 | 58 | /// 59 | /// Parses the specified string. 60 | /// 61 | /// The string. 62 | /// The CMapToUnicode or null if the characters map directly to unicode 63 | public static CMapToUnicode Parse(string s) 64 | { 65 | CMapToUnicode parse = new CMapToUnicode(); 66 | 67 | InternalParseBFRange(s, parse); 68 | InternalParseBFChar(s, parse); 69 | 70 | return parse.BFRanges.Count == 0 ? null : parse; 71 | } 72 | 73 | private static void InternalParseBFRange(string s, CMapToUnicode parse) 74 | { 75 | string bfRange; 76 | int beginBfRangePosition = s.IndexOf("beginbfrange", StringComparison.CurrentCultureIgnoreCase); 77 | while (beginBfRangePosition != -1) 78 | { 79 | beginBfRangePosition += 12; 80 | 81 | int endBfRangePosition = s.IndexOf("endbfrange", beginBfRangePosition, StringComparison.CurrentCultureIgnoreCase); 82 | bfRange = s.Substring(beginBfRangePosition, endBfRangePosition - beginBfRangePosition); 83 | 84 | int i = 0; 85 | Statement.SkipSpace(bfRange, ref i); 86 | while (i < bfRange.Length) 87 | { 88 | parse.BFRanges.Add(BFRange.Parse(bfRange, ref i)); 89 | Statement.SkipSpace(bfRange, ref i); 90 | } 91 | beginBfRangePosition = s.IndexOf("beginbfrange", endBfRangePosition, StringComparison.CurrentCultureIgnoreCase); 92 | } 93 | } 94 | 95 | private static void InternalParseBFChar(string s, CMapToUnicode parse) 96 | { 97 | string bfChar; 98 | int beginBfCharPosition = s.IndexOf("beginbfchar", StringComparison.CurrentCultureIgnoreCase); 99 | while (beginBfCharPosition != -1) 100 | { 101 | beginBfCharPosition += 12; 102 | 103 | int endBfCharPosition = s.IndexOf("endbfchar", beginBfCharPosition, StringComparison.CurrentCultureIgnoreCase); 104 | bfChar = s.Substring(beginBfCharPosition, endBfCharPosition - beginBfCharPosition); 105 | 106 | int i = 0; 107 | Statement.SkipSpace(bfChar, ref i); 108 | while (i < bfChar.Length) 109 | { 110 | parse.BFRanges.Add(BFChar.Parse(bfChar, ref i)); 111 | Statement.SkipSpace(bfChar, ref i); 112 | } 113 | beginBfCharPosition = s.IndexOf("beginbfchar", endBfCharPosition, StringComparison.CurrentCultureIgnoreCase); 114 | } 115 | } 116 | 117 | 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/FileOpen.designer.cs: -------------------------------------------------------------------------------- 1 | namespace BuildTablesFromPdf.Renderer 2 | { 3 | partial class FileOpen 4 | { 5 | /// 6 | /// Required designer variable. 7 | /// 8 | private System.ComponentModel.IContainer components = null; 9 | 10 | /// 11 | /// Clean up any resources being used. 12 | /// 13 | /// true if managed resources should be disposed; otherwise, false. 14 | protected override void Dispose(bool disposing) 15 | { 16 | if (disposing && (components != null)) 17 | { 18 | components.Dispose(); 19 | } 20 | base.Dispose(disposing); 21 | } 22 | 23 | #region Component Designer generated code 24 | /// 25 | /// Required method for Designer support - do not modify 26 | /// the contents of this method with the code editor. 27 | /// 28 | private void InitializeComponent() 29 | { 30 | this.components = new System.ComponentModel.Container(); 31 | System.ComponentModel.ComponentResourceManager resources = new System.ComponentModel.ComponentResourceManager(typeof(FileOpen)); 32 | this.openFileDialog = new System.Windows.Forms.OpenFileDialog(); 33 | this.btnFile = new System.Windows.Forms.Button(); 34 | this.txtFile = new System.Windows.Forms.TextBox(); 35 | this.toolTip = new System.Windows.Forms.ToolTip(this.components); 36 | this.btnBrowse = new System.Windows.Forms.Button(); 37 | this.SuspendLayout(); 38 | // 39 | // btnFile 40 | // 41 | this.btnFile.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right))); 42 | this.btnFile.FlatStyle = System.Windows.Forms.FlatStyle.Popup; 43 | this.btnFile.Image = ((System.Drawing.Image)(resources.GetObject("btnFile.Image"))); 44 | this.btnFile.Location = new System.Drawing.Point(194, 1); 45 | this.btnFile.Name = "btnFile"; 46 | this.btnFile.Size = new System.Drawing.Size(18, 18); 47 | this.btnFile.TabIndex = 1; 48 | this.btnFile.TabStop = false; 49 | // 50 | // txtFile 51 | // 52 | this.txtFile.AllowDrop = true; 53 | this.txtFile.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom) 54 | | System.Windows.Forms.AnchorStyles.Left) 55 | | System.Windows.Forms.AnchorStyles.Right))); 56 | this.txtFile.Location = new System.Drawing.Point(0, 0); 57 | this.txtFile.Name = "txtFile"; 58 | this.txtFile.Size = new System.Drawing.Size(192, 20); 59 | this.txtFile.TabIndex = 0; 60 | // 61 | // btnBrowse 62 | // 63 | this.btnBrowse.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right))); 64 | this.btnBrowse.FlatStyle = System.Windows.Forms.FlatStyle.Popup; 65 | this.btnBrowse.Image = ((System.Drawing.Image)(resources.GetObject("btnBrowse.Image"))); 66 | this.btnBrowse.Location = new System.Drawing.Point(213, 1); 67 | this.btnBrowse.Name = "btnBrowse"; 68 | this.btnBrowse.Size = new System.Drawing.Size(18, 18); 69 | this.btnBrowse.TabIndex = 2; 70 | this.btnBrowse.TabStop = false; 71 | // 72 | // FileOpen 73 | // 74 | this.Controls.Add(this.btnBrowse); 75 | this.Controls.Add(this.btnFile); 76 | this.Controls.Add(this.txtFile); 77 | this.Name = "FileOpen"; 78 | this.Size = new System.Drawing.Size(232, 24); 79 | this.ResumeLayout(false); 80 | this.PerformLayout(); 81 | 82 | } 83 | 84 | 85 | #endregion 86 | 87 | /// 88 | /// Oggetto che consente l'apertura del dialogo per la ricerca di un file 89 | /// 90 | private System.Windows.Forms.OpenFileDialog openFileDialog; 91 | 92 | /// 93 | /// Pulsante per l'apertura del dialogo ricerca file 94 | /// 95 | private System.Windows.Forms.Button btnFile; 96 | 97 | /// 98 | /// Controllo per contenere il percorso del file 99 | /// 100 | private System.Windows.Forms.TextBox txtFile; 101 | 102 | /// 103 | /// Per mostrare messaggi sugli oggetti presenti nel controllo 104 | /// 105 | private System.Windows.Forms.ToolTip toolTip; 106 | 107 | /// 108 | /// Per aprire la cartella in cui si trova il file 109 | /// 110 | private System.Windows.Forms.Button btnBrowse; 111 | 112 | 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/PdfDataTypes/PdfArrayDataType.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Globalization; 4 | 5 | namespace BuildTablesFromPdf.Engine 6 | { 7 | public class PdfArrayDataType 8 | { 9 | public PdfArrayDataType() 10 | { 11 | StringElements = new List(); 12 | Elements = new List(); 13 | } 14 | 15 | public List StringElements { get; private set; } 16 | public List Elements { get; private set; } 17 | 18 | 19 | public static PdfArrayDataType Parse(string s) 20 | { 21 | s = s.Trim(); 22 | if (!s.StartsWith("[") || !s.EndsWith("]")) 23 | throw new InvalidOperationException("{0} is not a valid array"); 24 | 25 | PdfArrayDataType pdfArrayDataType = new PdfArrayDataType(); 26 | int i = 0; 27 | InternalParse(pdfArrayDataType, s, ref i); 28 | return pdfArrayDataType; 29 | } 30 | 31 | private static void InternalParse(PdfArrayDataType pdfArrayDataType, string s, ref int i) 32 | { 33 | i++; 34 | while (i < s.Length) 35 | { 36 | if (PdfStringDataType.IsStartChar(s, i)) 37 | { 38 | string item = PdfStringDataType.GetRawData(s, ref i); 39 | pdfArrayDataType.StringElements.Add(item); 40 | pdfArrayDataType.Elements.Add(item); 41 | } 42 | else if (PdfHexStringDataType.IsStartChar(s, i)) 43 | { 44 | string item = PdfHexStringDataType.GetRawData(s, ref i); 45 | pdfArrayDataType.StringElements.Add(item); 46 | pdfArrayDataType.Elements.Add(item); 47 | } 48 | else if (PdfNumericDataType.IsStartChar(s, i)) 49 | { 50 | string item = PdfNumericDataType.GetRawData(s, ref i); 51 | pdfArrayDataType.StringElements.Add(item); 52 | pdfArrayDataType.Elements.Add(float.Parse(item, NumberFormatInfo.InvariantInfo)); 53 | } 54 | else if (PdfArrayDataType.IsStartChar(s, i)) 55 | { 56 | string item = PdfArrayDataType.GetRawData(s, ref i); 57 | pdfArrayDataType.StringElements.Add(item); 58 | 59 | PdfArrayDataType innerPdfArrayDataType = new PdfArrayDataType(); 60 | InternalParse(innerPdfArrayDataType, s, ref i); 61 | pdfArrayDataType.Elements.Add(innerPdfArrayDataType); 62 | } 63 | else if (s[i] == ']') 64 | return; 65 | else if (s[i] == ' ') 66 | i++; 67 | else if (s[i] == '\n') 68 | i++; 69 | else 70 | throw new ArgumentException(string.Format("{0} is not an array", s)); 71 | } 72 | } 73 | 74 | 75 | public static bool IsStartChar(string content, int i) 76 | { 77 | return content[i] == '['; 78 | } 79 | 80 | public static string GetRawData(string content, ref int i) 81 | { 82 | if (!IsStartChar(content, i)) 83 | throw new ArgumentException("The content is not a PdfArrayDataType"); 84 | 85 | string data = "["; 86 | 87 | i++; 88 | while (i < content.Length) 89 | { 90 | if (PdfStringDataType.IsStartChar(content, i)) 91 | { 92 | string item = PdfStringDataType.GetRawData(content, ref i); 93 | data += item; 94 | } 95 | else if (PdfHexStringDataType.IsStartChar(content, i)) 96 | { 97 | string item = PdfHexStringDataType.GetRawData(content, ref i); 98 | data += item; 99 | } 100 | else if (PdfNumericDataType.IsStartChar(content, i)) 101 | { 102 | string item = PdfNumericDataType.GetRawData(content, ref i); 103 | data += item; 104 | } 105 | else if (PdfArrayDataType.IsStartChar(content, i)) 106 | { 107 | string item = PdfArrayDataType.GetRawData(content, ref i); 108 | data += item; 109 | } 110 | else if (content[i] == ']') 111 | { 112 | data += content[i]; 113 | i++; 114 | return data; 115 | } 116 | else if (content[i] == ' ') 117 | i++; 118 | else if (content[i] == '\n') 119 | i++; 120 | else 121 | throw new ArgumentException(string.Format("{0} is not an array", content)); 122 | } 123 | 124 | return data; 125 | } 126 | 127 | 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Point.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Globalization; 3 | 4 | namespace BuildTablesFromPdf.Engine 5 | { 6 | public struct Point 7 | { 8 | public static readonly Point Origin = new Point(0, 0); 9 | 10 | #region == 11 | 12 | public bool Equals(Point other) 13 | { 14 | return Math.Abs(X - other.X) < ContentExtractor.Tolerance && Math.Abs(Y - other.Y) < ContentExtractor.Tolerance; 15 | } 16 | 17 | public override bool Equals(object obj) 18 | { 19 | if (ReferenceEquals(null, obj)) return false; 20 | return obj is Point && Equals((Point) obj); 21 | } 22 | 23 | public override int GetHashCode() 24 | { 25 | unchecked 26 | { 27 | return (X.GetHashCode() * 397) ^ Y.GetHashCode(); 28 | } 29 | } 30 | 31 | public static bool operator ==(Point left, Point right) 32 | { 33 | return left.Equals(right); 34 | } 35 | 36 | public static bool operator !=(Point left, Point right) 37 | { 38 | return !left.Equals(right); 39 | } 40 | 41 | #endregion 42 | 43 | #region >, >=, <, <= 44 | 45 | public static bool operator >(Point left, Point right) 46 | { 47 | if (Math.Abs(left.X - right.X) < ContentExtractor.Tolerance) 48 | { 49 | if (Math.Abs(left.Y - right.Y) < ContentExtractor.Tolerance) 50 | // Equal point 51 | return false; 52 | else 53 | return left.Y > right.Y; 54 | } 55 | else 56 | { 57 | return left.X > right.X; 58 | } 59 | } 60 | 61 | public static bool operator >=(Point left, Point right) 62 | { 63 | if (Math.Abs(left.X - right.X) < ContentExtractor.Tolerance) 64 | { 65 | if (Math.Abs(left.Y - right.Y) < ContentExtractor.Tolerance) 66 | // Equal point 67 | return true; 68 | else 69 | return left.Y > right.Y; 70 | } 71 | else 72 | { 73 | return left.X > right.X; 74 | } 75 | } 76 | 77 | public static bool operator <=(Point left, Point right) 78 | { 79 | if (Math.Abs(left.X - right.X) < ContentExtractor.Tolerance) 80 | { 81 | if (Math.Abs(left.Y - right.Y) < ContentExtractor.Tolerance) 82 | // Equal point 83 | return true; 84 | else 85 | return left.Y < right.Y; 86 | } 87 | else 88 | { 89 | return left.X < right.X; 90 | } 91 | } 92 | 93 | 94 | public static bool operator <(Point left, Point right) 95 | { 96 | if (Math.Abs(left.X - right.X) < ContentExtractor.Tolerance) 97 | { 98 | if (Math.Abs(left.Y - right.Y) < ContentExtractor.Tolerance) 99 | // Equal point 100 | return false; 101 | else 102 | return left.Y < right.Y; 103 | } 104 | else 105 | { 106 | return left.X < right.X; 107 | } 108 | } 109 | 110 | #endregion 111 | 112 | public readonly double X; 113 | public readonly double Y; 114 | 115 | public Point(double x, double y) 116 | { 117 | X = x; 118 | Y = y; 119 | } 120 | 121 | public static Point Parse(string rawContent) 122 | { 123 | var splittedRawContent = rawContent.Split(new[] {' '}, StringSplitOptions.RemoveEmptyEntries); 124 | float x = float.Parse(splittedRawContent[0], NumberFormatInfo.InvariantInfo); 125 | float y = float.Parse(splittedRawContent[1], NumberFormatInfo.InvariantInfo); 126 | return new Point(x, y); 127 | } 128 | 129 | public override string ToString() 130 | { 131 | return string.Format("({0}, {1})", X, Y); 132 | } 133 | 134 | public float Distance(Point point) 135 | { 136 | return (float) Math.Sqrt((X - point.X) * (X - point.X) + (Y - point.Y) * (Y - point.Y)); 137 | } 138 | 139 | public bool IsValid() 140 | { 141 | if (X < 0 || Y < 0) 142 | return false; 143 | 144 | if (X > 10000 || Y > 10000) 145 | return false; 146 | 147 | return true; 148 | } 149 | 150 | public Point Rotate(int pageRotation) 151 | { 152 | switch (pageRotation) 153 | { 154 | case 0: 155 | return new Point(X, 800 - Y); 156 | case 90: 157 | return new Point(Y, X); 158 | default: 159 | return this; 160 | } 161 | } 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/BuildTablesFromPdf.Renderer.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {DF805107-8789-4B69-8BE0-74AAF113AC79} 8 | WinExe 9 | Properties 10 | BuildTablesFromPdf.Renderer 11 | BuildTablesFromPdf.Renderer 12 | v4.5 13 | 512 14 | 15 | 16 | AnyCPU 17 | true 18 | full 19 | false 20 | bin\Debug\ 21 | DEBUG;TRACE 22 | prompt 23 | 4 24 | 25 | 26 | AnyCPU 27 | pdbonly 28 | true 29 | bin\Release\ 30 | TRACE 31 | prompt 32 | 4 33 | 34 | 35 | 36 | ..\packages\iTextSharp-LGPL.4.1.6\lib\iTextSharp.dll 37 | True 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | UserControl 53 | 54 | 55 | FileOpen.cs 56 | 57 | 58 | 59 | Form 60 | 61 | 62 | frmNotepad.cs 63 | 64 | 65 | Form 66 | 67 | 68 | frmRenderer.cs 69 | 70 | 71 | 72 | 73 | 74 | FileOpen.cs 75 | 76 | 77 | frmNotepad.cs 78 | 79 | 80 | frmRenderer.cs 81 | 82 | 83 | ResXFileCodeGenerator 84 | Resources.Designer.cs 85 | Designer 86 | 87 | 88 | True 89 | Resources.resx 90 | True 91 | 92 | 93 | 94 | SettingsSingleFileGenerator 95 | Settings.Designer.cs 96 | 97 | 98 | True 99 | Settings.settings 100 | True 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | {9883fa0a-cb8e-4053-a3e9-58ffe5269320} 109 | BuildTablesFromPdf.Engine 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 126 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine.Test/BuildTablesFromPdf.Engine.Test.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Debug 5 | AnyCPU 6 | {0EEDBCC7-E56A-435E-B41F-09953B8D9CFC} 7 | Library 8 | Properties 9 | BuildTablesFromPdf.Engine.Test 10 | BuildTablesFromPdf.Engine.Test 11 | v4.5 12 | 512 13 | {3AC096D0-A1C2-E12C-1390-A8335801FDAB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} 14 | 10.0 15 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion) 16 | $(ProgramFiles)\Common Files\microsoft shared\VSTT\$(VisualStudioVersion)\UITestExtensionPackages 17 | False 18 | UnitTest 19 | 20 | 21 | true 22 | full 23 | false 24 | bin\Debug\ 25 | DEBUG;TRACE 26 | prompt 27 | 4 28 | 29 | 30 | pdbonly 31 | true 32 | bin\Release\ 33 | TRACE 34 | prompt 35 | 4 36 | 37 | 38 | 39 | ..\packages\iTextSharp-LGPL.4.1.6\lib\iTextSharp.dll 40 | True 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | {9883fa0a-cb8e-4053-a3e9-58ffe5269320} 66 | BuildTablesFromPdf.Engine 67 | 68 | 69 | 70 | 71 | 72 | Always 73 | 74 | 75 | Always 76 | 77 | 78 | Always 79 | 80 | 81 | 82 | 83 | 84 | 85 | False 86 | 87 | 88 | False 89 | 90 | 91 | False 92 | 93 | 94 | False 95 | 96 | 97 | 98 | 99 | 100 | 101 | 108 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/BuildTablesFromPdf.Engine.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {9883FA0A-CB8E-4053-A3E9-58FFE5269320} 8 | Library 9 | Properties 10 | BuildTablesFromPdf.Engine 11 | BuildTablesFromPdf.Engine 12 | v4.5 13 | 512 14 | 15 | 16 | true 17 | full 18 | false 19 | bin\Debug\ 20 | DEBUG;TRACE 21 | prompt 22 | 4 23 | 24 | 25 | pdbonly 26 | true 27 | bin\Release\ 28 | TRACE 29 | prompt 30 | 4 31 | 32 | 33 | 34 | ..\packages\iTextSharp-LGPL.4.1.6\lib\iTextSharp.dll 35 | True 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 111 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/frmNotepad.resx: -------------------------------------------------------------------------------- 1 | 2 | 3 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | text/microsoft-resx 110 | 111 | 112 | 2.0 113 | 114 | 115 | System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 116 | 117 | 118 | System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 119 | 120 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/frmRenderer.resx: -------------------------------------------------------------------------------- 1 | 2 | 3 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | text/microsoft-resx 110 | 111 | 112 | 2.0 113 | 114 | 115 | System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 116 | 117 | 118 | System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 119 | 120 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Matrix.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Globalization; 3 | 4 | namespace BuildTablesFromPdf.Engine 5 | { 6 | public class Matrix 7 | { 8 | 9 | public static readonly Matrix Identity = new Matrix(); 10 | 11 | private Matrix() 12 | {} 13 | 14 | public Matrix(double a, double b, double c, double d, double e, double f) 15 | { 16 | _a = a; 17 | _b = b; 18 | _c = c; 19 | _d = d; 20 | _e = e; 21 | _f = f; 22 | } 23 | 24 | private double _a = 1; 25 | private double _b = 0; 26 | private double _c = 0; 27 | private double _d = 1; 28 | private double _e = 0; 29 | private double _f = 0; 30 | 31 | // ReSharper disable InconsistentNaming 32 | public double a 33 | { 34 | get { return _a; } 35 | } 36 | 37 | public double b 38 | { 39 | get { return _b; } 40 | } 41 | 42 | public double c 43 | { 44 | get { return _c; } 45 | } 46 | 47 | public double d 48 | { 49 | get { return _d; } 50 | } 51 | 52 | public double e 53 | { 54 | get { return _e; } 55 | } 56 | 57 | public double f 58 | { 59 | get { return _f; } 60 | } 61 | // ReSharper restore InconsistentNaming 62 | 63 | 64 | public Matrix GetRotationMatrix() 65 | { 66 | return new Matrix(a, b, c, d, 0, 0); 67 | } 68 | 69 | public Matrix GetTranslationMatrix() 70 | { 71 | return new Matrix(0, 0, 0, 0, e, f); 72 | } 73 | 74 | 75 | 76 | public static Matrix Parse(string s) 77 | { 78 | string[] parts = s.Split(new[] {' '}, StringSplitOptions.RemoveEmptyEntries); 79 | if (parts.Length < 6) 80 | throw new FormatException("s is not a transformation matrix"); 81 | 82 | var transformMatrix = new Matrix() 83 | { 84 | _a = double.Parse(parts[0], NumberFormatInfo.InvariantInfo), 85 | _b = double.Parse(parts[1], NumberFormatInfo.InvariantInfo), 86 | _c = double.Parse(parts[2], NumberFormatInfo.InvariantInfo), 87 | _d = double.Parse(parts[3], NumberFormatInfo.InvariantInfo), 88 | _e = double.Parse(parts[4], NumberFormatInfo.InvariantInfo), 89 | _f = double.Parse(parts[5], NumberFormatInfo.InvariantInfo) 90 | }; 91 | 92 | return transformMatrix; 93 | } 94 | 95 | public static bool TryParse(string s, out Matrix trasformationMatrix) 96 | { 97 | try 98 | { 99 | trasformationMatrix = Parse(s); 100 | return true; 101 | } 102 | catch 103 | { 104 | trasformationMatrix = new Matrix(); 105 | return false; 106 | } 107 | } 108 | 109 | public static Matrix operator *(Matrix l, Matrix r) 110 | { 111 | if (l == null) 112 | return null; 113 | if (r == null) 114 | return null; 115 | 116 | Matrix z = new Matrix(); 117 | z._a = l.a * r.a + l.b * r.c; 118 | z._b = l.a * r.b + l.b * r.d; 119 | z._c = l.c * r.a + l.d * r.c; 120 | z._d = l.c * r.b + l.d * r.d; 121 | z._e = l.e * r.a + l.f * r.c + 1 * r._e; 122 | z._f = l.e * r.b + l.f * r.d + 1 * r.f; 123 | 124 | return z; 125 | } 126 | 127 | 128 | public static Point operator *(Point l, Matrix r) 129 | { 130 | if (r == null) 131 | return l; 132 | 133 | double x = r.TransformX(l.X, l.Y); 134 | double y = r.TransformY(l.X, l.Y); 135 | 136 | return new Point(x, y); 137 | } 138 | 139 | public double TransformX(double x, double y) 140 | { 141 | return a * x + c * y + e; 142 | 143 | } 144 | 145 | public double TransformY(double x, double y) 146 | { 147 | return b * x + d * y + f; 148 | 149 | } 150 | 151 | public Point TransformPoint(Point point) 152 | { 153 | return new Point(TransformX(point.X, point.Y), TransformY(point.X, point.Y)); 154 | } 155 | 156 | public Line TransformLine(Line line) 157 | { 158 | return new Line(TransformPoint(line.StartPoint), TransformPoint(line.EndPoint)); 159 | } 160 | 161 | #region == 162 | 163 | protected bool Equals(Matrix other) 164 | { 165 | return _a.Equals(other._a) && _b.Equals(other._b) && _c.Equals(other._c) && _d.Equals(other._d) && _e.Equals(other._e) && _f.Equals(other._f); 166 | } 167 | 168 | public override bool Equals(object obj) 169 | { 170 | if (ReferenceEquals(null, obj)) return false; 171 | if (ReferenceEquals(this, obj)) return true; 172 | if (obj.GetType() != this.GetType()) return false; 173 | return Equals((Matrix)obj); 174 | } 175 | 176 | public override int GetHashCode() 177 | { 178 | unchecked 179 | { 180 | int hashCode = _a.GetHashCode(); 181 | hashCode = (hashCode * 397) ^ _b.GetHashCode(); 182 | hashCode = (hashCode * 397) ^ _c.GetHashCode(); 183 | hashCode = (hashCode * 397) ^ _d.GetHashCode(); 184 | hashCode = (hashCode * 397) ^ _e.GetHashCode(); 185 | hashCode = (hashCode * 397) ^ _f.GetHashCode(); 186 | return hashCode; 187 | } 188 | } 189 | 190 | public static bool operator ==(Matrix left, Matrix right) 191 | { 192 | return Equals(left, right); 193 | } 194 | 195 | public static bool operator !=(Matrix left, Matrix right) 196 | { 197 | return !Equals(left, right); 198 | } 199 | 200 | #endregion 201 | 202 | public override string ToString() 203 | { 204 | return string.Format("|{0}|{1}|0 - {2}|{3}|0 - {4}|{5}|1", a, b, c, d, e, f); 205 | } 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/Properties/Resources.resx: -------------------------------------------------------------------------------- 1 | 2 | 3 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | text/microsoft-resx 110 | 111 | 112 | 2.0 113 | 114 | 115 | System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 116 | 117 | 118 | System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 119 | 120 | 121 | 122 | ..\Resources\HTML_Footer.txt;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;Windows-1252 123 | 124 | 125 | ..\Resources\HTML_Header.txt;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;Windows-1252 126 | 127 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/PdfFontHelper.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using BuildTablesFromPdf.Engine.CMap; 7 | using iTextSharp.text.pdf; 8 | 9 | namespace BuildTablesFromPdf.Engine 10 | { 11 | static class PdfFontHelper 12 | { 13 | public static PdfDictionary GetFont(PdfReader pdfReader, int pageNumber, string fontKey) 14 | { 15 | PdfDictionary resources = pdfReader.GetPageN(pageNumber).GetAsDict(PdfName.RESOURCES); 16 | return FindFontDictionary(resources, fontKey); 17 | } 18 | 19 | public static void ExtractFontNameOfPdf(string sourceFileName) 20 | { 21 | using (Stream pdfStream = new FileStream(sourceFileName, FileMode.Open)) 22 | { 23 | ExtractFontNameOfPdf(new PdfReader(pdfStream)); 24 | } 25 | } 26 | 27 | public static void ExtractFontNameOfPdf(PdfReader pdfReader) 28 | { 29 | List set = new List(); 30 | 31 | for (int pageNumber = 1; pageNumber <= pdfReader.NumberOfPages; pageNumber++) 32 | { 33 | PdfDictionary resources = pdfReader.GetPageN(pageNumber).GetAsDict(PdfName.RESOURCES); 34 | ProcessResources(set, resources); 35 | } 36 | 37 | foreach (BaseFont item in set) 38 | Console.WriteLine(item.PostscriptFontName + " " + item.FontType.ToString()); 39 | } 40 | 41 | public static void ExtractFontNameOfPdf(PdfReader pdfReader, int pageNumber) 42 | { 43 | List set = new List(); 44 | PdfDictionary resources; 45 | 46 | 47 | // GetPageN parameter is 1 based 48 | resources = pdfReader.GetPageN(pageNumber + 1).GetAsDict(PdfName.RESOURCES); 49 | ProcessResources(set, resources); 50 | 51 | 52 | foreach (BaseFont item in set) 53 | Console.WriteLine(item.PostscriptFontName + " " + item.FontType.ToString()); 54 | } 55 | 56 | 57 | private static void ProcessResources(List baseFonts, PdfDictionary resources) 58 | { 59 | if (resources == null) 60 | return; 61 | PdfDictionary xObjects = resources.GetAsDict(PdfName.XOBJECT); 62 | if (xObjects != null) 63 | { 64 | foreach (PdfName key in xObjects.Keys) 65 | { 66 | ProcessResources(baseFonts, xObjects.GetAsDict(key)); 67 | } 68 | } 69 | 70 | PdfDictionary fonts = resources.GetAsDict(PdfName.FONT); 71 | 72 | if (fonts == null) 73 | return; 74 | foreach (PdfName key in fonts.Keys) 75 | { 76 | PRIndirectReference iRef = (PRIndirectReference)fonts.Get(key); 77 | if (iRef != null) 78 | baseFonts.Add(BaseFont.CreateFont(iRef)); 79 | } 80 | } 81 | 82 | 83 | public static CMapToUnicode GetFontCMapToUnicode(PdfReader pdfReader, int pageNumber, string fontKey) 84 | { 85 | PdfDictionary resources = pdfReader.GetPageN(pageNumber).GetAsDict(PdfName.RESOURCES); 86 | var fontDict = FindFontDictionary(resources, fontKey); 87 | if (fontDict == null) 88 | return null; 89 | PRStream toUnicodeIndirectReference = (PRStream)PdfReader.GetPdfObject(fontDict.Get(PdfName.TOUNICODE)); 90 | if (toUnicodeIndirectReference == null) 91 | return null; 92 | string toUnicode = Encoding.UTF8.GetString(PdfReader.GetStreamBytes(toUnicodeIndirectReference)); 93 | 94 | return CMapToUnicode.Parse(toUnicode); 95 | } 96 | 97 | 98 | private static PdfDictionary FindFontDictionary(PdfDictionary resources, string fontKey) 99 | { 100 | if (resources == null) 101 | return null; 102 | PdfDictionary xObjects = resources.GetAsDict(PdfName.XOBJECT); 103 | PdfDictionary fontDictionary; 104 | if (xObjects != null) 105 | { 106 | foreach (PdfName key in xObjects.Keys) 107 | { 108 | fontDictionary = FindFontDictionary(xObjects.GetAsDict(key), fontKey); 109 | if (fontDictionary != null) 110 | return fontDictionary; 111 | } 112 | } 113 | 114 | PdfDictionary fonts = resources.GetAsDict(PdfName.FONT); 115 | 116 | if (fonts == null) 117 | return null; 118 | 119 | PdfName pdfName = fonts.Keys.Cast().FirstOrDefault(_ => _.ToString() == fontKey); 120 | if (pdfName == null) 121 | return null; 122 | 123 | fontDictionary = (PdfDictionary)PdfReader.GetPdfObject(fonts.Get(pdfName)); 124 | 125 | return fontDictionary; 126 | 127 | } 128 | 129 | 130 | public static string ToUnicode(string content, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode) 131 | { 132 | if (cMapToUnicode != null) 133 | return cMapToUnicode.ConvertToString(content); 134 | else if (encodingDifferenceToUnicode != null) 135 | return encodingDifferenceToUnicode.ConvertToString(content); 136 | else 137 | return content; 138 | } 139 | 140 | public static string ToUnicode(int[] content, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode) 141 | { 142 | if (cMapToUnicode != null) 143 | return cMapToUnicode.ConvertToString(content); 144 | else if (encodingDifferenceToUnicode != null) 145 | return encodingDifferenceToUnicode.ConvertToString(content); 146 | else 147 | { 148 | byte[] byteContent = new byte[content.Length * sizeof(int)]; 149 | Buffer.BlockCopy(content, 0, byteContent, 0, byteContent.Length); 150 | string stringContent = System.Text.Encoding.Unicode.GetString(byteContent); 151 | return stringContent; 152 | } 153 | } 154 | 155 | public static char ToUnicode(int character, CMapToUnicode cMapToUnicode, EncodingDifferenceToUnicode encodingDifferenceToUnicode) 156 | { 157 | if (cMapToUnicode != null) 158 | return cMapToUnicode.ConvertToUnicodeChar(character); 159 | else if (encodingDifferenceToUnicode != null) 160 | return encodingDifferenceToUnicode.ConvertToUnicodeChar(character); 161 | else 162 | return Convert.ToChar(character); 163 | } 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/FileOpen.resx: -------------------------------------------------------------------------------- 1 | 2 | 3 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | text/microsoft-resx 110 | 111 | 112 | 2.0 113 | 114 | 115 | System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 116 | 117 | 118 | System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 119 | 120 | 121 | 17, 17 122 | 123 | 124 | 125 | 126 | iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6 127 | JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAAi0lEQVQ4T8WSiwnAIAxE3alzuEZGcQd3 128 | cifrGa2x/lqQ9uAQEu8lgmqbnD18dio9F0JSryAlbIUZUpkGUAbIcIFIWVLeGNNCLoCz7dSBU5SFQtgv 129 | NqYKAyBskqKsGiCfcHO4gxMAIiqQzwDOcX8NwKSOc38/AEIzOnwWXJhZa93/DyiCvHI3/KOUOgH62sG2 130 | tSHJRwAAAABJRU5ErkJggg== 131 | 132 | 133 | 134 | 141, 17 135 | 136 | 137 | 138 | iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6 139 | JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAAmElEQVQ4T5WRAQ7EIAgEfbpP82ctK6wB 140 | Sm2dhHjF3cn1rpHe+/U1Fq1BYMen5I+Ap1UiuBhjzKmQSJ4IBGZ/FXGPzCwRfr1mIVltJbifRcAyyZJM 141 | KUAJrDI+nwiALzOEM0seAgBJVeaJIdyjF5ivohfrP5elPGuJkq3AIxv9LZJkK/AhDdYSidcCOYqJEnd3 142 | gpcclxdFubUbgXzvaEtwAmcAAAAASUVORK5CYII= 143 | 144 | 145 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Ww][Ii][Nn]32/ 27 | [Aa][Rr][Mm]/ 28 | [Aa][Rr][Mm]64/ 29 | bld/ 30 | [Bb]in/ 31 | [Oo]bj/ 32 | [Ll]og/ 33 | [Ll]ogs/ 34 | 35 | # Visual Studio 2015/2017 cache/options directory 36 | .vs/ 37 | # Uncomment if you have tasks that create the project's static files in wwwroot 38 | #wwwroot/ 39 | 40 | # Visual Studio 2017 auto generated files 41 | Generated\ Files/ 42 | 43 | # MSTest test Results 44 | [Tt]est[Rr]esult*/ 45 | [Bb]uild[Ll]og.* 46 | 47 | # NUnit 48 | *.VisualState.xml 49 | TestResult.xml 50 | nunit-*.xml 51 | 52 | # Build Results of an ATL Project 53 | [Dd]ebugPS/ 54 | [Rr]eleasePS/ 55 | dlldata.c 56 | 57 | # Benchmark Results 58 | BenchmarkDotNet.Artifacts/ 59 | 60 | # .NET Core 61 | project.lock.json 62 | project.fragment.lock.json 63 | artifacts/ 64 | 65 | # ASP.NET Scaffolding 66 | ScaffoldingReadMe.txt 67 | 68 | # StyleCop 69 | StyleCopReport.xml 70 | 71 | # Files built by Visual Studio 72 | *_i.c 73 | *_p.c 74 | *_h.h 75 | *.ilk 76 | *.meta 77 | *.obj 78 | *.iobj 79 | *.pch 80 | *.pdb 81 | *.ipdb 82 | *.pgc 83 | *.pgd 84 | *.rsp 85 | *.sbr 86 | *.tlb 87 | *.tli 88 | *.tlh 89 | *.tmp 90 | *.tmp_proj 91 | *_wpftmp.csproj 92 | *.log 93 | *.vspscc 94 | *.vssscc 95 | .builds 96 | *.pidb 97 | *.svclog 98 | *.scc 99 | 100 | # Chutzpah Test files 101 | _Chutzpah* 102 | 103 | # Visual C++ cache files 104 | ipch/ 105 | *.aps 106 | *.ncb 107 | *.opendb 108 | *.opensdf 109 | *.sdf 110 | *.cachefile 111 | *.VC.db 112 | *.VC.VC.opendb 113 | 114 | # Visual Studio profiler 115 | *.psess 116 | *.vsp 117 | *.vspx 118 | *.sap 119 | 120 | # Visual Studio Trace Files 121 | *.e2e 122 | 123 | # TFS 2012 Local Workspace 124 | $tf/ 125 | 126 | # Guidance Automation Toolkit 127 | *.gpState 128 | 129 | # ReSharper is a .NET coding add-in 130 | _ReSharper*/ 131 | *.[Rr]e[Ss]harper 132 | *.DotSettings.user 133 | 134 | # TeamCity is a build add-in 135 | _TeamCity* 136 | 137 | # DotCover is a Code Coverage Tool 138 | *.dotCover 139 | 140 | # AxoCover is a Code Coverage Tool 141 | .axoCover/* 142 | !.axoCover/settings.json 143 | 144 | # Coverlet is a free, cross platform Code Coverage Tool 145 | coverage*.json 146 | coverage*.xml 147 | coverage*.info 148 | 149 | # Visual Studio code coverage results 150 | *.coverage 151 | *.coveragexml 152 | 153 | # NCrunch 154 | _NCrunch_* 155 | .*crunch*.local.xml 156 | nCrunchTemp_* 157 | 158 | # MightyMoose 159 | *.mm.* 160 | AutoTest.Net/ 161 | 162 | # Web workbench (sass) 163 | .sass-cache/ 164 | 165 | # Installshield output folder 166 | [Ee]xpress/ 167 | 168 | # DocProject is a documentation generator add-in 169 | DocProject/buildhelp/ 170 | DocProject/Help/*.HxT 171 | DocProject/Help/*.HxC 172 | DocProject/Help/*.hhc 173 | DocProject/Help/*.hhk 174 | DocProject/Help/*.hhp 175 | DocProject/Help/Html2 176 | DocProject/Help/html 177 | 178 | # Click-Once directory 179 | publish/ 180 | 181 | # Publish Web Output 182 | *.[Pp]ublish.xml 183 | *.azurePubxml 184 | # Note: Comment the next line if you want to checkin your web deploy settings, 185 | # but database connection strings (with potential passwords) will be unencrypted 186 | *.pubxml 187 | *.publishproj 188 | 189 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 190 | # checkin your Azure Web App publish settings, but sensitive information contained 191 | # in these scripts will be unencrypted 192 | PublishScripts/ 193 | 194 | # NuGet Packages 195 | *.nupkg 196 | # NuGet Symbol Packages 197 | *.snupkg 198 | # The packages folder can be ignored because of Package Restore 199 | **/[Pp]ackages/* 200 | # except build/, which is used as an MSBuild target. 201 | !**/[Pp]ackages/build/ 202 | # Uncomment if necessary however generally it will be regenerated when needed 203 | #!**/[Pp]ackages/repositories.config 204 | # NuGet v3's project.json files produces more ignorable files 205 | *.nuget.props 206 | *.nuget.targets 207 | 208 | # Microsoft Azure Build Output 209 | csx/ 210 | *.build.csdef 211 | 212 | # Microsoft Azure Emulator 213 | ecf/ 214 | rcf/ 215 | 216 | # Windows Store app package directories and files 217 | AppPackages/ 218 | BundleArtifacts/ 219 | Package.StoreAssociation.xml 220 | _pkginfo.txt 221 | *.appx 222 | *.appxbundle 223 | *.appxupload 224 | 225 | # Visual Studio cache files 226 | # files ending in .cache can be ignored 227 | *.[Cc]ache 228 | # but keep track of directories ending in .cache 229 | !?*.[Cc]ache/ 230 | 231 | # Others 232 | ClientBin/ 233 | ~$* 234 | *~ 235 | *.dbmdl 236 | *.dbproj.schemaview 237 | *.jfm 238 | *.pfx 239 | *.publishsettings 240 | orleans.codegen.cs 241 | 242 | # Including strong name files can present a security risk 243 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 244 | #*.snk 245 | 246 | # Since there are multiple workflows, uncomment next line to ignore bower_components 247 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 248 | #bower_components/ 249 | 250 | # RIA/Silverlight projects 251 | Generated_Code/ 252 | 253 | # Backup & report files from converting an old project file 254 | # to a newer Visual Studio version. Backup files are not needed, 255 | # because we have git ;-) 256 | _UpgradeReport_Files/ 257 | Backup*/ 258 | UpgradeLog*.XML 259 | UpgradeLog*.htm 260 | ServiceFabricBackup/ 261 | *.rptproj.bak 262 | 263 | # SQL Server files 264 | *.mdf 265 | *.ldf 266 | *.ndf 267 | 268 | # Business Intelligence projects 269 | *.rdl.data 270 | *.bim.layout 271 | *.bim_*.settings 272 | *.rptproj.rsuser 273 | *- [Bb]ackup.rdl 274 | *- [Bb]ackup ([0-9]).rdl 275 | *- [Bb]ackup ([0-9][0-9]).rdl 276 | 277 | # Microsoft Fakes 278 | FakesAssemblies/ 279 | 280 | # GhostDoc plugin setting file 281 | *.GhostDoc.xml 282 | 283 | # Node.js Tools for Visual Studio 284 | .ntvs_analysis.dat 285 | node_modules/ 286 | 287 | # Visual Studio 6 build log 288 | *.plg 289 | 290 | # Visual Studio 6 workspace options file 291 | *.opt 292 | 293 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 294 | *.vbw 295 | 296 | # Visual Studio LightSwitch build output 297 | **/*.HTMLClient/GeneratedArtifacts 298 | **/*.DesktopClient/GeneratedArtifacts 299 | **/*.DesktopClient/ModelManifest.xml 300 | **/*.Server/GeneratedArtifacts 301 | **/*.Server/ModelManifest.xml 302 | _Pvt_Extensions 303 | 304 | # Paket dependency manager 305 | .paket/paket.exe 306 | paket-files/ 307 | 308 | # FAKE - F# Make 309 | .fake/ 310 | 311 | # CodeRush personal settings 312 | .cr/personal 313 | 314 | # Python Tools for Visual Studio (PTVS) 315 | __pycache__/ 316 | *.pyc 317 | 318 | # Cake - Uncomment if you are using it 319 | # tools/** 320 | # !tools/packages.config 321 | 322 | # Tabs Studio 323 | *.tss 324 | 325 | # Telerik's JustMock configuration file 326 | *.jmconfig 327 | 328 | # BizTalk build output 329 | *.btp.cs 330 | *.btm.cs 331 | *.odx.cs 332 | *.xsd.cs 333 | 334 | # OpenCover UI analysis results 335 | OpenCover/ 336 | 337 | # Azure Stream Analytics local run output 338 | ASALocalRun/ 339 | 340 | # MSBuild Binary and Structured Log 341 | *.binlog 342 | 343 | # NVidia Nsight GPU debugger configuration file 344 | *.nvuser 345 | 346 | # MFractors (Xamarin productivity tool) working folder 347 | .mfractor/ 348 | 349 | # Local History for Visual Studio 350 | .localhistory/ 351 | 352 | # BeatPulse healthcheck temp database 353 | healthchecksdb 354 | 355 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 356 | MigrationBackup/ 357 | 358 | # Ionide (cross platform F# VS Code tools) working folder 359 | .ionide/ 360 | 361 | # Fody - auto-generated XML schema 362 | FodyWeavers.xsd -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Tables/Table.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Linq; 5 | 6 | namespace BuildTablesFromPdf.Engine.Tables 7 | { 8 | [DebuggerDisplay("{DebuggerDisplay}")] 9 | public class Table : IPageContent, IFormattable 10 | { 11 | public Table() 12 | { 13 | Rows = new List(); 14 | Columns = new List(); 15 | } 16 | 17 | public Point TopLeftPoint { get; set; } 18 | public Point BottomRightPoint { get; set; } 19 | 20 | public List Rows { get; private set; } 21 | public List Columns { get; private set; } 22 | 23 | public double Width { get { return BottomRightPoint.X - TopLeftPoint.X; } } 24 | public double Heigth { get { return BottomRightPoint.Y - TopLeftPoint.Y; } } 25 | 26 | private string[,] _Content; 27 | 28 | public string this[int row, int column] 29 | { 30 | get { return _Content[row, column]; } 31 | set { _Content[row, column] = value; } 32 | } 33 | 34 | public string this[int row, string columnName] 35 | { 36 | get 37 | { 38 | return _Content[row, GetColumnIndex(columnName)]; 39 | } 40 | set 41 | { 42 | _Content[row, GetColumnIndex(columnName)] = value; 43 | } 44 | } 45 | 46 | public bool ColumnExists(string columnName) 47 | { 48 | if (columnName == "<" || columnName == ">") 49 | return true; 50 | 51 | for (int i = 1; i < _Content.GetLength(1) - 1; i++) 52 | { 53 | if (String.Equals(_Content[0, i].Trim(), columnName, StringComparison.CurrentCultureIgnoreCase)) 54 | return true; 55 | } 56 | return false; 57 | } 58 | 59 | public string GetValueOrNull(int row, string columnName) 60 | { 61 | if (!ColumnExists(columnName)) 62 | return null; 63 | return this[row, GetColumnIndex(columnName)]; 64 | } 65 | 66 | private int GetColumnIndex(string columnName) 67 | { 68 | if (columnName == "<") 69 | return 0; 70 | 71 | if (columnName == ">") 72 | return _Content.GetLength(1) - 1; 73 | 74 | for (int i = 1; i < _Content.GetLength(1) - 1; i++) 75 | { 76 | if (String.Equals(_Content[0, i].Trim(), columnName, StringComparison.CurrentCultureIgnoreCase)) 77 | return i; 78 | } 79 | 80 | throw new ArgumentException(string.Format("Column '{0}' not found", columnName), "columnName"); 81 | 82 | } 83 | 84 | 85 | public bool Contains(Line line) 86 | { 87 | return 88 | TopLeftPoint.Y - ContentExtractor.Tolerance <= line.StartPoint.Y && 89 | line.EndPoint.Y <= BottomRightPoint.Y + ContentExtractor.Tolerance 90 | && 91 | TopLeftPoint.X - ContentExtractor.Tolerance <= line.StartPoint.X && 92 | line.EndPoint.X <= BottomRightPoint.X + ContentExtractor.Tolerance; 93 | } 94 | 95 | public bool Contains(double y) 96 | { 97 | return 98 | TopLeftPoint.Y - ContentExtractor.Tolerance <= y && 99 | y <= BottomRightPoint.Y + ContentExtractor.Tolerance; 100 | } 101 | 102 | public bool Contains(Point point) 103 | { 104 | return 105 | TopLeftPoint.Y - ContentExtractor.Tolerance <= point.Y && 106 | point.Y <= BottomRightPoint.Y - ContentExtractor.Tolerance 107 | && 108 | TopLeftPoint.X - ContentExtractor.Tolerance <= point.X && 109 | point.X <= BottomRightPoint.X - ContentExtractor.Tolerance; 110 | } 111 | 112 | internal void CreateContent() 113 | { 114 | _Content = new string[Rows.Count, Columns.Count + 2]; 115 | } 116 | 117 | public void AddText(Point point, string content) 118 | { 119 | if (_Content == null) 120 | throw new InvalidOperationException("Content is not initialized. Please call CreateContent first"); 121 | 122 | // The text can be also on the left or on the right of the table 123 | Row row = FindRow(point.Y); 124 | if (row == null) 125 | throw new InvalidOperationException("The point is outside the table"); 126 | 127 | int columnIndex = FindColumnIndex(point.X); 128 | int rowIndex = Rows.Count - row.Index - 1; 129 | 130 | if (string.IsNullOrEmpty(_Content[rowIndex, columnIndex])) 131 | _Content[rowIndex, columnIndex] = content; 132 | else if (_Content[rowIndex, columnIndex].EndsWith(" ")) 133 | _Content[rowIndex, columnIndex] += content; 134 | else 135 | _Content[rowIndex, columnIndex] += " " + content; 136 | } 137 | 138 | /// 139 | /// Finds the index of the column of the x coordinate. 140 | /// If x is on the left of the table, 0 is returned 141 | /// If x is on the right of the table, Count is returned 142 | /// 143 | /// The x coordinate. 144 | /// The column 145 | private int FindColumnIndex(double x) 146 | { 147 | if (x < TopLeftPoint.X) 148 | return 0; 149 | 150 | if (BottomRightPoint.X < x) 151 | return Columns.Count + 1; 152 | 153 | Column column = Columns.SingleOrDefault(_ => _.BeginX <= x && x <= _.EndX); 154 | 155 | if (column == null) 156 | column = Columns.OrderBy(_ => _.Index).Last(_ => x <= _.EndX); 157 | 158 | return column.Index + 1; 159 | } 160 | 161 | /// 162 | /// Finds the row corresponding to the y coordinate. 163 | /// Null if y is outside the table. 164 | /// 165 | /// The y. 166 | /// The row or null if y is outside the table 167 | private Row FindRow(double y) 168 | { 169 | Row row = Rows.FirstOrDefault(_ => _.BeginY <= y && y <= _.EndY); 170 | if (row == null) 171 | row = Rows.FirstOrDefault(_ => _.BeginY - ContentExtractor.Tolerance <= y && y <= _.EndY + ContentExtractor.Tolerance); 172 | return row; 173 | } 174 | 175 | double IPageContent.Y { get { return TopLeftPoint.Y; } } 176 | 177 | #region IFormattable 178 | 179 | private string DebuggerDisplay { get { return ToString("d"); } } 180 | 181 | public override string ToString() 182 | { 183 | return ToString(""); 184 | } 185 | 186 | public string ToString(string format) 187 | { 188 | switch (format) 189 | { 190 | case "s": 191 | case "": 192 | case null: 193 | if (_Content == null) 194 | return ""; 195 | string content = ""; 196 | for (int i = 0; i < _Content.GetLength(0); i++) 197 | { 198 | for (int j = 0; j < _Content.GetLength(1); j++) 199 | { 200 | if (j == 0) 201 | content += _Content[i, j]; 202 | else 203 | content += " | " + _Content[i, j]; 204 | } 205 | content += "\r\n"; 206 | } 207 | return content; 208 | case "d": 209 | return string.Format("{0} - {1}; Rows = {2}, Columns = {3}", TopLeftPoint, BottomRightPoint, Rows.Count, Columns.Count); 210 | default: 211 | throw new FormatException(); 212 | } 213 | } 214 | 215 | public string ToString(string format, IFormatProvider formatProvider) 216 | { 217 | return ToString(format); 218 | } 219 | 220 | 221 | #endregion 222 | 223 | } 224 | } -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/SimpleTextExtractor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Globalization; 3 | using System.Text; 4 | using BuildTablesFromPdf.Engine.CMap; 5 | using BuildTablesFromPdf.Engine.Statements; 6 | using iTextSharp.text.pdf; 7 | 8 | namespace BuildTablesFromPdf.Engine 9 | { 10 | /// 11 | /// The output of this class should be very similar to SimpleTextExtractor missing in LGPL version of iTextSharp 12 | /// 13 | public class SimpleTextExtractor 14 | { 15 | public static string ReadPdfFile(string filename) 16 | { 17 | PdfReader pdfReader = new PdfReader(filename); 18 | string strText = string.Empty; 19 | 20 | for (int page = 1; page <= pdfReader.NumberOfPages; page++) 21 | { 22 | string s = GetTextFromPage(pdfReader, page); 23 | 24 | /* 25 | s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s))); 26 | s = s.Replace("\n", "\r\n"); 27 | s = s.Replace("\0", " "); 28 | */ 29 | 30 | strText += s; 31 | } 32 | 33 | pdfReader.Close(); 34 | 35 | return strText; 36 | 37 | 38 | } 39 | 40 | public static string ReadPdfFilePage(string filename, int page) 41 | { 42 | PdfReader pdfReader = new PdfReader(filename); 43 | string strText = string.Empty; 44 | 45 | string s = GetTextFromPage(pdfReader, page); 46 | 47 | /* 48 | s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s))); 49 | s = s.Replace("\n", "\r\n"); 50 | s = s.Replace("\0", " "); 51 | */ 52 | 53 | strText += s; 54 | 55 | pdfReader.Close(); 56 | 57 | return strText; 58 | 59 | 60 | } 61 | 62 | private static string GetTextFromPage(PdfReader pdfReader, int pageNumber) 63 | { 64 | StringBuilder sb = new StringBuilder(); 65 | 66 | Matrix transformMatrix = Matrix.Identity; 67 | float leadingParameter = 0; 68 | Point position; 69 | CMapToUnicode cMapToUnicode = null; 70 | EncodingDifferenceToUnicode encodingDifferenceToUnicode = null; 71 | 72 | double oldY = 0; 73 | string lineContent = null; 74 | 75 | string rawPdfContent = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, pdfReader.GetPageContent(pageNumber))); 76 | int pointer = 0; 77 | 78 | string statement = Statement.GetNextStatement(rawPdfContent, ref pointer); 79 | while (statement != null) 80 | { 81 | 82 | // Embedded image 83 | if (statement.EndsWith("BI")) 84 | { 85 | pointer = rawPdfContent.IndexOf("\nEI", pointer, StringComparison.Ordinal); 86 | } 87 | else if (statement.EndsWith("Tm")) 88 | { 89 | Matrix matrix; 90 | if (Matrix.TryParse(statement, out matrix)) 91 | transformMatrix = matrix; 92 | } 93 | else if (statement.EndsWith("Tf")) 94 | { 95 | string[] fontParameters = statement.Split(' '); 96 | cMapToUnicode = PdfFontHelper.GetFontCMapToUnicode(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3]); 97 | encodingDifferenceToUnicode = EncodingDifferenceToUnicode.Parse(PdfFontHelper.GetFont(pdfReader, pageNumber, fontParameters[fontParameters.Length - 3])); 98 | } 99 | else if (statement.EndsWith("Td")) 100 | { 101 | float tx; 102 | float ty; 103 | string[] parameters = statement.Split(' '); 104 | if ( 105 | float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) && 106 | float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty)) 107 | transformMatrix = new Matrix(1, 0, 0, 1, tx, ty); 108 | } 109 | else if (statement.EndsWith("TD")) 110 | { 111 | float tx; 112 | float ty; 113 | string[] parameters = statement.Split(' '); 114 | if ( 115 | float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tx) && 116 | float.TryParse(parameters[1], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out ty)) 117 | { 118 | transformMatrix = new Matrix(1, 0, 0, 1, tx, ty) * transformMatrix; 119 | leadingParameter = -ty; 120 | } 121 | } 122 | else if (statement.EndsWith("TL")) 123 | { 124 | float tl; 125 | string[] parameters = statement.Split(' '); 126 | if ( 127 | float.TryParse(parameters[0], NumberStyles.Any, NumberFormatInfo.InvariantInfo, out tl)) 128 | leadingParameter = tl; 129 | } 130 | else if (statement.EndsWith("T*")) 131 | { 132 | transformMatrix = new Matrix(1, 0, 0, 1, 0, -leadingParameter) * transformMatrix; 133 | } 134 | else if (statement.EndsWith("TJ")) 135 | { 136 | string content = TextObjectStatement.GetTJContent(statement, cMapToUnicode, encodingDifferenceToUnicode); 137 | if (!string.IsNullOrWhiteSpace(content)) 138 | { 139 | content = content.Trim(); 140 | 141 | //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation); 142 | position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y)); 143 | if (oldY == position.Y) 144 | { 145 | if (!string.IsNullOrWhiteSpace(lineContent)) 146 | lineContent += " " + content; 147 | else 148 | lineContent = content; 149 | } 150 | else 151 | { 152 | if (!string.IsNullOrWhiteSpace(lineContent)) 153 | sb.AppendLine(lineContent); 154 | lineContent = content; 155 | oldY = position.Y; 156 | } 157 | } 158 | } 159 | else if (statement.Trim().EndsWith("Tj")) 160 | { 161 | string escapedContent; 162 | escapedContent = statement.Trim(); 163 | escapedContent = escapedContent.Remove(escapedContent.Length - 2); 164 | string content = PdfHexStringDataType.IsStartChar(escapedContent) ? PdfHexStringDataType.GetContent(escapedContent) : PdfStringDataType.GetContentFromEscapedContent(escapedContent); 165 | content = content.Trim(); 166 | content = PdfFontHelper.ToUnicode(content, cMapToUnicode, encodingDifferenceToUnicode); 167 | //line.Position = BaseTransformMatrix.TransformPoint(new Point(transformMatrix.TransformX(position.X, position.Y), transformMatrix.TransformY(position.X, position.Y) + line.FontHeight)).Rotate(pageRotation); 168 | position = new Point(transformMatrix.TransformX(Point.Origin.X, Point.Origin.Y), transformMatrix.TransformY(Point.Origin.X, Point.Origin.Y)); 169 | if (Math.Abs(oldY - position.Y) < 1) 170 | { 171 | if (!string.IsNullOrWhiteSpace(lineContent)) 172 | lineContent += " " + content; 173 | else 174 | lineContent = content; 175 | } 176 | else 177 | { 178 | if (!string.IsNullOrWhiteSpace(lineContent)) 179 | sb.AppendLine(lineContent); 180 | lineContent = content; 181 | oldY = position.Y; 182 | } 183 | } 184 | 185 | 186 | statement = Statement.GetNextStatement(rawPdfContent, ref pointer); 187 | 188 | } 189 | 190 | if (!string.IsNullOrWhiteSpace(lineContent)) 191 | sb.Append(lineContent); 192 | string textFromPage = sb.ToString(); 193 | 194 | return textFromPage; 195 | } 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Renderer/frmRenderer.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Diagnostics; 3 | using System.Drawing; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Text; 7 | using System.Windows.Forms; 8 | using BuildTablesFromPdf.Engine; 9 | using BuildTablesFromPdf.Engine.Statements; 10 | using BuildTablesFromPdf.Engine.Tables; 11 | 12 | namespace BuildTablesFromPdf.Renderer 13 | { 14 | public partial class frmRenderer : Form 15 | { 16 | private PageCollection _pages; 17 | 18 | public Page CurrentPage { get; set; } 19 | 20 | 21 | 22 | public frmRenderer() 23 | { 24 | InitializeComponent(); 25 | } 26 | 27 | protected override void OnLoad(EventArgs e) 28 | { 29 | base.OnLoad(e); 30 | 31 | fileOpen.Value = Properties.Settings.Default.FileName; 32 | 33 | splitContainer.Panel2.MouseWheel += splitContainer_Panel2_MouseWheel; 34 | } 35 | 36 | private void ShowDocument(string fileName) 37 | { 38 | ContentExtractor.ShowParserInfo = chkShowParserInfo.Checked; 39 | ContentExtractor.IgnoreWhiteLines = chkIgnoreWhiteLines.Checked; 40 | _pages = ContentExtractor.Read(fileName); 41 | lblPages.Text = _pages.Count.ToString(); 42 | DrawPage(0); 43 | } 44 | 45 | 46 | private void DrawPage(int pageIndex) 47 | { 48 | if (!_pages[pageIndex].IsRefreshed) 49 | { 50 | _pages[pageIndex].DetermineTableStructures(); 51 | _pages[pageIndex].DetermineParagraphs(); 52 | 53 | _pages[pageIndex].FillContent(); 54 | } 55 | 56 | txtPage.Text = (pageIndex + 1).ToString(); 57 | CurrentPage = _pages[pageIndex]; 58 | txtPageContent.Text = _pages[pageIndex].ToString(); 59 | RedrawLines(); 60 | 61 | } 62 | 63 | private void RedrawLines() 64 | { 65 | 66 | if (CurrentPage == null) 67 | return; 68 | 69 | using (var g = splitContainer.Panel2.CreateGraphics()) 70 | { 71 | 72 | g.Clear(splitContainer.Panel2.BackColor); 73 | 74 | if (chkLines.Checked) 75 | { 76 | foreach (Line line in CurrentPage.AllLines) 77 | g.DrawLine(Pens.DarkGray, (float)line.StartPoint.X, (float)line.StartPoint.Y, (float)line.EndPoint.X, (float)line.EndPoint.Y); 78 | 79 | foreach (Line line in CurrentPage.JoinedHorizontalLines) 80 | g.DrawLine(Pens.Blue, (float)line.StartPoint.X + 2, (float)line.StartPoint.Y + 2, (float)line.EndPoint.X + 2, (float) line.EndPoint.Y + 2); 81 | 82 | foreach (Line line in CurrentPage.JoinedVerticalLines) 83 | g.DrawLine(Pens.Blue, (float)line.StartPoint.X + 2, (float)line.StartPoint.Y + 2, (float)line.EndPoint.X + 2, (float) line.EndPoint.Y + 2); 84 | } 85 | 86 | if (chkTables.Checked) 87 | { 88 | foreach (Table tableStructure in CurrentPage.Tables) 89 | { 90 | g.DrawRectangle(Pens.OrangeRed, (float)tableStructure.TopLeftPoint.X + 4, (float)tableStructure.TopLeftPoint.Y + 4, (float)tableStructure.Width, (float) tableStructure.Heigth); 91 | 92 | if (chkLines.Checked) 93 | { 94 | // To avoid too many lines 95 | foreach (Row row in tableStructure.Rows) 96 | g.FillRectangle(Brushes.OrangeRed, (float)tableStructure.TopLeftPoint.X + 5, (float) row.EndY + 5, 4, 4); 97 | 98 | foreach (Column column in tableStructure.Columns) 99 | g.FillRectangle(Brushes.OrangeRed, (float)column.BeginX + 5, (float) tableStructure.BottomRightPoint.Y + 5, 4, 4); 100 | 101 | } 102 | else 103 | { 104 | for (int i = 0; i < tableStructure.Rows.Count - 1; i++) 105 | { 106 | Row row = tableStructure.Rows[i]; 107 | g.DrawLine(Pens.OrangeRed, (float)tableStructure.TopLeftPoint.X + 5, (float)row.EndY + 5, (float)tableStructure.BottomRightPoint.X + 5, (float)row.EndY + 5); 108 | } 109 | 110 | for (int i = 1; i < tableStructure.Columns.Count; i++) 111 | { 112 | Column column = tableStructure.Columns[i]; 113 | g.DrawLine(Pens.OrangeRed, (float)column.BeginX + 5, (float)tableStructure.BottomRightPoint.Y + 5, (float)column.BeginX + 5, (float)tableStructure.TopLeftPoint.Y + 5); 114 | } 115 | } 116 | } 117 | } 118 | 119 | if (chkParagraphs.Checked) 120 | { 121 | foreach (Paragraph paragraph in CurrentPage.Paragraphs) 122 | g.FillRectangle(Brushes.OrangeRed, 0, (float)paragraph.Y + 5, 10, 4); 123 | } 124 | 125 | if (chkText.Checked) 126 | { 127 | if (chkTextRealSize.Checked) 128 | { 129 | foreach (var line in CurrentPage.Statements.Where(_ => _ is TextObjectStatement).Cast().SelectMany(_ => _.Lines).Where(_ => _.FontHeight > 0)) 130 | { 131 | Font font = new Font("Arial", (float)line.FontHeight * 0.7f); 132 | g.DrawString(line.Content, font, Brushes.Black, (float)line.Position.X + 4, (float)line.Position.Y + 4); 133 | } 134 | } 135 | else 136 | { 137 | foreach (var line in CurrentPage.Statements.Where(_ => _ is TextObjectStatement).Cast().SelectMany(_ => _.Lines)) 138 | g.DrawString(line.Content, this.Font, Brushes.Black, (float)line.Position.X + 4, (float)line.Position.Y + 4); 139 | } 140 | } 141 | } 142 | } 143 | 144 | private void splitContainer_Panel2_Paint(object sender, PaintEventArgs e) 145 | { 146 | RedrawLines(); 147 | } 148 | 149 | private void chk_CheckedChanged(object sender, EventArgs e) 150 | { 151 | RedrawLines(); 152 | } 153 | 154 | private void btnFirst_Click(object sender, EventArgs e) 155 | { 156 | DrawPage(0); 157 | } 158 | 159 | private void btnPrevious_Click(object sender, EventArgs e) 160 | { 161 | MovePreviousPage(); 162 | } 163 | 164 | private void btnNext_Click(object sender, EventArgs e) 165 | { 166 | MoveNextPage(); 167 | } 168 | 169 | private void splitContainer_Panel2_MouseWheel(object sender, MouseEventArgs e) 170 | { 171 | if (e.Delta < 0) 172 | MoveNextPage(); 173 | else if (e.Delta > 0) 174 | MovePreviousPage(); 175 | } 176 | 177 | private void MoveNextPage() 178 | { 179 | if (CurrentPage != null && CurrentPage.Index < _pages.Count - 1) 180 | DrawPage(CurrentPage.Index + 1); 181 | } 182 | 183 | private void MovePreviousPage() 184 | { 185 | if (CurrentPage != null && CurrentPage.Index > 0) 186 | DrawPage(CurrentPage.Index - 1); 187 | } 188 | 189 | private void btnLast_Click(object sender, EventArgs e) 190 | { 191 | DrawPage(_pages.Count - 1); 192 | } 193 | 194 | private void btnGo_Click(object sender, EventArgs e) 195 | { 196 | int page; 197 | if (int.TryParse(txtPage.Text, out page) && page > 0 && page <= _pages.Count) 198 | DrawPage(page - 1); 199 | 200 | } 201 | 202 | private void btnRead_Click(object sender, EventArgs e) 203 | { 204 | if (string.IsNullOrWhiteSpace(fileOpen.Text)) 205 | return; 206 | 207 | ShowDocument(fileOpen.Text); 208 | 209 | Properties.Settings.Default.FileName = fileOpen.Text; 210 | Properties.Settings.Default.Save(); 211 | } 212 | 213 | private void btnViewRawContent_Click(object sender, EventArgs e) 214 | { 215 | if (_pages == null) 216 | return; 217 | 218 | int page; 219 | if (!int.TryParse(txtPage.Text, out page) && page > 0 && page <= _pages.Count) 220 | return; 221 | 222 | string textFromPage = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, _pages.PdfReader.GetPageContent(page))); 223 | textFromPage = textFromPage.Replace("\n", "\r\n"); 224 | new frmNotepad().Start(textFromPage); 225 | 226 | } 227 | 228 | private void btnCheckAllPages_Click(object sender, EventArgs e) 229 | { 230 | foreach (Page page in _pages.Where(_ => !_.IsRefreshed)) 231 | { 232 | page.DetermineTableStructures(); 233 | page.DetermineParagraphs(); 234 | page.FillContent(); 235 | } 236 | } 237 | 238 | private void btnHtmlExport_Click(object sender, EventArgs e) 239 | { 240 | foreach (Page page in _pages.Where(_ => !_.IsRefreshed)) 241 | { 242 | page.DetermineTableStructures(); 243 | page.DetermineParagraphs(); 244 | page.FillContent(); 245 | } 246 | 247 | string htmlFileName = fileOpen.Text + ".html"; 248 | File.WriteAllText(htmlFileName, HtmlConverter.Convert(_pages)); 249 | Process.Start(htmlFileName); 250 | } 251 | 252 | 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /BuildTablesFromPdf.Engine/Page.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using BuildTablesFromPdf.Engine.Statements; 5 | using BuildTablesFromPdf.Engine.Tables; 6 | 7 | namespace BuildTablesFromPdf.Engine 8 | { 9 | /// 10 | /// A Pdf page 11 | /// 12 | public class Page 13 | { 14 | public Page() 15 | { 16 | Statements = new List(); 17 | Tables = new List(); 18 | AllLines = new List(); 19 | } 20 | 21 | public int Index { get; set; } 22 | 23 | /// 24 | /// Gets the statements. 25 | /// 26 | /// 27 | /// The statements. 28 | /// 29 | public List Statements { get; private set; } 30 | 31 | public List AllLines { get; private set; } 32 | 33 | public List JoinedHorizontalLines { get; set; } 34 | 35 | public List JoinedVerticalLines { get; set; } 36 | 37 | public List JoinedLines { get; set; } 38 | 39 | public int Rotation { get; set; } 40 | 41 | /// 42 | /// Gets or sets the table structures. 43 | /// 44 | /// 45 | /// The table structures. 46 | /// 47 | public List
Tables { get; set; } 48 | 49 | public List Paragraphs { get; set; } 50 | 51 | public List Contents { get; set; } 52 | 53 | public bool IsRefreshed { get { return JoinedLines != null; } } 54 | 55 | 56 | public void DeleteWrongLines() 57 | { 58 | // ReSharper disable ImpureMethodCallOnReadonlyValueField 59 | AllLines = AllLines.Where(_ => _.StartPoint.IsValid() && _.EndPoint.IsValid()).ToList(); 60 | // ReSharper restore ImpureMethodCallOnReadonlyValueField 61 | } 62 | 63 | 64 | /// 65 | /// Determines the table structures. 66 | /// 67 | public void DetermineTableStructures() 68 | { 69 | JoinedLines = JoinLines(AllLines); 70 | 71 | // Find table borders 72 | foreach (Line horizontalLine in JoinedHorizontalLines.OrderBy(_ => _.StartPoint.Y)) 73 | { 74 | // We consider that this line is a top line of a table if 75 | // 1. There is not a table with this line inside 76 | // 2. There is a vertical line starting from this line 77 | 78 | if (Tables.Any(_ => _.Contains(horizontalLine.StartPoint.Y))) 79 | continue; 80 | 81 | Line? tableLine = JoinedVerticalLines 82 | .Where(_ => _.StartPoint == horizontalLine.StartPoint || _.StartPoint == horizontalLine.EndPoint) 83 | .OrderByDescending(_ => _.EndPoint.Y - _.StartPoint.Y) 84 | .Cast() 85 | .FirstOrDefault(); 86 | 87 | if (tableLine == null) 88 | continue; 89 | 90 | Table tableStructure = new Table() 91 | { 92 | TopLeftPoint = horizontalLine.StartPoint, 93 | BottomRightPoint = new Point(horizontalLine.EndPoint.X, tableLine.Value.EndPoint.Y) 94 | }; 95 | 96 | Tables.Add(tableStructure); 97 | } 98 | 99 | // Add the first row and the first column to all tables 100 | foreach (Table tableStructure in Tables) 101 | { 102 | tableStructure.Rows.Add(new Row(){BeginY = tableStructure.TopLeftPoint.Y}); 103 | tableStructure.Columns.Add(new Column(){BeginX = tableStructure.TopLeftPoint.X}); 104 | } 105 | 106 | // Find rows 107 | foreach (Line horizontalLine in JoinedHorizontalLines.OrderBy(_ => _.StartPoint.Y)) 108 | { 109 | var tableStructure = Tables.FirstOrDefault(_ => _.Contains(horizontalLine)); 110 | // No table contains this line 111 | if (tableStructure == null) 112 | continue; 113 | 114 | // Check if the row already belongs to the table 115 | if (tableStructure.Rows.Any(_ => Math.Abs(_.BeginY - horizontalLine.StartPoint.Y) < ContentExtractor.Tolerance)) 116 | continue; 117 | 118 | // Check if the row is the bottom edge of the table 119 | if (tableStructure.BottomRightPoint.Y - horizontalLine.StartPoint.Y < ContentExtractor.Tolerance) 120 | continue; 121 | 122 | tableStructure.Rows.Add(new Row() {BeginY = horizontalLine.StartPoint.Y}); 123 | } 124 | 125 | // Find columns 126 | foreach (Line verticalLine in JoinedVerticalLines.OrderBy(_ => _.StartPoint.X)) 127 | { 128 | var tableStructure = Tables.FirstOrDefault(_ => _.Contains(verticalLine)); 129 | // No table contains this line 130 | if (tableStructure == null) 131 | continue; 132 | 133 | // The row already belongs to the table 134 | if (tableStructure.Columns.Any(_ => Math.Abs(_.BeginX - verticalLine.StartPoint.X) < ContentExtractor.Tolerance)) 135 | continue; 136 | 137 | // Check if the row is the bottom edge of the table 138 | if (tableStructure.BottomRightPoint.X - verticalLine.StartPoint.X < ContentExtractor.Tolerance) 139 | continue; 140 | 141 | 142 | tableStructure.Columns.Add(new Column() { BeginX = verticalLine.StartPoint.X }); 143 | } 144 | 145 | 146 | // Fix EndX and EndY and indexes 147 | foreach (Table tableStructure in Tables) 148 | { 149 | // Fix EndYs 150 | for (int i = 0; i < tableStructure.Rows.Count - 1; i++) 151 | tableStructure.Rows[i].EndY = tableStructure.Rows[i + 1].BeginY - ContentExtractor.Tolerance * 0.1f; 152 | 153 | tableStructure.Rows[tableStructure.Rows.Count - 1].EndY = tableStructure.BottomRightPoint.Y; 154 | 155 | 156 | // Fix EndXs 157 | for (int i = 0; i < tableStructure.Columns.Count - 1; i++) 158 | tableStructure.Columns[i].EndX = tableStructure.Columns[i + 1].BeginX - ContentExtractor.Tolerance * 0.1f; 159 | 160 | tableStructure.Columns[tableStructure.Columns.Count - 1].EndX = tableStructure.BottomRightPoint.X; 161 | 162 | int index; 163 | 164 | index = 0; 165 | foreach (var column in tableStructure.Columns.OrderBy(_ => _.BeginX)) 166 | { 167 | column.Index = index; 168 | index++; 169 | } 170 | 171 | index = 0; 172 | foreach (var row in tableStructure.Rows.OrderByDescending(_ => _.BeginY)) 173 | { 174 | row.Index = index; 175 | index++; 176 | } 177 | 178 | tableStructure.CreateContent(); 179 | 180 | } 181 | 182 | } 183 | 184 | 185 | /// 186 | /// Joins the horizontal and vertical lines. 187 | /// 188 | /// All the lines. 189 | /// The orizontal and the vertical lines (eventually joined) 190 | private List JoinLines(List allLines) 191 | { 192 | JoinedVerticalLines = JoinVerticalLines(allLines); 193 | JoinedHorizontalLines = JoinHorizontalLines(allLines); 194 | 195 | return JoinedHorizontalLines.Union(JoinedVerticalLines).ToList(); 196 | } 197 | 198 | /// 199 | /// Joins the vertical lines. 200 | /// 201 | /// All lines. 202 | /// The vertical lines (eventually joined) 203 | private static List JoinVerticalLines(List allLines) 204 | { 205 | var lines = new List(); 206 | 207 | var verticalLines = allLines.Where(_ => _.IsVertical()).OrderBy(_ => _.StartPoint.X).ThenBy(_ => _.StartPoint.Y).ToList(); 208 | 209 | foreach (Line verticalLine in verticalLines) 210 | { 211 | if (lines.Count == 0) 212 | lines.Add(verticalLine); 213 | else if (verticalLine.IsCoincident(lines[lines.Count - 1])) 214 | continue; 215 | else if (verticalLine.IsOverlapped(lines[lines.Count - 1])) 216 | { 217 | var joinedLine = lines[lines.Count - 1].Join(verticalLine); 218 | lines.RemoveAt(lines.Count - 1); 219 | lines.Add(joinedLine); 220 | } 221 | else 222 | lines.Add(verticalLine); 223 | } 224 | 225 | return lines; 226 | } 227 | 228 | private static List JoinHorizontalLines(List allLines) 229 | { 230 | var lines = new List(); 231 | 232 | var horizontalLines = allLines.Where(_ => _.IsHorizontal()).OrderBy(_ => _.StartPoint.Y).ThenBy(_ => _.StartPoint.X).ToList(); 233 | 234 | foreach (Line horizontalLine in horizontalLines) 235 | { 236 | if (lines.Count == 0) 237 | lines.Add(horizontalLine); 238 | else if (horizontalLine.IsCoincident(lines[lines.Count - 1])) 239 | continue; 240 | else if (horizontalLine.IsOverlapped(lines[lines.Count - 1])) 241 | { 242 | var joinedLine = horizontalLine.Join(lines[lines.Count - 1]); 243 | lines.RemoveAt(lines.Count - 1); 244 | lines.Add(joinedLine); 245 | } 246 | else 247 | lines.Add(horizontalLine); 248 | } 249 | 250 | return lines; 251 | } 252 | 253 | 254 | public void DetermineParagraphs() 255 | { 256 | Paragraphs = new List(); 257 | 258 | var textObjectStatementLines = Statements.Where(_ => _ is TextObjectStatement).Cast().SelectMany(_ => _.Lines) 259 | .Where(_ => !string.IsNullOrWhiteSpace(_.Content)) 260 | .Where(_ => !Tables.Any(t => t.Contains(_.Position.Y))) 261 | .OrderBy(_ => _.Position.Y); 262 | 263 | foreach (var line in textObjectStatementLines) 264 | { 265 | if (!Paragraphs.Any(t => t.Contains(line.Position))) 266 | Paragraphs.Add(new Paragraph(line.Position.Y)); 267 | } 268 | } 269 | 270 | 271 | public void FillContent() 272 | { 273 | Contents = new List(); 274 | Contents.AddRange(Paragraphs.Cast().Union(Tables).OrderBy(_ => _.Y)); 275 | 276 | var textObjectStatementsLines = Statements.Where(_ => _ is TextObjectStatement).Cast().SelectMany(_ => _.Lines) 277 | .Where(_ => !string.IsNullOrWhiteSpace(_.Content)) 278 | .OrderBy(_ => _.Position.Y).ThenBy(_ => _.Position.X); 279 | 280 | foreach (var line in textObjectStatementsLines.Where(_ => _.Position.IsValid())) 281 | { 282 | IPageContent targetPageContent = Contents.First(_ => _.Contains(line.Position.Y)); 283 | targetPageContent.AddText(line.Position, line.Content); 284 | } 285 | } 286 | 287 | public override string ToString() 288 | { 289 | string pageContent = string.Empty; 290 | if (Contents != null) 291 | { 292 | foreach (IPageContent content in Contents) 293 | pageContent += string.Format("{0}\r\n", content); 294 | } 295 | return pageContent; 296 | } 297 | 298 | } 299 | } --------------------------------------------------------------------------------