├── .deployment ├── .gitattributes ├── .gitignore ├── .vscode ├── launch.json └── tasks.json ├── docs ├── architecture.md ├── artigo.md ├── blocks.md ├── goodnews.md ├── hierarquia.md ├── history.md ├── images │ ├── 01blocks.png │ ├── 02lines.png │ ├── 03blocksets.png │ ├── 04followline.png │ ├── 05tables.png │ ├── 06ordering.png │ ├── 10align.png │ ├── 11linedist.png │ ├── 12paragraph.png │ ├── 13centered.png │ └── 14notcentered.png └── todo.md ├── readme.md ├── src ├── ParserConsole │ ├── ParserConsole.csproj │ └── Program.cs ├── ParserConsoleWeb │ ├── AzureFS.cs │ ├── ParserConsoleWeb.csproj │ ├── PdfProcessor.cs │ └── Program.cs ├── ParserFrontend │ ├── .gitignore │ ├── Backend │ │ └── JobManagerHostedService.cs │ ├── Configuration │ │ └── CopyFilesConfig.cs │ ├── Controllers │ │ ├── ArticlesController.cs │ │ ├── BrowserController.cs │ │ ├── DocumentsController.cs │ │ ├── ImagesController.cs │ │ ├── JobController.cs │ │ ├── ListController.cs │ │ └── ProcessController.cs │ ├── Infra │ │ ├── AzureFS.cs │ │ ├── IVirtualFS2.cs │ │ ├── WebVirtualFS.cs │ │ └── ZipCompression.cs │ ├── Logic │ │ ├── AccessManager.cs │ │ ├── CopyFiles.cs │ │ ├── DeleteFiles.cs │ │ ├── DownloadFolder.cs │ │ ├── InputFiles.cs │ │ ├── JobManager.cs │ │ ├── JobProcess.cs │ │ ├── JobProcessHttp.cs │ │ ├── OutputFiles.cs │ │ ├── OutputTreeInfo.cs │ │ ├── PdfHandler.cs │ │ └── PrettyTextFile.cs │ ├── Pages │ │ ├── About.cshtml │ │ ├── About.cshtml.cs │ │ ├── AtosGraph.cshtml │ │ ├── AtosGraph.cshtml.cs │ │ ├── Error.cshtml │ │ ├── Error.cshtml.cs │ │ ├── Index.cshtml │ │ ├── Index.cshtml.cs │ │ ├── Jobs.cshtml │ │ ├── Jobs.cshtml.cs │ │ ├── Upload.cshtml │ │ ├── Upload.cshtml.cs │ │ ├── _ViewImports.cshtml │ │ └── _ViewStart.cshtml │ ├── ParserFrontend.csproj │ ├── Program.cs │ ├── Properties │ │ └── launchSettings.json │ ├── Startup.cs │ ├── TagHelpers │ │ ├── ArticleHtmlTagHelper.cs │ │ └── DocumentOutputTreeTagHelper.cs │ ├── Views │ │ ├── Articles │ │ │ └── ShowHtml.cshtml │ │ ├── Browser │ │ │ ├── Index.cshtml │ │ │ └── Year.cshtml │ │ ├── Documents │ │ │ ├── Log.cshtml │ │ │ └── Show.cshtml │ │ ├── List │ │ │ ├── Index.cshtml │ │ │ └── IndexFiltro.cshtml │ │ ├── Shared │ │ │ ├── _Layout.cshtml │ │ │ └── _ValidationScriptsPartial.cshtml │ │ ├── _ViewImports.cshtml │ │ └── _ViewStart.cshtml │ ├── appsettings.Development.json │ ├── appsettings.json │ ├── bundleconfig.json │ └── wwwroot │ │ ├── css │ │ ├── site.css │ │ └── site.min.css │ │ ├── favicon.ico │ │ ├── images │ │ ├── atosgraph.JPG │ │ ├── pdf01.jpg │ │ ├── pdf02.jpg │ │ ├── pdf03.jpg │ │ ├── pdf04.jpg │ │ ├── pdf05.jpg │ │ ├── pdf06.jpg │ │ └── wait.gif │ │ ├── js │ │ ├── site.js │ │ └── site.min.js │ │ └── lib │ │ ├── bootstrap │ │ ├── .bower.json │ │ ├── LICENSE │ │ └── dist │ │ │ ├── css │ │ │ ├── bootstrap-theme.css │ │ │ ├── bootstrap-theme.css.map │ │ │ ├── bootstrap-theme.min.css.map │ │ │ ├── bootstrap.css │ │ │ ├── bootstrap.css.map │ │ │ └── bootstrap.min.css.map │ │ │ ├── fonts │ │ │ ├── glyphicons-halflings-regular.eot │ │ │ ├── glyphicons-halflings-regular.svg │ │ │ ├── glyphicons-halflings-regular.ttf │ │ │ ├── glyphicons-halflings-regular.woff │ │ │ └── glyphicons-halflings-regular.woff2 │ │ │ └── js │ │ │ ├── bootstrap.js │ │ │ └── npm.js │ │ ├── calendar-heatmap │ │ ├── calendar-heatmap.css │ │ ├── calendar-heatmap.js │ │ └── calendar.html │ │ ├── jquery-validation-unobtrusive │ │ ├── .bower.json │ │ ├── jquery.validate.unobtrusive.js │ │ └── jquery.validate.unobtrusive.min.js │ │ ├── jquery-validation │ │ ├── .bower.json │ │ ├── LICENSE.md │ │ └── dist │ │ │ ├── additional-methods.js │ │ │ └── jquery.validate.js │ │ └── jquery │ │ ├── .bower.json │ │ ├── LICENSE.txt │ │ └── dist │ │ ├── jquery.js │ │ └── jquery.min.map ├── ParserFunctions │ ├── .gitignore │ ├── AzureBlob.cs │ ├── AzureFS.cs │ ├── DurableFunctions.cs │ ├── Functions.cs │ ├── Model │ │ ├── Pdf.cs │ │ └── PdfStats.cs │ ├── ParserFunctions.csproj │ ├── TestAzureBlob.cs │ └── host.json ├── ParserRun │ ├── AzureBlob.cs │ ├── AzureFS.cs │ ├── Config.cs │ ├── ParserRun.csproj │ ├── Program.cs │ ├── TestAzureBlob.cs │ └── TestFS.cs ├── PdfTextReader.Azure │ ├── AzureBlobFileSystem.cs │ ├── Blob │ │ ├── AzureBlobAccount.cs │ │ ├── AzureBlobContainer.cs │ │ ├── AzureBlobFS.cs │ │ ├── AzureBlobFileBlock.cs │ │ ├── AzureBlobFileGeneric.cs │ │ ├── AzureBlobFolder.cs │ │ └── AzureBlobRef.cs │ ├── DevNul │ │ ├── DevNulFS.cs │ │ ├── DevNulFile.cs │ │ └── DevNulFolder.cs │ ├── IAzureBlob.cs │ ├── IAzureBlobFile.cs │ ├── IAzureBlobFolder.cs │ ├── IPdfConverter.cs │ ├── PdfImageConverter.cs │ ├── PdfTextReader.Azure.csproj │ └── Queue │ │ ├── AzureMessage.cs │ │ ├── AzureQueue.cs │ │ ├── IQueueMessage.cs │ │ └── IStorageQueue.cs ├── PdfTextReader.sln ├── PdfTextReader │ ├── Base │ │ ├── Block.cs │ │ ├── BlockArea.cs │ │ ├── BlockColumn.cs │ │ ├── BlockHidden.cs │ │ ├── BlockImage.cs │ │ ├── BlockLine.cs │ │ ├── BlockMerge.cs │ │ ├── BlockPage.cs │ │ ├── BlockPage2.cs │ │ ├── BlockPageSegment.cs │ │ ├── BlockSet.cs │ │ ├── BlockSet2.cs │ │ ├── BlockTable.cs │ │ ├── BlockText.cs │ │ ├── IAggregateStructure.cs │ │ ├── IBlock.cs │ │ ├── IBlockSet.cs │ │ ├── ICalculateStats.cs │ │ ├── IConfigurationStore.cs │ │ ├── IConvertBlock.cs │ │ ├── IExecutionConfiguration.cs │ │ ├── ILogStructure.cs │ │ ├── ILogStructurePdf.cs │ │ ├── IPipelineDebug.cs │ │ ├── IProcessBlock.cs │ │ ├── IProcessBlockData.cs │ │ ├── IProcessStructure.cs │ │ ├── IRetrieveStatistics.cs │ │ ├── ITransformIndexTree.cs │ │ ├── IValidateBlock.cs │ │ ├── IValidateMark.cs │ │ ├── ImageBlock.cs │ │ ├── MarkLine.cs │ │ ├── PdfReaderException.cs │ │ ├── StatsBlocksOverlapped.cs │ │ ├── StatsExceptionHandled.cs │ │ ├── StatsPageFooter.cs │ │ ├── StatsPageLayout.cs │ │ ├── TableCell.cs │ │ ├── TableSet.cs │ │ ├── TextAlignment.cs │ │ ├── TextLine.cs │ │ ├── TextLine2.cs │ │ ├── TextPage.cs │ │ ├── TextSegment.cs │ │ ├── TextSegmentText.cs │ │ ├── TextStructure.cs │ │ ├── TextStructureAgg.cs │ │ ├── TextTaggedSegment.cs │ │ └── TitleWithHiddenIdMateria.cs │ ├── Compatibility.cs │ ├── Configuration │ │ ├── ConfigurationFile.cs │ │ └── ParserTreeConfig.cs │ ├── ExampleStages.cs │ ├── ExamplesAzure.cs │ ├── ExamplesWeb.cs │ ├── Execution │ │ ├── IPipelineContext.cs │ │ ├── Pipeline.cs │ │ ├── PipelineDebug.cs │ │ ├── PipelineDebugContext.cs │ │ ├── PipelineDisposeHelper.cs │ │ ├── PipelineDocumentStats.cs │ │ ├── PipelineFactory.cs │ │ ├── PipelineGlobalStats.cs │ │ ├── PipelineInputCache.cs │ │ ├── PipelineInputPdf.cs │ │ ├── PipelinePage.cs │ │ ├── PipelinePageStats.cs │ │ ├── PipelinePdfLog.cs │ │ ├── PipelineResult.cs │ │ ├── PipelineStats.cs │ │ └── PipelineText.cs │ ├── ExecutionStats │ │ ├── PrintAnalytics.cs │ │ ├── PrintAnalyticsExtensions.cs │ │ ├── PrintDebugCount.cs │ │ ├── PrintDebugExtensions.cs │ │ ├── PrintDebugPrint.cs │ │ ├── ShowParserWarnings.cs │ │ ├── TextInfo.cs │ │ ├── ValidateFooter.cs │ │ ├── ValidateLayout.cs │ │ ├── ValidateOverlap.cs │ │ └── ValidateUnhandledExceptions.cs │ ├── IVirtualFS.cs │ ├── PDFCore │ │ ├── AddImageSpace.cs │ │ ├── AddTableHorizontalLines.cs │ │ ├── AddTableHorizontalLines2.cs │ │ ├── AddTableLines.cs │ │ ├── AddTableSpace.cs │ │ ├── BasicFirstPageStats.cs │ │ ├── BlocksetData.cs │ │ ├── BreakColumns.cs │ │ ├── BreakColumnsLight.cs │ │ ├── BreakColumnsRewrite.cs │ │ ├── BreakInlineElements.cs │ │ ├── CheckOverlap.cs │ │ ├── DetectImplicitTable.cs │ │ ├── DouIgnoreLongDotSequence.cs │ │ ├── ExtractDouHeaderInfo.cs │ │ ├── FilterHeaderFooter.cs │ │ ├── FinalBlockResultData.cs │ │ ├── FindDouHeaderFooter.cs │ │ ├── FindDouIdMateria.cs │ │ ├── FindInitialBlockset.cs │ │ ├── FindInitialBlocksetWithBlockInfo.cs │ │ ├── FindInitialBlocksetWithRewind.cs │ │ ├── GroupLines.cs │ │ ├── HeaderFooterData.cs │ │ ├── HideSmallFonts.cs │ │ ├── HighlightTextTable.cs │ │ ├── IdentifyTables.cs │ │ ├── IdentifyTablesData.cs │ │ ├── MergeBlockLines.cs │ │ ├── MergeInlineTexts.cs │ │ ├── MergeSequentialLayout.cs │ │ ├── MergeTableText.cs │ │ ├── OrderBlocksets.cs │ │ ├── OrderBlocksets2.cs │ │ ├── OrderBlocksetsWithBlockInfo.cs │ │ ├── OrganizePageLayout.cs │ │ ├── PageInfoStats.cs │ │ ├── ProcessImageData.cs │ │ ├── ProcessPdfTextData.cs │ │ ├── RemoveBackgroundNonText.cs │ │ ├── RemoveBlockHidden.cs │ │ ├── RemoveFooter.cs │ │ ├── RemoveHeader.cs │ │ ├── RemoveHeaderImage.cs │ │ ├── RemoveHeaderImg.cs │ │ ├── RemoveImageLineFromHeaderFooter.cs │ │ ├── RemoveImageTexts.cs │ │ ├── RemoveOverlapedImages.cs │ │ ├── RemoveOverlapedImages2.cs │ │ ├── RemoveSmallFonts.cs │ │ ├── RemoveTableDotChar.cs │ │ ├── RemoveTableOverImage.cs │ │ ├── RemoveTableText.cs │ │ ├── ReplaceCharacters.cs │ │ ├── ResizeBlocksetMagins.cs │ │ ├── ResizeBlocksets.cs │ │ ├── ResizeBlocksetsColumn.cs │ │ ├── ResizeBlocksetsWithBlockInfo.cs │ │ ├── ResizeBlocksetsWithStats.cs │ │ ├── ResizeSequentialLayout.cs │ │ ├── SetIdentifyTablesCompatibility.cs │ │ ├── SetProcessImageCompatibility.cs │ │ ├── ShowBlocksets.cs │ │ ├── ShowTextHeaderFooter.cs │ │ └── ValidatePositiveCoordinates.cs │ ├── PDFText │ │ ├── PreProcessImages.cs │ │ ├── PreProcessRenderPath.cs │ │ ├── PreProcessTables.cs │ │ ├── ProcessPdfText.cs │ │ └── ProcessPdfValidation.cs │ ├── Parser │ │ ├── AggregateAnexo.cs │ │ ├── AggregateSingularBody.cs │ │ ├── Anexo.cs │ │ ├── Artigo.cs │ │ ├── Autor.cs │ │ ├── Content.cs │ │ ├── Conteudo.cs │ │ ├── Converter2GN.cs │ │ ├── ConverterGN.cs │ │ ├── CreateTaggedSegments.cs │ │ ├── HifenUtil.cs │ │ ├── InjectFilename.cs │ │ ├── Jornal.cs │ │ ├── Metadados.cs │ │ ├── ProcessParser.cs │ │ ├── ProcessParser2.cs │ │ ├── ProcessParserJson.cs │ │ ├── ProcessParserOriginal.cs │ │ ├── TipoDoConteudo.cs │ │ ├── TransformArtigo.cs │ │ ├── TransformArtigo2.cs │ │ ├── TransformConteudo.cs │ │ ├── TransformConteudo2.cs │ │ ├── TransformConteudo3.cs │ │ ├── TransformConteudo4.cs │ │ └── TransformExemplo.cs │ ├── ParserStages │ │ ├── StageBlocksets.cs │ │ ├── StageContext.cs │ │ ├── StageConvertArtigoGN.cs │ │ ├── StageConvertContent.cs │ │ ├── StageConvertStructText.cs │ │ ├── StageConvertStructure.cs │ │ ├── StageConvertText.cs │ │ ├── StageConvertTree.cs │ │ ├── StageDbgFlow.cs │ │ ├── StageExtractHeaderDOU.cs │ │ ├── StageFullV1.cs │ │ ├── StagePageMargins.cs │ │ ├── StagePdfInput.cs │ │ └── StageRetrieveBlocks.cs │ ├── PdfTextReader.csproj │ ├── Program.cs │ ├── Program3.cs │ ├── TextStructures │ │ ├── AfterFilterTextSegments.cs │ │ ├── AggregateStructures.cs │ │ ├── AnalyzeLines.cs │ │ ├── AnalyzeLinesCenterRight.cs │ │ ├── AnalyzePageInfo.cs │ │ ├── AnalyzeSegmentStats.cs │ │ ├── AnalyzeSegmentTextVersion.cs │ │ ├── AnalyzeSegmentTitles.cs │ │ ├── AnalyzeSegments.cs │ │ ├── AnalyzeSegments2.cs │ │ ├── AnalyzeStructures.cs │ │ ├── AnalyzeStructuresCentral.cs │ │ ├── AnalyzeTreeStructure.cs │ │ ├── AnalyzeTreeStructure2.cs │ │ ├── AnalyzeTreeStructureFontSize2.cs │ │ ├── CreateContent.cs │ │ ├── CreateStructText.cs │ │ ├── CreateStructures.cs │ │ ├── CreateStructures2.cs │ │ ├── CreateStructures3.cs │ │ ├── CreateStructuresV2.cs │ │ ├── CreateTextLineIndex.cs │ │ ├── CreateTextLines.cs │ │ ├── CreateTextSegments.cs │ │ ├── CreateTextSegmentsWithConfigData.cs │ │ ├── CreateTreeSegments.cs │ │ ├── FilterTextSegments.cs │ │ ├── FilterTextWithFontsSegments.cs │ │ ├── GenerateArtigoGN4.cs │ │ ├── GenerateArtigoTmp.cs │ │ ├── PreCreateStructures.cs │ │ ├── PreCreateTextSegments.cs │ │ ├── ProcessStructure2.cs │ │ ├── ShowStructureCentral.cs │ │ ├── ShowTitleSegment.cs │ │ ├── TransformIndex.cs │ │ ├── TransformIndexEntry.cs │ │ ├── TransformIndexTree.cs │ │ └── TransformText.cs │ ├── VirtualFS.Static.cs │ └── VirtualFS.cs ├── PdfToImageFunction │ ├── .gitignore │ ├── Function1.cs │ ├── PdfToImageFunction.csproj │ ├── Properties │ │ └── PublishProfiles │ │ │ └── FunctionApp20180412035249 - Web Deploy.pubxml │ ├── host.json │ ├── pdf │ │ └── D141.pdf │ └── temp │ │ └── readme.txt ├── QueueConsole │ ├── Config.cs │ ├── MainConsole.cs │ ├── MainPdfToImage.cs │ ├── Program.cs │ ├── QueueConsole.csproj │ └── appsettings.json ├── Validator │ ├── File.cs │ ├── FileList.cs │ ├── GeneralProcess.cs │ ├── IRunner.cs │ ├── Process2010.cs │ ├── Process2012.cs │ ├── Process2016.cs │ ├── ProcessDefault.cs │ ├── ProcessXml.cs │ ├── Program.cs │ ├── ProgramValidator │ │ ├── ProgramValidator.cs │ │ ├── ProgramValidator2010.cs │ │ ├── ProgramValidator2012.cs │ │ ├── ProgramValidator2016.cs │ │ ├── ProgramValidatorDefault.cs │ │ ├── ProgramValidatorXML.cs │ │ └── ValidatorPipeline.cs │ ├── Runner.cs │ ├── Validate2010.cs │ ├── Validator.csproj │ └── VirtualFS.Static.cs └── WebFrontendImages │ ├── Controllers │ └── ImagesController.cs │ ├── Logic │ ├── ImageProcessing.cs │ └── ImageSource.cs │ ├── Program.cs │ ├── Properties │ └── launchSettings.json │ ├── Startup.cs │ ├── WebFrontendImages.csproj │ ├── appsettings.Development.json │ ├── appsettings.json │ └── wwwroot │ └── test.html └── test └── PdfTextReader.Test ├── PdfTextReader.Tests.csproj └── UnitTest1.cs /.deployment: -------------------------------------------------------------------------------- 1 | [config] 2 | project = src/ParserFunctions/ParserFunctions.csproj -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to find out which attributes exist for C# debugging 3 | // Use hover for the description of the existing attributes 4 | // For further information visit https://github.com/OmniSharp/omnisharp-vscode/blob/master/debugger-launchjson.md 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": ".NET Core Launch (console)", 9 | "type": "coreclr", 10 | "request": "launch", 11 | "preLaunchTask": "build", 12 | // If you have changed target frameworks, make sure to update the program path. 13 | "program": "${workspaceRoot}/src/ParserConsole/bin/Debug/netcoreapp2.0/ParserConsole.dll", 14 | "args": [], 15 | "cwd": "${workspaceRoot}/src/ParserConsole", 16 | // For more information about the 'console' field, see https://github.com/OmniSharp/omnisharp-vscode/blob/master/debugger-launchjson.md#console-terminal-window 17 | "console": "internalConsole", 18 | "stopAtEntry": false, 19 | "internalConsoleOptions": "openOnSessionStart" 20 | }, 21 | { 22 | "name": ".NET Core Attach", 23 | "type": "coreclr", 24 | "request": "attach", 25 | "processId": "${command:pickProcess}" 26 | } 27 | ] 28 | } -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.1.0", 3 | "command": "dotnet", 4 | "isShellCommand": true, 5 | "args": [], 6 | "tasks": [ 7 | { 8 | "taskName": "build", 9 | "args": [ 10 | "${workspaceRoot}/src/ParserConsole/ParserConsole.csproj" 11 | ], 12 | "isBuildCommand": true, 13 | "problemMatcher": "$msCompile" 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /docs/blocks.md: -------------------------------------------------------------------------------- 1 | 2 | BlockPage 3 | - BlockColumn 4 | - BlockImage 5 | - BlockTable 6 | - BlockText 7 | - BlockLine 8 | - Block -------------------------------------------------------------------------------- /docs/goodnews.md: -------------------------------------------------------------------------------- 1 |

2 |

3 |

4 |

5 |

6 |

7 | -------------------------------------------------------------------------------- /docs/hierarquia.md: -------------------------------------------------------------------------------- 1 | Ministério de Minas e Energia (OttawaV, 11.96, Regular) 2 | (-4.002998) 3 | 4 | AGÊNCIA NACIONAL DO PETRÓLEO, GÁS 5 | NATURAL E BIOCOMBUSTÍVEIS (Times-Bold, 9.96, Bold) 6 | (0.2620096) 7 | 8 | DIRETORIA I (Times-Roman, 9.96, Regular) 9 | (6.871202) 10 | 11 | SUPERINTENDÊNCIA DE BIOCOMBUSTÍVEIS E QUALIDADE DE PRODUTOS (Times-Roman, 9.96, Regular) 12 | (21.27935) 13 | 14 | DESPACHOS DA SUPERINTENDENTE (Times-Bold, 7.97, Bold) 15 | (6.870232) 16 | 17 | Em 19 de dezembro de 2016 (Times-Roman, 7.97, Regular) 18 | (14.84112) 19 | 20 | -------------------------------------------------------------------------------- /docs/images/01blocks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/docs/images/01blocks.png -------------------------------------------------------------------------------- /docs/images/02lines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/docs/images/02lines.png -------------------------------------------------------------------------------- /docs/images/03blocksets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/docs/images/03blocksets.png -------------------------------------------------------------------------------- /docs/images/04followline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/docs/images/04followline.png -------------------------------------------------------------------------------- /docs/images/05tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/docs/images/05tables.png -------------------------------------------------------------------------------- /docs/images/06ordering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/docs/images/06ordering.png -------------------------------------------------------------------------------- /docs/images/10align.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/docs/images/10align.png -------------------------------------------------------------------------------- /docs/images/11linedist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/docs/images/11linedist.png -------------------------------------------------------------------------------- /docs/images/12paragraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/docs/images/12paragraph.png -------------------------------------------------------------------------------- /docs/images/13centered.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/docs/images/13centered.png -------------------------------------------------------------------------------- /docs/images/14notcentered.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/docs/images/14notcentered.png -------------------------------------------------------------------------------- /docs/todo.md: -------------------------------------------------------------------------------- 1 | - Gap 2009-2017 2 | - Jaider: jornal 3 | - Gerar ID de artigo 4 | - Gerar ID de imagem 5 | - Gerar Imagens Resized 6 | - Obter informações do cabeçalho 7 | - write diario.txt 8 | - rewrite artigoGN body 9 | - rewrite do framework de escrita 10 | - rewrite segment body (usando a diferenca de espacamento) 11 | - Corrigir erros de assinatura e hifenização (fazer releitura do JSON) 12 | - Criacao de artigos intermediarios (JSON?) + diario.txt + 13 | - Reconhecedor de datas ementas e assinaturas - concentrado em um unico 14 | - Quebrar assinatura 15 | - rewrite assinatura () - caso especifico 16 | - Download diario.txt 17 | - Download zip 18 | 19 | Formatter txt 20 | 21 | 22 | 23 | Portal segregado por ano 24 | Arvore de todos os ministerios: necessario um banco de dados 25 | Lista das portarias 26 | Lista dos diarios (jan fev mar abr) 27 | 28 | Integrar o lexml 29 | Indice reverso sobre os nomes (Frontend: identificar assinaturas) 30 | 31 | ML: determinar hierarquias (pois aparecem múltiplas vezes) 32 | 33 | Usar marcações HTML para e 34 | 35 | 36 | - rewrite headers (h1, h2, h3...) 37 | : funciona??? -------------------------------------------------------------------------------- /src/ParserConsole/ParserConsole.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp2.0 6 | Debug;Release;DebugCore;CORE 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/ParserConsole/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace ParserConsole 4 | { 5 | class Program 6 | { 7 | static void Main(string[] args) 8 | { 9 | PdfTextReader.Program.Main(args); 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/ParserConsoleWeb/ParserConsoleWeb.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp2.0 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/ParserConsoleWeb/PdfProcessor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using PdfTextReader; 5 | 6 | namespace ParserConsoleWeb 7 | { 8 | class PdfProcessor 9 | { 10 | IVirtualFS _vfs; 11 | 12 | public PdfProcessor(IVirtualFS vfs) 13 | { 14 | _vfs = vfs; 15 | } 16 | 17 | public void Process(string basename) 18 | { 19 | ExampleStages.RunParserPDF(_vfs, basename, "input", "output"); 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/ParserConsoleWeb/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace ParserConsoleWeb 4 | { 5 | class Program 6 | { 7 | static void Main(string[] args) 8 | { 9 | if( args.Length == 0 ) 10 | { 11 | Console.WriteLine("Syntax: ParserConsoleWeb [storage_account]"); 12 | Console.WriteLine(" - inputile: does not include the extension"); 13 | Console.WriteLine(" - storage_account: also configurable in Environment: STORAGE_ACCOUNT"); 14 | return; 15 | } 16 | 17 | string inputFile = args[0]; 18 | string storageAccount = (args.Length > 1) ? args[1] : Environment.GetEnvironmentVariable("STORAGE_ACCOUNT"); 19 | 20 | var azureFS = new AzureFS(storageAccount); 21 | 22 | var pdf = new PdfProcessor(azureFS); 23 | 24 | string basename = GetBasename(inputFile); 25 | 26 | Console.WriteLine($"Baseline = {basename}"); 27 | 28 | pdf.Process(basename); 29 | } 30 | 31 | static string GetBasename(string filename) 32 | { 33 | string[] components = filename.Split("/"); 34 | 35 | string basename = components[components.Length-1]; 36 | 37 | if (basename.ToLower().EndsWith(".pdf")) 38 | return basename.Substring(0, basename.Length - 4); 39 | 40 | return basename; 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/ParserFrontend/.gitignore: -------------------------------------------------------------------------------- 1 | wwwroot/files/ -------------------------------------------------------------------------------- /src/ParserFrontend/Backend/JobManagerHostedService.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.Extensions.Hosting; 2 | using ParserFrontend.Logic; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Threading; 7 | using System.Threading.Tasks; 8 | 9 | namespace ParserFrontend.Backend 10 | { 11 | public class JobManagerHostedService : IHostedService 12 | { 13 | private JobManager _jobManager; 14 | 15 | public JobManagerHostedService(JobManager jobManager) 16 | { 17 | this._jobManager = jobManager; 18 | } 19 | 20 | public Task StartAsync(CancellationToken cancellationToken) 21 | { 22 | var tasks = Enumerable 23 | .Range(0, 4) 24 | .Select(async numWorkers => 25 | { 26 | int timeout = 10000; 27 | await RunAsync(timeout, cancellationToken).ConfigureAwait(false); 28 | }) 29 | .ToArray(); 30 | 31 | return Task.WhenAll(tasks); 32 | } 33 | 34 | public Task StopAsync(CancellationToken cancellationToken) 35 | { 36 | return Task.CompletedTask; 37 | } 38 | 39 | public async Task RunAsync(int timeout, CancellationToken cancellationToken) 40 | { 41 | while(!cancellationToken.IsCancellationRequested) 42 | { 43 | try 44 | { 45 | await _jobManager.MessageLoopAsync(); 46 | } 47 | catch 48 | { 49 | 50 | } 51 | await Task.Delay(timeout); 52 | } 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/ParserFrontend/Configuration/CopyFilesConfig.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | 6 | namespace ParserFrontend.Configuration 7 | { 8 | public class CopyFilesConfig 9 | { 10 | public string StorageAccount { get; set; } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/ParserFrontend/Controllers/ImagesController.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using Microsoft.AspNetCore.Mvc; 6 | using Microsoft.Extensions.Options; 7 | using ParserFrontend.Logic; 8 | 9 | namespace ParserFrontend.Controllers 10 | { 11 | [Route("[controller]")] 12 | public class ImagesController : Controller 13 | { 14 | public class Config 15 | { 16 | public string RedirectSite { get; set; } 17 | } 18 | 19 | string _site; 20 | 21 | public ImagesController(IOptions options) 22 | { 23 | _site = options.Value.RedirectSite; 24 | } 25 | 26 | [Route("{name}/{*image}")] 27 | public IActionResult Get(string name, string image) 28 | { 29 | string url = $"{_site}/api/images/{name}/parser/{image}"; 30 | 31 | return Redirect(url); 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /src/ParserFrontend/Controllers/JobController.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using Microsoft.AspNetCore.Mvc; 6 | using ParserFrontend.Logic; 7 | 8 | namespace ParserFrontend.Controllers 9 | { 10 | [Route("[controller]")] 11 | public class JobController : Controller 12 | { 13 | private JobProcess _job; 14 | 15 | public JobController(JobProcess job) 16 | { 17 | this._job = job; 18 | } 19 | 20 | [Route("{*name}")] 21 | public bool Start(string name) 22 | { 23 | Console.WriteLine($"ParserFrontend: JobController [{name}]"); 24 | 25 | _job.Process(name); 26 | 27 | return true; 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /src/ParserFrontend/Infra/IVirtualFS2.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Threading.Tasks; 6 | 7 | namespace ParserFrontend 8 | { 9 | public interface IVirtualFS2 : IVirtualFS 10 | { 11 | string[] ListFileExtension(string extension); 12 | string[] ListFolderContent(string folder); 13 | void Delete(string filename); 14 | void DeleteFolder(string folder); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/ParserFrontend/Infra/ZipCompression.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using System.IO.Compression; 6 | using System.IO; 7 | 8 | namespace ParserFrontend.Infra 9 | { 10 | public class ZipCompression : IDisposable 11 | { 12 | MemoryStream _memoryStream; 13 | ZipArchive _zipArchive; 14 | 15 | public ZipCompression() 16 | { 17 | _memoryStream = new MemoryStream(); 18 | _zipArchive = new ZipArchive(_memoryStream, ZipArchiveMode.Create, true); 19 | } 20 | 21 | public void Add(string filename, Stream stream) 22 | { 23 | if (_zipArchive == null) 24 | throw new InvalidOperationException("Object disposed"); 25 | 26 | var entry = _zipArchive.CreateEntry(filename); 27 | 28 | using (var destStream = entry.Open()) 29 | { 30 | stream.CopyTo(destStream); 31 | } 32 | } 33 | 34 | public Stream DownloadStream() 35 | { 36 | if (_zipArchive == null) 37 | throw new InvalidOperationException("Object disposed"); 38 | 39 | Dispose(); 40 | 41 | _memoryStream.Seek(0, SeekOrigin.Begin); 42 | 43 | return _memoryStream; 44 | } 45 | 46 | public void Dispose() 47 | { 48 | if(_zipArchive != null) 49 | { 50 | _zipArchive.Dispose(); 51 | _zipArchive = null; 52 | } 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/ParserFrontend/Logic/AccessManager.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Threading.Tasks; 6 | 7 | namespace ParserFrontend.Logic 8 | { 9 | public class AccessManager 10 | { 11 | IVirtualFS2 _virtualFS; 12 | bool _hasFullAccess; 13 | 14 | public AccessManager(IVirtualFS2 virtualFS, bool hasFullAccess) 15 | { 16 | _virtualFS = virtualFS; 17 | _hasFullAccess = hasFullAccess; 18 | } 19 | 20 | public bool HasFullAccess => _hasFullAccess; 21 | 22 | public IVirtualFS2 GetReadOnlyFileSystem() 23 | { 24 | return _virtualFS; 25 | } 26 | 27 | public IVirtualFS2 GetFullAccessFileSystem() 28 | { 29 | if (!_hasFullAccess) 30 | throw new InvalidOperationException("no full access"); 31 | 32 | return _virtualFS; 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/ParserFrontend/Logic/DeleteFiles.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Threading.Tasks; 6 | 7 | namespace ParserFrontend.Logic 8 | { 9 | public class DeleteFiles 10 | { 11 | private readonly IVirtualFS2 _webFS; 12 | 13 | public DeleteFiles(AccessManager amgr) 14 | { 15 | this._webFS = amgr.GetFullAccessFileSystem(); 16 | } 17 | 18 | public void DeleteOutput(string name) 19 | { 20 | if (name.Trim().Length < 3) 21 | throw new InvalidOperationException(); 22 | 23 | _webFS.DeleteFolder("output/" + name); 24 | } 25 | 26 | public void DestroyAll(string name) 27 | { 28 | if (name.Trim().Length < 3) 29 | throw new InvalidOperationException(); 30 | 31 | _webFS.Delete("input/" + name + ".pdf"); 32 | _webFS.DeleteFolder("output/" + name); 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/ParserFrontend/Logic/InputFiles.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Threading.Tasks; 6 | 7 | namespace ParserFrontend.Logic 8 | { 9 | public class InputFiles 10 | { 11 | const string PDF_EXTENSION = ".pdf"; 12 | 13 | private readonly IVirtualFS2 _webFS; 14 | 15 | public InputFiles(IVirtualFS2 webFS) 16 | { 17 | this._webFS = webFS; 18 | } 19 | 20 | public IEnumerable List() 21 | { 22 | var filenames = _webFS.ListFileExtension(PDF_EXTENSION); 23 | 24 | return filenames.Select(n => RemoveExtension(n)); 25 | } 26 | 27 | string RemoveExtension(string name) 28 | { 29 | if( name.EndsWith(PDF_EXTENSION, StringComparison.OrdinalIgnoreCase) ) 30 | return name.Substring(0, name.Length - PDF_EXTENSION.Length); 31 | 32 | return name; 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/ParserFrontend/Logic/JobManager.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Azure.Queue; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Threading.Tasks; 6 | 7 | namespace ParserFrontend.Logic 8 | { 9 | public class JobManager 10 | { 11 | AzureQueue _queue; 12 | JobProcess _job; 13 | //JobProcessHttp _jobHttp = new JobProcessHttp(); 14 | 15 | public JobManager(AzureQueue queue, JobProcess job) 16 | { 17 | _queue = queue; 18 | _job = job; 19 | } 20 | 21 | public async Task MessageLoopAsync() 22 | { 23 | while(true) 24 | { 25 | var msg = await _queue.TryGetMessageAsync(); 26 | 27 | if (msg == null) 28 | return; 29 | 30 | Console.WriteLine("MessageLoopAsync: " + msg.Content); 31 | 32 | //try 33 | //{ 34 | // _jobHttp.Process(msg.Content); 35 | //} 36 | //catch { } 37 | 38 | try { _job.Process(msg.Content); } 39 | catch { } 40 | 41 | msg.Done(); 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/ParserFrontend/Logic/JobProcessHttp.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Net.Http; 5 | using System.Threading.Tasks; 6 | 7 | namespace ParserFrontend.Logic 8 | { 9 | public class JobProcessHttp 10 | { 11 | HttpClient _client; 12 | string _sitename; 13 | 14 | HttpClient GetClient() 15 | { 16 | if (_client == null) 17 | { 18 | _client = new HttpClient(); 19 | _sitename = Environment.GetEnvironmentVariable("APPSETTING_WEBSITE_SITE_NAME")?.TrimEnd('/'); 20 | Console.WriteLine("JobProcessHttp: URL = " + _sitename); 21 | } 22 | 23 | return _client; 24 | } 25 | 26 | public void Process(string name) 27 | { 28 | Console.WriteLine($"JobProcessHttp: Process Start [{name}]"); 29 | GetClient().PostAsync($"http://{_sitename}.azurewebsites.net/job/{name}", new StringContent("")).Wait(); 30 | Console.WriteLine($"JobProcessHttp: Process End [{name}]"); 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/About.cshtml: -------------------------------------------------------------------------------- 1 | @page 2 | @model AboutModel 3 | @{ 4 | ViewData["Title"] = "About"; 5 | } 6 |

@ViewData["Title"]

7 |

@Model.Message

8 | 9 |

Use this area to provide additional information.

10 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/About.cshtml.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using Microsoft.AspNetCore.Mvc.RazorPages; 6 | 7 | namespace ParserFrontend.Pages 8 | { 9 | public class AboutModel : PageModel 10 | { 11 | public string Message { get; set; } 12 | 13 | public void OnGet() 14 | { 15 | Message = "Your application description page."; 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/AtosGraph.cshtml: -------------------------------------------------------------------------------- 1 | @page 2 | @model ParserFrontend.Pages.AtosGraphModel 3 | @{ 4 | ViewData["Title"] = "AtosGraph"; 5 | } 6 | 7 |

AtosGraph

8 | 9 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/AtosGraph.cshtml.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using Microsoft.AspNetCore.Mvc; 6 | using Microsoft.AspNetCore.Mvc.RazorPages; 7 | 8 | namespace ParserFrontend.Pages 9 | { 10 | public class AtosGraphModel : PageModel 11 | { 12 | public void OnGet() 13 | { 14 | 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/Error.cshtml: -------------------------------------------------------------------------------- 1 | @page 2 | @model ErrorModel 3 | @{ 4 | ViewData["Title"] = "Error"; 5 | } 6 | 7 |

Error.

8 |

An error occurred while processing your request.

9 | 10 | @if (Model.ShowRequestId) 11 | { 12 |

13 | Request ID: @Model.RequestId 14 |

15 | } 16 | 17 |

Development Mode

18 |

19 | Swapping to Development environment will display more detailed information about the error that occurred. 20 |

21 |

22 | Development environment should not be enabled in deployed applications, as it can result in sensitive information from exceptions being displayed to end users. For local debugging, development environment can be enabled by setting the ASPNETCORE_ENVIRONMENT environment variable to Development, and restarting the application. 23 |

24 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/Error.cshtml.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Linq; 5 | using System.Threading.Tasks; 6 | using Microsoft.AspNetCore.Mvc.RazorPages; 7 | 8 | namespace ParserFrontend.Pages 9 | { 10 | public class ErrorModel : PageModel 11 | { 12 | public string RequestId { get; set; } 13 | 14 | public bool ShowRequestId => !string.IsNullOrEmpty(RequestId); 15 | 16 | public void OnGet() 17 | { 18 | RequestId = Activity.Current?.Id ?? HttpContext.TraceIdentifier; 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/Index.cshtml.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using Microsoft.AspNetCore.Mvc; 6 | using Microsoft.AspNetCore.Mvc.RazorPages; 7 | using ParserFrontend.Logic; 8 | 9 | namespace ParserFrontend.Pages 10 | { 11 | public class IndexModel : PageModel 12 | { 13 | public IEnumerable CurrentFiles { get; private set; } 14 | 15 | IVirtualFS2 _vfs; 16 | 17 | public IndexModel(AccessManager amgr) 18 | { 19 | _vfs = amgr.GetReadOnlyFileSystem(); 20 | } 21 | 22 | public void OnGet() 23 | { 24 | var inputf = new Logic.InputFiles(_vfs); 25 | 26 | CurrentFiles = inputf.List(); 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/Jobs.cshtml: -------------------------------------------------------------------------------- 1 | @page 2 | @model JobsModel 3 | @{ 4 | ViewData["Title"] = "Jobs"; 5 | } 6 |

@ViewData["Jobs"]

7 |

@Model.Message

8 | 9 |

Use this area to provide additional information.

10 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/Jobs.cshtml.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using Microsoft.AspNetCore.Mvc.RazorPages; 6 | using ParserFrontend.Logic; 7 | 8 | namespace ParserFrontend.Pages 9 | { 10 | public class JobsModel : PageModel 11 | { 12 | private readonly JobManager _jobMgr; 13 | 14 | public string Message { get; set; } 15 | 16 | public JobsModel(JobManager jobMgr) 17 | { 18 | this._jobMgr = jobMgr; 19 | } 20 | 21 | public void OnGet() 22 | { 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/Upload.cshtml: -------------------------------------------------------------------------------- 1 | @page 2 | @model UploadModel 3 | @{ 4 | ViewData["Title"] = "Upload"; 5 | } 6 |

@ViewData["Title"]

7 | 8 | @{ 9 | if (Model.HasFullAccess) 10 | { 11 |
12 |
13 |
14 |
15 |
16 |

Envie arquivos

17 | 18 |

Arquivos enviados serão processados pelo PdfTextReader

19 |
20 |
21 | 22 |
23 |
24 |
25 |
26 |
27 | 28 |
29 |
30 | } 31 | else 32 | { 33 |

Página desabilitada

34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/Upload.cshtml.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using Microsoft.AspNetCore.Mvc; 6 | using Microsoft.AspNetCore.Mvc.RazorPages; 7 | using ParserFrontend.Logic; 8 | 9 | namespace ParserFrontend.Pages 10 | { 11 | public class UploadModel : PageModel 12 | { 13 | AccessManager _amgr; 14 | public bool HasFullAccess => _amgr.HasFullAccess; 15 | 16 | public UploadModel(AccessManager amgr) 17 | { 18 | _amgr = amgr; 19 | } 20 | 21 | public void OnGet() 22 | { 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/_ViewImports.cshtml: -------------------------------------------------------------------------------- 1 | @using ParserFrontend 2 | @namespace ParserFrontend.Pages 3 | @addTagHelper *, Microsoft.AspNetCore.Mvc.TagHelpers 4 | -------------------------------------------------------------------------------- /src/ParserFrontend/Pages/_ViewStart.cshtml: -------------------------------------------------------------------------------- 1 | @{ 2 | Layout = "_Layout"; 3 | } 4 | -------------------------------------------------------------------------------- /src/ParserFrontend/ParserFrontend.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | netcoreapp2.0 4 | Debug;Release;CORE 5 | 055d1775-b62c-464a-ac54-60de337c6f02 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /src/ParserFrontend/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Threading.Tasks; 6 | using Microsoft.AspNetCore; 7 | using Microsoft.AspNetCore.Hosting; 8 | using Microsoft.Extensions.Configuration; 9 | using Microsoft.Extensions.Logging; 10 | 11 | namespace ParserFrontend 12 | { 13 | public class Program 14 | { 15 | public static void Main(string[] args) 16 | { 17 | BuildWebHost(args).Run(); 18 | } 19 | 20 | public static IWebHost BuildWebHost(string[] args) => 21 | WebHost.CreateDefaultBuilder(args) 22 | .UseStartup() 23 | .Build(); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/ParserFrontend/Properties/launchSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "iisSettings": { 3 | "windowsAuthentication": false, 4 | "anonymousAuthentication": true, 5 | "iisExpress": { 6 | "applicationUrl": "http://localhost:58409/", 7 | "sslPort": 0 8 | } 9 | }, 10 | "profiles": { 11 | "IIS Express": { 12 | "commandName": "IISExpress", 13 | "launchBrowser": true, 14 | "environmentVariables": { 15 | "ASPNETCORE_ENVIRONMENT": "Development" 16 | } 17 | }, 18 | "ParserFrontend": { 19 | "commandName": "Project", 20 | "launchBrowser": true, 21 | "environmentVariables": { 22 | "ASPNETCORE_ENVIRONMENT": "Development" 23 | }, 24 | "applicationUrl": "http://localhost:58410/" 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/ParserFrontend/TagHelpers/ArticleHtmlTagHelper.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using Microsoft.AspNetCore.Razor.TagHelpers; 6 | using ParserFrontend.Logic; 7 | 8 | namespace ParserFrontend.TagHelpers 9 | { 10 | public class ArticleHtmlTagHelper : TagHelper 11 | { 12 | public string Text { get; set; } 13 | 14 | public override void Process(TagHelperContext context, TagHelperOutput output) 15 | { 16 | output.TagName = "div"; 17 | output.Content.AppendHtml(Text); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/ParserFrontend/Views/Browser/Index.cshtml: -------------------------------------------------------------------------------- 1 | @{ 2 | Layout = "_Layout"; 3 | } 4 | 5 | @{ 6 | ViewData["Title"] = "Documentos"; 7 | string doc_name = @ViewBag.Name; 8 | } 9 |

Documentos Disponíveis

10 | 11 |
12 |

Seção 1

13 |

Leis, decretos, resoluções, instruções normativas, portarias e outros atos normativos

14 | @{ 15 | for (int year = 2017; year >= 2002; year--) 16 | { 17 | 18 | } 19 | } 20 |
21 | 22 |
23 |

Seção 2

24 |

Portarias de nomeação, exoneração, aposentadoria e outros atos relativos a servidores da administração

25 | 26 | @{ 27 | for (int year = 2017; year >= 2002; year--) 28 | { 29 | 30 | } 31 | } 32 |
33 | 34 |
35 |

Seção 3

36 |

Contratos, editais, avisos, balanços de empresas e outros

37 | 38 | @{ 39 | for (int year = 2017; year >= 2002; year--) 40 | { 41 | 42 | } 43 | } 44 |
-------------------------------------------------------------------------------- /src/ParserFrontend/Views/Documents/Log.cshtml: -------------------------------------------------------------------------------- 1 | @{ 2 | Layout = "_Layout"; 3 | } 4 | 5 | @{ 6 | ViewData["Title"] = "View"; 7 | string doc_name = @ViewBag.Name; 8 | string log_name = @ViewBag.LogName; 9 | } 10 | 11 |
12 |

@doc_name

13 |
14 | 15 |
16 |
17 | 18 |
19 |
20 | -------------------------------------------------------------------------------- /src/ParserFrontend/Views/List/Index.cshtml: -------------------------------------------------------------------------------- 1 | @{ 2 | Layout = "_Layout"; 3 | } 4 | 5 | @{ 6 | ViewData["Title"] = "Lista de Arquivos"; 7 | } 8 |

Arquivos Disponíveis

9 | 10 |
11 |
12 |

Seção 1

13 | @{ 14 | foreach (var filename in ViewBag.DO1) 15 | { 16 |

@filename

17 | } 18 | } 19 |
20 |
21 |

Seção 2

22 | @{ 23 | foreach (var filename in ViewBag.DO2) 24 | { 25 |

@filename

26 | } 27 | } 28 |
29 |
30 |

Seção 3

31 | @{ 32 | foreach (var filename in ViewBag.DO3) 33 | { 34 |

@filename

35 | } 36 | } 37 |
38 |
39 |

Outros

40 | @{ 41 | foreach (var filename in ViewBag.Outros) 42 | { 43 |

@filename

44 | } 45 | } 46 |
47 |
48 | -------------------------------------------------------------------------------- /src/ParserFrontend/Views/List/IndexFiltro.cshtml: -------------------------------------------------------------------------------- 1 | @{ 2 | Layout = "_Layout"; 3 | } 4 | 5 | @{ 6 | ViewData["Title"] = "Lista de Arquivos"; 7 | } 8 |

Arquivos Disponíveis

9 | 10 |
11 |
12 |

Seção 1

13 | @{ 14 | foreach (var filename in ViewBag.DO1) 15 | { 16 |

@filename

17 | } 18 | } 19 |
20 |
21 |

Seção 2

22 | @{ 23 | foreach (var filename in ViewBag.DO2) 24 | { 25 |

@filename

26 | } 27 | } 28 |
29 |
30 |

Seção 3

31 | @{ 32 | foreach (var filename in ViewBag.DO3) 33 | { 34 |

@filename

35 | } 36 | } 37 |
38 |
39 |

Outros

40 | @{ 41 | foreach (var filename in ViewBag.Outros) 42 | { 43 |

@filename

44 | } 45 | } 46 |
47 |
48 | -------------------------------------------------------------------------------- /src/ParserFrontend/Views/Shared/_ValidationScriptsPartial.cshtml: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 12 | 18 | 19 | -------------------------------------------------------------------------------- /src/ParserFrontend/Views/_ViewImports.cshtml: -------------------------------------------------------------------------------- 1 | @using ParserFrontend 2 | @namespace ParserFrontend.Views 3 | @addTagHelper *, Microsoft.AspNetCore.Mvc.TagHelpers 4 | @addTagHelper *, ParserFrontend -------------------------------------------------------------------------------- /src/ParserFrontend/Views/_ViewStart.cshtml: -------------------------------------------------------------------------------- 1 | @namespace ParserFrontend.Views 2 | @{ 3 | Layout = "_Layout"; 4 | } 5 | -------------------------------------------------------------------------------- /src/ParserFrontend/appsettings.Development.json: -------------------------------------------------------------------------------- 1 | { 2 | "Logging": { 3 | "IncludeScopes": false, 4 | "LogLevel": { 5 | "Default": "Debug", 6 | "System": "Information", 7 | "Microsoft": "Information" 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/ParserFrontend/appsettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "Logging": { 3 | "IncludeScopes": false, 4 | "LogLevel": { 5 | "Default": "Warning" 6 | } 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /src/ParserFrontend/bundleconfig.json: -------------------------------------------------------------------------------- 1 | // Configure bundling and minification for the project. 2 | // More info at https://go.microsoft.com/fwlink/?LinkId=808241 3 | [ 4 | { 5 | "outputFileName": "wwwroot/css/site.min.css", 6 | // An array of relative input file paths. Globbing patterns supported 7 | "inputFiles": [ 8 | "wwwroot/css/site.css" 9 | ] 10 | }, 11 | { 12 | "outputFileName": "wwwroot/js/site.min.js", 13 | "inputFiles": [ 14 | "wwwroot/js/site.js" 15 | ], 16 | // Optionally specify minification options 17 | "minify": { 18 | "enabled": true, 19 | "renameLocals": true 20 | }, 21 | // Optionally generate .map file 22 | "sourceMap": false 23 | } 24 | ] 25 | -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/css/site.css: -------------------------------------------------------------------------------- 1 | body { 2 | padding-top: 50px; 3 | padding-bottom: 20px; 4 | } 5 | 6 | /* Wrapping element */ 7 | /* Set some basic padding to keep content from hitting the edges */ 8 | .body-content { 9 | padding-left: 15px; 10 | padding-right: 15px; 11 | } 12 | 13 | /* Carousel */ 14 | .carousel-caption p { 15 | font-size: 20px; 16 | line-height: 1.4; 17 | } 18 | 19 | /* Make .svg files in the carousel display properly in older browsers */ 20 | .carousel-inner .item img[src$=".svg"] { 21 | width: 100%; 22 | } 23 | 24 | /* QR code generator */ 25 | #qrCode { 26 | margin: 15px; 27 | } 28 | 29 | /* Hide/rearrange for smaller screens */ 30 | @media screen and (max-width: 767px) { 31 | /* Hide captions */ 32 | .carousel-caption { 33 | display: none; 34 | } 35 | } 36 | 37 | .bg-dark { 38 | background-color: #101010; 39 | opacity: 0.8; 40 | } -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/css/site.min.css: -------------------------------------------------------------------------------- 1 | body{padding-top:50px;padding-bottom:20px}.body-content{padding-left:15px;padding-right:15px}.carousel-caption p{font-size:20px;line-height:1.4}.carousel-inner .item img[src$=".svg"]{width:100%}#qrCode{margin:15px}@media screen and (max-width:767px){.carousel-caption{display:none}}.bg-dark{background-color:#101010;opacity:.8} -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/favicon.ico -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/images/atosgraph.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/images/atosgraph.JPG -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/images/pdf01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/images/pdf01.jpg -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/images/pdf02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/images/pdf02.jpg -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/images/pdf03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/images/pdf03.jpg -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/images/pdf04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/images/pdf04.jpg -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/images/pdf05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/images/pdf05.jpg -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/images/pdf06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/images/pdf06.jpg -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/images/wait.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/images/wait.gif -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/js/site.js: -------------------------------------------------------------------------------- 1 | // Write your Javascript code. 2 | $(function () { 3 | }) 4 | 5 | // Upload Page 6 | $(".upload-form").submit(uploadFormSubmit); 7 | 8 | function uploadFormSubmit(ev) { 9 | var waitImage = ""; 10 | 11 | // disable all the controls and hide the submit button 12 | //$(".upload-form .form-group input").prop('disabled', true); 13 | $(".upload-form input[type='submit']").hide(); 14 | $(".upload-form .wait-image").show(); 15 | 16 | return true; 17 | } 18 | 19 | $(".show-form .button-reprocess").click(showReprocessDocument); 20 | 21 | function showReprocessDocument(ev) { 22 | 23 | } -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/js/site.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/js/site.min.js -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/bootstrap/.bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bootstrap", 3 | "description": "The most popular front-end framework for developing responsive, mobile first projects on the web.", 4 | "keywords": [ 5 | "css", 6 | "js", 7 | "less", 8 | "mobile-first", 9 | "responsive", 10 | "front-end", 11 | "framework", 12 | "web" 13 | ], 14 | "homepage": "http://getbootstrap.com", 15 | "license": "MIT", 16 | "moduleType": "globals", 17 | "main": [ 18 | "less/bootstrap.less", 19 | "dist/js/bootstrap.js" 20 | ], 21 | "ignore": [ 22 | "/.*", 23 | "_config.yml", 24 | "CNAME", 25 | "composer.json", 26 | "CONTRIBUTING.md", 27 | "docs", 28 | "js/tests", 29 | "test-infra" 30 | ], 31 | "dependencies": { 32 | "jquery": "1.9.1 - 3" 33 | }, 34 | "version": "3.3.7", 35 | "_release": "3.3.7", 36 | "_resolution": { 37 | "type": "version", 38 | "tag": "v3.3.7", 39 | "commit": "0b9c4a4007c44201dce9a6cc1a38407005c26c86" 40 | }, 41 | "_source": "https://github.com/twbs/bootstrap.git", 42 | "_target": "v3.3.7", 43 | "_originalSource": "bootstrap", 44 | "_direct": true 45 | } -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/bootstrap/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2011-2016 Twitter, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/bootstrap/dist/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/lib/bootstrap/dist/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/bootstrap/dist/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/lib/bootstrap/dist/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/bootstrap/dist/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/lib/bootstrap/dist/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/bootstrap/dist/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/ParserFrontend/wwwroot/lib/bootstrap/dist/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/bootstrap/dist/js/npm.js: -------------------------------------------------------------------------------- 1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment. 2 | require('../../js/transition.js') 3 | require('../../js/alert.js') 4 | require('../../js/button.js') 5 | require('../../js/carousel.js') 6 | require('../../js/collapse.js') 7 | require('../../js/dropdown.js') 8 | require('../../js/modal.js') 9 | require('../../js/tooltip.js') 10 | require('../../js/popover.js') 11 | require('../../js/scrollspy.js') 12 | require('../../js/tab.js') 13 | require('../../js/affix.js') -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/calendar-heatmap/calendar-heatmap.css: -------------------------------------------------------------------------------- 1 | text.month-name, 2 | text.calendar-heatmap-legend-text, 3 | text.day-initial { 4 | font-size: 10px; 5 | fill: inherit; 6 | font-family: Helvetica, arial, 'Open Sans', sans-serif; 7 | } 8 | rect.day-cell:hover { 9 | stroke: #555555; 10 | stroke-width: 1px; 11 | } 12 | .day-cell-tooltip { 13 | position: absolute; 14 | z-index: 9999; 15 | padding: 5px 9px; 16 | color: #bbbbbb; 17 | font-size: 12px; 18 | background: rgba(0, 0, 0, 0.85); 19 | border-radius: 3px; 20 | text-align: center; 21 | } 22 | .day-cell-tooltip > span { 23 | font-family: Helvetica, arial, 'Open Sans', sans-serif 24 | } 25 | .calendar-heatmap { 26 | box-sizing: initial; 27 | } -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/calendar-heatmap/calendar.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | D3.js v4 Calendar Heatmap 6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/jquery-validation-unobtrusive/.bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "jquery-validation-unobtrusive", 3 | "version": "3.2.6", 4 | "homepage": "https://github.com/aspnet/jquery-validation-unobtrusive", 5 | "description": "Add-on to jQuery Validation to enable unobtrusive validation options in data-* attributes.", 6 | "main": [ 7 | "jquery.validate.unobtrusive.js" 8 | ], 9 | "ignore": [ 10 | "**/.*", 11 | "*.json", 12 | "*.md", 13 | "*.txt", 14 | "gulpfile.js" 15 | ], 16 | "keywords": [ 17 | "jquery", 18 | "asp.net", 19 | "mvc", 20 | "validation", 21 | "unobtrusive" 22 | ], 23 | "authors": [ 24 | "Microsoft" 25 | ], 26 | "license": "http://www.microsoft.com/web/webpi/eula/net_library_eula_enu.htm", 27 | "repository": { 28 | "type": "git", 29 | "url": "git://github.com/aspnet/jquery-validation-unobtrusive.git" 30 | }, 31 | "dependencies": { 32 | "jquery-validation": ">=1.8", 33 | "jquery": ">=1.8" 34 | }, 35 | "_release": "3.2.6", 36 | "_resolution": { 37 | "type": "version", 38 | "tag": "v3.2.6", 39 | "commit": "13386cd1b5947d8a5d23a12b531ce3960be1eba7" 40 | }, 41 | "_source": "git://github.com/aspnet/jquery-validation-unobtrusive.git", 42 | "_target": "3.2.6", 43 | "_originalSource": "jquery-validation-unobtrusive" 44 | } -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/jquery-validation/.bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "jquery-validation", 3 | "homepage": "http://jqueryvalidation.org/", 4 | "repository": { 5 | "type": "git", 6 | "url": "git://github.com/jzaefferer/jquery-validation.git" 7 | }, 8 | "authors": [ 9 | "Jörn Zaefferer " 10 | ], 11 | "description": "Form validation made easy", 12 | "main": "dist/jquery.validate.js", 13 | "keywords": [ 14 | "forms", 15 | "validation", 16 | "validate" 17 | ], 18 | "license": "MIT", 19 | "ignore": [ 20 | "**/.*", 21 | "node_modules", 22 | "bower_components", 23 | "test", 24 | "demo", 25 | "lib" 26 | ], 27 | "dependencies": { 28 | "jquery": ">= 1.7.2" 29 | }, 30 | "version": "1.14.0", 31 | "_release": "1.14.0", 32 | "_resolution": { 33 | "type": "version", 34 | "tag": "1.14.0", 35 | "commit": "c1343fb9823392aa9acbe1c3ffd337b8c92fed48" 36 | }, 37 | "_source": "git://github.com/jzaefferer/jquery-validation.git", 38 | "_target": ">=1.8", 39 | "_originalSource": "jquery-validation" 40 | } -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/jquery-validation/LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | ===================== 3 | 4 | Copyright Jörn Zaefferer 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /src/ParserFrontend/wwwroot/lib/jquery/.bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "jquery", 3 | "main": "dist/jquery.js", 4 | "license": "MIT", 5 | "ignore": [ 6 | "package.json" 7 | ], 8 | "keywords": [ 9 | "jquery", 10 | "javascript", 11 | "browser", 12 | "library" 13 | ], 14 | "homepage": "https://github.com/jquery/jquery-dist", 15 | "version": "2.2.0", 16 | "_release": "2.2.0", 17 | "_resolution": { 18 | "type": "version", 19 | "tag": "2.2.0", 20 | "commit": "6fc01e29bdad0964f62ef56d01297039cdcadbe5" 21 | }, 22 | "_source": "git://github.com/jquery/jquery-dist.git", 23 | "_target": "2.2.0", 24 | "_originalSource": "jquery" 25 | } -------------------------------------------------------------------------------- /src/ParserFunctions/AzureFS.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using System.IO; 5 | using PdfTextReader; 6 | using PdfTextReader.Azure.Blob; 7 | using PdfTextReader.Azure; 8 | 9 | namespace ParserFunctions 10 | { 11 | public class AzureFS : IVirtualFS 12 | { 13 | AzureBlobFileSystem _azure = new AzureBlobFileSystem(); 14 | AzureBlobFileSystem _inputFS; 15 | AzureBlobFileSystem _outputFS; 16 | 17 | public AzureFS(string inputConnectionString, string outputConnectionString) 18 | { 19 | _inputFS = _azure; 20 | _outputFS = _azure; 21 | 22 | if (String.IsNullOrEmpty(inputConnectionString)) 23 | throw new ArgumentNullException(nameof(inputConnectionString)); 24 | 25 | if (String.IsNullOrEmpty(outputConnectionString)) 26 | throw new ArgumentNullException(nameof(outputConnectionString)); 27 | 28 | _inputFS.AddStorageAccount("input", inputConnectionString); 29 | _outputFS.AddStorageAccount("output", outputConnectionString); 30 | } 31 | 32 | public Stream OpenReader(string filename) => _inputFS.GetFile(filename).GetStreamReader(); 33 | 34 | public Stream OpenWriter(string filename) => _outputFS.GetFile(filename).GetStreamWriter(); 35 | 36 | public IAzureBlobFolder GetFolder(string name) 37 | { 38 | return _inputFS.GetFolder(name); 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/ParserFunctions/Model/Pdf.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace ParserFunctions.Model 6 | { 7 | public class Pdf 8 | { 9 | public string Name { get; set; } 10 | public string Path { get; set; } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/ParserFunctions/ParserFunctions.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | netstandard2.0 4 | v2 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | PreserveNewest 18 | 19 | 20 | PreserveNewest 21 | Never 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/ParserFunctions/TestAzureBlob.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace ParserFunctions 7 | { 8 | class TestAzureBlob 9 | { 10 | public static void Run(AzureFS azure, string filename) 11 | { 12 | using (var sw = new System.IO.StreamWriter(azure.OpenWriter(filename))) 13 | { 14 | sw.WriteLine("Hello from WRITER"); 15 | } 16 | 17 | using (var sr = new System.IO.StreamReader(azure.OpenReader(filename))) 18 | { 19 | string text = sr.ReadToEnd(); 20 | 21 | Console.WriteLine("Output from READER = " + text); 22 | } 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/ParserFunctions/host.json: -------------------------------------------------------------------------------- 1 | { 2 | "queues": {"batchSize": 4} 3 | } -------------------------------------------------------------------------------- /src/ParserRun/AzureFS.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using System.IO; 5 | using PdfTextReader; 6 | using PdfTextReader.Azure.Blob; 7 | using PdfTextReader.Azure; 8 | 9 | namespace ParserRun 10 | { 11 | public class AzureFS : IVirtualFS 12 | { 13 | AzureBlobFileSystem _inputFS = new AzureBlobFileSystem(); 14 | AzureBlobFileSystem _outputFS = new AzureBlobFileSystem(); 15 | 16 | public AzureFS(string inputConnectionString, string outputConnectionString) 17 | { 18 | if (String.IsNullOrEmpty(inputConnectionString)) 19 | throw new ArgumentNullException(nameof(inputConnectionString)); 20 | 21 | if (String.IsNullOrEmpty(outputConnectionString)) 22 | throw new ArgumentNullException(nameof(outputConnectionString)); 23 | 24 | _inputFS.AddStorageAccount("input", inputConnectionString); 25 | _outputFS.AddStorageAccount("output", outputConnectionString); 26 | } 27 | 28 | public Stream OpenReader(string filename) => _inputFS.GetFile(filename).GetStreamReader(); 29 | 30 | public Stream OpenWriter(string filename) => _outputFS.GetFile(filename).GetStreamWriter(); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/ParserRun/Config.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.Extensions.Configuration; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Diagnostics; 5 | using System.Text; 6 | 7 | namespace ParserRun 8 | { 9 | class Config 10 | { 11 | private readonly IConfigurationRoot _config; 12 | 13 | public Config(string[] args) 14 | { 15 | _config = new ConfigurationBuilder() 16 | .AddUserSecrets(optional: true) 17 | .AddEnvironmentVariables("PDFPARSER") 18 | .AddCommandLine(args) 19 | .Build(); 20 | } 21 | 22 | [DebuggerHidden] 23 | public string Get(string configName) 24 | { 25 | var value = _config[configName]; 26 | 27 | if (value == null) 28 | throw new NotConfigured(configName); 29 | 30 | return value; 31 | } 32 | 33 | class NotConfigured : Exception 34 | { 35 | public readonly string Name; 36 | 37 | public NotConfigured(string name) : base($"configuration '{name}' not found") 38 | { 39 | Name = name; 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/ParserRun/ParserRun.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp2.0 6 | PdfTextReader.ParserRun.01262018 7 | Debug;Release;CORE 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /src/ParserRun/Program.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader; 2 | using System; 3 | 4 | namespace ParserRun 5 | { 6 | class Program 7 | { 8 | static void Main(string[] args) 9 | { 10 | Console.WriteLine("Parser Run"); 11 | 12 | var config = new Config(args); 13 | 14 | Console.WriteLine(config.Get("AZURE_STORAGE_PDF")); 15 | Console.WriteLine(config.Get("AZURE_STORAGE_OUTPUT")); 16 | 17 | // DEV: configure the secrets 18 | // 19 | // dotnet user-secrets set AZURE_STORAGE_PDF <_connection_string_from_portal_> 20 | // dotnet user-secrets set AZURE_STORAGE_OUTPUT <_connection_string_from_portal_> 21 | // 22 | string inputConnectionString = config.Get("AZURE_STORAGE_PDF"); 23 | string outputConnectionString = config.Get("AZURE_STORAGE_OUTPUT"); 24 | 25 | // Test azure connection 26 | //TestAzureBlob.V2(connectionString, storageContainer); 27 | //TestAzureBlob.Run(connectionString, storageContainer); 28 | //TestAzureBlob.Enum(connectionString, storageContainer); 29 | 30 | // create the AzureFS 31 | var azureBlobs = new AzureFS(inputConnectionString, outputConnectionString); 32 | 33 | //ExamplesAzure.FollowText(virtualFileSystem, "example"); 34 | //ExamplesAzure.RunParserPDF(azureBlobs, "DO1_2010_01_04", "wasb://input/pdf/2010/2010_01_04", "wasb://output/test"); 35 | ExamplesAzure.RunCreateArtigos(azureBlobs, "DO1_2010_01_04", "wasb://input/pdf/2010/2010_01_04", "wasb://output/test/logs", "wasb://output/test/artigos"); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/ParserRun/TestFS.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using System.IO; 5 | using PdfTextReader; 6 | using PdfTextReader.Azure.Blob; 7 | using PdfTextReader.Azure; 8 | 9 | namespace ParserRun 10 | { 11 | public class TestFS : IVirtualFS 12 | { 13 | AzureBlobFileSystem _blobFS = new AzureBlobFileSystem(); 14 | 15 | public void AddStorageAccount(string name, string connectionString) 16 | { 17 | _blobFS.AddStorageAccount(name, connectionString); 18 | } 19 | 20 | public Stream OpenReader(string filename) => _blobFS.GetFile(filename).GetStreamReader(); 21 | 22 | public Stream OpenWriter(string filename) => _blobFS.GetFile(filename).GetStreamWriter(); 23 | 24 | public IAzureBlobFolder GetFolder(string name) 25 | { 26 | return _blobFS.GetFolder(name); 27 | } 28 | 29 | public IAzureBlobFile GetFile(string name) 30 | { 31 | return _blobFS.GetFile(name); 32 | } 33 | 34 | public IEnumerable EnumItems() 35 | { 36 | return _blobFS.EnumItems(); 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/Blob/AzureBlobFileBlock.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.WindowsAzure.Storage.Blob; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Diagnostics; 5 | using System.IO; 6 | using System.Text; 7 | using System.Threading.Tasks; 8 | 9 | namespace PdfTextReader.Azure.Blob 10 | { 11 | class AzureBlobFileBlock : AzureBlobRef, IAzureBlobFile 12 | { 13 | CloudBlockBlob _blob; 14 | 15 | public string Extension { 16 | get { 17 | var idx = Name.LastIndexOf('.'); 18 | if (idx == -1) 19 | return string.Empty; 20 | 21 | return Name.Substring(idx + 1); 22 | } 23 | } 24 | 25 | string IAzureBlobFile.Uri => Uri.ToString(); 26 | 27 | public AzureBlobFileBlock(IAzureBlob parent, string name, CloudBlockBlob blob) : base(parent, name, blob.Uri) 28 | { 29 | _blob = blob; 30 | } 31 | 32 | public Stream GetStreamWriter() => _blob.OpenWriteAsync().Result; 33 | 34 | public Stream GetStreamReader() => _blob.OpenReadAsync().Result; 35 | 36 | public void Delete() => _blob.DeleteIfExistsAsync().Wait(); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/Blob/AzureBlobFileGeneric.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.WindowsAzure.Storage.Blob; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Diagnostics; 5 | using System.IO; 6 | using System.Text; 7 | using System.Threading.Tasks; 8 | 9 | namespace PdfTextReader.Azure.Blob 10 | { 11 | class AzureBlobFileGeneric : AzureBlobRef, IAzureBlobFile 12 | { 13 | public AzureBlobFileGeneric(IAzureBlob parent, string name, Uri uri) : base(parent, name, uri) 14 | { 15 | } 16 | 17 | public string Extension 18 | { 19 | get 20 | { 21 | var idx = Name.LastIndexOf('.'); 22 | if (idx == -1) 23 | return string.Empty; 24 | 25 | return Name.Substring(idx + 1); 26 | } 27 | } 28 | 29 | string IAzureBlobFile.Uri => Uri.ToString(); 30 | 31 | public void Delete() 32 | { 33 | throw new NotImplementedException(); 34 | } 35 | 36 | public Stream GetStreamReader() 37 | { 38 | throw new NotImplementedException(); 39 | } 40 | 41 | public Stream GetStreamWriter() 42 | { 43 | throw new NotImplementedException(); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/Blob/AzureBlobRef.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.Azure.Blob 7 | { 8 | abstract class AzureBlobRef : IAzureBlob 9 | { 10 | 11 | public AzureBlobRef(IAzureBlob parent, string name, Uri uri) 12 | { 13 | if (parent == null) 14 | throw new ArgumentNullException(nameof(parent)); 15 | 16 | EnsureValidName(name); 17 | 18 | Path = $"{parent.Path}/{name}"; 19 | Name = name; 20 | Uri = uri; 21 | } 22 | 23 | public string Name { get; } 24 | public string Path { get; } 25 | public readonly Uri Uri; 26 | 27 | [DebuggerHidden] 28 | void EnsureValidName(string name) 29 | { 30 | if (name == null) 31 | throw new ArgumentNullException(nameof(name)); 32 | 33 | if (name.Contains("/") || name.Contains("\\")) 34 | throw new ArgumentException("'Name' contains invalid characters"); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/DevNul/DevNulFile.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.Azure.DevNul 7 | { 8 | class DevNulFile : IAzureBlobFile 9 | { 10 | public DevNulFile(string path, string name) 11 | { 12 | this.Path = path; 13 | this.Name = name; 14 | } 15 | 16 | public string Name { get; private set; } 17 | public string Path { get; private set; } 18 | 19 | public string Extension 20 | { 21 | get 22 | { 23 | var idx = Name.LastIndexOf('.'); 24 | if (idx == -1) 25 | return string.Empty; 26 | 27 | return Name.Substring(idx + 1); 28 | } 29 | } 30 | 31 | public string Uri => null; 32 | 33 | Stream AlwaysCreateNewStream() => new MemoryStream(); 34 | 35 | public Stream GetStreamReader() 36 | { 37 | return AlwaysCreateNewStream(); 38 | } 39 | 40 | public Stream GetStreamWriter() 41 | { 42 | return AlwaysCreateNewStream(); 43 | } 44 | 45 | public void Delete() 46 | { 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/DevNul/DevNulFolder.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Azure.DevNul 6 | { 7 | class DevNulFolder : IAzureBlobFolder 8 | { 9 | public DevNulFolder(string path, string name) 10 | { 11 | this.Path = path; 12 | this.Name = name; 13 | } 14 | 15 | public string Name { get; private set; } 16 | public string Path { get; private set; } 17 | 18 | public string Extension { get{ return null;} } 19 | 20 | public Uri Uri { get; } 21 | 22 | public void Delete() 23 | { 24 | } 25 | 26 | public IEnumerable EnumItems() 27 | { 28 | return new IAzureBlob[0]; 29 | } 30 | 31 | public bool Exists() 32 | { 33 | return true; 34 | } 35 | 36 | public IAzureBlobFile GetFile(string name) 37 | { 38 | return new DevNulFile(this.Path, name); 39 | } 40 | 41 | public IAzureBlobFolder GetFolder(string name) 42 | { 43 | return new DevNulFolder(this.Path, name); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/IAzureBlob.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Azure 6 | { 7 | public interface IAzureBlob 8 | { 9 | string Name { get; } 10 | string Path { get; } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/IAzureBlobFile.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.Azure 7 | { 8 | public interface IAzureBlobFile : IAzureBlob 9 | { 10 | void Delete(); 11 | Stream GetStreamWriter(); 12 | Stream GetStreamReader(); 13 | string Extension { get; } 14 | string Uri { get; } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/IAzureBlobFolder.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Azure 6 | { 7 | public interface IAzureBlobFolder : IAzureBlob 8 | { 9 | void Delete(); 10 | IAzureBlobFolder GetFolder(string name); 11 | IAzureBlobFile GetFile(string name); 12 | IEnumerable EnumItems(); 13 | bool Exists(); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/IPdfConverter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace PdfTextReader.Azure 8 | { 9 | public interface IPdfConverter 10 | { 11 | void GenerateImage(Stream pdfInput, ref Stream[] imageListOutput); 12 | 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/PdfTextReader.Azure.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netstandard2.0 5 | Debug;Release;CORE 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/Queue/AzureMessage.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using Microsoft.WindowsAzure.Storage; 5 | using Microsoft.WindowsAzure.Storage.Queue; 6 | 7 | namespace PdfTextReader.Azure.Queue 8 | { 9 | public class AzureQueueMessage : IQueueMessage 10 | { 11 | private readonly CloudQueueMessage _internalMessage; 12 | private readonly AzureQueue _azureQueue; 13 | 14 | public AzureQueueMessage(AzureQueue queue, CloudQueueMessage message) 15 | { 16 | if (queue == null) 17 | throw new ArgumentNullException(nameof(queue)); 18 | 19 | if (message == null) 20 | throw new ArgumentNullException(nameof(message)); 21 | 22 | _azureQueue = queue; 23 | _internalMessage = message; 24 | } 25 | 26 | public CloudQueueMessage InternalMessage => _internalMessage; 27 | 28 | public string Content => _internalMessage.AsString; 29 | 30 | public void Done() 31 | { 32 | _azureQueue.DequeueMessageAsync(this).Wait(); 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/Queue/IQueueMessage.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Azure.Queue 6 | { 7 | public interface IQueueMessage 8 | { 9 | string Content { get; } 10 | void Done(); 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/PdfTextReader.Azure/Queue/IStorageQueue.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.WindowsAzure.Storage.Queue; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace PdfTextReader.Azure.Queue 8 | { 9 | public interface IStorageQueue 10 | { 11 | Task AddMessageAsync(string message); 12 | 13 | Task PeekMessageAsync(); 14 | 15 | Task TryGetMessageAsync(); 16 | 17 | Task DequeueMessageAsync(IQueueMessage message); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/BlockArea.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class BlockArea : BlockSet 8 | { 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/BlockColumn.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class BlockColumn : BlockSet 8 | { 9 | private int _columnType; 10 | public int X1; 11 | public int X2; 12 | public int W; 13 | //public int Y1; 14 | //public int Y2; 15 | //public int H; 16 | 17 | public int ColumnType => _columnType; 18 | 19 | public BlockColumn(BlockPage page, int columnType, int x, int w) : base(page) 20 | { 21 | this._columnType = columnType; 22 | this.X1 = x; 23 | this.X2 = x + w; 24 | this.W = w; 25 | } 26 | 27 | public void AddBlock(IBlock block) 28 | { 29 | this.Add(block); 30 | } 31 | 32 | public string GetColumnName() 33 | { 34 | string columnId = (this.X1 + 1).ToString(); 35 | 36 | if ((_columnType == 3) && (this.W > 3)) 37 | return columnId + "X"; 38 | 39 | return columnId; 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/BlockHidden.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class BlockHidden : Block 8 | { 9 | public string GetHiddenText() 10 | { 11 | return TitleWithHiddenIdMateria.GetHiddenText(Text); 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/BlockImage.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class BlockImage 8 | { 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/BlockLine.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class BlockLine : Block 8 | { 9 | public BlockLine() 10 | { 11 | } 12 | public BlockLine(Block b) : base(b) 13 | { 14 | } 15 | public bool HasLargeSpace { get; set; } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/BlockPage.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.Base 7 | { 8 | class BlockPage 9 | { 10 | BlockSet _blocks; 11 | 12 | public virtual BlockSet AllBlocks => _blocks; 13 | 14 | public virtual bool IsEmpty() 15 | { 16 | return (_blocks.Count() == 0); 17 | } 18 | 19 | public BlockPage() 20 | { 21 | _blocks = new BlockSet(this); 22 | } 23 | 24 | public void Add(IBlock block) 25 | { 26 | if (block == null) 27 | throw new ArgumentNullException(nameof(block)); 28 | 29 | _blocks.Add(block); 30 | } 31 | public void AddRange(IEnumerable blockList) 32 | { 33 | foreach(var block in blockList) 34 | { 35 | if (block == null) 36 | throw new ArgumentNullException(nameof(block)); 37 | 38 | Add(block); 39 | } 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/BlockPage2.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.Base 7 | { 8 | class BlockPage2 : BlockPage 9 | { 10 | List _segments = new List(); 11 | 12 | public IEnumerable Segments => _segments; 13 | 14 | public void AddSegment(BlockPageSegment segment) 15 | { 16 | _segments.Add(segment); 17 | } 18 | 19 | public override bool IsEmpty() 20 | { 21 | if (_segments.Count == 0) 22 | return true; 23 | 24 | return false; 25 | } 26 | 27 | public override BlockSet AllBlocks 28 | { 29 | get 30 | { 31 | var blocks = new BlockSet(this); 32 | 33 | foreach(var segment in _segments) 34 | { 35 | foreach(var column in segment) 36 | { 37 | blocks.AddRange(column); 38 | } 39 | 40 | } 41 | 42 | return blocks; 43 | } 44 | } 45 | 46 | public override string ToString() 47 | { 48 | var names = _segments.Select(s => s.GetName()); 49 | return String.Join("", names); 50 | } 51 | 52 | internal object SelectMany(Func p) 53 | { 54 | throw new NotImplementedException(); 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/BlockTable.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class BlockTable 8 | { 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/BlockText.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class BlockText 8 | { 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IAggregateStructure.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IAggregateStructure 8 | { 9 | void Init(TI line); 10 | 11 | bool Aggregate(TI line); 12 | 13 | TO Create(List input); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IBlock.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IBlock 8 | { 9 | string GetText(); 10 | float GetX(); 11 | float GetH(); 12 | float GetWidth(); 13 | float GetHeight(); 14 | float GetWordSpacing(); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IBlockSet.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IBlockSet 8 | { 9 | } 10 | 11 | interface IBlockSet : IBlock, IEnumerable 12 | { 13 | } 14 | 15 | interface IBlockArea : IBlockSet 16 | { 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/ICalculateStats.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface ICalculateStats 8 | { 9 | object Calculate(IEnumerable stats); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IConfigurationStore.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IConfigurationStore 8 | { 9 | string Get(string filename); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IConvertBlock.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IConvertBlock 8 | { 9 | IEnumerable ProcessPage(int pageNumber, BlockPage page); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IExecutionConfiguration.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IExecutionConfiguration 8 | { 9 | void Init(string filename); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/ILogStructure.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Execution; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Text; 6 | 7 | namespace PdfTextReader.Base 8 | { 9 | interface ILogMultipleStructure 10 | { 11 | string CreateId(T data); 12 | void Log(string id, Stream input, T data); 13 | } 14 | 15 | interface ILogStructure 16 | { 17 | void StartLog(TextWriter input); 18 | void Log(TextWriter input, T data); 19 | void EndLog(TextWriter input); 20 | } 21 | 22 | interface ILogStructure2 : ILogStructure 23 | { 24 | void Init(ITransformIndexTree index); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/ILogStructurePdf.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface ILogStructurePdf 8 | { 9 | void StartLogPdf(IPipelineDebug pipeline); 10 | void LogPdf(IPipelineDebug pipeline, T data); 11 | void EndLogPdf(IPipelineDebug pipeline); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IPipelineDebug.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IPipelineDebug 8 | { 9 | void ShowLine(TextLine line, System.Drawing.Color color); 10 | void ShowLine(IEnumerable lines, System.Drawing.Color color); 11 | void ShowText(string text, TextLine line, System.Drawing.Color color); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IProcessBlock.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IProcessBlock 8 | { 9 | BlockPage Process(BlockPage page); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IProcessBlockData.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IProcessBlockData : IProcessBlock 8 | { 9 | void UpdateInstance(object cache); 10 | BlockPage LastResult { get; } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IProcessStructure.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IProcessStructure 8 | { 9 | T Process(T structure); 10 | } 11 | 12 | interface IProcessStructure2 13 | { 14 | IEnumerable Process(IEnumerable input); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IRetrieveStatistics.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IRetrieveStatistics 8 | { 9 | object RetrieveStatistics(); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/ITransformIndexTree.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface ITransformIndexTree 8 | { 9 | int FindPageStart(T instance); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IValidateBlock.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IValidateBlock 8 | { 9 | BlockPage Validate(BlockPage page); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/IValidateMark.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | interface IValidateMark 8 | { 9 | string Validate(BlockSet marks); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/ImageBlock.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class ImageBlock : IBlock 8 | { 9 | public float X; 10 | public float H; 11 | public float Width; 12 | public float Height; 13 | public string ResourceName; 14 | 15 | public string GetText() => throw new InvalidOperationException(); 16 | public float GetX() => X; 17 | public float GetH() => H; 18 | public float GetWidth() => Width; 19 | public float GetHeight() => Height; 20 | public float GetWordSpacing() => throw new InvalidOperationException(); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/MarkLine.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class MarkLine : IBlock 8 | { 9 | public const int ORANGE = 10210; 10 | public const int YELLOW = 10220; 11 | public const int PURPLE = 10101; 12 | 13 | public float X { get; set; } 14 | public float B { get; set; } 15 | public float H { get; set; } 16 | public float Width { get; set; } 17 | public float Height { get; set; } 18 | public string GetText() => throw new NotImplementedException(); 19 | public float GetX() => X; 20 | public float GetH() => H; 21 | public float GetWidth() => Width; 22 | public float GetHeight() => Height; 23 | public float GetWordSpacing() => throw new NotImplementedException(); 24 | public float LineWidth { get; set; } 25 | public int Color { get; set; } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/StatsBlocksOverlapped.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class StatsBlocksOverlapped 8 | { 9 | public static StatsBlocksOverlapped Empty = new StatsBlocksOverlapped(); 10 | 11 | public IBlock[] Blocks; 12 | public int[] BlockIds; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/StatsExceptionHandled.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class StatsExceptionHandled 8 | { 9 | private readonly int _pageNumber; 10 | private readonly Exception _exception; 11 | 12 | public string Error => _exception.ToString(); 13 | public int PageNumber => _pageNumber; 14 | 15 | public StatsExceptionHandled(int pageNumber, Exception ex) 16 | { 17 | this._pageNumber = pageNumber; 18 | this._exception = ex; 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/StatsPageFooter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class StatsPageFooter 8 | { 9 | public bool HasFooter; 10 | public float? FooterHeight; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/StatsPageLayout.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class StatsPageLayout 8 | { 9 | public string Layout; 10 | public override string ToString() 11 | { 12 | return Layout; 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/TableCell.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class TableCell : IBlock 8 | { 9 | const float DARKCOLOR_THRESHOLD = 0.5f; 10 | public static bool HasDarkColor(TableCell t) => (t.BgColor < DARKCOLOR_THRESHOLD); 11 | public static bool HasWhiteColor(TableCell t) => (t.BgColor == 1); 12 | 13 | public string Text { get; set; } 14 | public float X { get; set; } 15 | public float B { get; set; } 16 | public float H { get; set; } 17 | public float Width { get; set; } 18 | public float Height { get; set; } 19 | public int Op { get; set; } 20 | public string GetText() => "<>"; 21 | public float GetX() => X; 22 | public float GetH() => H; 23 | public float GetWidth() => Width; 24 | public float GetHeight() => Height; 25 | public float GetWordSpacing() => X; 26 | public float LineWidth { get; set; } 27 | public float BgColor { get; set; } 28 | 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/TextAlignment.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | enum TextAlignment 8 | { 9 | CENTER, 10 | LEFT, 11 | RIGHT, 12 | JUSTIFY, 13 | UNKNOWN 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/TextLine.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class TextLine : IBlock 8 | { 9 | public string FontName { get; set; } 10 | public float FontSize { get; set; } 11 | public string Text { get; set; } 12 | public float MarginRight { get; set; } 13 | public float MarginLeft { get; set; } 14 | public float CenteredAt { get; set; } 15 | public float? AfterSpace { get; set; } 16 | public float? BeforeSpace { get; set; } 17 | public bool HasLargeSpace { get; set; } 18 | public string FontStyle { get; set; } 19 | public bool HasBackColor { get; set; } 20 | public TextPageInfo PageInfo { get; set; } 21 | 22 | public IBlock Block { get; set; } 23 | 24 | public TextLine() 25 | { 26 | } 27 | 28 | public string GetText() => Block.GetText(); 29 | 30 | public float GetX() => Block.GetX(); 31 | public float GetH() => Block.GetH(); 32 | 33 | public float GetWidth() => Block.GetWidth(); 34 | 35 | public float GetHeight() => Block.GetHeight(); 36 | 37 | public float GetWordSpacing() => Block.GetWordSpacing(); } 38 | } 39 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/TextLine2.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class TextLine2 : TextLine 8 | { 9 | public bool AlignedCenter; 10 | public bool HasContinuation; 11 | 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/TextPage.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class TextPageInfo 8 | { 9 | public int PageNumber { get; set; } 10 | public int BlockId { get; set; } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/TextSegment.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.Base 7 | { 8 | class TextSegment 9 | { 10 | public TextStructure[] OriginalTitle { get; set; } 11 | public TextStructure[] Title { get; set; } 12 | public TextStructure[] Body { get; set; } 13 | public string TitleText { get; set; } 14 | public string BodyText { get; set; } 15 | 16 | public override string ToString() 17 | { 18 | var sb = new StringBuilder(); 19 | 20 | sb.AppendLine("=========================================================================="); 21 | sb.AppendLine(); 22 | sb.AppendLine(String.Join("\r\n", Title.Select(t => t.Text))); 23 | sb.AppendLine(); 24 | sb.AppendLine(); 25 | sb.AppendLine("--------------------------------------------------------------------------"); 26 | sb.AppendLine(String.Join("\r\n", Body.Select(t => t.Text))); 27 | sb.AppendLine(); 28 | 29 | return sb.ToString(); 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/TextSegmentText.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.Base 7 | { 8 | class TextSegmentText 9 | { 10 | public TextSegment OriginalTitle { get; set; } 11 | public TextStructure[] Title { get; set; } 12 | public TextStructure[] Body { get; set; } 13 | 14 | public string Text { get; set; } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/TextStructureAgg.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | class TextStructureAgg 8 | { 9 | public TextStructure TextStruct; 10 | public bool SameFont; 11 | public bool SameSpacing; 12 | public bool AlignedTabStop; 13 | public bool HasContinuation; 14 | public float VerticalSpacing; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/PdfTextReader/Base/TextTaggedSegment.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Base 6 | { 7 | enum TaggedSegmentEnum 8 | { 9 | None, Hierarquia, Titulo, Subtitulo, Ementa, Assinatura, Cargo, Data, Image, Table 10 | }; 11 | 12 | class TextTaggedSegment 13 | { 14 | public TextTaggedStructure[] Title { get; set; } 15 | public TextTaggedStructure[] Body { get; set; } 16 | public TextSegment OriginalSegment { get; set; } 17 | } 18 | 19 | class TextTaggedStructure 20 | { 21 | public TextStructure TextStructure; 22 | public TaggedSegmentEnum Tag; 23 | public TextAlignment TextAlignment; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/PdfTextReader/Compatibility.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader 7 | { 8 | static class Compatibility 9 | { 10 | public static IEnumerable TakeLast(this List list, int end) 11 | { 12 | int start = list.Count - end; 13 | return list.GetRange(start, end); 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/PdfTextReader/Configuration/ConfigurationFile.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Text; 6 | 7 | namespace PdfTextReader.Configuration 8 | { 9 | class ConfigurationFile : IConfigurationStore 10 | { 11 | public string Get(string filename) 12 | { 13 | string content = null; 14 | try 15 | { 16 | content = File.ReadAllText(filename); 17 | } 18 | catch { } 19 | 20 | return content; 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/PdfTextReader/Configuration/ParserTreeConfig.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | 7 | namespace PdfTextReader.Configuration 8 | { 9 | class ParserTreeConfig : IExecutionConfiguration 10 | { 11 | List _titles; 12 | public bool IsValid { get; private set; } 13 | 14 | public void Init(string content) 15 | { 16 | if( content == null ) 17 | { 18 | IsValid = false; 19 | return; 20 | } 21 | 22 | _titles = content 23 | .Split(new char[] { '\r', '\n' }) 24 | .Select(RemoveComment) 25 | .Where(StringNotEmpty) 26 | .ToList(); 27 | 28 | IsValid = true; 29 | } 30 | 31 | public IList Titles => _titles; 32 | 33 | bool StringNotEmpty(string line) => !String.IsNullOrWhiteSpace(line); 34 | 35 | string RemoveComment(string line) 36 | { 37 | if( line.Contains("((") && line.Contains("))")) 38 | { 39 | string[] components = line.Split(new String[] { "((" }, 2, StringSplitOptions.None); 40 | 41 | string text = components[0].Trim(); 42 | 43 | return text; 44 | } 45 | 46 | return line.Trim(); 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/PdfTextReader/Execution/IPipelineContext.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using PdfTextReader.Base; 5 | 6 | namespace PdfTextReader.Execution 7 | { 8 | interface IPipelineContext 9 | { 10 | } 11 | interface IPipelinePdfContext : IPipelineContext 12 | { 13 | PipelineInputPdf.PipelineInputPdfPage CurrentPage { get; } 14 | 15 | IProcessBlockData FromCache(int pageNumber); 16 | 17 | void StoreCache(int pageNumber, IProcessBlockData result); 18 | 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/PdfTextReader/Execution/PipelineDisposeHelper.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Execution 6 | { 7 | class PipelineDisposeHelper : IDisposable 8 | { 9 | private List _disposableObjects = new List(); 10 | 11 | public void TrackInstance(object instance) 12 | { 13 | var disposableObj = instance as IDisposable; 14 | if (disposableObj != null) 15 | { 16 | _disposableObjects.Add(disposableObj); 17 | } 18 | } 19 | 20 | public void FreeObject(object instance) 21 | { 22 | var disposable = instance as IDisposable; 23 | if (disposable != null) 24 | { 25 | disposable.Dispose(); 26 | } 27 | } 28 | 29 | public void Dispose() 30 | { 31 | lock (_disposableObjects) 32 | { 33 | if (_disposableObjects != null) 34 | { 35 | foreach (var obj in _disposableObjects) 36 | { 37 | FreeObject(obj); 38 | } 39 | 40 | _disposableObjects = null; 41 | } 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/PdfTextReader/Execution/PipelineDocumentStats.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Execution 6 | { 7 | class PipelineDocumentStats 8 | { 9 | public float X; 10 | public float H; 11 | public float Width; 12 | public float Height; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/PdfTextReader/Execution/PipelineGlobalStats.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Execution 6 | { 7 | class PipelineGlobalStats 8 | { 9 | public T Instance { get; set; } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/PdfTextReader/Execution/PipelineInputCache.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.Execution 7 | { 8 | class PipelineInputCache where TD: class 9 | { 10 | Dictionary _dictDocuments = new Dictionary(); 11 | int _numberOfPages = -1; 12 | 13 | public void SetSize(int size) 14 | { 15 | if (size <= 0) 16 | PdfReaderException.AlwaysThrow("Invalid size"); 17 | 18 | _numberOfPages = size; 19 | } 20 | 21 | Document GetCache() 22 | { 23 | _dictDocuments.TryGetValue(typeof(T), out Document cache); 24 | 25 | if(cache == null) 26 | { 27 | cache = new Document(_numberOfPages); 28 | _dictDocuments[typeof(T)] = cache; 29 | } 30 | 31 | return cache; 32 | } 33 | 34 | public TD FromCache(int pageNumber) 35 | { 36 | var cache = GetCache(); 37 | return cache[pageNumber]; 38 | } 39 | 40 | public void StoreCache(int pageNumber, TD result) 41 | { 42 | var cache = GetCache(); 43 | cache[pageNumber] = result; 44 | } 45 | 46 | class Document : List 47 | { 48 | public Document(int size) : base(new TD[size]) {} 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/PdfTextReader/Execution/PipelinePageStats.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Execution 6 | { 7 | class PipelinePageStats 8 | { 9 | T _internalProperty; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/PdfTextReader/Execution/PipelinePdfLog.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.Execution 7 | { 8 | class PipelinePdfLog 9 | { 10 | List _log = new List(); 11 | 12 | class PipelinePdfLogEntry 13 | { 14 | public int PageNumber; 15 | public Type Component; 16 | public string Message; 17 | } 18 | 19 | public void LogCheck(int pageNumber, Type component, string message) 20 | { 21 | _log.Add(new PipelinePdfLogEntry() 22 | { 23 | PageNumber = pageNumber, 24 | Component = component, 25 | Message = message 26 | }); 27 | } 28 | 29 | public void SaveErrors(string inputfile, string outputfile) 30 | { 31 | 32 | } 33 | 34 | public IEnumerable GetErrors() 35 | { 36 | return _log.Select(t => t.PageNumber).Distinct().OrderBy(t => t); 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/PdfTextReader/Execution/PipelineResult.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using PdfTextReader.Base; 5 | 6 | namespace PdfTextReader.Execution 7 | { 8 | interface IPipelineResults 9 | { 10 | T GetResults(); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/PdfTextReader/ExecutionStats/PrintAnalyticsExtensions.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using PdfTextReader.Execution; 3 | using PdfTextReader.Parser; 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Text; 7 | 8 | namespace PdfTextReader.ExecutionStats 9 | { 10 | static class PrintAnalyticsExtensions 11 | { 12 | public static PipelineText PrintAnalytics(this PipelineText pipelineText, string filename) 13 | { 14 | return pipelineText.Log(filename); 15 | } 16 | 17 | public static PipelineText PrintAnalytics(this PipelineText pipelineText, string filename) 18 | { 19 | return pipelineText.Log(filename); 20 | } 21 | 22 | public static PipelineText PrintAnalytics(this PipelineText pipelineText, string filename) 23 | { 24 | return pipelineText.Log(filename); 25 | } 26 | 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/PdfTextReader/ExecutionStats/PrintDebugCount.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.IO; 6 | 7 | namespace PdfTextReader.ExecutionStats 8 | { 9 | class PrintDebugCount : ILogStructure 10 | { 11 | string _message = typeof(T).Name; 12 | int _count = 0; 13 | 14 | public void StartLog(TextWriter input) 15 | { 16 | } 17 | 18 | public void EndLog(TextWriter input) 19 | { 20 | input.WriteLine(_message + ": " + _count); 21 | } 22 | 23 | public void Log(TextWriter input, T data) 24 | { 25 | _count++; 26 | } 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/PdfTextReader/ExecutionStats/PrintDebugExtensions.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Execution; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.ExecutionStats 7 | { 8 | static class PrintDebugExtensions 9 | { 10 | public static PipelineText DebugCount(this PipelineText pipelineText) 11 | { 12 | return pipelineText.Log>(Console.Out); 13 | } 14 | 15 | public static PipelineText DebugPrint(this PipelineText pipelineText) 16 | { 17 | return pipelineText.Log>(Console.Out); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/PdfTextReader/ExecutionStats/PrintDebugPrint.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.IO; 6 | 7 | namespace PdfTextReader.ExecutionStats 8 | { 9 | class PrintDebugPrint : ILogStructure 10 | { 11 | string _message = typeof(T).Name; 12 | 13 | public void StartLog(TextWriter input) 14 | { 15 | input.WriteLine("DebugPrint: " + _message); 16 | } 17 | 18 | public void EndLog(TextWriter input) 19 | { 20 | } 21 | 22 | public void Log(TextWriter input, T data) 23 | { 24 | input.WriteLine(data.ToString()); 25 | input.WriteLine(""); 26 | } 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/PdfTextReader/ExecutionStats/ShowParserWarnings.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using PdfTextReader.Execution; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Text; 7 | 8 | namespace PdfTextReader.ExecutionStats 9 | { 10 | class ShowParserWarnings 11 | { 12 | public IEnumerable GetPages(PipelineStats statistics) 13 | { 14 | var layout = (ValidateLayout)statistics.Calculate(); 15 | var overlap = (ValidateOverlap)statistics.Calculate(); 16 | var unhandled = (ValidateUnhandledExceptions)statistics.Calculate(); 17 | 18 | var pagesLayout = layout.GetPageErrors().ToList(); 19 | var pagesOverlap = overlap.GetPageErrors().ToList(); 20 | var pagesUnhandled = unhandled.GetPageErrors().ToList(); 21 | 22 | var pages = pagesLayout 23 | .Concat(pagesOverlap) 24 | .Concat(pagesUnhandled) 25 | .Distinct().OrderBy(t => t).ToList(); 26 | 27 | return pages; 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/PdfTextReader/ExecutionStats/TextInfo.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using PdfTextReader.Base; 5 | 6 | namespace PdfTextReader.ExecutionStats 7 | { 8 | class TextInfo 9 | { 10 | public string FontName; 11 | public float FontSize; 12 | public string FontStyle; 13 | public string Text; 14 | 15 | public TextInfo(TextLine line) 16 | { 17 | this.FontName = line.FontName; 18 | this.FontSize = line.FontSize; 19 | this.FontStyle = line.FontStyle; 20 | this.Text = line.Text; 21 | } 22 | 23 | public TextInfo(string fontName, string fontStyle, float fontSize) 24 | { 25 | this.FontName = fontName; 26 | this.FontStyle = fontStyle; 27 | this.FontSize = fontSize; 28 | this.Text = ""; 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/PdfTextReader/ExecutionStats/ValidateFooter.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using PdfTextReader.PDFCore; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Text; 6 | 7 | namespace PdfTextReader.ExecutionStats 8 | { 9 | class ValidateFooter : ICalculateStats 10 | { 11 | const float statRegionTooLarge = 200f; 12 | 13 | public object Calculate(IEnumerable stats) 14 | { 15 | float total = 0; 16 | int count = 0; 17 | int missingFooter = 0; 18 | 19 | foreach(var stat in stats) 20 | { 21 | if (stat == null) 22 | continue; 23 | 24 | if( stat.HasFooter ) 25 | { 26 | float height = (float)stat.FooterHeight; 27 | 28 | if (height > statRegionTooLarge) 29 | { 30 | PdfReaderException.AlwaysThrow("height > statRegionTooLarge"); 31 | } 32 | 33 | total += height; 34 | count++; 35 | } 36 | else 37 | { 38 | missingFooter++; 39 | } 40 | } 41 | 42 | return new 43 | { 44 | PagesWithoutFooter = missingFooter, 45 | AverageFooterHeight = total / count 46 | }; 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/PdfTextReader/ExecutionStats/ValidateOverlap.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.ExecutionStats 7 | { 8 | class ValidateOverlap : ICalculateStats 9 | { 10 | public IList Results { get; private set; } 11 | 12 | public IEnumerable GetPageErrors() 13 | { 14 | for (int i = 0; i < Results.Count; i++) 15 | { 16 | if ((Results[i] != null) && (Results[i] != StatsBlocksOverlapped.Empty)) 17 | yield return i+1; 18 | } 19 | } 20 | 21 | public object Calculate(IEnumerable stats) 22 | { 23 | var result = new List(); 24 | 25 | foreach (var s in stats) 26 | { 27 | var r = (s == StatsBlocksOverlapped.Empty) ? null : s; 28 | 29 | result.Add(r); 30 | } 31 | 32 | Results = result; 33 | 34 | return this; 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/PdfTextReader/ExecutionStats/ValidateUnhandledExceptions.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.ExecutionStats 7 | { 8 | class ValidateUnhandledExceptions : ICalculateStats 9 | { 10 | public IList Results { get; private set; } 11 | 12 | public IEnumerable GetPageErrors() 13 | { 14 | for (int i = 0; i < Results.Count; i++) 15 | { 16 | if (Results[i] == null) 17 | continue; 18 | 19 | yield return Results[i].PageNumber; 20 | } 21 | } 22 | 23 | public object Calculate(IEnumerable stats) 24 | { 25 | var result = new List(); 26 | 27 | foreach (var s in stats) 28 | { 29 | result.Add(s); 30 | } 31 | 32 | Results = result; 33 | 34 | return this; 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/PdfTextReader/IVirtualFS.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Text; 5 | 6 | namespace PdfTextReader 7 | { 8 | public interface IVirtualFS 9 | { 10 | Stream OpenReader(string filename); 11 | Stream OpenWriter(string filename); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/AddImageSpace.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Execution; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using PdfTextReader.PDFText; 7 | using PdfTextReader.Base; 8 | 9 | namespace PdfTextReader.PDFCore 10 | { 11 | class AddImageSpace : IProcessBlock 12 | { 13 | private List _images; 14 | 15 | public AddImageSpace(PreProcessImages parserImage) 16 | { 17 | var page = parserImage.Images; 18 | 19 | if (page == null) 20 | { 21 | PdfReaderException.AlwaysThrow("AddImageSpace requires PreProcessImages"); 22 | } 23 | 24 | this._images = page.AllBlocks.ToList(); 25 | } 26 | 27 | public BlockPage Process(BlockPage page) 28 | { 29 | if(this._images == null) 30 | { 31 | PdfReaderException.AlwaysThrow("AddImageSpace requires PreProcessImages"); 32 | } 33 | 34 | var result = new BlockPage(); 35 | 36 | foreach (var block in page.AllBlocks) 37 | { 38 | result.Add(block); 39 | } 40 | foreach (var block in _images) 41 | { 42 | result.Add(block); 43 | } 44 | 45 | return result; 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/AddTableSpace.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Execution; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using PdfTextReader.Base; 7 | 8 | namespace PdfTextReader.PDFCore 9 | { 10 | class AddTableSpace : IProcessBlock 11 | { 12 | private List _tables; 13 | 14 | public AddTableSpace(PDFCore.IdentifyTables parserTable) 15 | { 16 | var page = parserTable.PageTables; 17 | 18 | if (page == null) 19 | { 20 | PdfReaderException.AlwaysThrow("AddTableSpace requires IdentifyTables"); 21 | } 22 | 23 | this._tables = page.AllBlocks.ToList(); 24 | } 25 | 26 | public BlockPage Process(BlockPage page) 27 | { 28 | if(this._tables == null) 29 | { 30 | PdfReaderException.AlwaysThrow("AddTableSpace requires IdentifyTables"); 31 | } 32 | 33 | var result = new BlockPage(); 34 | 35 | foreach (var block in page.AllBlocks) 36 | { 37 | result.Add(block); 38 | } 39 | foreach (var block in _tables) 40 | { 41 | result.Add(block); 42 | } 43 | 44 | return result; 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/BasicFirstPageStats.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | 7 | namespace PdfTextReader.PDFCore 8 | { 9 | class BasicFirstPageStats : IProcessBlock 10 | { 11 | public float MinX { get; private set; } 12 | public float MaxX { get; private set; } 13 | public float PageWidth { get; private set; } 14 | public float TabStop { get; private set; } 15 | 16 | static BasicFirstPageStats Global = null; 17 | 18 | public BasicFirstPageStats Stats { get 19 | { 20 | return Global; 21 | } 22 | } 23 | 24 | [Obsolete] 25 | public static void Reset() 26 | { 27 | Global = null; 28 | } 29 | 30 | public void SetTabStop(float tabstop) 31 | { 32 | TabStop = tabstop; 33 | } 34 | 35 | void SetupPage(BlockPage page) 36 | { 37 | if (Global != null) 38 | return; 39 | 40 | Global = this; 41 | 42 | var blocks = page.AllBlocks; 43 | 44 | MinX = blocks.Min(b => b.GetX()); 45 | MaxX = blocks.Max(b => b.GetX() + b.GetWidth()); 46 | PageWidth = MaxX - MinX; 47 | } 48 | 49 | public BlockPage Process(BlockPage page) 50 | { 51 | SetupPage(page); 52 | 53 | return page; 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/DouIgnoreLongDotSequence.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using PdfTextReader.Base; 5 | using System.Linq; 6 | 7 | namespace PdfTextReader.PDFCore 8 | { 9 | // bug #37: "omisses" can span multiple columns 10 | // proposed fix: decrease the width 11 | // should be run after GroupLines, to prevent cases where omisses are broken 12 | class DouIgnoreLongDotSequence : IProcessBlock 13 | { 14 | public BlockPage Process(BlockPage page) 15 | { 16 | var result = new BlockPage(); 17 | 18 | foreach (var block in page.AllBlocks) 19 | { 20 | var blockLine = (BlockLine)block; 21 | 22 | // divide by 4 23 | if(blockLine.GetText().Contains("....................")) 24 | { 25 | blockLine.Width /= 4; 26 | } 27 | 28 | result.Add(blockLine); 29 | } 30 | 31 | return result; 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/FilterHeaderFooter.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.Linq; 6 | 7 | namespace PdfTextReader.PDFCore 8 | { 9 | class FilterHeaderFooter : IProcessBlock, IValidateBlock 10 | { 11 | private float _headerH = float.NaN; 12 | private float _footerH = float.NaN; 13 | 14 | public FilterHeaderFooter(HeaderFooterData data) 15 | { 16 | _headerH = data.HeaderH; 17 | _footerH = data.FooterH; 18 | 19 | if( float.IsNaN(_headerH) || float.IsNaN(_footerH) ) 20 | PdfReaderException.AlwaysThrow("FilterHeaderFooter requires HeaderFooterData"); 21 | } 22 | 23 | public BlockPage Process(BlockPage page) 24 | { 25 | var content = new BlockPage(); 26 | 27 | foreach(var b in page.AllBlocks) 28 | { 29 | if( b.GetH() > _footerH && b.GetH() < _headerH ) 30 | { 31 | content.Add(b); 32 | } 33 | } 34 | 35 | return content; 36 | } 37 | 38 | public BlockPage Validate(BlockPage page) 39 | { 40 | var headerfooter = new BlockPage(); 41 | 42 | foreach (var b in page.AllBlocks) 43 | { 44 | if (b.GetH() <= _footerH || b.GetH() >= _headerH) 45 | { 46 | headerfooter.Add(b); 47 | } 48 | } 49 | 50 | return headerfooter; 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/FinalBlockResultData.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.PDFCore 7 | { 8 | class FinalBlockResultData : IProcessBlockData 9 | { 10 | public BlockPage LastResult { get; private set; } 11 | 12 | public BlockPage Process(BlockPage page) 13 | { 14 | LastResult = page; 15 | return page; 16 | } 17 | 18 | public void UpdateInstance(object cache) 19 | { 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/HeaderFooterData.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.Linq; 6 | 7 | namespace PdfTextReader.PDFCore 8 | { 9 | class HeaderFooterData : IProcessBlockData 10 | { 11 | public float HeaderH = float.NaN; 12 | public float FooterH = float.NaN; 13 | 14 | public BlockPage LastResult { get; private set; } 15 | 16 | public BlockPage Process(BlockPage page) 17 | { 18 | LastResult = page; 19 | return page; 20 | } 21 | 22 | public void UpdateInstance(object cache) 23 | { 24 | var instance = (HeaderFooterData)cache; 25 | 26 | this.HeaderH = instance.HeaderH; 27 | this.FooterH = instance.FooterH; 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/IdentifyTablesData.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using PdfTextReader.Execution; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Text; 7 | 8 | namespace PdfTextReader.PDFCore 9 | { 10 | class IdentifyTablesData : IProcessBlockData 11 | { 12 | public bool Ready { get; set; } 13 | public BlockPage PageTables { get; set; } 14 | public BlockPage PageLines { get; set; } 15 | public BlockPage PageBackground { get; set; } 16 | public TableCell PageFooterLine { get; set; } 17 | 18 | public BlockPage LastResult { get; set; } 19 | 20 | public BlockPage Process(BlockPage page) 21 | { 22 | LastResult = page; 23 | return page; 24 | } 25 | 26 | public void UpdateInstance(object cache) 27 | { 28 | var instance = (IdentifyTablesData)cache; 29 | this.LastResult = instance.LastResult; 30 | this.Ready = instance.Ready; 31 | this.PageTables = instance.PageTables; 32 | this.PageLines = instance.PageLines; 33 | this.PageBackground = instance.PageBackground; 34 | this.PageFooterLine = instance.PageFooterLine; 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/MergeBlockLines.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using PdfTextReader.Base; 6 | 7 | namespace PdfTextReader.PDFCore 8 | { 9 | class MergeBlockLines : IProcessBlock 10 | { 11 | public BlockPage Process(BlockPage page) 12 | { 13 | var result = new BlockPage(); 14 | BlockSet last = null; 15 | 16 | foreach (var block in page.AllBlocks) 17 | { 18 | var blockset = (BlockSet)block; 19 | 20 | if ((last == null) || (!CanBeMerged(last, blockset))) 21 | { 22 | var b = new BlockSet(); 23 | b.AddRange(blockset); 24 | 25 | result.Add(b); 26 | 27 | last = b; 28 | } 29 | else 30 | { 31 | // merge blocks 32 | last.AddRange(blockset); 33 | } 34 | } 35 | 36 | return result; 37 | } 38 | 39 | bool CanBeMerged(BlockSet a, BlockSet b) 40 | { 41 | var lastLine = a.Last(); 42 | var firstLine = b.First(); 43 | 44 | return Block.HasOverlap(lastLine, firstLine); 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/PageInfoStats.cs: -------------------------------------------------------------------------------- 1 | using Newtonsoft.Json; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.PDFCore 7 | { 8 | class PageInfoStats 9 | { 10 | public class HeaderInfo 11 | { 12 | public string ISSN; 13 | public string Local; 14 | public string DataDia; 15 | public string DataYMD; 16 | public string JornalAnoSupl; 17 | public string JornalEdicao; 18 | } 19 | 20 | public HeaderInfo Header { get; private set; } 21 | 22 | public void SetInfo(HeaderInfo headerInfo ) 23 | { 24 | this.Header = headerInfo; 25 | } 26 | 27 | public override string ToString() 28 | { 29 | if (this.Header == null) 30 | return ""; 31 | 32 | return JsonConvert.SerializeObject(this.Header, Formatting.Indented); 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/ProcessPdfTextData.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.PDFCore 7 | { 8 | class ProcessPdfTextData : IProcessBlockData 9 | { 10 | public BlockPage LastResult { get; private set; } 11 | 12 | public BlockPage Process(BlockPage page) 13 | { 14 | LastResult = page; 15 | return page; 16 | } 17 | 18 | public void UpdateInstance(object cache) 19 | { 20 | var instance = (ProcessPdfTextData)cache; 21 | 22 | if (instance == null) 23 | PdfReaderException.AlwaysThrow("Null cache value"); 24 | 25 | this.LastResult = instance.LastResult; 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/RemoveBlockHidden.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.PDFCore 7 | { 8 | class RemoveBlockHidden : IProcessBlock, IValidateBlock 9 | { 10 | public BlockPage Process(BlockPage page) 11 | { 12 | var newpage = new BlockPage(); 13 | 14 | foreach(var block in page.AllBlocks) 15 | { 16 | if (block is BlockHidden) 17 | continue; 18 | 19 | newpage.Add(block); 20 | } 21 | 22 | return newpage; 23 | } 24 | 25 | public BlockPage Validate(BlockPage page) 26 | { 27 | var newpage = new BlockPage(); 28 | 29 | foreach (var block in page.AllBlocks) 30 | { 31 | if (block is BlockHidden) 32 | { 33 | newpage.Add(block); 34 | } 35 | } 36 | 37 | return newpage; 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/RemoveHeader.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | 7 | namespace PdfTextReader.PDFCore 8 | { 9 | class RemoveHeader : IProcessBlock, IValidateBlock 10 | { 11 | const float statRegionTooLarge = 200f; 12 | 13 | public BlockPage Process(BlockPage page) 14 | { 15 | if (page.AllBlocks.Count() == 0) 16 | return page; 17 | 18 | float err = 1f; 19 | float maxH = page.AllBlocks.Max(b => b.GetH()) - err; 20 | 21 | var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() < maxH); 22 | 23 | var result = new BlockPage(); 24 | 25 | result.AddRange(blocksAtHeader); 26 | 27 | return result; 28 | } 29 | 30 | public BlockPage Validate(BlockPage page) 31 | { 32 | if (page.AllBlocks.Count() == 0) 33 | return page; 34 | 35 | float err = 1f; 36 | float maxH = page.AllBlocks.Max(b => b.GetH()) - err; 37 | 38 | var blocksAtHeader = page.AllBlocks.Where(b => b.GetH() >= maxH); 39 | 40 | var result = new BlockPage(); 41 | 42 | result.AddRange(blocksAtHeader); 43 | 44 | float height = result.AllBlocks.GetHeight(); 45 | if (height > statRegionTooLarge) 46 | PdfReaderException.AlwaysThrow("height > statRegionTooLarge"); 47 | 48 | return result; 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/RemoveImageTexts.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Execution; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using PdfTextReader.PDFText; 7 | using PdfTextReader.Base; 8 | 9 | namespace PdfTextReader.PDFCore 10 | { 11 | class RemoveImageTexts : IProcessBlock 12 | { 13 | private List _images; 14 | 15 | public RemoveImageTexts(PreProcessImages parseImage) 16 | { 17 | var page = parseImage.Images; 18 | 19 | if (page == null) 20 | { 21 | PdfReaderException.AlwaysThrow("RemoveImageTexts requires PreProcessImages"); 22 | } 23 | 24 | this._images = page.AllBlocks.ToList(); 25 | } 26 | 27 | public BlockPage Process(BlockPage page) 28 | { 29 | if (this._images == null) 30 | { 31 | PdfReaderException.AlwaysThrow("RemoveImageTexts requires PreProcessImages"); 32 | } 33 | 34 | var result = new BlockPage(); 35 | 36 | foreach (var block in page.AllBlocks) 37 | { 38 | bool insideImage = false; 39 | 40 | foreach (var table in _images) 41 | { 42 | if (Block.HasOverlap(table, block)) 43 | { 44 | insideImage = true; 45 | break; 46 | } 47 | } 48 | 49 | if (!insideImage) 50 | { 51 | result.Add(block); 52 | } 53 | } 54 | 55 | return result; 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/RemoveTableDotChar.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.PDFCore 7 | { 8 | class RemoveTableDotChar : IProcessBlock, IValidateBlock 9 | { 10 | public BlockPage Process(BlockPage page) 11 | { 12 | var result = new BlockPage(); 13 | 14 | foreach (var block in page.AllBlocks) 15 | { 16 | if (block.GetText() != ".") 17 | result.Add(block); 18 | } 19 | 20 | return result; 21 | } 22 | 23 | public BlockPage Validate(BlockPage page) 24 | { 25 | var result = new BlockPage(); 26 | 27 | foreach(var block in page.AllBlocks) 28 | { 29 | if (block.GetText() == ".") 30 | result.Add(block); 31 | } 32 | 33 | return result; 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/RemoveTableText.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Execution; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using PdfTextReader.Base; 7 | 8 | namespace PdfTextReader.PDFCore 9 | { 10 | class RemoveTableText : IProcessBlock 11 | { 12 | private List _tables; 13 | 14 | public RemoveTableText(PDFCore.IdentifyTables parserTable) 15 | { 16 | var page = parserTable.PageTables; 17 | 18 | if (page == null) 19 | { 20 | PdfReaderException.AlwaysThrow("RemoveTableText requires IdentifyTables"); 21 | } 22 | 23 | this._tables = page.AllBlocks.ToList(); 24 | } 25 | 26 | public BlockPage Process(BlockPage page) 27 | { 28 | if(this._tables == null) 29 | { 30 | PdfReaderException.AlwaysThrow("RemoveTableText requires IdentifyTables"); 31 | } 32 | 33 | var result = new BlockPage(); 34 | 35 | foreach(var block in page.AllBlocks) 36 | { 37 | bool insideTable = false; 38 | 39 | foreach(var table in _tables) 40 | { 41 | if( Block.HasOverlap(table, block) ) 42 | { 43 | insideTable = true; 44 | break; 45 | } 46 | } 47 | 48 | if( !insideTable ) 49 | { 50 | result.Add(block); 51 | } 52 | } 53 | 54 | return result; 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/SetIdentifyTablesCompatibility.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using PdfTextReader.PDFText; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Text; 6 | 7 | namespace PdfTextReader.PDFCore 8 | { 9 | class SetIdentifyTablesCompatibility : IProcessBlock 10 | { 11 | private readonly IdentifyTables _pre; 12 | private readonly IdentifyTablesData _data; 13 | 14 | public SetIdentifyTablesCompatibility(IdentifyTables pre, IdentifyTablesData data) 15 | { 16 | this._pre = pre; 17 | this._data = data; 18 | } 19 | 20 | public void SetCompatibility(IdentifyTables pre, IdentifyTablesData data) 21 | { 22 | if (data.Ready == false) 23 | { 24 | if (pre.PageTables == null && pre.PageLines == null && pre.PageBackground == null) 25 | PdfReaderException.AlwaysThrow("there is no data available"); 26 | 27 | data.PageFooterLine = pre.PageFooterLine; 28 | data.PageTables = pre.PageTables; 29 | data.PageLines = pre.PageLines; 30 | data.PageBackground = pre.PageBackground; 31 | data.Ready = true; 32 | } 33 | 34 | // set the compatibility between PreProcessImages and ProcessImageData 35 | pre.SetCompatibility(data); 36 | } 37 | 38 | public BlockPage Process(BlockPage page) 39 | { 40 | SetCompatibility(_pre, _data); 41 | 42 | // do nothing 43 | return page; 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/SetProcessImageCompatibility.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using PdfTextReader.PDFText; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Text; 6 | 7 | namespace PdfTextReader.PDFCore 8 | { 9 | class SetProcessImageCompatibility : IProcessBlock 10 | { 11 | private readonly PreProcessImages _pre; 12 | private readonly ProcessImageData _data; 13 | 14 | public SetProcessImageCompatibility(PreProcessImages pre, ProcessImageData data) 15 | { 16 | this._pre = pre; 17 | this._data = data; 18 | } 19 | 20 | public void SetCompatibility(PreProcessImages pre, ProcessImageData data) 21 | { 22 | if (data.Images == null) 23 | PdfReaderException.AlwaysThrow("Null image"); 24 | 25 | // set the compatibility between PreProcessImages and ProcessImageData 26 | pre.SetCompatibility(data); 27 | } 28 | 29 | public BlockPage Process(BlockPage page) 30 | { 31 | SetCompatibility(_pre, _data); 32 | 33 | // do nothing 34 | return page; 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/PdfTextReader/PDFCore/ShowBlocksets.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.PDFCore 7 | { 8 | class ShowBlocksets : IProcessBlock 9 | { 10 | public BlockPage Process(BlockPage page2) 11 | { 12 | var page = page2 as BlockPage2; 13 | 14 | if (page == null) 15 | PdfReaderException.AlwaysThrow("ShowBlocksets must execute AFTER OrganizePageLayout"); 16 | 17 | var blocksets = new BlockPage(); 18 | 19 | foreach(var seg in page.Segments) 20 | { 21 | blocksets.AddRange(seg.Columns); 22 | } 23 | 24 | return blocksets; 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/AggregateAnexo.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.TextStructures; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using PdfTextReader.Base; 7 | 8 | namespace PdfTextReader.Parser 9 | { 10 | class AggregateAnexo : IAggregateStructure 11 | { 12 | public bool Aggregate(Conteudo line) 13 | { 14 | if (line.Titulo.ToLower().Contains("anexo")) 15 | { 16 | return true; 17 | } 18 | else 19 | { 20 | return false; 21 | } 22 | } 23 | 24 | public Conteudo Create(List conteudos) 25 | { 26 | Conteudo newConteudo = conteudos[0]; 27 | newConteudo.Anexos = new List(); 28 | if (conteudos.Count() > 1) 29 | { 30 | for (int i = 1; i < conteudos.Count; i++) 31 | { 32 | Anexo a = new Anexo() 33 | { 34 | Titulo = conteudos[i].Titulo, 35 | Texto = conteudos[i].Corpo 36 | }; 37 | newConteudo.Anexos.Add(a); 38 | } 39 | 40 | } 41 | 42 | return newConteudo; 43 | 44 | } 45 | 46 | public void Init(Conteudo line) 47 | { 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/AggregateSingularBody.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.TextStructures; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using PdfTextReader.Base; 7 | 8 | namespace PdfTextReader.Parser 9 | { 10 | class AggregateSingularBody : IAggregateStructure 11 | { 12 | public bool Aggregate(Conteudo line) 13 | { 14 | if (line.Titulo.ToLower().Contains("seção") || line.Titulo.ToLower().Contains("capítulo")) 15 | { 16 | return true; 17 | } 18 | else 19 | { 20 | return false; 21 | } 22 | } 23 | 24 | public Conteudo Create(List conteudos) 25 | { 26 | Conteudo newConteudo = conteudos[0]; 27 | if (conteudos.Count() > 1) 28 | { 29 | for (int i = 1; i < conteudos.Count; i++) 30 | { 31 | //Verificando se na hierarquia entrou o título da lei (Capitulo) 32 | var titleParts = conteudos[i].Hierarquia.Split(':'); 33 | foreach (string title in titleParts) 34 | { 35 | if (title.Contains("CAPÍTULO")) 36 | conteudos[i].Titulo = title + "\n" + conteudos[i].Titulo; 37 | } 38 | newConteudo.Corpo = newConteudo.Corpo + "\n" + conteudos[i].Titulo + "\n" + conteudos[i].Corpo; 39 | } 40 | } 41 | return newConteudo; 42 | } 43 | 44 | public void Init(Conteudo line) 45 | { 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/Artigo.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Parser 6 | { 7 | class Artigo 8 | { 9 | public Metadados Metadados { get; set; } 10 | public Conteudo Conteudo { get; set; } 11 | public List Anexos { get; set; } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/Autor.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Parser 6 | { 7 | class Autor 8 | { 9 | public string Assinatura { get; set; } 10 | public string Cargo { get; set; } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/Content.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using PdfTextReader.Base; 5 | 6 | namespace PdfTextReader.Parser 7 | { 8 | class Content : TextStructure 9 | { 10 | public TipoDoConteudo ContentType { get; set; } 11 | 12 | public Content() { } 13 | 14 | public Content(TextStructure structure, TipoDoConteudo type) 15 | { 16 | this.FontName = structure.FontName; 17 | this.FontSize = structure.FontSize; 18 | this.FontStyle = structure.FontStyle; 19 | this.Text = structure.Text; 20 | this.TextAlignment = structure.TextAlignment; 21 | this.ContentType = type; 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/Conteudo.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using PdfTextReader.Base; 5 | 6 | namespace PdfTextReader.Parser 7 | { 8 | class Conteudo 9 | { 10 | //For internal use 11 | public int IntenalId { get; set; } 12 | public int Page { get; set; } 13 | public string PID { get; set; } 14 | 15 | public string Hierarquia { get; set; } 16 | public string Titulo { get; set; } 17 | public string Corpo { get; set; } 18 | public List Autor { get; set; } 19 | public string Caput { get; set; } 20 | public string Grade { get; set; } 21 | public string Data { get; set; } 22 | public string Setor { get; set; } 23 | public string Departamento { get; set; } 24 | 25 | public string[] HierarquiaTitulo { get; set; } 26 | public string Texto { get; set; } 27 | 28 | //Just for while 29 | public List Anexos { get; set; } 30 | 31 | public override string ToString() => Titulo; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/HifenUtil.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using System.Text.RegularExpressions; 5 | 6 | namespace PdfTextReader.Parser 7 | { 8 | class HifenUtil 9 | { 10 | static Regex _pattern = new Regex(@"(-[mst])?(.)-\n([^ ]+( )?)"); 11 | 12 | public static string ExtrairHifen(string texto) 13 | { 14 | string replace = _pattern.Replace(texto, m => { 15 | var g = m.Groups; 16 | bool keep = false; 17 | 18 | string corpo = g[0].Value; 19 | bool isMesoclise = g[1].Success; 20 | char charMesoclise = isMesoclise ? g[1].Value[1] : '\0'; 21 | char charAntes = g[2].Value[0]; 22 | char charDepois = g[3].Value[0]; 23 | string afterMatch = g[3].Value; 24 | 25 | if(isMesoclise) 26 | { 27 | keep = true; 28 | } 29 | 30 | if(IsNumber(charAntes) || IsNumber(charDepois)) 31 | { 32 | keep = true; 33 | } 34 | 35 | string keepHifen = (keep) ? "-" : ""; 36 | 37 | return corpo.Replace("-\n", keepHifen).TrimEnd() + "\n"; 38 | }); 39 | 40 | return replace; 41 | } 42 | 43 | static bool IsNumber(char ch) 44 | { 45 | return (ch >= '0' && ch <= '9'); 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/InjectFilename.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.PDFCore; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace PdfTextReader.Parser 7 | { 8 | class InjectFilename 9 | { 10 | public string Filename { get; set; } 11 | public PageInfoStats InfoStats { get; set; } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/Metadados.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace PdfTextReader.Parser 6 | { 7 | class Metadados 8 | { 9 | public string Nome { get; set; } 10 | public string SecaoDoDiario { get; set; } 11 | public string DataPublicacao { get; set; } 12 | public string TipoDoArtigo { get; set; } 13 | public string Grade { get; set; } 14 | public int NumeroDaPagina { get; set; } 15 | public string PdfLink { get; set; } 16 | public string IdMateria { get; set; } 17 | public string NumeroDaEdicao { get; set; } 18 | public string NumeroDoJornal { get; set; } 19 | public string Titulo { get; set; } 20 | 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/ProcessParserJson.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Text; 6 | using PdfTextReader.Base; 7 | using Newtonsoft.Json; 8 | 9 | namespace PdfTextReader.Parser 10 | { 11 | class ProcessParserJson 12 | { 13 | public void Write(Artigo artigo, string doc) 14 | { 15 | // TODO: fix it 16 | // Rollback to previous name 17 | //string finalURL = ProcessName(artigos.FirstOrDefault(), doc); 18 | string finalURL = doc; 19 | 20 | JsonSerializerSettings settings = new JsonSerializerSettings() { Formatting = Formatting.Indented }; 21 | using (Stream virtualStream = VirtualFS.OpenWrite($"{finalURL}.json")) 22 | { 23 | string content = JsonConvert.SerializeObject(artigo, settings); 24 | 25 | using (var writer = new StreamWriter(virtualStream)) 26 | { 27 | writer.Write(content); 28 | } 29 | } 30 | } 31 | 32 | public void WriteJson(IEnumerable artigos, string doc) 33 | { 34 | int i = 1; 35 | foreach(var artigo in artigos) 36 | { 37 | string doc_i = doc + (i++); 38 | this.Write(artigo, doc_i); 39 | } 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/TipoDoConteudo.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using PdfTextReader.Base; 5 | 6 | namespace PdfTextReader.Parser 7 | { 8 | public enum TipoDoConteudo 9 | { 10 | Título, 11 | Grade, 12 | Corpo, 13 | Assinatura, 14 | Data, 15 | Caput, 16 | Cargo, 17 | Setor, 18 | Departamento 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/PdfTextReader/Parser/TransformExemplo.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.TextStructures; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using PdfTextReader.Base; 7 | 8 | namespace PdfTextReader.Parser 9 | { 10 | // 11 | // 12 | // TODO: rewrite using IFilterStructure 13 | // 14 | // 15 | class TransformExemplo : IAggregateStructure 16 | { 17 | public bool Aggregate(TextStructure line) 18 | { 19 | // never aggregate multiple lines 20 | return false; 21 | } 22 | 23 | public TextStructure Create(List textStructureList) 24 | { 25 | if (textStructureList.Count != 1) 26 | throw new InvalidOperationException("impossible"); 27 | 28 | var textStruct = textStructureList[0]; 29 | 30 | // filter password out 31 | if (textStruct.Text.Contains("password")) 32 | return null; 33 | 34 | return textStruct; 35 | } 36 | 37 | public void Init(TextStructure line) 38 | { 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/PdfTextReader/ParserStages/StageConvertContent.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using PdfTextReader.Parser; 3 | using PdfTextReader.TextStructures; 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Text; 7 | 8 | namespace PdfTextReader.ParserStages 9 | { 10 | class StageConvertContent 11 | { 12 | private readonly string _input; 13 | private readonly string _output; 14 | private readonly StageContext _context; 15 | 16 | public StageConvertContent(StageContext context) 17 | { 18 | this._input = context.InputFolder; 19 | this._output = context.OutputFolder; 20 | this._context = context; 21 | } 22 | 23 | public void Process() 24 | { 25 | var pipelineText = _context.GetPipelineText(); 26 | 27 | var resultPipeline = pipelineText 28 | .ConvertText(); 29 | 30 | _context.SetPipelineText(resultPipeline); 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/PdfTextReader/ParserStages/StageConvertStructText.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using PdfTextReader.Parser; 3 | using PdfTextReader.TextStructures; 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Text; 7 | 8 | namespace PdfTextReader.ParserStages 9 | { 10 | class StageConvertStructText 11 | { 12 | private readonly string _input; 13 | private readonly string _output; 14 | private readonly StageContext _context; 15 | 16 | public StageConvertStructText(StageContext context) 17 | { 18 | this._input = context.InputFolder; 19 | this._output = context.OutputFolder; 20 | this._context = context; 21 | } 22 | 23 | public void Process() 24 | { 25 | var pipelineText = _context.GetPipelineText(); 26 | 27 | var resultPipeline = pipelineText 28 | .ConvertText(true) 29 | .Log($"{_context.OutputFilePrefix}-text-version.txt"); 30 | 31 | _context.SetPipelineText(resultPipeline); 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/PdfTextReader/ParserStages/StageDbgFlow.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Execution; 2 | using PdfTextReader.PDFCore; 3 | using PdfTextReader.PDFText; 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Drawing; 7 | using System.Text; 8 | 9 | namespace PdfTextReader.ParserStages 10 | { 11 | class StageDbgFlow 12 | { 13 | private readonly string _input; 14 | private readonly string _output; 15 | private readonly StageContext _context; 16 | 17 | public StageDbgFlow(StageContext context) 18 | { 19 | this._input = context.InputFolder; 20 | this._output = context.OutputFolder; 21 | this._context = context; 22 | } 23 | 24 | public void Process() 25 | { 26 | string basename = _context.Basename; 27 | Pipeline pipeline = _context.GetPipeline(); 28 | 29 | pipeline.Input($"{_context.InputFilePrefix}.pdf") 30 | .Output($"{_context.OutputFilePrefix}-dbg0-flow.pdf") 31 | .StageProcess(Flow); 32 | } 33 | 34 | void Flow(PipelineInputPdf.PipelineInputPdfPage page) 35 | { 36 | page.ParsePdf() 37 | .Show(Color.Blue) 38 | .ParsePdf() 39 | .Show(Color.Orange) 40 | .ParsePdf() 41 | .Show(Color.Yellow) 42 | .ShowLine(Color.Black) ; 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/PdfTextReader/ParserStages/StageExtractHeaderDOU.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Execution; 2 | using PdfTextReader.Parser; 3 | using PdfTextReader.PDFCore; 4 | using PdfTextReader.PDFText; 5 | using System; 6 | using System.Collections.Generic; 7 | using System.Drawing; 8 | using System.Text; 9 | 10 | namespace PdfTextReader.ParserStages 11 | { 12 | class StageExtractHeaderDOU 13 | { 14 | private readonly string _input; 15 | private readonly string _output; 16 | private readonly StageContext _context; 17 | 18 | public StageExtractHeaderDOU(StageContext context) 19 | { 20 | this._input = context.InputFolder; 21 | this._output = context.OutputFolder; 22 | this._context = context; 23 | } 24 | 25 | public void Process() 26 | { 27 | string basename = _context.Basename; 28 | Pipeline pipeline = _context.GetPipeline(); 29 | 30 | var page = pipeline.Input($"{_context.InputFilePrefix}.pdf") 31 | .Page(1) 32 | .ParsePdf() 33 | .ParseBlock(); 34 | 35 | var extract = page.CreateInstance(); 36 | var infoStats = extract.InfoStats; 37 | string content = infoStats.ToString(); 38 | 39 | var filename = _context.CreateGlobalInstance(); 40 | filename.Filename = _context.Basename; 41 | filename.InfoStats = infoStats; 42 | 43 | _context.WriteFile("header", $"{_context.OutputFilePrefix}-header.txt", content); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/PdfTextReader/PdfTextReader.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netstandard2.0 5 | Debug;Release;CORE 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/PdfTextReader/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.IO; 3 | using System.Diagnostics; 4 | 5 | namespace PdfTextReader 6 | { 7 | public class Program 8 | { 9 | public static void Main(string[] args) 10 | { 11 | if (args.Length == 2 && args[0] == "extract") 12 | { 13 | ExampleStages.ExtractHeader(args[1]); 14 | return; 15 | } 16 | 17 | Console.WriteLine("PDF Text Reader"); 18 | var watch = Stopwatch.StartNew(); 19 | 20 | Program3.ProcessStage("2010_04_19_p_anvisa", 1); 21 | 22 | watch.Stop(); 23 | var elapsedMs = watch.ElapsedMilliseconds; 24 | 25 | Console.WriteLine($"Elapsed time was: {elapsedMs}"); 26 | 27 | Console.ReadKey(); 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/AfterFilterTextSegments.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.TextStructures; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using PdfTextReader.Base; 7 | 8 | namespace PdfTextReader.Parser 9 | { 10 | class AfterFilterTextSegments : IAggregateStructure 11 | { 12 | public bool Aggregate(TextSegment line) 13 | { 14 | return (line.Title.Length == 0); 15 | } 16 | 17 | public TextSegment Create(List _structures) 18 | { 19 | if( _structures.Count == 1 ) 20 | { 21 | return new TextSegment() 22 | { 23 | Title = _structures[0].Title, 24 | Body = _structures[0].Body 25 | }; 26 | } 27 | 28 | var title = _structures[0].Title; 29 | var body = _structures.SelectMany(s => s.Body).ToArray(); 30 | 31 | int additionalTitles = _structures.Skip(1).Where(s => s.Title.Length > 0).Count(); 32 | if (additionalTitles > 0) 33 | PdfReaderException.Throw("s.Title.Length > 0"); 34 | 35 | return new TextSegment() 36 | { 37 | Title = title, 38 | Body = body 39 | }; 40 | } 41 | 42 | public void Init(TextSegment line) 43 | { 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/AnalyzeLines.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.IO; 6 | 7 | namespace PdfTextReader.TextStructures 8 | { 9 | class AnalyzeLines : ILogStructure 10 | { 11 | public void StartLog(TextWriter input) 12 | { 13 | } 14 | 15 | public void Log(TextWriter input, TextLine line) 16 | { 17 | input.WriteLine("-----------------------------------"); 18 | 19 | float? afterSpace = line.AfterSpace; 20 | float? beforeSpace = line.BeforeSpace; 21 | 22 | input.WriteLine($"Margins: (LEFT: {line.MarginLeft}, RIGHT: {line.MarginRight})"); 23 | 24 | input.Write($"TEXT: {line.Text}"); 25 | input.WriteLine($" ({line.FontName}, {line.FontSize.ToString("0.00")}, {line.FontStyle})"); 26 | input.WriteLine($" (AfterSpace: {afterSpace})"); 27 | input.WriteLine($" (BeforeSpace: {afterSpace})"); 28 | input.WriteLine(); 29 | 30 | input.WriteLine(""); 31 | } 32 | 33 | public void EndLog(TextWriter input) 34 | { 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/AnalyzeLinesCenterRight.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.IO; 6 | 7 | namespace PdfTextReader.TextStructures 8 | { 9 | class AnalyzeLinesCenterRight : ILogStructure 10 | { 11 | public void StartLog(TextWriter input) 12 | { 13 | } 14 | 15 | public void Log(TextWriter input, TextLine line) 16 | { 17 | 18 | 19 | float? afterSpace = line.AfterSpace; 20 | float? beforeSpace = line.BeforeSpace; 21 | 22 | 23 | float? lineCenter = (float)line.CenteredAt; 24 | 25 | if (lineCenter < 29 && lineCenter > 28) 26 | { 27 | input.WriteLine("-----------------------------------"); 28 | input.WriteLine($"Margins: (LEFT: {line.MarginLeft}, RIGHT: {line.MarginRight}, CENTER: {line.CenteredAt})"); 29 | 30 | input.Write($"TEXT: {line.Text}"); 31 | input.WriteLine($" ({line.FontName}, {line.FontSize.ToString("0.00")}, {line.FontStyle})"); 32 | input.WriteLine($" (AfterSpace: {afterSpace})"); 33 | input.WriteLine($" (BeforeSpace: {afterSpace})"); 34 | input.WriteLine(); 35 | 36 | input.WriteLine(""); 37 | 38 | 39 | } 40 | 41 | } 42 | 43 | public void EndLog(TextWriter input) 44 | { 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/AnalyzePageInfo.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.IO; 6 | 7 | namespace PdfTextReader.TextStructures 8 | { 9 | class AnalyzePageInfo : ILogStructure2 10 | { 11 | ITransformIndexTree _index; 12 | int _structureId = 0; 13 | 14 | public void Init(ITransformIndexTree index) 15 | { 16 | if (index == null) 17 | throw new ArgumentNullException(); 18 | 19 | _index = index; 20 | } 21 | 22 | public void Log(TextWriter input, T instance) 23 | { 24 | int page = _index.FindPageStart(instance); 25 | 26 | input.WriteLine($"Page {page}: {_structureId} [{instance.ToString().Replace("\n", " ")}]"); 27 | 28 | _structureId++; 29 | } 30 | 31 | public void StartLog(TextWriter input) 32 | { 33 | } 34 | 35 | public void EndLog(TextWriter input) 36 | { 37 | } 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/AnalyzeSegmentTextVersion.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Text; 7 | 8 | namespace PdfTextReader.TextStructures 9 | { 10 | class AnalyzeSegmentTextVersion : ILogStructure 11 | { 12 | public void EndLog(TextWriter input) 13 | { 14 | } 15 | 16 | public void Log(TextWriter input, TextSegment data) 17 | { 18 | input.WriteLine(data.TitleText); 19 | input.WriteLine(); 20 | input.WriteLine(data.BodyText); 21 | input.WriteLine(); 22 | input.WriteLine(); 23 | } 24 | 25 | public void StartLog(TextWriter input) 26 | { 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/AnalyzeSegmentTitles.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.IO; 6 | 7 | namespace PdfTextReader.TextStructures 8 | { 9 | class AnalyzeSegmentTitles : ILogStructure 10 | { 11 | public void StartLog(TextWriter input) 12 | { 13 | } 14 | 15 | public void Log(TextWriter input, TextSegment segment) 16 | { 17 | input.WriteLine("-----------------------------------"); 18 | 19 | foreach(var title in segment.Title) 20 | { 21 | float? afterSpace = title.AfterSpace; 22 | 23 | input.Write(title.Text); 24 | input.WriteLine($" ({title.FontName}, {title.FontSize.ToString("0.00")}, {title.FontStyle})"); 25 | input.WriteLine($" ({afterSpace})"); 26 | input.WriteLine(); 27 | } 28 | 29 | input.WriteLine(""); 30 | } 31 | 32 | public void EndLog(TextWriter input) 33 | { 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/AnalyzeSegments.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Text; 6 | 7 | namespace PdfTextReader.TextStructures 8 | { 9 | class AnalyzeSegments : ILogStructure 10 | { 11 | public void EndLog(TextWriter input) 12 | { 13 | } 14 | 15 | public void Log(TextWriter input, TextSegment data) 16 | { 17 | input.WriteLine("-----------------------------------"); 18 | 19 | input.WriteLine($"Title Count: {data.Title.Length}"); 20 | input.WriteLine($"Body Count: {data.Body.Length}"); 21 | input.WriteLine(); 22 | 23 | input.WriteLine($"Body Alignments: "); 24 | foreach (var item in data.Body) 25 | { 26 | input.WriteLine($"Text: {item.Text} ==>> {item.TextAlignment}"); 27 | } 28 | 29 | input.WriteLine(""); 30 | } 31 | 32 | public void StartLog(TextWriter input) 33 | { 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/AnalyzeSegments2.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Text; 7 | 8 | namespace PdfTextReader.TextStructures 9 | { 10 | class AnalyzeSegments2 : ILogStructure 11 | { 12 | public void EndLog(TextWriter input) 13 | { 14 | } 15 | 16 | public void Log(TextWriter input, TextSegment data) 17 | { 18 | input.WriteLine("Text,FontName,FontSize,FontStyle,MarginLeft,MarginRight,TextAlignment,AfterSpace"); 19 | if (data.Title.Length > 0) 20 | { 21 | input.WriteLine(data.Title.LastOrDefault().Text); 22 | } 23 | foreach (var item in data.Body) 24 | { 25 | input.WriteLine($"{item.Text.Replace(",",";")},{item.FontName},{item.FontSize},{item.FontStyle},{item.MarginLeft},{item.MarginRight},{item.TextAlignment},{item.AfterSpace}"); 26 | } 27 | 28 | input.WriteLine(""); 29 | input.WriteLine("--,--,--,--,--,--,--,--,"); 30 | input.WriteLine("--,--,--,--,--,--,--,--,"); 31 | input.WriteLine(""); 32 | } 33 | 34 | public void StartLog(TextWriter input) 35 | { 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/AnalyzeStructures.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.IO; 6 | 7 | namespace PdfTextReader.TextStructures 8 | { 9 | class AnalyzeStructures : ILogStructure 10 | { 11 | public void StartLog(TextWriter input) 12 | { 13 | } 14 | 15 | public void Log(TextWriter input, TextStructure structure) 16 | { 17 | input.WriteLine("-----------------------------------"); 18 | 19 | float? afterSpace = structure.AfterSpace; 20 | 21 | input.WriteLine($"Aligment: {structure.TextAlignment}"); 22 | 23 | input.Write(structure.Text); 24 | input.WriteLine($" ({structure.FontName}, {structure.FontSize.ToString("0.00")}, {structure.FontStyle} - {structure.HasBackColor})"); 25 | input.WriteLine($" ({afterSpace})"); 26 | input.WriteLine(); 27 | 28 | input.WriteLine(""); 29 | } 30 | 31 | public void EndLog(TextWriter input) 32 | { 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/CreateContent.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using PdfTextReader.Execution; 3 | using PdfTextReader.Parser; 4 | using PdfTextReader.PDFCore; 5 | using System; 6 | using System.Collections.Generic; 7 | using System.Text; 8 | 9 | namespace PdfTextReader.TextStructures 10 | { 11 | class CreateContent : IAggregateStructure 12 | { 13 | public CreateContent(BasicFirstPageStats basicFirstPageStats, PipelinePageStats teste, PipelineDocumentStats docstats) 14 | { 15 | } 16 | 17 | public bool Aggregate(TextSegment line) 18 | { 19 | return false; 20 | } 21 | 22 | public TextSegment Create(List input) 23 | { 24 | return input[0]; 25 | } 26 | 27 | public void Init(TextSegment line) 28 | { 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/CreateTextLineIndex.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.TextStructures; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using PdfTextReader.Base; 7 | 8 | namespace PdfTextReader.Parser 9 | { 10 | class CreateTextLineIndex : IAggregateStructure 11 | { 12 | // this class does nothing 13 | // however, it indirectly creates an index for TextLine 14 | public bool Aggregate(TextLine line) 15 | { 16 | return false; 17 | } 18 | 19 | public TextLine Create(List lines) 20 | { 21 | return lines[0]; 22 | } 23 | 24 | public void Init(TextLine line) 25 | { 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/GenerateArtigoGN4.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using PdfTextReader.Parser; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Text; 7 | 8 | namespace PdfTextReader.TextStructures 9 | { 10 | class GenerateArtigoGN4 : ILogMultipleStructure 11 | { 12 | int _id = 0; 13 | ProcessParser2 _procParser = new ProcessParser2(); 14 | Converter2GN _convert = new Converter2GN(); 15 | private InjectFilename _filename; 16 | 17 | public GenerateArtigoGN4(InjectFilename filename) 18 | { 19 | this._filename = filename; 20 | } 21 | 22 | public string CreateId(Artigo data) 23 | { 24 | // return (_id++).ToString(); 25 | return data.Conteudo.PID; 26 | } 27 | 28 | public void Log(string id, Stream input, Artigo data) 29 | { 30 | MemoryStream memstream = new MemoryStream(); 31 | _procParser.XMLWriter(data, memstream); 32 | 33 | memstream.Seek(0, SeekOrigin.Begin); 34 | var reader = new StreamReader(memstream); 35 | 36 | string pdfname = _filename.Filename; 37 | string article = reader.ReadToEnd(); 38 | string edition = _filename?.InfoStats?.Header?.JornalEdicao ?? ""; 39 | string result = _convert.Convert(pdfname, id, article, edition); 40 | 41 | using (var writer = new StreamWriter(input)) 42 | { 43 | writer.Write(result); 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/GenerateArtigoTmp.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using PdfTextReader.Parser; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Text; 7 | 8 | namespace PdfTextReader.TextStructures 9 | { 10 | class GenerateArtigoTmp : ILogMultipleStructure 11 | { 12 | int _id = 0; 13 | ProcessParser2 _procParser = new ProcessParser2(); 14 | 15 | public string CreateId(Artigo data) 16 | { 17 | return (_id++).ToString(); 18 | } 19 | 20 | public void Log(string id, Stream input, Artigo data) 21 | { 22 | _procParser.XMLWriter(data, input); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/ShowStructureCentral.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Drawing; 5 | using System.Text; 6 | 7 | namespace PdfTextReader.TextStructures 8 | { 9 | class ShowStructureCentral : ILogStructurePdf 10 | { 11 | public void EndLogPdf(IPipelineDebug pipeline) 12 | { 13 | } 14 | 15 | public void LogPdf(IPipelineDebug pipeline, TextStructure data) 16 | { 17 | if (data.TextAlignment == TextAlignment.CENTER) 18 | { 19 | pipeline.ShowLine(data.Lines, Color.Red); 20 | } 21 | 22 | if (data.TextAlignment == TextAlignment.RIGHT) 23 | { 24 | pipeline.ShowLine(data.Lines, Color.Blue); 25 | } 26 | } 27 | 28 | public void StartLogPdf(IPipelineDebug pipeline) 29 | { 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/PdfTextReader/TextStructures/ShowTitleSegment.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Base; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Drawing; 5 | using System.Text; 6 | 7 | namespace PdfTextReader.TextStructures 8 | { 9 | class ShowTitleSegment : ILogStructurePdf 10 | { 11 | int _id = 0; 12 | 13 | public void EndLogPdf(IPipelineDebug pipeline) 14 | { 15 | } 16 | 17 | public void LogPdf(IPipelineDebug pipeline, TextSegment data) 18 | { 19 | var titles = data.OriginalTitle; 20 | 21 | if (titles.Length == 0) 22 | return; 23 | 24 | for(int i=0; i 8 | { 9 | public int Id { get; set; } 10 | public TO Key { get; set; } 11 | public TI Start { get; set; } 12 | public TI End { get; set; } 13 | public List Items { get; set; } 14 | } 15 | class TransformIndexEntry2 16 | { 17 | public int Id { get; set; } 18 | public T Key { get; set; } 19 | public int StartId { get; set; } 20 | public int EndId { get; set; } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/PdfTextReader/VirtualFS.Static.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Text; 7 | 8 | namespace PdfTextReader.Base 9 | { 10 | partial class VirtualFS : IVirtualFS 11 | { 12 | public static Stream OpenRead(string filename) 13 | { 14 | return g_vfs.OpenReader(filename); 15 | } 16 | public static Stream OpenWrite(string filename) 17 | { 18 | return g_vfs.OpenWriter(filename); 19 | } 20 | 21 | [DebuggerHidden] 22 | public static void ConfigureFileSystem(IVirtualFS virtualFS) 23 | { 24 | if (virtualFS == null) 25 | throw new ArgumentNullException(nameof(IVirtualFS)); 26 | 27 | g_vfs = virtualFS; 28 | } 29 | 30 | // iText.Kernel.Pdf 31 | public static iText.Kernel.Pdf.PdfReader OpenPdfReader(string filename) 32 | { 33 | return new iText.Kernel.Pdf.PdfReader(OpenRead(filename)); 34 | } 35 | 36 | public static iText.Kernel.Pdf.PdfWriter OpenPdfWriter(string filename) 37 | { 38 | return new iText.Kernel.Pdf.PdfWriter(OpenWrite(filename)); 39 | } 40 | 41 | public static StreamWriter OpenStreamWriter(string filename) 42 | { 43 | return new StreamWriter(OpenWrite(filename)); 44 | } 45 | 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/PdfTextReader/VirtualFS.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Text; 7 | 8 | namespace PdfTextReader.Base 9 | { 10 | partial class VirtualFS : IVirtualFS 11 | { 12 | static IVirtualFS g_vfs = new VirtualFS(); 13 | 14 | public Stream OpenReader(string filename) 15 | { 16 | System.Diagnostics.Debug.WriteLine($"READ: {filename}"); 17 | return new FileStream(filename, FileMode.Open, FileAccess.Read); 18 | } 19 | 20 | public Stream OpenWriter(string filename) 21 | { 22 | System.Diagnostics.Debug.WriteLine($"WRITE: {filename}"); 23 | 24 | string folderName = Path.GetDirectoryName(filename); 25 | if(!Directory.Exists(folderName)) 26 | { 27 | if(Path.IsPathRooted(folderName)) 28 | { 29 | throw new NotImplementedException(); 30 | } 31 | else 32 | { 33 | DirectoryInfo directory = new DirectoryInfo("."); 34 | directory.CreateSubdirectory(folderName); 35 | } 36 | } 37 | 38 | return new FileStream(filename, FileMode.Create); 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/PdfToImageFunction/PdfToImageFunction.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | netstandard2.0 4 | v2 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | PreserveNewest 17 | 18 | 19 | PreserveNewest 20 | 21 | 22 | PreserveNewest 23 | 24 | 25 | PreserveNewest 26 | 27 | 28 | PreserveNewest 29 | 30 | 31 | PreserveNewest 32 | Never 33 | 34 | 35 | PreserveNewest 36 | 37 | 38 | -------------------------------------------------------------------------------- /src/PdfToImageFunction/Properties/PublishProfiles/FunctionApp20180412035249 - Web Deploy.pubxml: -------------------------------------------------------------------------------- 1 |  2 | 6 | 7 | 8 | MSDeploy 9 | AzureWebSite 10 | Release 11 | Any CPU 12 | http://functionapp20180412035249.azurewebsites.net 13 | False 14 | False 15 | functionapp20180412035249.scm.azurewebsites.net:443 16 | /subscriptions/eb6659ac-634f-4460-8e5c-c92db0afcabb/resourcegroups/casa-civil-br/providers/Microsoft.Web/sites/FunctionApp20180412035249 17 | FunctionApp20180412035249 18 | True 19 | WMSVC 20 | True 21 | $FunctionApp20180412035249 22 | <_SavePWD>True 23 | False 24 | 25 | -------------------------------------------------------------------------------- /src/PdfToImageFunction/host.json: -------------------------------------------------------------------------------- 1 | { 2 | // Value indicating the timeout duration for all functions. 3 | // In Dynamic SKUs, the valid range is from 1 second to 10 minutes and the default value is 5 minutes. 4 | // In Paid SKUs there is no limit and the default is no timeout. 5 | "functionTimeout": "00:05:00", 6 | "queues": { 7 | // The maximum interval in milliseconds between 8 | // queue polls. The default is 1 minute. 9 | "maxPollingInterval": 2000, 10 | 11 | // The visibility timeout that will be applied to messages that fail processing 12 | // (i.e. the time interval between retries). The default is zero. 13 | "visibilityTimeout": "00:00:30", 14 | 15 | // The number of queue messages to retrieve and process in 16 | // parallel (per job function). The default is 16 and the maximum is 32. 17 | "batchSize": 4, 18 | 19 | // The number of times to try processing a message before 20 | // moving it to the poison queue. The default is 5. 21 | "maxDequeueCount": 5 22 | } 23 | } -------------------------------------------------------------------------------- /src/PdfToImageFunction/pdf/D141.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fcatae/PdfTextReader/d1bfabef8425321dd5dbd3a672722c8e6a61f3ec/src/PdfToImageFunction/pdf/D141.pdf -------------------------------------------------------------------------------- /src/PdfToImageFunction/temp/readme.txt: -------------------------------------------------------------------------------- 1 | This folder will be used by ghostScript to write image from the pdf file -------------------------------------------------------------------------------- /src/QueueConsole/Config.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.Extensions.Configuration; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Diagnostics; 5 | 6 | namespace QueueConsole 7 | { 8 | class Config 9 | { 10 | private readonly IConfigurationRoot _config; 11 | 12 | public Config(string[] args) 13 | { 14 | _config = new ConfigurationBuilder() 15 | .AddJsonFile("appsettings.json",true,true) 16 | .AddCommandLine(args) 17 | .Build(); 18 | } 19 | 20 | [DebuggerHidden] 21 | public string Get(string configName) 22 | { 23 | var value = _config[configName]; 24 | 25 | if (value == null) 26 | throw new NotConfigured(configName); 27 | 28 | return value; 29 | } 30 | 31 | [DebuggerHidden] 32 | public string TryGet(string configName) 33 | { 34 | var value = _config[configName]; 35 | return value; 36 | } 37 | 38 | class NotConfigured : Exception 39 | { 40 | public readonly string Name; 41 | 42 | public NotConfigured(string name) : base($"configuration '{name}' not found") 43 | { 44 | Name = name; 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/QueueConsole/MainConsole.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Azure.Queue; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace QueueConsole 8 | { 9 | class MainConsole 10 | { 11 | public void Run(string queueSas) 12 | { 13 | RunWriterAsync(queueSas).Wait(); 14 | 15 | Console.WriteLine("Read messages:"); 16 | RunReaderAsync(queueSas).Wait(); 17 | } 18 | 19 | public async Task RunWriterAsync(string queueSas) 20 | { 21 | var azQueue = await AzureQueue.OpenAsync(queueSas); 22 | 23 | while(true) 24 | { 25 | string text = Console.ReadLine(); 26 | 27 | if (String.IsNullOrEmpty(text)) 28 | break; 29 | 30 | await azQueue.AddMessageAsync(text); 31 | } 32 | } 33 | public async Task RunReaderAsync(string queueSas) 34 | { 35 | var azQueue = await AzureQueue.OpenAsync(queueSas); 36 | 37 | while (true) 38 | { 39 | var message = await azQueue.TryGetMessageAsync(); 40 | 41 | if (message == null) 42 | break; 43 | 44 | Console.WriteLine($"message: {message.Content}"); 45 | 46 | message.Done(); 47 | } 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/QueueConsole/Program.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.WindowsAzure.Storage; 2 | using PdfTextReader.Azure; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Text.RegularExpressions; 6 | using System.Threading.Tasks; 7 | 8 | namespace QueueConsole 9 | { 10 | class Program 11 | { 12 | static Config _config = null; 13 | 14 | static void Main(string[] args) 15 | { 16 | Console.WriteLine("Console Queue"); 17 | 18 | _config = new Config(args); 19 | 20 | string INPUT_STORAGE_ACCOUNT = _config.TryGet("INPUT_STORAGE_ACCOUNT"); 21 | string QUEUE_STORAGE_ACCOUNT = _config.TryGet("QUEUE_STORAGE_ACCOUNT"); 22 | string QUEUE_NAME = _config.TryGet("QUEUE_NAME"); 23 | 24 | string QUEUE_SAS = _config.TryGet("QUEUE_SAS"); 25 | 26 | if( !String.IsNullOrWhiteSpace(INPUT_STORAGE_ACCOUNT) ) 27 | { 28 | (new MainPdfToImage()).Run(INPUT_STORAGE_ACCOUNT, QUEUE_STORAGE_ACCOUNT, QUEUE_NAME); 29 | } 30 | else 31 | { 32 | (new MainConsole()).Run(QUEUE_SAS); 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/QueueConsole/QueueConsole.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp2.0 6 | Debug;Release;CORE 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | PreserveNewest 25 | 26 | 27 | PreserveNewest 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/QueueConsole/appsettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "INPUT_STORAGE_ACCOUNT": "", 3 | "QUEUE_STORAGE_ACCOUNT": "", 4 | "QUEUE_NAME": "" 5 | } 6 | -------------------------------------------------------------------------------- /src/Validator/File.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Validator 6 | { 7 | class File 8 | { 9 | public File(string folder, string filename) 10 | { 11 | Folder = folder; 12 | Filename = filename; 13 | } 14 | 15 | public readonly string Folder; 16 | public readonly string Filename; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/Validator/GeneralProcess.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Validator 6 | { 7 | class GeneralProcess : IRunner 8 | { 9 | public string FilePattern => "*.pdf"; 10 | 11 | public void Run(File file, string outputname) 12 | { 13 | string inputFolder = file.Folder; 14 | string basename = file.Filename; 15 | 16 | PdfTextReader.ProgramValidator.Process(basename, inputFolder, outputname); 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/Validator/IRunner.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Validator 6 | { 7 | interface IRunner 8 | { 9 | string FilePattern { get; } 10 | void Run(File file, string outputname); 11 | } 12 | 13 | interface IRunner2 : IRunner 14 | { 15 | void Close(string outputfolder); 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/Validator/Process2010.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Validator 6 | { 7 | class Process2010 : IRunner 8 | { 9 | int _totalProcessed = 0; 10 | 11 | //public string FilePattern => "DO1_2010_0?_10.pdf"; //6 12 | //public string FilePattern => "*.pdf"; 13 | public string FilePattern => "DO1_2010_12_??.pdf"; 14 | 15 | public void Run(File file, string outputname) 16 | { 17 | string inputFolder = file.Folder; 18 | string basename = file.Filename; 19 | 20 | //if (!basename.Contains("DO1_2010_02_02")) 21 | // return; 22 | 23 | // CMD C:\PDF\2010\ c:\pdf\output 2010 24 | PdfTextReader.ProgramValidator2010.Process(basename, inputFolder, outputname); 25 | _totalProcessed++; 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/Validator/Process2012.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Validator 6 | { 7 | class Process2012 : IRunner 8 | { 9 | int _totalProcessed = 0; 10 | 11 | public string FilePattern => "*.pdf"; 12 | 13 | public void Run(File file, string outputname) 14 | { 15 | string inputFolder = file.Folder; 16 | string basename = file.Filename; 17 | 18 | string folderOutput = FileList.CreateOutputFolder(outputname, basename); 19 | 20 | PdfTextReader.ProgramValidator2012.Process(basename, inputFolder, folderOutput); 21 | _totalProcessed++; 22 | 23 | 24 | } 25 | 26 | 27 | 28 | 29 | 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/Validator/Process2016.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | 6 | namespace Validator 7 | { 8 | class Process2016 : IRunner2 9 | { 10 | int _totalProcessed = 0; 11 | 12 | public string FilePattern => "*.pdf"; 13 | 14 | public void Run(File file, string outputname) 15 | { 16 | string inputFolder = file.Folder; 17 | string basename = file.Filename; 18 | 19 | string folderOutput = FileList.CreateOutputFolder(outputname, basename); 20 | 21 | PdfTextReader.ProgramValidator2016.Process(basename, inputFolder, folderOutput); 22 | _totalProcessed++; 23 | } 24 | 25 | public void Close(string outputfolder) 26 | { 27 | ProgramValidatorXML.CreateFinalStats($"{outputfolder}/GlobalArticlePrecision.txt"); 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/Validator/ProcessDefault.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Validator 6 | { 7 | class ProcessDefault : IRunner 8 | { 9 | int _totalProcessed = 0; 10 | 11 | public string FilePattern => "*.pdf"; 12 | 13 | public void Run(File file, string outputname) 14 | { 15 | string inputFolder = file.Folder; 16 | string basename = file.Filename; 17 | 18 | string folderOutput = FileList.CreateOutputFolder(outputname, basename); 19 | 20 | PdfTextReader.ProgramValidatorDefault.Process(basename, inputFolder, folderOutput); 21 | _totalProcessed++; 22 | } 23 | 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/Validator/Validate2010.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Validator 6 | { 7 | class Validate2010 : IRunner 8 | { 9 | int _totalProcessed = 0; 10 | int _totalErrors = 0; 11 | 12 | //public string FilePattern => "DO1_2010_??_10.pdf"; //6 13 | public string FilePattern => "*.pdf"; 14 | 15 | public void Run(File file, string outputname) 16 | { 17 | string inputFolder = file.Folder; 18 | string basename = file.Filename; 19 | 20 | // CMD c:\pdf\output_6 c:\pdf\valid valid2010 21 | int errors = PdfTextReader.ValidatorPipeline.Process(basename, inputFolder, outputname); 22 | 23 | _totalProcessed++; 24 | _totalErrors += errors; 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/Validator/Validator.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp2.0 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/WebFrontendImages/Logic/ImageProcessing.cs: -------------------------------------------------------------------------------- 1 | using SixLabors.ImageSharp; 2 | using SixLabors.ImageSharp.Formats.Jpeg; 3 | using SixLabors.ImageSharp.PixelFormats; 4 | using SixLabors.ImageSharp.Processing; 5 | using SixLabors.ImageSharp.Processing.Transforms; 6 | using SixLabors.Primitives; 7 | using System; 8 | using System.Collections.Generic; 9 | using System.IO; 10 | using System.Linq; 11 | using System.Threading.Tasks; 12 | 13 | namespace WebFrontendImages.Logic 14 | { 15 | public class ImageProcessing 16 | { 17 | static readonly JpegDecoder JPEG = new JpegDecoder(); 18 | 19 | public static Stream Crop(Stream stream, float tx, float ty, float tw, float th) 20 | { 21 | Stream output = new MemoryStream(); 22 | 23 | using (Image image = Image.Load(stream, JPEG)) 24 | { 25 | int x1 = (int)(image.Width * tx); 26 | int y1 = (int)(image.Height * ty); 27 | int dx = (int)(image.Width * tw); 28 | int dy = (int)(image.Height * th); 29 | 30 | image.Mutate(x => x 31 | .Crop(new Rectangle(x1, y1, dx, dy)) 32 | //.Resize(image.Width / 2, image.Height / 2) 33 | ); 34 | 35 | image.Save(output, new JpegEncoder()); 36 | } 37 | 38 | output.Seek(0, SeekOrigin.Begin); 39 | 40 | return output; 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/WebFrontendImages/Logic/ImageSource.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.WindowsAzure.Storage.Blob; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Threading.Tasks; 7 | 8 | namespace WebFrontendImages.Logic 9 | { 10 | public class ImageSource 11 | { 12 | CloudBlobContainer _container; 13 | 14 | public ImageSource(string storageUrl) 15 | { 16 | if (String.IsNullOrEmpty(storageUrl)) 17 | throw new ArgumentNullException(nameof(storageUrl)); 18 | 19 | var container = new CloudBlobContainer(new Uri(storageUrl)); 20 | 21 | _container = container; 22 | } 23 | 24 | public async Task GetAsync(string filename) 25 | { 26 | var blob = _container.GetBlobReference(filename); 27 | 28 | return await blob.OpenReadAsync(); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/WebFrontendImages/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Linq; 5 | using System.Threading.Tasks; 6 | using Microsoft.AspNetCore; 7 | using Microsoft.AspNetCore.Hosting; 8 | using Microsoft.Extensions.Configuration; 9 | using Microsoft.Extensions.Logging; 10 | 11 | namespace WebFrontendImages 12 | { 13 | public class Program 14 | { 15 | public static void Main(string[] args) 16 | { 17 | BuildWebHost(args).Run(); 18 | } 19 | 20 | public static IWebHost BuildWebHost(string[] args) => 21 | WebHost.CreateDefaultBuilder(args) 22 | .UseStartup() 23 | .Build(); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/WebFrontendImages/Properties/launchSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "iisSettings": { 3 | "windowsAuthentication": false, 4 | "anonymousAuthentication": true, 5 | "iisExpress": { 6 | "applicationUrl": "http://localhost:52069/", 7 | "sslPort": 0 8 | } 9 | }, 10 | "profiles": { 11 | "IIS Express": { 12 | "commandName": "IISExpress", 13 | "launchBrowser": true, 14 | "launchUrl": "api/values", 15 | "environmentVariables": { 16 | "ASPNETCORE_ENVIRONMENT": "Development" 17 | } 18 | }, 19 | "WebFrontendImages": { 20 | "commandName": "Project", 21 | "launchBrowser": true, 22 | "launchUrl": "api/values", 23 | "environmentVariables": { 24 | "ASPNETCORE_ENVIRONMENT": "Development" 25 | }, 26 | "applicationUrl": "http://localhost:52070/" 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/WebFrontendImages/WebFrontendImages.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netcoreapp2.0 5 | 5e47a648-480b-4066-8538-b5cb625c4453 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/WebFrontendImages/appsettings.Development.json: -------------------------------------------------------------------------------- 1 | { 2 | "Logging": { 3 | "IncludeScopes": false, 4 | "LogLevel": { 5 | "Default": "Debug", 6 | "System": "Information", 7 | "Microsoft": "Information" 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/WebFrontendImages/appsettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "Logging": { 3 | "IncludeScopes": false, 4 | "Debug": { 5 | "LogLevel": { 6 | "Default": "Warning" 7 | } 8 | }, 9 | "Console": { 10 | "LogLevel": { 11 | "Default": "Warning" 12 | } 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/WebFrontendImages/wwwroot/test.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 10 | 11 | 32 | 33 | -------------------------------------------------------------------------------- /test/PdfTextReader.Test/PdfTextReader.Tests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netcoreapp2.0 5 | 6 | false 7 | 8 | Debug;Release;CORE 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /test/PdfTextReader.Test/UnitTest1.cs: -------------------------------------------------------------------------------- 1 | using PdfTextReader.Azure; 2 | using System; 3 | using System.IO; 4 | using Xunit; 5 | 6 | namespace PdfTextReader.Test 7 | { 8 | public class UnitTest1 9 | { 10 | [Fact] 11 | public async void Test1() 12 | { 13 | 14 | var pdfFile = @"C:\Users\visouza\Repos\DOU-OCR\data\pdf\D141.pdf"; 15 | int pdfPages = 48; 16 | var gs = @"C:\Program Files\gs\gs9.23\bin\gswin64.exe"; 17 | var tempFolder = @"C:\temp\dou"; 18 | 19 | var pdfInput = File.OpenRead(pdfFile); 20 | 21 | PdfImageConverter imageConverter = new PdfImageConverter(gs, tempFolder, "102.4"); 22 | 23 | Stream[] pdfPageImageList = null; 24 | 25 | //The array of streams will respect the page number-1, page 1 equal index 0; 26 | imageConverter.GenerateImage(pdfInput, ref pdfPageImageList); 27 | 28 | Assert.Equal(pdfPages, pdfPageImageList.Length); 29 | } 30 | } 31 | } 32 | --------------------------------------------------------------------------------