├── Data ├── agencies.xlsx ├── country_list.xlsx ├── names_A60-72.xlsx ├── un_entities_20191017.xlsx ├── Download GoogleNews-vectors-negative300 ├── key_words_un_org_list.txt ├── preambular_verb_list.txt ├── operative_verb_list.txt └── key_words_not_un_org_list.txt ├── requirements.txt ├── CODE_OF_CONDUCT.md ├── tools └── document-processor │ ├── DSnA.WebJob.DocumentParser │ ├── Code │ │ ├── IStorageClientFactory.cs │ │ ├── DefaultStorageClientFactory.cs │ │ ├── Exceptions.cs │ │ ├── InteropWordUtils.cs │ │ ├── Constants.cs │ │ ├── Logger.cs │ │ ├── ParserClasses.cs │ │ ├── DocumentParser.cs │ │ ├── ParseHelper.cs │ │ └── Utils.cs │ ├── Interface │ │ ├── IDocumentParser.cs │ │ └── IStorageClient.cs │ ├── LocalStorageClient.cs │ ├── BlobStorageClient.cs │ ├── Properties │ │ └── AssemblyInfo.cs │ ├── README.md │ ├── App.config │ ├── packages.config │ ├── Program.cs │ └── DSnA.WebJob.DocumentParser.csproj │ └── DSnA.WebJob.DocumentParser.sln ├── SUPPORT.md ├── SECURITY.md ├── knowledge_extraction_resolution_level.py ├── README.md ├── .gitignore ├── LICENSE └── knowledge_extraction_paragraph_level.py /Data/agencies.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/un-knowledge-extraction/HEAD/Data/agencies.xlsx -------------------------------------------------------------------------------- /Data/country_list.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/un-knowledge-extraction/HEAD/Data/country_list.xlsx -------------------------------------------------------------------------------- /Data/names_A60-72.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/un-knowledge-extraction/HEAD/Data/names_A60-72.xlsx -------------------------------------------------------------------------------- /Data/un_entities_20191017.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/un-knowledge-extraction/HEAD/Data/un_entities_20191017.xlsx -------------------------------------------------------------------------------- /Data/Download GoogleNews-vectors-negative300: -------------------------------------------------------------------------------- 1 | It can be downloaded at https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit and then saved under the "Data" folder 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.1.5 2 | nltk==3.6.6 3 | numpy==1.22.0 4 | re==2.2.1 5 | string 6 | spacy==3.0.1 7 | gensim==3.8.3 8 | scipy==1.5.3 9 | collections 10 | matplotlib==3.3.4 11 | -------------------------------------------------------------------------------- /Data/key_words_un_org_list.txt: -------------------------------------------------------------------------------- 1 | Committee 2 | Council 3 | Conference 4 | Fund 5 | Organization 6 | Entity 7 | Department 8 | Commission 9 | Court 10 | Board 11 | Community 12 | Office 13 | Association 14 | Government 15 | Group 16 | Summit 17 | Subcommittee -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Code/IStorageClientFactory.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | 10 | namespace DSnA.WebJob.DocumentParser 11 | { 12 | public interface IStorageClientFactory 13 | { 14 | IStorageClient Create(string id, Dictionary parameters, IUtils utils); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Interface/IDocumentParser.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | using Microsoft.Office.Interop.Word; 10 | 11 | namespace DSnA.WebJob.DocumentParser 12 | { 13 | public interface IDocumentParser 14 | { 15 | string ParseDocuments(string uri, IStorageClient storageClient, Application wordApp, string outputFileFormat); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Interface/IStorageClient.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | 10 | namespace DSnA.WebJob.DocumentParser 11 | { 12 | public class StorageObjectDescriptor 13 | { 14 | public string FileName { get; set; } 15 | public Uri Uri { get; set; } 16 | } 17 | 18 | public interface IStorageClient 19 | { 20 | string GetFile(StorageObjectDescriptor descriptor, string destinationFilePath); 21 | void SaveFile(string sourceUri, string destinationUri); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Code/DefaultStorageClientFactory.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | 10 | namespace DSnA.WebJob.DocumentParser 11 | { 12 | public class DefaultStorageClientFactory : IStorageClientFactory 13 | { 14 | public const string BlobContainerNameKey = "container"; 15 | 16 | public IStorageClient Create(string id, Dictionary parameters, IUtils utils) 17 | { 18 | switch(id) 19 | { 20 | case "blob": 21 | return new BlobStorageClient(parameters[BlobContainerNameKey], utils); 22 | 23 | default: 24 | return new LocalStorageClient(); 25 | } 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Code/Exceptions.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using System; 5 | 6 | namespace DSnA.WebJob.DocumentParser 7 | { 8 | public class UnableToDeleteFileException : Exception 9 | { 10 | public UnableToDeleteFileException() 11 | { 12 | } 13 | 14 | public UnableToDeleteFileException(string message) 15 | : base(message) 16 | { 17 | } 18 | 19 | public UnableToDeleteFileException(string message, Exception inner) 20 | : base(message, inner) 21 | { 22 | } 23 | } 24 | 25 | public class LoggerException : Exception 26 | { 27 | public LoggerException() 28 | { 29 | } 30 | 31 | public LoggerException(string message) 32 | : base(message) 33 | { 34 | } 35 | 36 | public LoggerException(string message, Exception inner) 37 | : base(message, inner) 38 | { 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/LocalStorageClient.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using Microsoft.Azure; 5 | using Microsoft.WindowsAzure.Storage; 6 | using Microsoft.WindowsAzure.Storage.Blob; 7 | using System; 8 | using System.Diagnostics; 9 | using System.Linq; 10 | 11 | namespace DSnA.WebJob.DocumentParser 12 | { 13 | public class LocalStorageClient : IStorageClient 14 | { 15 | public LocalStorageClient() 16 | { 17 | 18 | } 19 | 20 | public string GetFile(StorageObjectDescriptor descriptor, string destinationFilePath) 21 | { 22 | return System.Net.WebUtility.UrlDecode(descriptor.Uri.AbsolutePath).Replace("/","\\"); 23 | } 24 | 25 | public void SaveFile(string sourceUri, string destinationUri) 26 | { 27 | string sourceUriPath = System.IO.Path.GetDirectoryName(sourceUri); 28 | string sourceUriFileName = System.IO.Path.GetFileName(sourceUri); 29 | 30 | System.IO.File.Copy(sourceUri, (destinationUri != null ? destinationUri : $@"{sourceUriPath}\out_{sourceUriFileName}"), true); 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.30804.86 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DSnA.WebJob.DocumentParser", "DSnA.WebJob.DocumentParser\DSnA.WebJob.DocumentParser.csproj", "{9E4D7884-0C36-429B-A4C9-1217D3CA7D4E}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {9E4D7884-0C36-429B-A4C9-1217D3CA7D4E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {9E4D7884-0C36-429B-A4C9-1217D3CA7D4E}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {9E4D7884-0C36-429B-A4C9-1217D3CA7D4E}.Release|Any CPU.ActiveCfg = Release|Any CPU 17 | {9E4D7884-0C36-429B-A4C9-1217D3CA7D4E}.Release|Any CPU.Build.0 = Release|Any CPU 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | GlobalSection(ExtensibilityGlobals) = postSolution 23 | SolutionGuid = {202AC18B-E457-427C-B60C-2D7A1D2E6319} 24 | EndGlobalSection 25 | EndGlobal 26 | -------------------------------------------------------------------------------- /Data/preambular_verb_list.txt: -------------------------------------------------------------------------------- 1 | acknowledging 2 | acting 3 | adhering 4 | affirming 5 | agreeing 6 | alarmed 7 | taking 8 | anxious 9 | appreciating 10 | asserting 11 | attaching 12 | aware 13 | bearing 14 | being 15 | believing 16 | cognizant 17 | commemorating 18 | commending 19 | concerned 20 | concluding 21 | concurring 22 | confident 23 | confirming 24 | conscious 25 | considering 26 | continuing 27 | convinced 28 | deeming 29 | deploring 30 | disturbed 31 | grieved 32 | perturbed 33 | regretting 34 | shocked 35 | desiring 36 | desirous 37 | deternied 38 | distressed 39 | disturbed 40 | emphasizing 41 | encouraged 42 | endorsing 43 | expressing 44 | faithful 45 | fearing 46 | noting 47 | recalling 48 | gratified 49 | alarmed 50 | guided 51 | having 52 | indignant 53 | holding 54 | hopeful 55 | conformity 56 | pursuance 57 | inspired 58 | invoking 59 | opinion 60 | keeping 61 | mindful 62 | observing 63 | outraged 64 | paying 65 | pending 66 | persuaded 67 | realizing 68 | recognizing 69 | recollecting 70 | referring 71 | regretting 72 | reiterating 73 | restating 74 | seeking 75 | sharing 76 | stressing 77 | striving 78 | condemning 79 | taking 80 | trusting 81 | underlining 82 | urging 83 | viewing 84 | warning 85 | welcoming 86 | wishing 87 | preventing -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/BlobStorageClient.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using Microsoft.Azure; 5 | using Microsoft.WindowsAzure.Storage; 6 | using Microsoft.WindowsAzure.Storage.Blob; 7 | using System; 8 | using System.Diagnostics; 9 | using System.Linq; 10 | 11 | namespace DSnA.WebJob.DocumentParser 12 | { 13 | public class BlobStorageClient : IStorageClient 14 | { 15 | private static readonly CloudStorageAccount StorageAccount = CloudStorageAccount.Parse(CloudConfigurationManager.GetSetting("StorageConnectionString")); 16 | 17 | private readonly IUtils _utils; 18 | 19 | public CloudBlobClient Client; 20 | public CloudBlobContainer Container; 21 | 22 | public BlobStorageClient(string containerName, IUtils utils) 23 | { 24 | _utils = utils; 25 | 26 | Client = utils.CreateCloudBlobClient(StorageAccount); 27 | Container = Client.GetContainerReference(containerName); 28 | } 29 | 30 | public string GetFile(StorageObjectDescriptor descriptor, string destinationFilePath) 31 | { 32 | return _utils.DownloadBlobFile(descriptor.Uri.AbsoluteUri, Constants.FileConfigs.WorkingDirectoryPath, Client); 33 | } 34 | 35 | public void SaveFile(string sourceUri, string destinationUri) 36 | { 37 | _utils.UploadFileToBlob(sourceUri, Client); 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyTitle("DSnA.WebJob.DocumentParser")] 9 | [assembly: AssemblyDescription("")] 10 | [assembly: AssemblyConfiguration("")] 11 | [assembly: AssemblyCompany("")] 12 | [assembly: AssemblyProduct("DSnA.WebJob.DocumentParser")] 13 | [assembly: AssemblyCopyright("Copyright © 2018")] 14 | [assembly: AssemblyTrademark("")] 15 | [assembly: AssemblyCulture("")] 16 | 17 | // Setting ComVisible to false makes the types in this assembly not visible 18 | // to COM components. If you need to access a type in this assembly from 19 | // COM, set the ComVisible attribute to true on that type. 20 | [assembly: ComVisible(false)] 21 | 22 | // The following GUID is for the ID of the typelib if this project is exposed to COM 23 | [assembly: Guid("9e4d7884-0c36-429b-a4c9-1217d3ca7d4e")] 24 | 25 | // Version information for an assembly consists of the following four values: 26 | // 27 | // Major Version 28 | // Minor Version 29 | // Build Number 30 | // Revision 31 | // 32 | // You can specify all the values or you can default the Build and Revision Numbers 33 | // by using the '*' as shown below: 34 | // [assembly: AssemblyVersion("1.0.*")] 35 | [assembly: AssemblyVersion("1.0.0.0")] 36 | [assembly: AssemblyFileVersion("1.0.0.0")] 37 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/README.md: -------------------------------------------------------------------------------- 1 | # Document Parser 2 | 3 | ## Prerequisites 4 | 5 | [Visual Studio 2019 Community](https://visualstudio.microsoft.com/vs/community/) 6 | 7 | ## Setup 8 | 9 | Build Solution file **DSnA.WebJob.DocumentParser.sln** with Visual Studio. This will generate the executable file called: *DSnA.WebJob.DocumentParser.exe* 10 | 11 | ## Configuration 12 | 13 | App.config - add the key values for: 14 | 15 | StorageConnectionString (only for blob storage mode): add storage connection string information in order to get access to the blob storage containers. 16 | 17 | OutputFileFormat: "csv" or "json" output file format 18 | 19 | StorageType: "blob" or "localstorage" values 20 | 21 | ## How it works? 22 | 23 | The document parser tool allows you to extract the content of a document file (pdf, word) and create an output csv or json file with the document data classified into text, paragraphs, headings, sections, clauses, heading clauses and additional information. 24 | 25 | To run this, you have two options: 26 | 27 | **Azure blob storage**: upload the documents to be processed to the input blob storage and open a command prompt window located where the *DSnA.WebJob.DocumentParser.exe* file is located and then run it with the required arguments as below: 28 | 29 | >DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3 30 | 31 | Options: 32 | 33 | >arg1: **Required** - blob input container name 34 | 35 | >arg2: **Required** - blob virtual directory name/path (/ root level) 36 | 37 | >arg3: Optional - file name filter (if not present, all documents within source folder will be processed) 38 | 39 | The output files will be located in the blob storage *docparseoutput* container. 40 | 41 | **Local storage**: upload the documents to the local folder in your file system to then open a command prompt window located where the *DSnA.WebJob.DocumentParser.exe* file is located and then run it with the required arguments as below: 42 | 43 | >DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3 44 | 45 | Options: 46 | 47 | >arg1: **Required** - local storage source folder path 48 | 49 | >arg2: **Required** - local storage output folder path 50 | 51 | >arg3: Optional - file name filter (if not present, all documents within source folder will be processed) 52 | 53 | The output files will be located in the output local folder. 54 | 55 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Code/InteropWordUtils.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using Microsoft.Office.Interop.Word; 5 | using System; 6 | 7 | namespace DSnA.WebJob.DocumentParser 8 | { 9 | public interface IInteropWordUtils 10 | { 11 | Application CreateWordAppInstance(); 12 | Document OpenDocument(string file, Application wordApp); 13 | void DisposeIneropObject(Application wordApp, bool saveChanges = false); 14 | } 15 | 16 | class InteropWordUtils : IInteropWordUtils 17 | { 18 | /// 19 | /// Creates word application instance 20 | /// 21 | /// 22 | public Application CreateWordAppInstance() 23 | { 24 | return new Application 25 | { 26 | DisplayAlerts = WdAlertLevel.wdAlertsNone, 27 | Visible = false, 28 | Options = { SavePropertiesPrompt = false, SaveNormalPrompt = false, DisplayPasteOptions = false, DoNotPromptForConvert = true } 29 | }; 30 | } 31 | 32 | /// 33 | /// Opens word document 34 | /// 35 | /// 36 | /// 37 | /// 38 | public Document OpenDocument(string file, Application wordApp) 39 | { 40 | return wordApp.Documents.Open(file, ReadOnly: false); 41 | } 42 | 43 | /// 44 | /// Disposes all COM Objects 45 | /// 46 | /// 47 | public void DisposeIneropObject(Application wordApp,bool saveChanges = false) 48 | { 49 | try 50 | { 51 | wordApp.Quit(SaveChanges: saveChanges); 52 | System.Runtime.InteropServices.Marshal.ReleaseComObject(wordApp); 53 | GC.Collect(); 54 | GC.WaitForPendingFinalizers(); 55 | } 56 | catch (Exception) 57 | { 58 | 59 | wordApp.Quit(SaveChanges: false); 60 | System.Runtime.InteropServices.Marshal.ReleaseComObject(wordApp); 61 | GC.Collect(); 62 | GC.WaitForPendingFinalizers(); 63 | } 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /Data/operative_verb_list.txt: -------------------------------------------------------------------------------- 1 | accept 2 | accepts 3 | recommend 4 | recommends 5 | acknowledge 6 | acknowledges 7 | address 8 | addresses 9 | adopt 10 | adopts 11 | proclaim 12 | proclaims 13 | affirm 14 | affirms 15 | appeal 16 | appeals 17 | call 18 | calls 19 | draw 20 | draws 21 | pledge 22 | pledges 23 | reiterate 24 | reiterates 25 | request 26 | requests 27 | agree 28 | agrees 29 | decide 30 | decides 31 | endorse 32 | endorses 33 | invite 34 | invites 35 | note 36 | notes 37 | welcome 38 | welcomes 39 | amend 40 | amends 41 | applaud 42 | applauds 43 | appoint 44 | appoints 45 | approve 46 | approves 47 | assert 48 | asserts 49 | assure 50 | assures 51 | authorize 52 | authorizes 53 | await 54 | awaits 55 | believe 56 | believes 57 | condemn 58 | condemns 59 | censure 60 | censures 61 | commend 62 | commends 63 | commission 64 | commissions 65 | compliment 66 | compliments 67 | concur 68 | concurs 69 | confirm 70 | confirms 71 | congratulate 72 | congratulates 73 | consider 74 | considers 75 | convey 76 | conveys 77 | declare 78 | declares 79 | deem 80 | deems 81 | appreciate 82 | appreciates 83 | deplore 84 | deplores 85 | defer 86 | defers 87 | demand 88 | demands 89 | denounce 90 | denounces 91 | deprecate 92 | deprecates 93 | designate 94 | designates 95 | desire 96 | desires 97 | determine 98 | determines 99 | direct 100 | directs 101 | dissolve 102 | dissolves 103 | draw 104 | draws 105 | emphasize 106 | emphasizes 107 | empower 108 | empowers 109 | encourage 110 | encourages 111 | entrust 112 | entrusts 113 | envisage 114 | envisages 115 | establish 116 | establishes 117 | exhort 118 | exhorts 119 | expect 120 | expects 121 | express 122 | expresses 123 | extend 124 | extends 125 | maintain 126 | maintains 127 | support 128 | supports 129 | formulate 130 | formulates 131 | share 132 | shares 133 | reaffirm 134 | reaffirms 135 | insist 136 | insists 137 | instruct 138 | instructs 139 | invite 140 | invites 141 | look 142 | looks 143 | make 144 | makes 145 | mandate 146 | mandates 147 | offer 148 | offers 149 | pay 150 | pays 151 | propose 152 | proposes 153 | realize 154 | realizes 155 | reassert 156 | reasserts 157 | reassure 158 | reassures 159 | recall 160 | recalls 161 | recognize 162 | recognizes 163 | re-emphasize 164 | re-emphasizes 165 | refer 166 | refers 167 | regard 168 | regards 169 | register 170 | registers 171 | regret 172 | regrets 173 | reject 174 | rejects 175 | remind 176 | reminds 177 | renew 178 | renews 179 | resolve 180 | resolves 181 | seize 182 | seizes 183 | set 184 | sets 185 | warn 186 | warns 187 | state 188 | states 189 | stress 190 | stresses 191 | suggest 192 | suggests 193 | take 194 | takes 195 | transmit 196 | transmits 197 | trust 198 | trusts 199 | underline 200 | underlines 201 | urge 202 | urges -------------------------------------------------------------------------------- /Data/key_words_not_un_org_list.txt: -------------------------------------------------------------------------------- 1 | Goal 2 | Goals 3 | Agenda 4 | Outcome 5 | Headquarters 6 | Declaration 7 | Account 8 | Implementation 9 | Territory 10 | Territories 11 | Act 12 | Action 13 | Actions 14 | Programme 15 | Agreement 16 | Partnership 17 | Protection of Civilian Persons 18 | Time of War 19 | Framework 20 | Frameworks 21 | Consensus 22 | Convention 23 | Conventions 24 | Related 25 | Resolution 26 | Resolutions 27 | Forum 28 | Meeting 29 | Strategy 30 | Eradicate 31 | General Service 32 | Document 33 | Deconstruction 34 | Status 35 | Statute 36 | Protocol 37 | Protocols 38 | Outcome 39 | Illicit 40 | Session 41 | A/RES/ 42 | Movement 43 | Chair 44 | Treatment 45 | Platform 46 | Platforms 47 | Plan 48 | Weapons 49 | National Food Security 50 | Rules 51 | Budget 52 | Principle 53 | Principles 54 | System 55 | Systems 56 | Mechanism 57 | Report 58 | Pact 59 | Compact 60 | Trade 61 | Consequences 62 | United Nations Global Compact 63 | Facility 64 | Covenant 65 | Covenants 66 | Responsible 67 | Treaty 68 | Decade 69 | Wider United Nations 70 | Their 71 | Expert 72 | Personnel 73 | Conservation 74 | Field Service 75 | Information 76 | International Migration and Development 77 | Coordinator 78 | Armistice Line 79 | Further 80 | Day 81 | Week 82 | Month 83 | Year 84 | Criteria 85 | El Nio 86 | Fellowship 87 | Safety of Maritime Navigation 88 | Library 89 | Doha Development Round 90 | Journal 91 | Review 92 | Aid for Trade 93 | Sea 94 | Movement 95 | Zone 96 | International Health Regulations 97 | International Mother 98 | Goodwill Ambassadors 99 | Chronicle 100 | Involuntary Disappearances 101 | Impact 102 | Rapporteur 103 | Rapporteurs 104 | Record 105 | Records 106 | Ministers 107 | Panel 108 | University 109 | Yearbook 110 | Messengers 111 | Terrorism 112 | Dialogue 113 | Officer 114 | Target 115 | Targets 116 | Elimination 117 | Council established 118 | Repair and Assembly 119 | Countries and Peoples 120 | Model Strategies and Practical Measures 121 | Ways and means 122 | Challenge 123 | Network 124 | Safety and Security of Radioactive Sources 125 | Guideline 126 | Guidelines 127 | Parties 128 | Unregulated Fishing 129 | Discrimination 130 | Armed Robbery against Ships 131 | Regular 132 | International Search 133 | Process 134 | Branch 135 | Context 136 | Orthodox Good Friday 137 | Seascape 138 | Regional Security 139 | Cooperation for 140 | Application 141 | Volunteer 142 | Volunteers 143 | Fishing Vessels 144 | Alternative 145 | Green Paper 146 | Holy See 147 | Need of Assistance 148 | Olympic Truce 149 | Mutual Understanding 150 | Tapta 151 | Census 152 | Sport for Development and Peace 153 | Campaign 154 | Protection of Child Victims of Trafficking 155 | Approach 156 | Service 157 | Commercial Shipping 158 | Reduction of Underwater Noise 159 | Chair 160 | Chairs 161 | Co-Chairs -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/App.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /knowledge_extraction_resolution_level.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | 5 | #%% Imports 6 | import pandas as pd 7 | import re 8 | 9 | current_dir = './UN_Knowledge_Extraction/' 10 | data_dir = current_dir + "data/" 11 | output_dir = current_dir + "output/" 12 | 13 | 14 | UN_DOCS = pd.read_csv(data_dir + "UN_RES_DOCS_2009_2018.csv") 15 | UN_DOCS_resolution_level = UN_DOCS[['SourceFile']].drop_duplicates().reset_index(drop=True) 16 | UN_DOCS_resolution_level['Resolutuion_Session'] = '' 17 | UN_DOCS_resolution_level['Resolutuion_Agenda_item'] = '' 18 | UN_DOCS_resolution_level['Resolutuion_Number'] = '' 19 | UN_DOCS_resolution_level['Resolutuion_Title'] = '' 20 | UN_DOCS_resolution_level['Resolutuion_Adoption_DateMonthYear'] = '' 21 | UN_DOCS_resolution_level['Resolutuion_Adoption_Day'] = '' 22 | UN_DOCS_resolution_level['Resolutuion_Adoption_Month'] = '' 23 | UN_DOCS_resolution_level['Resolutuion_Adoption_Year'] = '' 24 | 25 | for index, row in UN_DOCS_resolution_level.iterrows(): 26 | Resolution_Info = [''] * 5 27 | SourceFile = row['SourceFile'] 28 | SourceFile_info_paragraphs = UN_DOCS.loc[UN_DOCS['SourceFile'] == SourceFile].sort_values(by=['Index']).fillna('').reset_index(drop=True) 29 | for i in range(len(SourceFile_info_paragraphs)): 30 | Content = SourceFile_info_paragraphs.loc[i,'Content'] 31 | Type = SourceFile_info_paragraphs.loc[i,'Type'] 32 | if(Resolution_Info[0] == '' and Type == 'Session'): 33 | Resolution_Info[0] = Content 34 | elif(Resolution_Info[1] == '' and Type == 'AgendaItem'): 35 | Resolution_Info[1] = Content 36 | elif(Resolution_Info[2] == '' and Resolution_Info[3] == '' and re.match('(\d+/\d+)\s{0,1}\.\s{0,1}(.*)', Content)): 37 | Resolution_Info[2] = re.match('(\d+/\d+)\s{0,1}\.\s{0,1}(.*)',Content).groups()[0] 38 | Resolution_Info[3] = re.match('(\d+/\d+)\s{0,1}\.\s{0,1}(.*)',Content).groups()[1] 39 | elif(Resolution_Info[4] == '' and re.match('(.*)on (\d{1,2}\s\w+\s\d{4})$', Content)): 40 | Resolution_Info[4] = re.match('(.*)on (\d{1,2}\s\w+\s\d{4})$', Content).groups()[1] 41 | UN_DOCS_resolution_level.iloc[index]['Resolutuion_Session'] = Resolution_Info[0] 42 | UN_DOCS_resolution_level.iloc[index]['Resolutuion_Agenda_item'] = Resolution_Info[1] 43 | UN_DOCS_resolution_level.iloc[index]['Resolutuion_Number'] = Resolution_Info[2] 44 | UN_DOCS_resolution_level.iloc[index]['Resolutuion_Title'] = Resolution_Info[3] 45 | UN_DOCS_resolution_level.iloc[index]['Resolutuion_Adoption_DateMonthYear'] = Resolution_Info[4] 46 | if (Resolution_Info[4] != ''): 47 | UN_DOCS_resolution_level.iloc[index]['Resolutuion_Adoption_Day'] = re.match(r'(\d{1,2})\s(\w+)\s(\d{4})', Resolution_Info[4]).groups()[0] 48 | UN_DOCS_resolution_level.iloc[index]['Resolutuion_Adoption_Month'] = re.match(r'(\d{1,2})\s(\w+)\s(\d{4})', Resolution_Info[4]).groups()[1] 49 | UN_DOCS_resolution_level.iloc[index]['Resolutuion_Adoption_Year'] = re.match(r'(\d{1,2})\s(\w+)\s(\d{4})', Resolution_Info[4]).groups()[2] 50 | 51 | UN_DOCS_resolution_level.to_excel(output_dir + 'output_UN_DOCS_resolution_level.xlsx') 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Automatic Information Extraction and Knowledge Elicitation for United Nations Documents 2 | 3 | #### Context: 4 | The processing of considerable and rapidly growing amount of information within UN system is left to the very limited human capacities. The UN system produces a substantial amount of information that, if effectively mobilized, could greatly enhance the effectiveness and efficiency of the UN system. 5 | 6 | #### Goal: 7 | The goal is to pilot Microsoft Cognitive Services to unlock the strategic value of UN unstructured content by building on AI and semantic technologies. The idea is to showcase the innovative smart services that natural language processing and machine learning to effectively support policy and decision making, coordination, synergies and accountability. 8 | 9 | #### Data: 10 | UN General Assembly Resolutions (English only) between 2009 and 2018. In total 3138 resolution files in pdf format. 11 | 12 | #### Data Reference: 13 | pre-trained word2vec embeddings trained on part of Google News dataset (about 100 billion words): https://code.google.com/archive/p/word2vec/ 14 | 15 | #### Deliverables: 16 | ##### Resolution Level: 17 | Resolution File Name 18 | Resolution Session 19 | Resolution Agenda Item 20 | Resolution Number 21 | Resolution Title 22 | Resolution Adoption Date/Month/Year 23 | 24 | ##### Paragraph Level: 25 | Paragraph Type 26 | First Action Verb 27 | Key Terms 28 | Referenced Resolutions 29 | Referenced Resolution Dates 30 | Sustainable Development Goals (SDG), Targets, and Indicators 31 | Country 32 | Organization Names 33 | 34 | 35 | 36 | # Setup 37 | 38 | 1. Install requirements 39 | 40 | This code use python 3.7 41 | 42 | ``` 43 | pip install -r requirements.txt 44 | 45 | ``` 46 | 47 | 2. Run Scripts 48 | 49 | a. Run the following file for extracting resolution level information: [knowledge_extraction_resolution_level.py](https://github.com/microsoft/UN-Knowledge-Extraction/blob/main/knowledge_extraction_resolution_level.py) 50 | 51 | ``` 52 | python knowledge_extraction_resolution_level.py 53 | 54 | ``` 55 | b. Run the following file for extracting paragraph level information: [knowledge_extraction_paragraph_level.py](https://github.com/microsoft/UN-Knowledge-Extraction/blob/main/knowledge_extraction_paragraph_level.py) 56 | 57 | ``` 58 | python knowledge_extraction_paragraph_level.py 59 | 60 | ``` 61 | 62 | ## Contributing 63 | 64 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 65 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 66 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 67 | 68 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 69 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 70 | provided by the bot. You will only need to do this once across all repos using our CLA. 71 | 72 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 73 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 74 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 75 | 76 | ## Trademarks 77 | 78 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 79 | trademarks or logos is subject to and must follow 80 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 81 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 82 | Any use of third-party trademarks or logos are subject to those third-party's policies. 83 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/packages.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Code/Constants.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | namespace DSnA.WebJob.DocumentParser 5 | { 6 | using System; 7 | using System.Text.RegularExpressions; 8 | 9 | public static class Constants 10 | { 11 | public static string DateTimeFormat => "MM-dd-yyyy_HH-mm-ss"; 12 | public static class FileConfigs 13 | { 14 | private static string _sourceDirectoryPath; 15 | private static string _outputDirectoryPath; 16 | 17 | public static string SourceDirectoryPath 18 | { 19 | get 20 | { 21 | return _sourceDirectoryPath; 22 | } 23 | 24 | set { _sourceDirectoryPath = string.Format(@"{0}", value); } 25 | } 26 | 27 | public static string OutputDirectoryPath 28 | { 29 | get 30 | { 31 | return string.IsNullOrEmpty(_outputDirectoryPath) ? @"\DocumentParser" : _outputDirectoryPath; 32 | } 33 | 34 | set { _outputDirectoryPath = string.Format(@"{0}", value); } 35 | } 36 | 37 | public static string WorkingDirectoryPath 38 | { 39 | get 40 | { 41 | return OutputDirectoryPath + @"\Temp"; 42 | } 43 | } 44 | 45 | public static string LogFileName = "log_" + DateTime.UtcNow.ToString("MM-dd-yyyy") + ".log"; 46 | public static string TempFileName => "JsonByExtractionProgram"; 47 | } 48 | 49 | public static class RegexExp 50 | { 51 | public static string NoSpecialCharRegex => "[\\W]+"; 52 | // match only strings with combination of numbers and spaces 53 | public static string OnlyNumericWithSpaces => "^([0-9\\s]+)$"; 54 | // to match company names in reports 55 | public static string CompanyNameRegex => "^(^[a-zA-Z\\d\\s]+[a-zA-Z\\d]+[a-zA-Z\\d\\W]*)$"; 56 | // regex to match dates like MMMM dd,YYYY (January 23, 2017) and its combinations 57 | public static string DateRegex => "^(\\s*\\w{3,9}?\\s*?\\d{1,2}?\\s*?,\\s*?\\d{4}?)"; 58 | public static string NoEscapeSequences => @"[\a\b\t\r\v\f\e]"; 59 | public static string OnlyAsciiChar => @"[^\u0000-\u007F]+"; 60 | public static string OnlyWhiteSpaces => @"\s+"; 61 | public static string OnlyHyperlinks => @"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"; 62 | public static string HyperlinkAppProtocols => @"(http|https|ftp)"; 63 | public static string HasNumbers => @"^(?=.*[0-9])"; 64 | 65 | public static string HasBulletPoint => @"^(|\u2022|\u2023|\u25E6|\u2043|\u2219|-|[a-z]\)|[a-z]\.)"; 66 | public static readonly Regex SessionRegEx = new Regex(@"(\w)*(-)?(\w)*(\s)*(session)", RegexOptions.IgnoreCase | RegexOptions.Compiled); 67 | public static readonly Regex AgendaItemRegEx = new Regex(@"(agenda)(\s)+(item)+(s)?(\s)+(\d)*(\s)*(and)*(\s)*([a-fA-F0-9\(\)])*", RegexOptions.IgnoreCase | RegexOptions.Compiled); 68 | } 69 | 70 | public static class ParserConfig 71 | { 72 | public static string MessageQueueRef => "QueueName"; 73 | public static string ConnectionUriRef => "KeyVaultUriForConnectionString"; 74 | public static int MaxDequeueCount => 5; 75 | public static string LogsContainerNameRef => "LogsContainerName"; 76 | public static string LogPrefix => "LogPrefix"; 77 | public static string OutputContainerNameRef => "OutputContainerName"; 78 | } 79 | 80 | public static class CsvFileConfig 81 | { 82 | public const string CsvFileFormat = "csv"; 83 | public const string JsonFileFormat = "json"; 84 | public static string Headers => "SourceFile,Index,Content,Type"; 85 | public static string ContentTypeBlobUri => "BlobUri"; 86 | public static string ContentTypeAgreementNumber => "AgreementNumber"; 87 | public static string ContentTypeFileType => "FileType"; 88 | public static string ContentTypeExtractionTimeStamp => "ExtractionTimeStamp"; 89 | public static string ContentTypeText => "Text"; 90 | public static string ContentTypeParagraph => "Paragraph"; 91 | public static string ContentTypeHeader => "Header"; 92 | public static string ContentTypeSection => "Section"; 93 | public static string ContentTypeClause => "Clause"; 94 | public static string ContentTypeHeaderClause => "HeaderClause"; 95 | public static string ContentTypeAdditionalInformation => "AdditionalInformation"; 96 | } 97 | } 98 | } -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Code/Logger.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using System; 5 | 6 | namespace DSnA.WebJob.DocumentParser 7 | { 8 | using System.Diagnostics; 9 | using Microsoft.Azure; 10 | using Microsoft.WindowsAzure.Storage; 11 | using Microsoft.WindowsAzure.Storage.Blob; 12 | 13 | public interface ILogger 14 | { 15 | void Info(string message); 16 | void Error(string message, Exception exp); 17 | } 18 | public class Logger : ILogger 19 | { 20 | private static CloudStorageAccount StorageAccount = CloudStorageAccount.Parse(CloudConfigurationManager.GetSetting("StorageConnectionString")); 21 | 22 | private static CloudBlobClient blobClient = StorageAccount.CreateCloudBlobClient(); 23 | 24 | private static string logContainerName = CloudConfigurationManager.GetSetting(Constants.ParserConfig.LogsContainerNameRef); 25 | 26 | private static string logPrefix = CloudConfigurationManager.GetSetting(Constants.ParserConfig.LogPrefix); 27 | 28 | private static Logger LoggerInstance; 29 | private Logger() { } 30 | 31 | public static Logger Instance 32 | { 33 | get 34 | { 35 | if (LoggerInstance == null) 36 | { 37 | LoggerInstance = new Logger(); 38 | } 39 | 40 | return LoggerInstance; 41 | } 42 | } 43 | 44 | /// 45 | /// Write log text (info/error) to Azure Blob 46 | /// 47 | /// 48 | /// 49 | /// 50 | private void Write(string message, EventLogEntryType category, Exception exp = null) 51 | { 52 | try 53 | { 54 | // create blob client and container(if not exists) to store the logs in Azure Storage Account 55 | CloudBlobContainer logsContainer = blobClient.GetContainerReference(logContainerName); 56 | logsContainer.CreateIfNotExists(); 57 | // append information to blob - create logs for every day if not exists 58 | CloudAppendBlob appendBlob = logsContainer.GetAppendBlobReference($"log_{logPrefix}_{DateTime.UtcNow.ToString("MM-dd-yyyy")}.log"); 59 | if (!appendBlob.Exists()) 60 | appendBlob.CreateOrReplace(); 61 | 62 | if (exp != null) 63 | appendBlob.AppendText(String.Format("{0:u}\t[{1}]\t[{2}]\tMessage:{3}{4}{5}{6}", 64 | DateTime.UtcNow, Environment.MachineName, category.ToString().ToUpper(), message, Environment.NewLine, exp, Environment.NewLine)); 65 | else 66 | appendBlob.AppendText(String.Format("{0:u}\t[{1}]\t[{2}]\tMessage:{3}{4}", 67 | DateTime.UtcNow, Environment.MachineName, category.ToString().ToUpper(), message, Environment.NewLine)); 68 | } 69 | catch (Exception exception) 70 | { 71 | throw new LoggerException("Exception in Logging information/error", exception); 72 | } 73 | } 74 | 75 | /// 76 | /// Logs information text 77 | /// 78 | /// 79 | public void Info(string message) 80 | { 81 | Write(message, EventLogEntryType.Information); 82 | } 83 | 84 | /// 85 | /// Logs Exception text 86 | /// 87 | /// 88 | /// 89 | public void Error(string message, Exception exp) 90 | { 91 | Write(message, EventLogEntryType.Error, exp); 92 | } 93 | } 94 | public class ConsoleLogger : ILogger 95 | { 96 | private static ConsoleLogger LoggerInstance; 97 | private ConsoleLogger() { } 98 | 99 | 100 | public static ConsoleLogger Instance 101 | { 102 | get 103 | { 104 | if (LoggerInstance == null) 105 | { 106 | LoggerInstance = new ConsoleLogger(); 107 | } 108 | return LoggerInstance; 109 | } 110 | } 111 | 112 | /// 113 | /// Write log text (info/error) to Azure Blob 114 | /// 115 | /// 116 | /// 117 | /// 118 | private void Write(string message, EventLogEntryType category, Exception exp = null) 119 | { 120 | Console.WriteLine($"{category}: {message}"); 121 | 122 | if (exp != null) 123 | { 124 | Console.WriteLine(exp.Message); 125 | Console.WriteLine(exp.StackTrace); 126 | } 127 | } 128 | 129 | /// 130 | /// Logs information text 131 | /// 132 | /// 133 | public void Info(string message) 134 | { 135 | Write(message, EventLogEntryType.Information); 136 | } 137 | 138 | /// 139 | /// Logs Exception text 140 | /// 141 | /// 142 | /// 143 | public void Error(string message, Exception exp) 144 | { 145 | Write(message, EventLogEntryType.Error, exp); 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Code/ParserClasses.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | namespace DSnA.WebJob.DocumentParser 5 | { 6 | using Newtonsoft.Json; 7 | using System.Collections.Generic; 8 | 9 | /// 10 | /// Represents output JSON document structure 11 | /// 12 | public class JsonDocumentStruct 13 | { 14 | public DocumentContent DocumentContent { get; set; } 15 | public FileMetaData FileProperties { get; set; } 16 | public Error Errors { get; set; } 17 | } 18 | 19 | public class JsonDocumentStructFlat 20 | { 21 | [JsonProperty(PropertyName = "agreementNumber")] 22 | public string AgreementNumber { get; set; } 23 | 24 | [JsonProperty(PropertyName = "fileName")] 25 | public string FileName { get; set; } 26 | 27 | [JsonProperty(PropertyName = "fileType")] 28 | public string FileType { get; set; } 29 | 30 | [JsonProperty(PropertyName = "imageStoreUri")] 31 | public string ImageStoreUri { get; set; } 32 | 33 | [JsonProperty(PropertyName = "extractionTimeStamp")] 34 | public string ExtractionTimeStamp { get; set; } 35 | 36 | [JsonProperty(PropertyName = "text")] 37 | public string Text { get; set; } 38 | 39 | [JsonProperty(PropertyName = "headers")] 40 | public Dictionary Headers { get; set; } 41 | 42 | [JsonProperty(PropertyName = "paragraphs")] 43 | public Dictionary Paragraphs { get; set; } 44 | 45 | [JsonProperty(PropertyName = "sections")] 46 | public Dictionary Sections { get; set; } 47 | 48 | [JsonProperty(PropertyName = "clauses")] 49 | public List Clauses { get; set; } 50 | 51 | [JsonProperty(PropertyName = "headerClauses")] 52 | public List HeaderClauses { get; set; } 53 | 54 | [JsonProperty(PropertyName = "additionalInformation")] 55 | public List AdditionalInformation { get; set; } 56 | } 57 | 58 | public class ReportExtractionResponse 59 | { 60 | public string location { get; set; } 61 | public string contentJson { get; set; } 62 | } 63 | 64 | public class Clauses 65 | { 66 | public Clauses() 67 | { 68 | this.Title = ""; 69 | this.Content = ""; 70 | this.Start = -1; 71 | this.End = -1; 72 | } 73 | 74 | public string Title { get; set; } 75 | public string Content { get; set; } 76 | public int Start { get; set; } 77 | public int End { get; set; } 78 | } 79 | 80 | /// 81 | /// Represents file meta data in Json output 82 | /// 83 | public class FileMetaData 84 | { 85 | [JsonProperty(PropertyName = "agreementNumber")] 86 | public string AgreementNumber { get; set; } 87 | 88 | [JsonProperty(PropertyName = "fileName")] 89 | public string FileName { get; set; } 90 | 91 | [JsonProperty(PropertyName = "fileType")] 92 | public string FileType { get; set; } 93 | 94 | [JsonProperty(PropertyName = "extractionTimeStamp")] 95 | public string ExtractionTimeStamp { get; set; } 96 | } 97 | 98 | /// 99 | /// Represents higher structure of red flag document content in Json output 100 | /// 101 | public class DocumentContent 102 | { 103 | [JsonProperty(PropertyName = "text")] 104 | public string Text { get; set; } 105 | [JsonProperty(PropertyName = "paragraphs")] 106 | public Dictionary Paragraphs { get; set; } 107 | 108 | [JsonProperty(PropertyName = "headers")] 109 | public Dictionary Headers { get; set; } 110 | 111 | [JsonProperty(PropertyName = "sections")] 112 | public Dictionary Sections { get; set; } 113 | 114 | [JsonProperty(PropertyName = "clauses")] 115 | public List Clauses { get; set; } 116 | 117 | [JsonProperty(PropertyName = "headerClauses")] 118 | public List HeaderClauses { get; set; } 119 | 120 | [JsonProperty(PropertyName = "additionalInformation")] 121 | public List AdditionalInformation { get; set; } 122 | } 123 | 124 | /// 125 | /// Represents structure of errors presented in Json output 126 | /// 127 | public class Error 128 | { 129 | public Error() 130 | { 131 | this.IsError = false; 132 | this.Description = ""; 133 | } 134 | 135 | public bool IsError { get; set; } 136 | public string Description { get; set; } 137 | } 138 | 139 | /// 140 | /// Represents structure of input message from OneVet Queue 141 | /// 142 | public class QueueMessage 143 | { 144 | public string DocumentId { get; set; } 145 | public string FileInputUri { get; set; } 146 | public string FileOutputUri { get; set; } 147 | public string RequestCreationDateTimeUtc { get; set; } 148 | public string DocumentTypeId { get; set; } 149 | } 150 | 151 | /// 152 | /// Represents structure for csv output files 153 | /// 154 | public class CsvDocumentFile 155 | { 156 | public CsvDocumentFile() 157 | { 158 | this.CsvOutputLines = new List() { Constants.CsvFileConfig.Headers }; 159 | } 160 | 161 | private List CsvOutputLines { get; set; } 162 | 163 | public void AddCsvLine(string sourceFile, string content, string type) 164 | { 165 | this.CsvOutputLines.Add(string.Format("{0},{1},\"{2}\",{3}", sourceFile, GetCurrentLineCount(), content, type)); 166 | } 167 | 168 | public List GetCsvOutputLines() 169 | { 170 | return this.CsvOutputLines; 171 | } 172 | 173 | private int GetCurrentLineCount() 174 | { 175 | return this.CsvOutputLines.Count > 0 ? this.CsvOutputLines.Count - 1 : this.CsvOutputLines.Count; 176 | } 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | .DS_Store 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Aa][Rr][Mm]/ 27 | [Aa][Rr][Mm]64/ 28 | bld/ 29 | [Bb]in/ 30 | [Oo]bj/ 31 | [Ll]og/ 32 | [Ll]ogs/ 33 | 34 | # Visual Studio 2015/2017 cache/options directory 35 | .vs/ 36 | # Uncomment if you have tasks that create the project's static files in wwwroot 37 | #wwwroot/ 38 | 39 | # Visual Studio 2017 auto generated files 40 | Generated\ Files/ 41 | 42 | # MSTest test Results 43 | [Tt]est[Rr]esult*/ 44 | [Bb]uild[Ll]og.* 45 | 46 | # NUnit 47 | *.VisualState.xml 48 | TestResult.xml 49 | nunit-*.xml 50 | 51 | # Build Results of an ATL Project 52 | [Dd]ebugPS/ 53 | [Rr]eleasePS/ 54 | dlldata.c 55 | 56 | # Benchmark Results 57 | BenchmarkDotNet.Artifacts/ 58 | 59 | # .NET Core 60 | project.lock.json 61 | project.fragment.lock.json 62 | artifacts/ 63 | 64 | # StyleCop 65 | StyleCopReport.xml 66 | 67 | # Files built by Visual Studio 68 | *_i.c 69 | *_p.c 70 | *_h.h 71 | *.ilk 72 | *.meta 73 | *.obj 74 | *.iobj 75 | *.pch 76 | *.pdb 77 | *.ipdb 78 | *.pgc 79 | *.pgd 80 | *.rsp 81 | *.sbr 82 | *.tlb 83 | *.tli 84 | *.tlh 85 | *.tmp 86 | *.tmp_proj 87 | *_wpftmp.csproj 88 | *.log 89 | *.vspscc 90 | *.vssscc 91 | .builds 92 | *.pidb 93 | *.svclog 94 | *.scc 95 | 96 | # Chutzpah Test files 97 | _Chutzpah* 98 | 99 | # Visual C++ cache files 100 | ipch/ 101 | *.aps 102 | *.ncb 103 | *.opendb 104 | *.opensdf 105 | *.sdf 106 | *.cachefile 107 | *.VC.db 108 | *.VC.VC.opendb 109 | 110 | # Visual Studio profiler 111 | *.psess 112 | *.vsp 113 | *.vspx 114 | *.sap 115 | 116 | # Visual Studio Trace Files 117 | *.e2e 118 | 119 | # TFS 2012 Local Workspace 120 | $tf/ 121 | 122 | # Guidance Automation Toolkit 123 | *.gpState 124 | 125 | # ReSharper is a .NET coding add-in 126 | _ReSharper*/ 127 | *.[Rr]e[Ss]harper 128 | *.DotSettings.user 129 | 130 | # TeamCity is a build add-in 131 | _TeamCity* 132 | 133 | # DotCover is a Code Coverage Tool 134 | *.dotCover 135 | 136 | # AxoCover is a Code Coverage Tool 137 | .axoCover/* 138 | !.axoCover/settings.json 139 | 140 | # Visual Studio code coverage results 141 | *.coverage 142 | *.coveragexml 143 | 144 | # NCrunch 145 | _NCrunch_* 146 | .*crunch*.local.xml 147 | nCrunchTemp_* 148 | 149 | # MightyMoose 150 | *.mm.* 151 | AutoTest.Net/ 152 | 153 | # Web workbench (sass) 154 | .sass-cache/ 155 | 156 | # Installshield output folder 157 | [Ee]xpress/ 158 | 159 | # DocProject is a documentation generator add-in 160 | DocProject/buildhelp/ 161 | DocProject/Help/*.HxT 162 | DocProject/Help/*.HxC 163 | DocProject/Help/*.hhc 164 | DocProject/Help/*.hhk 165 | DocProject/Help/*.hhp 166 | DocProject/Help/Html2 167 | DocProject/Help/html 168 | 169 | # Click-Once directory 170 | publish/ 171 | 172 | # Publish Web Output 173 | *.[Pp]ublish.xml 174 | *.azurePubxml 175 | # Note: Comment the next line if you want to checkin your web deploy settings, 176 | # but database connection strings (with potential passwords) will be unencrypted 177 | *.pubxml 178 | *.publishproj 179 | 180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 181 | # checkin your Azure Web App publish settings, but sensitive information contained 182 | # in these scripts will be unencrypted 183 | PublishScripts/ 184 | 185 | # NuGet Packages 186 | *.nupkg 187 | # NuGet Symbol Packages 188 | *.snupkg 189 | # The packages folder can be ignored because of Package Restore 190 | **/[Pp]ackages/* 191 | # except build/, which is used as an MSBuild target. 192 | !**/[Pp]ackages/build/ 193 | # Uncomment if necessary however generally it will be regenerated when needed 194 | #!**/[Pp]ackages/repositories.config 195 | # NuGet v3's project.json files produces more ignorable files 196 | *.nuget.props 197 | *.nuget.targets 198 | 199 | # Microsoft Azure Build Output 200 | csx/ 201 | *.build.csdef 202 | 203 | # Microsoft Azure Emulator 204 | ecf/ 205 | rcf/ 206 | 207 | # Windows Store app package directories and files 208 | AppPackages/ 209 | BundleArtifacts/ 210 | Package.StoreAssociation.xml 211 | _pkginfo.txt 212 | *.appx 213 | *.appxbundle 214 | *.appxupload 215 | 216 | # Visual Studio cache files 217 | # files ending in .cache can be ignored 218 | *.[Cc]ache 219 | # but keep track of directories ending in .cache 220 | !?*.[Cc]ache/ 221 | 222 | # Others 223 | ClientBin/ 224 | ~$* 225 | *~ 226 | *.dbmdl 227 | *.dbproj.schemaview 228 | *.jfm 229 | *.pfx 230 | *.publishsettings 231 | orleans.codegen.cs 232 | 233 | # Including strong name files can present a security risk 234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 235 | #*.snk 236 | 237 | # Since there are multiple workflows, uncomment next line to ignore bower_components 238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 239 | #bower_components/ 240 | 241 | # RIA/Silverlight projects 242 | Generated_Code/ 243 | 244 | # Backup & report files from converting an old project file 245 | # to a newer Visual Studio version. Backup files are not needed, 246 | # because we have git ;-) 247 | _UpgradeReport_Files/ 248 | Backup*/ 249 | UpgradeLog*.XML 250 | UpgradeLog*.htm 251 | ServiceFabricBackup/ 252 | *.rptproj.bak 253 | 254 | # SQL Server files 255 | *.mdf 256 | *.ldf 257 | *.ndf 258 | 259 | # Business Intelligence projects 260 | *.rdl.data 261 | *.bim.layout 262 | *.bim_*.settings 263 | *.rptproj.rsuser 264 | *- [Bb]ackup.rdl 265 | *- [Bb]ackup ([0-9]).rdl 266 | *- [Bb]ackup ([0-9][0-9]).rdl 267 | 268 | # Microsoft Fakes 269 | FakesAssemblies/ 270 | 271 | # GhostDoc plugin setting file 272 | *.GhostDoc.xml 273 | 274 | # Node.js Tools for Visual Studio 275 | .ntvs_analysis.dat 276 | node_modules/ 277 | 278 | # Visual Studio 6 build log 279 | *.plg 280 | 281 | # Visual Studio 6 workspace options file 282 | *.opt 283 | 284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 285 | *.vbw 286 | 287 | # Visual Studio LightSwitch build output 288 | **/*.HTMLClient/GeneratedArtifacts 289 | **/*.DesktopClient/GeneratedArtifacts 290 | **/*.DesktopClient/ModelManifest.xml 291 | **/*.Server/GeneratedArtifacts 292 | **/*.Server/ModelManifest.xml 293 | _Pvt_Extensions 294 | 295 | # Paket dependency manager 296 | .paket/paket.exe 297 | paket-files/ 298 | 299 | # FAKE - F# Make 300 | .fake/ 301 | 302 | # CodeRush personal settings 303 | .cr/personal 304 | 305 | # Python Tools for Visual Studio (PTVS) 306 | __pycache__/ 307 | *.pyc 308 | 309 | # Cake - Uncomment if you are using it 310 | # tools/** 311 | # !tools/packages.config 312 | 313 | # Tabs Studio 314 | *.tss 315 | 316 | # Telerik's JustMock configuration file 317 | *.jmconfig 318 | 319 | # BizTalk build output 320 | *.btp.cs 321 | *.btm.cs 322 | *.odx.cs 323 | *.xsd.cs 324 | 325 | # OpenCover UI analysis results 326 | OpenCover/ 327 | 328 | # Azure Stream Analytics local run output 329 | ASALocalRun/ 330 | 331 | # MSBuild Binary and Structured Log 332 | *.binlog 333 | 334 | # NVidia Nsight GPU debugger configuration file 335 | *.nvuser 336 | 337 | # MFractors (Xamarin productivity tool) working folder 338 | .mfractor/ 339 | 340 | # Local History for Visual Studio 341 | .localhistory/ 342 | 343 | # BeatPulse healthcheck temp database 344 | healthchecksdb 345 | 346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 347 | MigrationBackup/ 348 | 349 | # Ionide (cross platform F# VS Code tools) working folder 350 | .ionide/ 351 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Program.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using Microsoft.Azure; 5 | using Microsoft.Office.Interop.Word; 6 | using Microsoft.WindowsAzure.Storage; 7 | using Microsoft.WindowsAzure.Storage.Blob; 8 | using System; 9 | using System.Diagnostics; 10 | using System.Linq; 11 | 12 | namespace DSnA.WebJob.DocumentParser 13 | { 14 | class Program 15 | { 16 | private static string _storageType = CloudConfigurationManager.GetSetting("StorageType"); 17 | 18 | static void Main(string[] args) 19 | { 20 | if (!ValidateArgs(args, _storageType)) 21 | return; 22 | 23 | var logger = ConsoleLogger.Instance; 24 | var util = new Utils(logger); 25 | 26 | var args0 = _storageType == "blob" ? args[0] : Constants.FileConfigs.SourceDirectoryPath = args[0]; 27 | var args1 = _storageType == "blob" ? args[1] : Constants.FileConfigs.OutputDirectoryPath = args[1]; 28 | var args2 = args.Count() > 2 ? args[2] : null; 29 | 30 | IStorageClientFactory clientFactory = new DefaultStorageClientFactory(); 31 | IStorageClient client = clientFactory.Create(_storageType, new System.Collections.Generic.Dictionary() { 32 | { DefaultStorageClientFactory.BlobContainerNameKey, args0 } 33 | }, util); 34 | 35 | string[] uris = GetUris(client, _storageType == "blob" ? args1 : Constants.FileConfigs.SourceDirectoryPath, args2, util); 36 | 37 | IDocumentParser parser = new DocumentParser(logger, util, new ParseHelper(logger, util)); 38 | 39 | var total = uris.Count(); 40 | if (total == 0) 41 | { 42 | Console.WriteLine("No files to process..."); 43 | } 44 | 45 | var counter = 0; 46 | 47 | var outputFileFormat = CloudConfigurationManager.GetSetting("OutputFileFormat"); 48 | 49 | Stopwatch stopWatch = new Stopwatch(); 50 | 51 | InteropWordUtils iInteropWordUtils = new InteropWordUtils(); 52 | 53 | // fire up word instance 54 | Application wordApp = iInteropWordUtils.CreateWordAppInstance(); 55 | 56 | int maxFailures = 3; 57 | int currentFailures = 0; 58 | 59 | try 60 | { 61 | foreach (var uri in uris) 62 | { 63 | stopWatch.Start(); 64 | counter++; 65 | Console.WriteLine($"Processing: {counter} out of {total}"); 66 | Console.WriteLine($"Processing: {uri}"); 67 | 68 | string result = null; 69 | 70 | try 71 | { 72 | result = parser.ParseDocuments(uri, client, wordApp, outputFileFormat); 73 | } 74 | catch (Exception ex) 75 | { 76 | Console.WriteLine(ex.Message); 77 | Console.WriteLine(ex.StackTrace); 78 | 79 | currentFailures++; 80 | 81 | if (currentFailures >= maxFailures) 82 | { 83 | throw new Exception("Max failure count reached."); 84 | } 85 | } 86 | 87 | stopWatch.Stop(); 88 | 89 | TimeSpan ts = stopWatch.Elapsed; 90 | 91 | string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", 92 | ts.Hours, ts.Minutes, ts.Seconds, 93 | ts.Milliseconds / 10); 94 | 95 | Console.WriteLine("RunTime " + elapsedTime); 96 | Console.WriteLine(result); 97 | stopWatch.Reset(); 98 | } 99 | } 100 | finally 101 | { 102 | iInteropWordUtils.DisposeIneropObject(wordApp); 103 | } 104 | 105 | Console.WriteLine("Press any key to exit..."); 106 | Console.ReadKey(); 107 | } 108 | 109 | private static string[] GetUris(IStorageClient client, string prefix, string filter, IUtils utils) 110 | { 111 | if (client is BlobStorageClient) return GetBlobUris(client as BlobStorageClient, prefix, filter, utils); 112 | else if (client is LocalStorageClient) return GetLocalUris(client as LocalStorageClient, prefix, filter, utils); 113 | else return null; 114 | } 115 | 116 | private static string[] GetBlobUris(BlobStorageClient client, string sourcePath, string filter, IUtils utils) 117 | { 118 | Console.WriteLine($"Listing Blobs in container {client.Container.Name} in folder {sourcePath}"); 119 | 120 | string blobPrefix = sourcePath == "null" || sourcePath == "/" ? null : sourcePath; 121 | 122 | var outputBlobList = utils.GetBlobListFromOutputContainer(client.Client); 123 | 124 | var blobList = client.Container.ListBlobs(prefix: blobPrefix, useFlatBlobListing: true); 125 | 126 | var filteredBlobList = blobList.Where(s => !outputBlobList.Contains(utils.CleanNonSupportedSparkChar(s.Uri.Segments[s.Uri.Segments.Length - 1]))).ToList(); 127 | 128 | if (filter != null) 129 | { 130 | filteredBlobList = filteredBlobList.Where(s => s.Uri.PathAndQuery.Contains(filter)).ToList(); 131 | } 132 | 133 | return filteredBlobList 134 | .Select(x => x.Uri.AbsoluteUri) 135 | .ToArray(); 136 | } 137 | 138 | private static string[] GetLocalUris(LocalStorageClient client, string sourcePath, string filter, IUtils utils) 139 | { 140 | string[] files = System.IO.Directory.GetFiles(sourcePath, "*", System.IO.SearchOption.AllDirectories); 141 | 142 | return (!string.IsNullOrEmpty(filter) 143 | ? files.Where(x => x.Contains(filter)).ToArray() 144 | : files); 145 | } 146 | 147 | private static bool ValidateArgs(string[] args, string storageType) 148 | { 149 | bool validArgs = true; 150 | 151 | switch (storageType) 152 | { 153 | case "blob": 154 | if (!args?.Any() ?? true) 155 | { 156 | Console.WriteLine(string.Format(" No arguments passed. \n\n DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3 \n\n Options: \n\t arg1: Required - blob container name \n\t arg2: Required - virtual directory name (/ root level) \n\t arg3: Optional - document file name filter")); 157 | validArgs = false; 158 | } 159 | else if (!(args.Length >= 2 && args.Length < 4)) 160 | { 161 | Console.WriteLine(string.Format(" Incorrect number of arguments. \n\n DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3 \n\n Options: \n\t arg1: Required - blob container name \n\t arg2: Required - virtual directory name (/ root level) \n\t arg3: Optional - document file name filter")); 162 | validArgs = false; 163 | } 164 | break; 165 | 166 | case "localstorage": 167 | if (!args?.Any() ?? true) 168 | { 169 | Console.WriteLine(string.Format(" No arguments passed. \n\n DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3 \n\n Options: \n\t arg1: Required - local storage source folder path \n\t arg2: Required - local storage output folder path \n\t arg3: Optional - document file name filter")); 170 | validArgs = false; 171 | } 172 | else if (!(args.Length >= 2 && args.Length < 4)) 173 | { 174 | Console.WriteLine(string.Format(" Incorrect number of arguments. \n\n DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3 \n\n Options: \n\t arg1: Required - local storage source folder path \n\t arg2: Required - local storage output folder path \n\t arg3: Optional - document file name filter")); 175 | validArgs = false; 176 | } 177 | break; 178 | 179 | default: 180 | return validArgs = false; 181 | } 182 | 183 | return validArgs; 184 | } 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Code/DocumentParser.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using System; 5 | using System.IO; 6 | using Microsoft.WindowsAzure.Storage.Blob; 7 | using System.Collections.Generic; 8 | using Newtonsoft.Json; 9 | using Microsoft.Office.Interop.Word; 10 | 11 | namespace DSnA.WebJob.DocumentParser 12 | { 13 | public class DocumentParser : IDocumentParser 14 | { 15 | private readonly ILogger iLogger; 16 | private readonly IUtils iUtils; 17 | private readonly IParseHelper iparseHelper; 18 | 19 | public DocumentParser(ILogger iLogger, IUtils iUtils, IParseHelper iparseHelper) 20 | { 21 | this.iLogger = iLogger; 22 | this.iUtils = iUtils; 23 | this.iparseHelper = iparseHelper; 24 | } 25 | 26 | /// 27 | /// Main API for document extraction 28 | /// 29 | public string ParseDocuments(string uri, IStorageClient storageClient, Application wordApp, string outputFileFormat) 30 | { 31 | try 32 | { 33 | string fileLocation = ""; 34 | var output = ""; 35 | try 36 | { 37 | fileLocation = storageClient.GetFile(new StorageObjectDescriptor() { Uri = new Uri(uri) }, Constants.FileConfigs.WorkingDirectoryPath); 38 | var result = ExtractContentFromReports(fileLocation, outputFileFormat, uri, wordApp); 39 | storageClient.SaveFile(result.location, null); 40 | output = $"Finished Processing: {result.location}"; 41 | iUtils.DeleteInputFiles(new List { result.location }); 42 | } 43 | catch (Exception exp) 44 | { 45 | iLogger.Error($"Error Processing: {uri}", exp); 46 | output = $"error processing: {uri}"; 47 | } 48 | 49 | return output; 50 | } 51 | catch (Exception exp) 52 | { 53 | iLogger.Error("{" + nameof(ParseDocuments) + "} - exception occured-Level 2", exp); 54 | throw; 55 | } 56 | finally 57 | { 58 | // force garbage collection to collect leftover COM objects 59 | GC.Collect(); 60 | } 61 | } 62 | 63 | /// 64 | /// Extracts document content - Initial function encapsulating different extraction helper methods 65 | /// 66 | /// 67 | /// 68 | /// saved JSON output file location 69 | private ReportExtractionResponse ExtractContentFromReports(string fileLocation, string outputFileFormat, string originalFileLocation = null, Application wordApp = null) 70 | { 71 | var docFile = ""; 72 | try 73 | { 74 | docFile = iUtils.ConvertPdfToWord(fileLocation, Constants.FileConfigs.WorkingDirectoryPath, wordApp); 75 | var documentContent = iparseHelper.ExtractDocumentContent(docFile, wordApp); 76 | ReportExtractionResponse reportExtractionResponse = null; 77 | switch (outputFileFormat) 78 | { 79 | case Constants.CsvFileConfig.JsonFileFormat: 80 | JsonDocumentStructFlat jsonDoc; 81 | string jsonOutputFileLocation; 82 | ExtractAsJsonFormat(fileLocation, originalFileLocation, documentContent, out jsonDoc, out jsonOutputFileLocation); 83 | reportExtractionResponse = new ReportExtractionResponse() 84 | { 85 | location = jsonOutputFileLocation, 86 | contentJson = JsonConvert.SerializeObject(jsonDoc, Formatting.Indented) 87 | }; 88 | break; 89 | 90 | case Constants.CsvFileConfig.CsvFileFormat: 91 | default: 92 | string csvOutputFileLocation; 93 | ExtractAsCsvFormat(fileLocation, originalFileLocation, documentContent, out csvOutputFileLocation); 94 | reportExtractionResponse = new ReportExtractionResponse() 95 | { 96 | location = csvOutputFileLocation 97 | }; 98 | break; 99 | } 100 | 101 | return reportExtractionResponse; 102 | } 103 | finally 104 | { 105 | iUtils.DeleteInputFiles(new List { fileLocation, docFile }); 106 | } 107 | } 108 | 109 | 110 | private void ExtractAsJsonFormat(string fileLocation, string originalFileLocation, DocumentContent documentContent, out JsonDocumentStructFlat jsonDoc, out string jsonOutputFileLocation) 111 | { 112 | jsonDoc = new JsonDocumentStructFlat(); 113 | jsonDoc.ImageStoreUri = originalFileLocation; 114 | jsonDoc.Text = documentContent.Text; 115 | jsonDoc.Paragraphs = documentContent.Paragraphs; 116 | jsonDoc.Headers = documentContent.Headers; 117 | jsonDoc.Sections = documentContent.Sections; 118 | jsonDoc.Clauses = documentContent.Clauses; 119 | jsonDoc.HeaderClauses = documentContent.HeaderClauses; 120 | jsonDoc.AdditionalInformation = documentContent.AdditionalInformation; 121 | 122 | var fileProperties = iUtils.ExtractFileMetadata(fileLocation); 123 | jsonDoc.FileName = fileProperties.FileName; 124 | jsonDoc.FileType = fileProperties.FileType; 125 | jsonDoc.AgreementNumber = fileProperties.AgreementNumber; 126 | jsonDoc.ExtractionTimeStamp = fileProperties.ExtractionTimeStamp; 127 | jsonOutputFileLocation = iUtils.SerializeAndSaveJson(jsonDoc, Path.GetFileName(fileLocation)); 128 | } 129 | 130 | private void ExtractAsCsvFormat(string fileLocation, string originalFileLocation, DocumentContent documentContent, out string csvOutputFileLocation) 131 | { 132 | CsvDocumentFile csvDocumentFile = new CsvDocumentFile(); 133 | var fileProperties = iUtils.ExtractFileMetadata(fileLocation); 134 | csvDocumentFile.AddCsvLine(fileProperties.FileName, originalFileLocation, Constants.CsvFileConfig.ContentTypeBlobUri); 135 | csvDocumentFile.AddCsvLine(fileProperties.FileName, fileProperties.AgreementNumber, Constants.CsvFileConfig.ContentTypeAgreementNumber); 136 | csvDocumentFile.AddCsvLine(fileProperties.FileName, fileProperties.FileType, Constants.CsvFileConfig.ContentTypeFileType); 137 | csvDocumentFile.AddCsvLine(fileProperties.FileName, fileProperties.ExtractionTimeStamp, Constants.CsvFileConfig.ContentTypeExtractionTimeStamp); 138 | csvDocumentFile.AddCsvLine(fileProperties.FileName, iUtils.CleanTextFromNonAsciiChar(documentContent.Text), Constants.CsvFileConfig.ContentTypeText); 139 | 140 | foreach (var paragraph in documentContent.Paragraphs) 141 | { 142 | var paragraphCleanText = iUtils.CleanTextFromNonAsciiChar(paragraph.Value); 143 | if (!string.IsNullOrEmpty(paragraphCleanText)) 144 | csvDocumentFile.AddCsvLine(fileProperties.FileName, paragraphCleanText, Constants.CsvFileConfig.ContentTypeParagraph); 145 | } 146 | 147 | foreach (var header in documentContent.Headers) 148 | { 149 | var headerCleanText = iUtils.CleanTextFromNonAsciiChar(header.Value); 150 | if (!string.IsNullOrEmpty(headerCleanText)) 151 | csvDocumentFile.AddCsvLine(fileProperties.FileName, headerCleanText, Constants.CsvFileConfig.ContentTypeHeader); 152 | } 153 | 154 | foreach (var section in documentContent.Sections) 155 | { 156 | var sectionCleanText = iUtils.CleanTextFromNonAsciiChar(section.Value); 157 | if (!string.IsNullOrEmpty(sectionCleanText)) 158 | csvDocumentFile.AddCsvLine(fileProperties.FileName, sectionCleanText, Constants.CsvFileConfig.ContentTypeSection); 159 | } 160 | 161 | foreach (var clause in documentContent.Clauses) 162 | { 163 | var clauseCleanText = iUtils.CleanTextFromNonAsciiChar(clause.Content); 164 | if (!string.IsNullOrEmpty(clauseCleanText)) 165 | csvDocumentFile.AddCsvLine(fileProperties.FileName, clauseCleanText, Constants.CsvFileConfig.ContentTypeClause); 166 | } 167 | 168 | foreach (var headerClause in documentContent.HeaderClauses) 169 | { 170 | var headerClauseCleanText = iUtils.CleanTextFromNonAsciiChar(headerClause.Content); 171 | if (!string.IsNullOrEmpty(headerClauseCleanText)) 172 | csvDocumentFile.AddCsvLine(fileProperties.FileName, headerClauseCleanText, Constants.CsvFileConfig.ContentTypeHeaderClause); 173 | } 174 | 175 | foreach (var additionalInformation in documentContent.AdditionalInformation) 176 | { 177 | var additionalInformationCleanText = iUtils.CleanTextFromNonAsciiChar(additionalInformation); 178 | if (!string.IsNullOrEmpty(additionalInformationCleanText)) 179 | csvDocumentFile.AddCsvLine(fileProperties.FileName, additionalInformationCleanText, Constants.CsvFileConfig.ContentTypeAdditionalInformation); 180 | } 181 | 182 | csvOutputFileLocation = iUtils.SaveToCsvFile(csvDocumentFile.GetCsvOutputLines(), Path.GetFileNameWithoutExtension(fileLocation)); 183 | } 184 | } 185 | } -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/DSnA.WebJob.DocumentParser.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | AnyCPU 7 | {9E4D7884-0C36-429B-A4C9-1217D3CA7D4E} 8 | Exe 9 | DSnA.WebJob.DocumentParser 10 | DSnA.WebJob.DocumentParser 11 | v4.6.2 12 | 512 13 | true 14 | 15 | 16 | AnyCPU 17 | true 18 | full 19 | false 20 | bin\Debug\ 21 | DEBUG;TRACE 22 | prompt 23 | 4 24 | 25 | 26 | AnyCPU 27 | pdbonly 28 | true 29 | bin\Release\ 30 | TRACE 31 | prompt 32 | 4 33 | 34 | 35 | 36 | ..\packages\DocumentFormat.OpenXml.2.8.1\lib\net46\DocumentFormat.OpenXml.dll 37 | 38 | 39 | ..\packages\Microsoft.Azure.KeyVault.Core.1.0.0\lib\net40\Microsoft.Azure.KeyVault.Core.dll 40 | 41 | 42 | True 43 | ..\packages\Microsoft.Office.Interop.Word.15.0.4797.1003\lib\net20\Microsoft.Office.Interop.Word.dll 44 | True 45 | 46 | 47 | ..\packages\Microsoft.Win32.Primitives.4.3.0\lib\net46\Microsoft.Win32.Primitives.dll 48 | 49 | 50 | ..\packages\Microsoft.WindowsAzure.ConfigurationManager.3.2.3\lib\net40\Microsoft.WindowsAzure.Configuration.dll 51 | 52 | 53 | ..\packages\WindowsAzure.Storage.9.3.2\lib\net45\Microsoft.WindowsAzure.Storage.dll 54 | True 55 | 56 | 57 | ..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll 58 | 59 | 60 | 61 | ..\packages\System.AppContext.4.3.0\lib\net46\System.AppContext.dll 62 | True 63 | 64 | 65 | 66 | 67 | ..\packages\System.Console.4.3.0\lib\net46\System.Console.dll 68 | 69 | 70 | 71 | ..\packages\System.Diagnostics.DiagnosticSource.4.3.0\lib\net46\System.Diagnostics.DiagnosticSource.dll 72 | 73 | 74 | ..\packages\System.Diagnostics.Tracing.4.3.0\lib\net462\System.Diagnostics.Tracing.dll 75 | 76 | 77 | ..\packages\System.Globalization.Calendars.4.3.0\lib\net46\System.Globalization.Calendars.dll 78 | 79 | 80 | ..\packages\System.IO.4.3.0\lib\net462\System.IO.dll 81 | 82 | 83 | ..\packages\System.IO.Abstractions.2.1.0.247\lib\net40\System.IO.Abstractions.dll 84 | 85 | 86 | ..\packages\System.IO.Compression.4.3.0\lib\net46\System.IO.Compression.dll 87 | True 88 | 89 | 90 | 91 | ..\packages\System.IO.Compression.ZipFile.4.3.0\lib\net46\System.IO.Compression.ZipFile.dll 92 | 93 | 94 | ..\packages\System.IO.FileSystem.4.3.0\lib\net46\System.IO.FileSystem.dll 95 | 96 | 97 | ..\packages\System.IO.FileSystem.Primitives.4.3.0\lib\net46\System.IO.FileSystem.Primitives.dll 98 | 99 | 100 | ..\packages\System.IO.Packaging.4.4.0\lib\net46\System.IO.Packaging.dll 101 | 102 | 103 | ..\packages\System.Net.Http.4.3.4\lib\net46\System.Net.Http.dll 104 | 105 | 106 | ..\packages\System.Net.Sockets.4.3.0\lib\net46\System.Net.Sockets.dll 107 | 108 | 109 | 110 | ..\packages\System.Reflection.4.3.0\lib\net462\System.Reflection.dll 111 | 112 | 113 | ..\packages\System.Runtime.4.3.0\lib\net462\System.Runtime.dll 114 | 115 | 116 | ..\packages\System.Runtime.Extensions.4.3.0\lib\net462\System.Runtime.Extensions.dll 117 | 118 | 119 | ..\packages\System.Runtime.InteropServices.4.3.0\lib\net462\System.Runtime.InteropServices.dll 120 | 121 | 122 | ..\packages\System.Runtime.InteropServices.RuntimeInformation.4.3.0\lib\net45\System.Runtime.InteropServices.RuntimeInformation.dll 123 | True 124 | 125 | 126 | 127 | ..\packages\System.Security.Cryptography.Algorithms.4.3.0\lib\net461\System.Security.Cryptography.Algorithms.dll 128 | 129 | 130 | ..\packages\System.Security.Cryptography.Encoding.4.3.0\lib\net46\System.Security.Cryptography.Encoding.dll 131 | 132 | 133 | ..\packages\System.Security.Cryptography.Primitives.4.3.0\lib\net46\System.Security.Cryptography.Primitives.dll 134 | 135 | 136 | ..\packages\System.Security.Cryptography.X509Certificates.4.3.0\lib\net461\System.Security.Cryptography.X509Certificates.dll 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | ..\packages\System.Xml.ReaderWriter.4.3.0\lib\net46\System.Xml.ReaderWriter.dll 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | Designer 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | 23 | 24 | 25 | Apache License 26 | Version 2.0, January 2004 27 | http://www.apache.org/licenses/ 28 | 29 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 30 | 31 | 1. Definitions. 32 | 33 | "License" shall mean the terms and conditions for use, reproduction, 34 | and distribution as defined by Sections 1 through 9 of this document. 35 | 36 | "Licensor" shall mean the copyright owner or entity authorized by 37 | the copyright owner that is granting the License. 38 | 39 | "Legal Entity" shall mean the union of the acting entity and all 40 | other entities that control, are controlled by, or are under common 41 | control with that entity. For the purposes of this definition, 42 | "control" means (i) the power, direct or indirect, to cause the 43 | direction or management of such entity, whether by contract or 44 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 45 | outstanding shares, or (iii) beneficial ownership of such entity. 46 | 47 | "You" (or "Your") shall mean an individual or Legal Entity 48 | exercising permissions granted by this License. 49 | 50 | "Source" form shall mean the preferred form for making modifications, 51 | including but not limited to software source code, documentation 52 | source, and configuration files. 53 | 54 | "Object" form shall mean any form resulting from mechanical 55 | transformation or translation of a Source form, including but 56 | not limited to compiled object code, generated documentation, 57 | and conversions to other media types. 58 | 59 | "Work" shall mean the work of authorship, whether in Source or 60 | Object form, made available under the License, as indicated by a 61 | copyright notice that is included in or attached to the work 62 | (an example is provided in the Appendix below). 63 | 64 | "Derivative Works" shall mean any work, whether in Source or Object 65 | form, that is based on (or derived from) the Work and for which the 66 | editorial revisions, annotations, elaborations, or other modifications 67 | represent, as a whole, an original work of authorship. For the purposes 68 | of this License, Derivative Works shall not include works that remain 69 | separable from, or merely link (or bind by name) to the interfaces of, 70 | the Work and Derivative Works thereof. 71 | 72 | "Contribution" shall mean any work of authorship, including 73 | the original version of the Work and any modifications or additions 74 | to that Work or Derivative Works thereof, that is intentionally 75 | submitted to Licensor for inclusion in the Work by the copyright owner 76 | or by an individual or Legal Entity authorized to submit on behalf of 77 | the copyright owner. For the purposes of this definition, "submitted" 78 | means any form of electronic, verbal, or written communication sent 79 | to the Licensor or its representatives, including but not limited to 80 | communication on electronic mailing lists, source code control systems, 81 | and issue tracking systems that are managed by, or on behalf of, the 82 | Licensor for the purpose of discussing and improving the Work, but 83 | excluding communication that is conspicuously marked or otherwise 84 | designated in writing by the copyright owner as "Not a Contribution." 85 | 86 | "Contributor" shall mean Licensor and any individual or Legal Entity 87 | on behalf of whom a Contribution has been received by Licensor and 88 | subsequently incorporated within the Work. 89 | 90 | 2. Grant of Copyright License. Subject to the terms and conditions of 91 | this License, each Contributor hereby grants to You a perpetual, 92 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 93 | copyright license to reproduce, prepare Derivative Works of, 94 | publicly display, publicly perform, sublicense, and distribute the 95 | Work and such Derivative Works in Source or Object form. 96 | 97 | 3. Grant of Patent License. Subject to the terms and conditions of 98 | this License, each Contributor hereby grants to You a perpetual, 99 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 100 | (except as stated in this section) patent license to make, have made, 101 | use, offer to sell, sell, import, and otherwise transfer the Work, 102 | where such license applies only to those patent claims licensable 103 | by such Contributor that are necessarily infringed by their 104 | Contribution(s) alone or by combination of their Contribution(s) 105 | with the Work to which such Contribution(s) was submitted. If You 106 | institute patent litigation against any entity (including a 107 | cross-claim or counterclaim in a lawsuit) alleging that the Work 108 | or a Contribution incorporated within the Work constitutes direct 109 | or contributory patent infringement, then any patent licenses 110 | granted to You under this License for that Work shall terminate 111 | as of the date such litigation is filed. 112 | 113 | 4. Redistribution. You may reproduce and distribute copies of the 114 | Work or Derivative Works thereof in any medium, with or without 115 | modifications, and in Source or Object form, provided that You 116 | meet the following conditions: 117 | 118 | (a) You must give any other recipients of the Work or 119 | Derivative Works a copy of this License; and 120 | 121 | (b) You must cause any modified files to carry prominent notices 122 | stating that You changed the files; and 123 | 124 | (c) You must retain, in the Source form of any Derivative Works 125 | that You distribute, all copyright, patent, trademark, and 126 | attribution notices from the Source form of the Work, 127 | excluding those notices that do not pertain to any part of 128 | the Derivative Works; and 129 | 130 | (d) If the Work includes a "NOTICE" text file as part of its 131 | distribution, then any Derivative Works that You distribute must 132 | include a readable copy of the attribution notices contained 133 | within such NOTICE file, excluding those notices that do not 134 | pertain to any part of the Derivative Works, in at least one 135 | of the following places: within a NOTICE text file distributed 136 | as part of the Derivative Works; within the Source form or 137 | documentation, if provided along with the Derivative Works; or, 138 | within a display generated by the Derivative Works, if and 139 | wherever such third-party notices normally appear. The contents 140 | of the NOTICE file are for informational purposes only and 141 | do not modify the License. You may add Your own attribution 142 | notices within Derivative Works that You distribute, alongside 143 | or as an addendum to the NOTICE text from the Work, provided 144 | that such additional attribution notices cannot be construed 145 | as modifying the License. 146 | 147 | You may add Your own copyright statement to Your modifications and 148 | may provide additional or different license terms and conditions 149 | for use, reproduction, or distribution of Your modifications, or 150 | for any such Derivative Works as a whole, provided Your use, 151 | reproduction, and distribution of the Work otherwise complies with 152 | the conditions stated in this License. 153 | 154 | 5. Submission of Contributions. Unless You explicitly state otherwise, 155 | any Contribution intentionally submitted for inclusion in the Work 156 | by You to the Licensor shall be under the terms and conditions of 157 | this License, without any additional terms or conditions. 158 | Notwithstanding the above, nothing herein shall supersede or modify 159 | the terms of any separate license agreement you may have executed 160 | with Licensor regarding such Contributions. 161 | 162 | 6. Trademarks. This License does not grant permission to use the trade 163 | names, trademarks, service marks, or product names of the Licensor, 164 | except as required for reasonable and customary use in describing the 165 | origin of the Work and reproducing the content of the NOTICE file. 166 | 167 | 7. Disclaimer of Warranty. Unless required by applicable law or 168 | agreed to in writing, Licensor provides the Work (and each 169 | Contributor provides its Contributions) on an "AS IS" BASIS, 170 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 171 | implied, including, without limitation, any warranties or conditions 172 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 173 | PARTICULAR PURPOSE. You are solely responsible for determining the 174 | appropriateness of using or redistributing the Work and assume any 175 | risks associated with Your exercise of permissions under this License. 176 | 177 | 8. Limitation of Liability. In no event and under no legal theory, 178 | whether in tort (including negligence), contract, or otherwise, 179 | unless required by applicable law (such as deliberate and grossly 180 | negligent acts) or agreed to in writing, shall any Contributor be 181 | liable to You for damages, including any direct, indirect, special, 182 | incidental, or consequential damages of any character arising as a 183 | result of this License or out of the use or inability to use the 184 | Work (including but not limited to damages for loss of goodwill, 185 | work stoppage, computer failure or malfunction, or any and all 186 | other commercial damages or losses), even if such Contributor 187 | has been advised of the possibility of such damages. 188 | 189 | 9. Accepting Warranty or Additional Liability. While redistributing 190 | the Work or Derivative Works thereof, You may choose to offer, 191 | and charge a fee for, acceptance of support, warranty, indemnity, 192 | or other liability obligations and/or rights consistent with this 193 | License. However, in accepting such obligations, You may act only 194 | on Your own behalf and on Your sole responsibility, not on behalf 195 | of any other Contributor, and only if You agree to indemnify, 196 | defend, and hold each Contributor harmless for any liability 197 | incurred by, or claims asserted against, such Contributor by reason 198 | of your accepting any such warranty or additional liability. 199 | 200 | END OF TERMS AND CONDITIONS 201 | 202 | APPENDIX: How to apply the Apache License to your work. 203 | 204 | To apply the Apache License to your work, attach the following 205 | boilerplate notice, with the fields enclosed by brackets "[]" 206 | replaced with your own identifying information. (Don't include 207 | the brackets!) The text should be enclosed in the appropriate 208 | comment syntax for the file format. We also recommend that a 209 | file or class name and description of purpose be included on the 210 | same "printed page" as the copyright notice for easier 211 | identification within third-party archives. 212 | 213 | Copyright [yyyy] [name of copyright owner] 214 | 215 | Licensed under the Apache License, Version 2.0 (the "License"); 216 | you may not use this file except in compliance with the License. 217 | You may obtain a copy of the License at 218 | 219 | http://www.apache.org/licenses/LICENSE-2.0 220 | 221 | Unless required by applicable law or agreed to in writing, software 222 | distributed under the License is distributed on an "AS IS" BASIS, 223 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 224 | See the License for the specific language governing permissions and 225 | limitations under the License. 226 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Code/ParseHelper.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Text.RegularExpressions; 9 | using Microsoft.Office.Interop.Word; 10 | 11 | namespace DSnA.WebJob.DocumentParser 12 | { 13 | public interface IParseHelper 14 | { 15 | DocumentContent ExtractDocumentContent(string docFile, Application wordApp); 16 | } 17 | 18 | public class ParseHelper : IParseHelper 19 | { 20 | internal static string WordHeading1 = "Heading 1"; 21 | internal static string WordHeading2 = "Heading 2"; 22 | internal static string WordHeading3 = "Heading 3"; 23 | internal static string WordHeading4 = "Heading 4"; 24 | private readonly ILogger iLogger; 25 | private readonly IUtils iUtils; 26 | private readonly IInteropWordUtils iInteropWordUtils; 27 | 28 | public ParseHelper(ILogger iLogger, IUtils iUtils) 29 | { 30 | this.iLogger = iLogger; 31 | this.iUtils = iUtils; 32 | iInteropWordUtils = new InteropWordUtils(); 33 | } 34 | 35 | public ParseHelper(ILogger iLogger, IUtils iUtils, IInteropWordUtils iInteropWordUtils) 36 | { 37 | this.iLogger = iLogger; 38 | this.iUtils = iUtils; 39 | this.iInteropWordUtils = iInteropWordUtils; 40 | } 41 | 42 | /// 43 | /// Extract all paragraphs 44 | /// 45 | /// 46 | /// 47 | private DocumentContent ExtractAllParagraphs(Document wordDocToExtract, Dictionary headers, List tableContent, Dictionary listParagraphs) 48 | { 49 | try 50 | { 51 | var fullContent = string.Empty; 52 | var paragraphs = new Dictionary(); 53 | var sections = new Dictionary(); 54 | 55 | List clauses = new List(); 56 | List headerClauses = new List(); 57 | List additionalInformation = new List(); 58 | 59 | foreach (Paragraph para in wordDocToExtract.Paragraphs) 60 | { 61 | var text = para.Range.Text; 62 | var cleanText = iUtils.CleanTextFromNonAsciiChar(text); 63 | var textLastSentence = para.Range.Sentences.Last.Text; 64 | var textStart = para.Range.Start; 65 | fullContent += text; 66 | var listNumber = para.Range.ListFormat.ListString; 67 | 68 | if (!string.IsNullOrEmpty(listNumber)) 69 | { 70 | text = $"{listNumber.Trim()} {text}"; 71 | } 72 | 73 | if (textStart > 250 && headers.ContainsKey(textStart)) 74 | { 75 | if (headerClauses.Count > 0) 76 | { 77 | headerClauses.Last().End = textStart - 1; 78 | } 79 | 80 | headerClauses.Add(new Clauses 81 | { 82 | Title = headers[textStart], 83 | Content = text, 84 | Start = textStart 85 | }); 86 | 87 | if (headers.Keys.Max() == textStart) 88 | { 89 | headerClauses.Last().End = para.Range.End; 90 | 91 | if (!string.IsNullOrEmpty(listNumber)) 92 | { 93 | headerClauses.Last().Content = para.Range.ListFormat.List.Range.Text; 94 | } 95 | } 96 | } 97 | else if (headerClauses.Count >= 1 && textStart <= headers.Keys.Max() && tableContent.Contains(cleanText) == false) 98 | { 99 | headerClauses.Last().Content += text; 100 | } 101 | 102 | if (listParagraphs.Count > 0 && listParagraphs.ContainsKey(textStart)) 103 | { 104 | if(!string.IsNullOrEmpty(cleanText)) 105 | { 106 | sections.Add(textStart, text); 107 | } 108 | 109 | if (clauses.Count > 0) 110 | { 111 | clauses.Last().End = textStart - 1; 112 | } 113 | 114 | clauses.Add(new Clauses 115 | { 116 | Title = listParagraphs[textStart], 117 | Content = text, 118 | Start = textStart 119 | }); 120 | 121 | if (listParagraphs.Keys.Max() == textStart) 122 | { 123 | var nextPara = para.Next(); 124 | 125 | if (nextPara != null) 126 | { 127 | clauses.Last().End = nextPara.Range.End; 128 | clauses.Last().Content += nextPara?.Range?.Text ?? string.Empty; 129 | } 130 | else 131 | { 132 | clauses.Last().End = para.Range.End; 133 | } 134 | } 135 | } 136 | else if (clauses.Count >= 1 && textStart <= listParagraphs.Keys.Max() && tableContent.Contains(cleanText) == false) 137 | { 138 | if (!string.IsNullOrEmpty(cleanText)) 139 | { 140 | sections.Add(textStart, text); 141 | } 142 | 143 | clauses.Last().Content += text; 144 | } 145 | 146 | paragraphs.Add(textStart, text); 147 | } 148 | 149 | if (headerClauses.Count == 0) 150 | { 151 | headerClauses.Add(new Clauses()); 152 | } 153 | 154 | if (clauses.Count == 0) 155 | { 156 | clauses.Add(new Clauses()); 157 | } 158 | 159 | if (sections.Count == 0) 160 | { 161 | sections.Add(-1, string.Empty); 162 | } 163 | 164 | if (headers.Count == 0) 165 | { 166 | headers.Add(-1, string.Empty); 167 | } 168 | 169 | StringBuilder rangesContent = new StringBuilder(); 170 | List> ranges = new List>(); 171 | List rangeTypes = new List(); 172 | 173 | foreach (Range range in wordDocToExtract.StoryRanges) 174 | { 175 | Range currentRange = range; 176 | 177 | do 178 | { 179 | if (RangeStoryTypeIsHeaderOrFooter(currentRange) && 180 | CurrentRangeHaveShapeRanges(currentRange)) 181 | { 182 | foreach (Shape shape in currentRange.ShapeRange) 183 | { 184 | if (shape.TextFrame.HasText == 0) 185 | { 186 | continue; 187 | } 188 | 189 | Range shapeRange = shape.TextFrame.TextRange; 190 | 191 | rangesContent.Append(RemoveLineBreaks(shapeRange.Text)); 192 | ranges.Add(new Tuple(shapeRange.Start, shapeRange.End)); 193 | rangeTypes.Add(currentRange.StoryType); 194 | } 195 | } 196 | else 197 | { 198 | rangesContent.Append(RemoveLineBreaks(currentRange.Text)); 199 | ranges.Add(new Tuple(currentRange.Start, currentRange.End)); 200 | rangeTypes.Add(currentRange.StoryType); 201 | } 202 | 203 | bool hasMatch = false; 204 | MatchCollection matches = Constants.RegexExp.SessionRegEx.Matches(rangesContent.ToString()); 205 | 206 | foreach (Match match in matches) 207 | { 208 | additionalInformation.Add($"{string.Join("\t", rangeTypes.Select(x => x.ToString()))}|{string.Join("\t", ranges.Select(x => $"{{{x.Item1},{x.Item2}}}"))}|{match.Index}|{match.Value}"); 209 | hasMatch = true; 210 | } 211 | 212 | matches = Constants.RegexExp.AgendaItemRegEx.Matches(rangesContent.ToString()); 213 | 214 | foreach (Match match in matches) 215 | { 216 | additionalInformation.Add($"{string.Join("\t", rangeTypes.Select(x => x.ToString()))}|{string.Join("\t", ranges.Select(x => $"{{{x.Item1},{x.Item2}}}"))}|{match.Index}|{match.Value}"); 217 | hasMatch = true; 218 | } 219 | 220 | if (hasMatch) 221 | { 222 | rangesContent.Clear(); 223 | ranges.Clear(); 224 | rangeTypes.Clear(); 225 | } 226 | 227 | currentRange = currentRange.NextStoryRange; 228 | } while (currentRange != null); 229 | } 230 | 231 | return new DocumentContent() 232 | { 233 | Text = fullContent, 234 | Paragraphs = paragraphs, 235 | Sections = sections, 236 | Clauses = clauses, 237 | Headers = headers, 238 | HeaderClauses = headerClauses, 239 | AdditionalInformation = additionalInformation 240 | }; 241 | } 242 | catch (Exception exception) 243 | { 244 | throw new Exception("Exception in extracting data\n", exception); 245 | } 246 | } 247 | 248 | public static string RemoveLineBreaks(string text) 249 | { 250 | if (text == "\n" 251 | || text == "\r\n") 252 | { 253 | return " "; 254 | } 255 | 256 | return text 257 | .Replace("\r", string.Empty) 258 | .Replace("\n", string.Empty); 259 | } 260 | 261 | private static bool RangeStoryTypeIsHeaderOrFooter(Range range) 262 | { 263 | return (range.StoryType == WdStoryType.wdEvenPagesHeaderStory || 264 | range.StoryType == WdStoryType.wdPrimaryHeaderStory || 265 | range.StoryType == WdStoryType.wdEvenPagesFooterStory || 266 | range.StoryType == WdStoryType.wdPrimaryFooterStory || 267 | range.StoryType == WdStoryType.wdFirstPageHeaderStory || 268 | range.StoryType == WdStoryType.wdFirstPageFooterStory); 269 | } 270 | 271 | private static bool CurrentRangeHaveShapeRanges(Range range) 272 | { 273 | return range.ShapeRange.Count > 0; 274 | } 275 | 276 | /// 277 | /// Extracts content (red flags, company name, report date) from document 278 | /// 279 | /// 280 | /// document content 281 | public DocumentContent ExtractDocumentContent(string docFile, Application wordApp) 282 | { 283 | Document wordDocToExtract = null; 284 | 285 | try 286 | { 287 | DocumentContent docContent = new DocumentContent(); 288 | // open the document only in read only mode - so that no edits are made on the document 289 | wordDocToExtract = iInteropWordUtils.OpenDocument(docFile, wordApp); 290 | var tableContent = ExtractTableContent(wordDocToExtract); 291 | // Extract paragraph 292 | var listParagraphs = ExtractListPragraphs(wordDocToExtract); 293 | var headers = ExtractHeaders(wordDocToExtract, tableContent, listParagraphs); 294 | docContent = ExtractAllParagraphs(wordDocToExtract, headers, tableContent, listParagraphs); 295 | 296 | return docContent; 297 | } 298 | catch (Exception exception) 299 | { 300 | throw new Exception("Exception extracting content (" + nameof(ExtractDocumentContent) + ")\n", exception); 301 | } 302 | finally 303 | { 304 | // Close without saving and release resources 305 | wordDocToExtract?.Close(SaveChanges: false); 306 | } 307 | } 308 | 309 | private Dictionary ExtractListPragraphs(Document wordDocToExtract) 310 | { 311 | var listParagraphs = new Dictionary(); 312 | foreach (List firstItem in wordDocToExtract.Lists.OfType().Reverse()) 313 | { 314 | if (firstItem.Range.ListFormat.ListString != null) 315 | { 316 | var totalVlaues = firstItem.Range.ListParagraphs.Count; 317 | bool foundNumeric = false; 318 | foreach (Paragraph item in firstItem.Range.ListParagraphs.OfType().Reverse()) 319 | { 320 | if (listParagraphs.ContainsKey(item.Range.Start)) 321 | { 322 | break; 323 | } 324 | 325 | var isNumeric = Regex.IsMatch(item.Range.ListFormat?.ListString ?? string.Empty, Constants.RegexExp.HasNumbers); 326 | if (foundNumeric == false) 327 | { 328 | foundNumeric = isNumeric; 329 | } 330 | 331 | if (foundNumeric == true && isNumeric == false) 332 | { 333 | continue; 334 | } 335 | 336 | if (item.Range.ListFormat.ListLevelNumber == 1 && (listParagraphs.Count == 0 || listParagraphs.Keys.Max() < item.Range.Start)) 337 | { 338 | listParagraphs.Add(item.Range.Start, item.Range.Sentences.First.Text); 339 | } 340 | } 341 | } 342 | } 343 | 344 | return listParagraphs; 345 | } 346 | 347 | private List ExtractTableContent(Document wordDocToExtract) 348 | { 349 | var tblParaList = new List(); 350 | try 351 | { 352 | foreach (Table table in wordDocToExtract.Tables) 353 | { 354 | foreach (Paragraph tblPara in table.Range.Paragraphs) 355 | { 356 | var cleanText = iUtils.CleanTextFromNonAsciiChar(tblPara.Range.Text).Replace(" ", ""); 357 | if (!string.IsNullOrEmpty(cleanText)) 358 | { 359 | tblParaList.Add(iUtils.CleanTextFromNonAsciiChar(tblPara.Range.Text)); 360 | } 361 | } 362 | } 363 | } 364 | catch (Exception) 365 | { 366 | } 367 | 368 | return tblParaList; 369 | } 370 | 371 | /// 372 | /// Extract headers 373 | /// 374 | /// 375 | /// company name 376 | private Dictionary ExtractHeaders(Document wordDocToExtract, List tblParaList, Dictionary listParagraphs) 377 | { 378 | try 379 | { 380 | var headers = new Dictionary(); 381 | foreach (Paragraph para in wordDocToExtract.Paragraphs) 382 | { 383 | try 384 | { 385 | var textStart = para.Range.Start; 386 | if (listParagraphs.Count > 0 && textStart <= listParagraphs.Keys.Max() && textStart >= listParagraphs.Keys.Min()) 387 | { 388 | continue; 389 | } 390 | 391 | var paraText = para.Range.Text; 392 | var cleanTextWithoutSpecialChar = iUtils.CleanTextFromNonAsciiChar(Regex.Replace(paraText.ToLower().Trim(), Constants.RegexExp.NoSpecialCharRegex, string.Empty)); 393 | if (!string.IsNullOrEmpty(cleanTextWithoutSpecialChar) && tblParaList.Contains(iUtils.CleanTextFromNonAsciiChar(paraText)) == false) 394 | { 395 | string headingStyle = null; 396 | try 397 | { 398 | headingStyle = (para.Range.get_Style() as Style).NameLocal; 399 | } 400 | catch (Exception) 401 | { 402 | headingStyle = string.Empty; 403 | } 404 | 405 | if (headingStyle.Equals(WordHeading1) || headingStyle.Equals(WordHeading2) || headingStyle.Equals(WordHeading3) || headingStyle.Equals(WordHeading4) || para.Range.Font.Bold == -1) 406 | { 407 | if (!Regex.IsMatch(paraText, Constants.RegexExp.OnlyNumericWithSpaces)) 408 | { 409 | headers.Add(textStart, iUtils.CleanTextFromNonAsciiChar(Regex.Replace(paraText.Replace(".", " ").TrimStart(), Constants.RegexExp.OnlyNumericWithSpaces, string.Empty))); 410 | } 411 | } 412 | else if (para.Range.Words.First.Bold == -1 || para.Range.Font.Size > 12) 413 | { 414 | var wordCount = para.Range.Sentences.First.Words.Count; 415 | if (wordCount <= 6) 416 | { 417 | var firstWords = iUtils.CleanTextFromNonAsciiChar(Regex.Replace(para.Range.Sentences.First.Text.Replace(".", " ").TrimStart(), Constants.RegexExp.OnlyNumericWithSpaces, string.Empty)); 418 | if (firstWords.Length <= 1) 419 | { 420 | firstWords = iUtils.CleanTextFromNonAsciiChar(Regex.Replace(para.Range.Sentences[2].Text.Replace(".", " ").TrimStart(), Constants.RegexExp.OnlyNumericWithSpaces, string.Empty)); 421 | } 422 | 423 | headers.Add(textStart, firstWords); 424 | } 425 | else 426 | { 427 | var boldText = string.Empty; 428 | var wordCounter = wordCount <= 25 ? wordCount : 25; 429 | for (int i = 1; i <= wordCount; i++) 430 | { 431 | if (para.Range.Sentences.First.Words[i].Bold == -1) 432 | { 433 | boldText += para.Range.Words[i].Text; 434 | } 435 | else 436 | { 437 | break; 438 | } 439 | } 440 | 441 | if (boldText != string.Empty) 442 | { 443 | headers.Add(textStart, iUtils.CleanTextFromNonAsciiChar((Regex.Replace(boldText.Replace(".", " ").TrimStart(), Constants.RegexExp.OnlyNumericWithSpaces, string.Empty)))); 444 | } 445 | } 446 | } 447 | } 448 | } 449 | catch (Exception) 450 | { 451 | } 452 | } 453 | 454 | return headers; 455 | } 456 | catch (Exception exception) 457 | { 458 | throw new Exception("Exception in extracting headers (" + nameof(ExtractHeaders) + ")\n", exception); 459 | } 460 | } 461 | } 462 | } 463 | -------------------------------------------------------------------------------- /tools/document-processor/DSnA.WebJob.DocumentParser/Code/Utils.cs: -------------------------------------------------------------------------------- 1 | //Copyright(c) Microsoft Corporation.All rights reserved. 2 | //Licensed under the MIT License. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Text; 8 | using Microsoft.Office.Interop.Word; 9 | using Newtonsoft.Json; 10 | using System.Text.RegularExpressions; 11 | using System.Linq; 12 | using Microsoft.WindowsAzure.Storage.Blob; 13 | using Microsoft.WindowsAzure.Storage.Queue; 14 | 15 | 16 | namespace DSnA.WebJob.DocumentParser 17 | { 18 | using Microsoft.Azure; 19 | using Microsoft.WindowsAzure.Storage; 20 | using System.IO.Abstractions; 21 | using Table = Table; 22 | public interface IUtils 23 | { 24 | string CleanNonSupportedSparkChar(string dirtyString); 25 | string CleanTextFromNonAsciiChar(string dirtyString); 26 | List ExtractLinksFromText(string content, bool isPreProcessingReq = false); 27 | string SerializeAndSaveJson(dynamic jsonData, string fileName); 28 | string SaveToCsvFile(List csvLines, string fileName); 29 | string SaveJsonToFile(string jsonDoc, string fileName, string directory); 30 | FileMetaData ExtractFileMetadata(string fileLocation); 31 | void UploadFileToBlob(string fileLocation, CloudBlobClient blobClient); 32 | List GetBlobListFromOutputContainer(CloudBlobClient blobClient); 33 | void CheckBlobUriInMsg(QueueMessage queueMsg); 34 | void DeleteInputFiles(List filesToDelete); 35 | void CheckAllQueueExists(List queueList); 36 | string DownloadBlobFile(string blobUri, string locationToSave, CloudBlobClient blobClient); 37 | bool CheckUriIsValid(string inputUri); 38 | string ConvertPdfToWord(string file, string directoryToSave, Application wordApp); 39 | JsonDocumentStruct PrepareErrorJsonDoc(string fileLocation, Exception exp); 40 | Tuple FindTableWithHeader(Document wordDocToExtract, List tableHeaders); 41 | CloudBlobClient CreateCloudBlobClient(CloudStorageAccount StorageAccount); 42 | CloudQueue GetQueueReference(CloudStorageAccount StorageAccount); 43 | int GetQueueMessageDequeueCount(CloudQueueMessage queueMsg); 44 | } 45 | 46 | public class Utils : IUtils 47 | { 48 | private readonly ILogger iLogger; 49 | private readonly IFileSystem iFileSystem; 50 | private readonly IInteropWordUtils iInteropWordUtils; 51 | private static string outputContainerName = CloudConfigurationManager.GetSetting(Constants.ParserConfig.OutputContainerNameRef); 52 | 53 | public Utils(ILogger iLogger) 54 | { 55 | this.iLogger = iLogger; 56 | this.iFileSystem = new FileSystem(); 57 | iInteropWordUtils = new InteropWordUtils(); 58 | } 59 | 60 | public Utils(ILogger iLogger, IFileSystem iFileSystem) 61 | { 62 | this.iLogger = iLogger; 63 | this.iFileSystem = iFileSystem; 64 | iInteropWordUtils = new InteropWordUtils(); 65 | } 66 | 67 | public Utils(ILogger iLogger, IFileSystem iFileSystem, IInteropWordUtils iInteropWordUtils) 68 | { 69 | this.iLogger = iLogger; 70 | this.iFileSystem = iFileSystem; 71 | this.iInteropWordUtils = iInteropWordUtils; 72 | } 73 | 74 | /// 75 | /// Clean non supported Spark filename characters 76 | /// 77 | /// 78 | /// text with only ascii char 79 | public string CleanNonSupportedSparkChar(string dirtyString) 80 | { 81 | if (string.IsNullOrEmpty(dirtyString)) 82 | return dirtyString; 83 | 84 | return dirtyString.Replace("%20", "_").Replace("{", "").Replace("}", "").Replace("[", "").Replace("]", ""); 85 | } 86 | 87 | /// 88 | /// Clean non ascii char from input text/string 89 | /// 90 | /// 91 | /// text with only ascii char 92 | public string CleanTextFromNonAsciiChar(string dirtyString) 93 | { 94 | if (string.IsNullOrEmpty(dirtyString)) 95 | return dirtyString; 96 | 97 | string cleanString = Regex.Replace(dirtyString, Constants.RegexExp.NoEscapeSequences, String.Empty); 98 | cleanString = Regex.Replace(cleanString, Constants.RegexExp.OnlyAsciiChar, String.Empty); 99 | cleanString = Regex.Replace(cleanString, "\u0001", String.Empty); 100 | cleanString = Regex.Replace(cleanString, "\u0015", String.Empty); 101 | cleanString = Regex.Replace(cleanString, Constants.RegexExp.OnlyWhiteSpaces, " "); 102 | return cleanString.Trim(); 103 | } 104 | 105 | /// 106 | /// extract only hyperlinks from text 107 | /// if preprocessing = true -> removes all spaces and adds spaces only before App protocols 108 | /// -to distinguish hyperlinks from other strings. 109 | /// 110 | /// 111 | /// 112 | /// list of hyperlinks 113 | public List ExtractLinksFromText(string content, bool isPreProcessingReq = false) 114 | { 115 | // do some preprocessing on the text 116 | if (isPreProcessingReq) 117 | { 118 | string stringWithSpacesRemoved = Regex.Replace(content, Constants.RegexExp.OnlyWhiteSpaces, String.Empty); 119 | content = Regex.Replace(stringWithSpacesRemoved, Constants.RegexExp.HyperlinkAppProtocols, word => String.Format(@" {0}", word.Value)); //add space before http or https or ftp so that next regex can pick the link 120 | } 121 | 122 | MatchCollection matches = Regex.Matches(content, Constants.RegexExp.OnlyHyperlinks); 123 | List webLinks = matches.Cast().Select(match => match.Value).ToList(); 124 | return webLinks; 125 | } 126 | 127 | /// 128 | /// Serialize json string and save it to location 129 | /// 130 | /// 131 | /// 132 | /// location where json file is saved 133 | public string SerializeAndSaveJson(dynamic jsonData, string fileName) 134 | { 135 | try 136 | { 137 | if (String.IsNullOrEmpty(fileName)) 138 | fileName = Constants.FileConfigs.TempFileName + "-" + DateTime.UtcNow.ToString(Constants.DateTimeFormat); 139 | 140 | string finalJson = JsonConvert.SerializeObject(jsonData, Formatting.Indented); 141 | var jsonOutputFileLocation = SaveJsonToFile(finalJson, fileName, Constants.FileConfigs.OutputDirectoryPath); 142 | return jsonOutputFileLocation; 143 | } 144 | catch (Exception exception) 145 | { 146 | throw new Exception("Exception occured in saving JSON (SerializeAndSaveJson)\n", exception); 147 | } 148 | } 149 | 150 | public string SaveToCsvFile(List csvLines, string fileName) 151 | { 152 | if (String.IsNullOrEmpty(fileName)) 153 | fileName = Constants.FileConfigs.TempFileName + "-" + DateTime.UtcNow.ToString(Constants.DateTimeFormat); 154 | 155 | var directory = Constants.FileConfigs.OutputDirectoryPath; 156 | if (!iFileSystem.Directory.Exists(directory)) 157 | iFileSystem.Directory.CreateDirectory(directory); 158 | 159 | string outputFileName = Path.Combine(directory, $"{fileName}.csv"); 160 | using (StreamWriter sw = new StreamWriter(outputFileName)) 161 | { 162 | foreach (var row in csvLines) 163 | { 164 | sw.WriteLine(row); 165 | } 166 | } 167 | 168 | return outputFileName; 169 | } 170 | 171 | /// 172 | /// Populate all metadata related to extracted file 173 | /// 174 | /// 175 | /// 176 | /// FileMetaData 177 | public FileMetaData ExtractFileMetadata(string fileLocation) 178 | { 179 | try 180 | { 181 | FileMetaData fileData = new FileMetaData(); 182 | var filName = Path.GetFileName(fileLocation); 183 | fileData.AgreementNumber = filName.Contains("_") ? filName.Split('_')[0] : string.Empty; 184 | fileData.FileName = filName; 185 | fileData.FileType = Path.GetExtension(fileLocation).Replace(".",""); 186 | fileData.ExtractionTimeStamp = DateTime.UtcNow.ToString(Constants.DateTimeFormat); 187 | return fileData; 188 | } 189 | catch (Exception exception) 190 | { 191 | throw new Exception("Error in extracting metadata of given file(ExtractFileMetadata)\n", exception); 192 | } 193 | } 194 | 195 | /// 196 | /// upload file to given blob location 197 | /// 198 | /// 199 | /// 200 | public void UploadFileToBlob(string fileLocation, CloudBlobClient blobClient) 201 | { 202 | try 203 | { 204 | var container = blobClient.GetContainerReference(outputContainerName); 205 | ICloudBlob blob = container.GetBlockBlobReference(Path.GetFileName(fileLocation)); 206 | if (blob == null) 207 | throw new Exception("Inaccessible blob location --> " + container?.Uri?.AbsolutePath + " (UploadFileToBlob)\n"); 208 | 209 | blob.UploadFromFile(fileLocation); 210 | } 211 | catch (Exception exception) 212 | { 213 | throw new Exception("Error in uploading the output JSON file to blob location (" + fileLocation + ") in (UploadFileToBlob)\n", exception); 214 | } 215 | } 216 | 217 | public List GetBlobListFromOutputContainer(CloudBlobClient blobClient) 218 | { 219 | try 220 | { 221 | var container = blobClient.GetContainerReference(outputContainerName); 222 | var blobList = container.ListBlobs(useFlatBlobListing: true); 223 | var ouputBlobs = blobList.Select(s => s.Uri.Segments[s.Uri.Segments.Length - 1].Replace(".json","")).ToList(); 224 | return ouputBlobs; 225 | } 226 | catch (Exception exception) 227 | { 228 | throw new Exception("Error while executing func GetBlobListFromOutputContainer", exception); 229 | } 230 | } 231 | 232 | /// 233 | /// higher level - check blob URI is valid 234 | /// 235 | /// 236 | public void CheckBlobUriInMsg(QueueMessage queueMsg) 237 | { 238 | if (!CheckUriIsValid(queueMsg.FileInputUri)) 239 | throw new Exception("Queue message is invalid" + "-->" + queueMsg.FileInputUri); 240 | 241 | if (!CheckUriIsValid(queueMsg.FileOutputUri)) 242 | throw new Exception("Queue message is invalid" + "-->" + queueMsg.FileOutputUri); 243 | } 244 | 245 | /// 246 | /// Delete given list of files 247 | /// 248 | /// 249 | public void DeleteInputFiles(List filesToDelete) 250 | { 251 | try 252 | { 253 | foreach (var file in filesToDelete) 254 | { 255 | if (iFileSystem.File.Exists(file)) 256 | iFileSystem.File.Delete(file); 257 | } 258 | } 259 | catch (Exception exception) 260 | { 261 | throw new UnableToDeleteFileException("Unable to delete files related to reports (pdf doc or word doc or json output file)\n", exception); 262 | } 263 | } 264 | 265 | /// 266 | /// Check given list of Azure Queues exist 267 | /// 268 | /// 269 | public void CheckAllQueueExists(List queueList) 270 | { 271 | foreach (CloudQueue queue in queueList) 272 | { 273 | if (!queue.Exists()) 274 | throw new Exception("Message Queue is inaccessible or does not exist" + "-->" + queue.Uri + "(CheckAllQueueExists)\n"); 275 | } 276 | } 277 | 278 | /// 279 | /// Download blob file to local file system 280 | /// 281 | /// 282 | /// 283 | /// 284 | /// local file location where the file is saved 285 | public string DownloadBlobFile(string blobUri, string locationToSave, CloudBlobClient blobClient) 286 | { 287 | try 288 | { 289 | if (!iFileSystem.Directory.Exists(locationToSave)) 290 | iFileSystem.Directory.CreateDirectory(locationToSave); 291 | 292 | ICloudBlob blob = blobClient.GetBlobReferenceFromServer(new Uri(blobUri)); 293 | if (blob == null) 294 | throw new Exception("Inaccessible blob location " + blobUri + "\n"); 295 | 296 | string fileName = Path.GetFileName(new Uri(blobUri).LocalPath).Replace(" ", "_").Replace("{", "").Replace("}", "").Replace("[", "").Replace("]", ""); // to deal with spaces in filenames and invalid values for spark 297 | string localBlobLocation = Path.Combine(locationToSave, fileName); 298 | blob.DownloadToFile(localBlobLocation, FileMode.Create); 299 | return localBlobLocation; 300 | } 301 | catch (Exception exception) 302 | { 303 | throw new Exception("Exception occured in downloading file from Azure Blob Storage(DownloadBlobFile)\n", exception); 304 | } 305 | } 306 | 307 | /// 308 | /// check whether given uri is valid by recreating it 309 | /// 310 | /// 311 | /// true if valid, false otherwise 312 | public bool CheckUriIsValid(string inputUri) 313 | { 314 | try 315 | { 316 | Uri result; 317 | if (String.IsNullOrEmpty(inputUri) || String.IsNullOrWhiteSpace(inputUri)) 318 | return false; 319 | 320 | if (!Uri.TryCreate(inputUri, UriKind.Absolute, out result)) 321 | return false; 322 | 323 | if (!result.Scheme.Equals(Uri.UriSchemeHttp) && !result.Scheme.Equals(Uri.UriSchemeHttps)) 324 | return false; 325 | 326 | return true; 327 | } 328 | catch (Exception exception) 329 | { 330 | throw new Exception("Exception occured in checking blob URI(CheckUriIsValid)\n", exception); 331 | } 332 | } 333 | 334 | /// 335 | /// save json output to file in local filesystem 336 | /// 337 | /// 338 | /// 339 | /// 340 | /// saved local file location 341 | public string SaveJsonToFile(string jsonData, string fileName, string directory) 342 | { 343 | 344 | if (!iFileSystem.Directory.Exists(directory)) 345 | iFileSystem.Directory.CreateDirectory(directory); 346 | 347 | string outputFileName = Path.Combine(directory, $"{fileName}.json"); 348 | JsonTextWriter jsonTextWriter = new JsonTextWriter(iFileSystem.File.CreateText(outputFileName)); 349 | jsonTextWriter.Close(); 350 | // file is overwritten if already exists 351 | iFileSystem.File.WriteAllText(outputFileName, jsonData); 352 | return outputFileName; 353 | } 354 | 355 | /// 356 | /// Convert PDF document to MS Word document 357 | /// 358 | /// 359 | /// location of converted/saved word doc 360 | public string ConvertPdfToWord(string file, string directoryToSave, Application wordApp) 361 | { 362 | Document pdfAsWordDoc = null; 363 | try 364 | { 365 | if (!iFileSystem.Directory.Exists(directoryToSave)) 366 | iFileSystem.Directory.CreateDirectory(directoryToSave); 367 | 368 | pdfAsWordDoc = iInteropWordUtils.OpenDocument(file, wordApp); 369 | string convertedDocFileLocation = directoryToSave + "/" + Path.ChangeExtension(Path.GetFileName(file), ".doc"); 370 | pdfAsWordDoc.SaveAs2(convertedDocFileLocation, WdSaveFormat.wdFormatDocument); 371 | return convertedDocFileLocation; 372 | } 373 | catch (Exception exception) 374 | { 375 | throw new Exception("Exception occured while converting PDF document to Word document (ConvertPdfToWord)\n", exception); 376 | } 377 | finally 378 | { 379 | // Close without saving and release resources 380 | pdfAsWordDoc?.Close(SaveChanges: false); 381 | } 382 | } 383 | 384 | /// 385 | /// When Error, prepare json document with error details 386 | /// 387 | /// 388 | /// 389 | /// 390 | /// Json structure with error details 391 | public JsonDocumentStruct PrepareErrorJsonDoc(string fileLocation, Exception exp) 392 | { 393 | JsonDocumentStruct jsonDoc = new JsonDocumentStruct(); 394 | jsonDoc.Errors = new Error(); 395 | jsonDoc.Errors.IsError = true; 396 | jsonDoc.Errors.Description = exp.ToString(); 397 | jsonDoc.FileProperties = ExtractFileMetadata(fileLocation); 398 | return jsonDoc; 399 | } 400 | 401 | /// 402 | /// Find table containing provided table headers - mainly for document containing tables 403 | /// 404 | /// 405 | /// 406 | /// table index and row index (to know where to start reading data from) 407 | public Tuple FindTableWithHeader(Document wordDocToExtract, List tableHeaders) 408 | { 409 | try 410 | { 411 | var tableIndex = 1; 412 | string cleanColHeader; 413 | var cellText = new StringBuilder(); 414 | var columnHeadersAsList = new List(); 415 | foreach (Table table in wordDocToExtract.Tables) 416 | { 417 | if (table.Columns.Count == tableHeaders.Count) 418 | { 419 | // check whether the table contains red flags 420 | for (var row = 1; row <= table.Rows.Count; row++) 421 | { 422 | for (var col = 1; col <= table.Columns.Count; col++) 423 | { 424 | foreach (Paragraph para in table.Cell(row, col).Range.Paragraphs) 425 | cellText.Append(para.Range.Text); 426 | 427 | // looks for exact wordings in headers of red flag table 428 | cleanColHeader = Regex.Replace(cellText.ToString(), Constants.RegexExp.NoSpecialCharRegex, ""); 429 | columnHeadersAsList.Add(tableHeaders.Find(x => cleanColHeader.ToLower().Equals(x.ToLower()))); 430 | cellText.Clear(); 431 | } 432 | 433 | // if this table is the table we were looking for, break and return the table index 434 | if (!columnHeadersAsList.Contains(null)) 435 | return Tuple.Create(tableIndex, row + 1); 436 | 437 | columnHeadersAsList.Clear(); 438 | } 439 | } 440 | 441 | tableIndex++; 442 | } 443 | 444 | return Tuple.Create(-1, -1); 445 | } 446 | catch (Exception exception) 447 | { 448 | throw new Exception("Exception occured on finding table with provided headers in document(FindTableWithHeader)\n", exception); 449 | } 450 | } 451 | 452 | /// 453 | /// Create cloud blob client from storage account 454 | /// 455 | /// 456 | /// Cloud blob client 457 | public CloudBlobClient CreateCloudBlobClient(CloudStorageAccount StorageAccount) 458 | { 459 | return StorageAccount?.CreateCloudBlobClient(); 460 | } 461 | 462 | /// 463 | /// get reference for Azure queue from storage account 464 | /// 465 | /// 466 | /// Queue reference 467 | public CloudQueue GetQueueReference(CloudStorageAccount StorageAccount) 468 | { 469 | CloudQueueClient queueClient = StorageAccount?.CreateCloudQueueClient(); 470 | return queueClient?.GetQueueReference(CloudConfigurationManager.GetSetting(Constants.ParserConfig.MessageQueueRef)); 471 | } 472 | 473 | /// 474 | /// Get Queue message dequeue count 475 | /// 476 | /// 477 | /// message dequeue count 478 | public int GetQueueMessageDequeueCount(CloudQueueMessage queueMsg) 479 | { 480 | return queueMsg.DequeueCount; 481 | } 482 | } 483 | } -------------------------------------------------------------------------------- /knowledge_extraction_paragraph_level.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT License. 3 | 4 | 5 | #%% Imports 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from nltk import word_tokenize 10 | from nltk.corpus import stopwords 11 | from nltk.tokenize import RegexpTokenizer 12 | import re 13 | import string 14 | import spacy 15 | spacy_nlp = spacy.load('en') 16 | import gensim 17 | from scipy import spatial 18 | from collections import Counter 19 | import matplotlib.pyplot as plt 20 | import difflib 21 | 22 | 23 | stop_words = set(stopwords.words('english')) 24 | 25 | current_dir = './UN_Knowledge_Extraction/' 26 | data_dir = current_dir + "data/" 27 | output_dir = current_dir + "output/" 28 | 29 | UN_DOCS_Paragraphs = pd.read_csv(data_dir + "UN_RES_DOCS_2009_2018.csv").fillna('').reset_index(drop=True) 30 | w2v_google = gensim.models.KeyedVectors.load_word2vec_format(data_dir + 'GoogleNews-vectors-negative300.bin.gz', binary=True) 31 | 32 | 33 | UNBIS_terms = pd.read_csv(data_dir + "UNBIS_terms.csv", encoding='cp1252') 34 | UNBIS_terms = [term.lower() for term in UNBIS_terms['Term'].unique().tolist()] 35 | 36 | SDG_Targets_Indicators = pd.read_csv(data_dir + "SDG_Targets_Indicators.csv", encoding='cp1252') 37 | SDG = list(SDG_Targets_Indicators['SDG'].drop_duplicates()) 38 | 39 | Targets_SDG_dict = pd.Series(SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Targets'].SDG.values,index=SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Targets'].Content).to_dict() 40 | Indicators_SDG_dict = pd.Series(SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Indicators'].SDG.values,index=SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Indicators'].Content).to_dict() 41 | 42 | Targets = list(SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Targets']['Content'].drop_duplicates()) 43 | Indicators = list(SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Indicators']['Content'].drop_duplicates()) 44 | 45 | 46 | SDG_Targets_Indicators_High_Frequency_Words = dict() 47 | for SDG in list(SDG_Targets_Indicators.SDG.unique()): 48 | target = [key for key,value in Targets_SDG_dict.items() if value == SDG] 49 | indicator = [key for key,value in Indicators_SDG_dict.items() if value == SDG] 50 | tokenizer = RegexpTokenizer(r'\w+') 51 | all_words = [w for w in tokenizer.tokenize(' '.join(target + indicator).lower().replace('\t',' ')) if w not in stop_words] 52 | SDG_Targets_Indicators_High_Frequency_Words[SDG] = Counter(all_words).most_common(10) 53 | 54 | for SDG in SDG_Targets_Indicators_High_Frequency_Words.keys(): 55 | print(SDG, SDG_Targets_Indicators_High_Frequency_Words[SDG]) 56 | 57 | 58 | preambular_verb_list = open(data_dir + "preambular_verb_list.txt").read().splitlines() 59 | operative_verb_list = open(data_dir + "operative_verb_list.txt").read().splitlines() 60 | 61 | UN_DOCS_Paragraphs['First_Action_Verb'] = '' 62 | UN_DOCS_Paragraphs['Paragraph_Type'] = '' 63 | UN_DOCS_Paragraphs['Key_Terms'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))] 64 | UN_DOCS_Paragraphs['Referenced_Resolutions'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))] 65 | UN_DOCS_Paragraphs['Referenced_Resolutions_Dates'] = [dict() for x in range(len(UN_DOCS_Paragraphs.index))] 66 | UN_DOCS_Paragraphs['SDG'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))] 67 | 68 | 69 | for index, row in UN_DOCS_Paragraphs.iterrows(): 70 | if index % 10000 == 0: 71 | print(index) 72 | Content = row['Content'].replace('\t',' ') 73 | Content = ''.join(filter(lambda x:x in string.printable, Content)) 74 | Content = Content.translate(str.maketrans('', '', '(),:;?@{|}~.')) 75 | Content = Content.translate(str.maketrans('', '', string.digits)) 76 | tokenized_word = word_tokenize(Content.lower()) 77 | Content_space_seperated = " " + " ".join(tokenized_word) + " " 78 | word_count = len(tokenized_word) 79 | 80 | 81 | if row['Type'] == 'Paragraph' and word_count >= 10: 82 | first_action_verb = '' 83 | try: 84 | first_action_verb = next(word for word in tokenized_word[:10] if word in preambular_verb_list + operative_verb_list) 85 | except Exception: 86 | pass 87 | if Content[0].islower() == False: 88 | UN_DOCS_Paragraphs.loc[index, 'First_Action_Verb'] = first_action_verb 89 | if first_action_verb in preambular_verb_list and Content[0].islower() == False: 90 | UN_DOCS_Paragraphs.loc[index, 'Paragraph_Type'] = 'preambular' 91 | elif first_action_verb in operative_verb_list and Content[0].islower() == False: 92 | UN_DOCS_Paragraphs.loc[index, 'Paragraph_Type'] = 'operative' 93 | elif Content[0].islower() == True: 94 | previous_paragraph_types = list(UN_DOCS_Paragraphs.Paragraph_Type[(index-5):(index-1)]) 95 | previous_paragraph_types_non_empty = [x for x in previous_paragraph_types if x != ''] 96 | if len(previous_paragraph_types_non_empty) >= 1: 97 | UN_DOCS_Paragraphs.loc[index, 'Paragraph_Type'] = previous_paragraph_types_non_empty[-1] 98 | 99 | matching_terms = list(set([term for term in UNBIS_terms if " " + term + " " in Content_space_seperated])) 100 | matching_terms.sort(key=len, reverse=True) 101 | key_terms = [] 102 | if matching_terms is not None: 103 | for i in range(len(matching_terms)): 104 | if matching_terms[i] in Content: 105 | key_terms.append(matching_terms[i]) 106 | Content = Content.replace(matching_terms[i], '') 107 | UN_DOCS_Paragraphs.at[index, 'Key_Terms'] = key_terms 108 | 109 | Referenced_Resolutions = re.findall(r'resolutions \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* .* and all subsequent related resolutions|resolutions \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* of [0-9]{1,2} [A-Za-z]{3,9} [0-9]{4}.* and \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* of [0-9]{1,2} [A-Za-z]{3,9} [0-9]{4}|resolutions \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* and \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* of [0-9]{1,2} [A-Za-z]{3,9} [0-9]{4}|resolution \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* of [0-9]{1,2} [A-Za-z]{3,9} [0-9]{4}|resolutions \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)*.* and \w*-*\d+[/]*\d+\s*\(*\w*-*\w*\)* of [0-9]{1,2} [A-Za-z]{3,9} [0-9]{4}|resolutions \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)*.* and \w*-*\d+[/]*\d+\s*\(*\w*-*\w*\)*|resolution \w*-*\d+[/]*[.]*\d+ \(\w*-*\w*\)|resolution \w*-*\d+[/]*[.]*\d+', Content) 110 | Referenced_Resolutions_Dates = [] 111 | for referenced_resolution in Referenced_Resolutions: 112 | referenced_resolution = re.sub(' January ', '/01/', referenced_resolution) 113 | referenced_resolution = re.sub(' February ', '/02/', referenced_resolution) 114 | referenced_resolution = re.sub(' March ', '/03/', referenced_resolution) 115 | referenced_resolution = re.sub(' April ', '/04/', referenced_resolution) 116 | referenced_resolution = re.sub(' May ', '/05/', referenced_resolution) 117 | referenced_resolution = re.sub(' June ', '/06/', referenced_resolution) 118 | referenced_resolution = re.sub(' July ', '/07/', referenced_resolution) 119 | referenced_resolution = re.sub(' August ', '/08/', referenced_resolution) 120 | referenced_resolution = re.sub(' September ', '/09/', referenced_resolution) 121 | referenced_resolution = re.sub(' October ', '/10/', referenced_resolution) 122 | referenced_resolution = re.sub(' November ', '/11/', referenced_resolution) 123 | referenced_resolution = re.sub(' December ', '/12/', referenced_resolution) 124 | referenced_resolution_split = re.split(',|and', referenced_resolution) 125 | for resolution in referenced_resolution_split: 126 | if bool(re.search('resolution\w* (.*) of ([0-9]{1,2}/[0-9]{2}/[0-9]{4})', resolution)): 127 | resolution_number = re.findall(r'resolution\w* (.*) of', resolution)[0] 128 | date = re.findall(r'of ([0-9]{1,2}/[0-9]{2}/[0-9]{4})', resolution)[0] 129 | elif bool(re.search('\s*(.*) of ([0-9]{1,2}/[0-9]{2}/[0-9]{4})', resolution)): 130 | resolution_number = re.findall(r'\s*(.*) of', resolution)[0] 131 | date = re.findall(r'of ([0-9]{1,2}/[0-9]{2}/[0-9]{4})', resolution)[0] 132 | elif bool(re.search('resolution\w* (.*)', resolution)): 133 | resolution_number = re.findall(r'resolution\w* (.*)', resolution)[0] 134 | date = 'NA' 135 | elif bool(re.search('\w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)*', resolution)): 136 | resolution_number = re.findall(r'\w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)*', resolution)[0] 137 | date = 'NA' 138 | Referenced_Resolutions_Dates[resolution_number] = date 139 | UN_DOCS_Paragraphs.at[index, 'Referenced_Resolutions'] = Referenced_Resolutions 140 | UN_DOCS_Paragraphs.at[index, 'Referenced_Resolutions_Dates'] = Referenced_Resolutions_Dates 141 | 142 | if any(x in tokenized_word for x in ['poverty', 'poor']): 143 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('No Poverty') 144 | elif any(x in Content.lower() for x in ['hunger', 'hungry', 'malnutrition', 'food crisis', 'sufficient food', 'food producers', 'food production', 'food reserves', 'food price', 'food insecurity', 'food security', 'undernutrition']): 145 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Zero Hunger') 146 | elif any(x in tokenized_word for x in ['health', 'well-being', 'mortality', 'disease']): 147 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Good Health and Well-Being') 148 | elif any(x in tokenized_word for x in ['education', 'educational']): 149 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Quality Education') 150 | elif any(x in tokenized_word for x in ['gender equality']): 151 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Gender Equality') 152 | elif any(x in tokenized_word for x in ['water', 'sanitation', 'wastewater']): 153 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Clean Water and Sanitation') 154 | elif any(x in tokenized_word for x in ['energy', 'renewable']): 155 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Affordable and Clean Energy') 156 | elif any(x in tokenized_word for x in ['labour-intensive', 'employment']) or any(x in Content.lower() for x in ['child labour', 'labour rights', 'decent work', 'economic growth', 'economic productivity']): 157 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Decent Work and Economic Growth') 158 | elif any(x in tokenized_word for x in ['industry', 'innovation', 'infrastructure']): 159 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Industry, Innovation and Infrastructure') 160 | elif any(x in tokenized_word for x in ['inequalities', 'inequality']) and (not any(x in Content.lower() for x in ['gender equality'])): 161 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Reduced Inequalities') 162 | elif 'sustainable cities' in Content.lower(): 163 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Sustainable Cities and Communities') 164 | elif any(x in Content.lower() for x in ['consumption and production']): 165 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Responsible Consumption and Production') 166 | elif any(x in Content.lower() for x in ['climate change', 'climate-related', 'natural disaster', 'national disaster', 'local disaster']): 167 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Climate Action') 168 | elif any(x in tokenized_word for x in ['marine', 'fisheries', 'coastal']) or any(x in Content.lower() for x in ['oceans and seas']): 169 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Life Below Water') 170 | elif any(x in tokenized_word for x in ['biodiversity', 'land ', 'inland', 'species']): 171 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Life on Land') 172 | elif 'institutions' in tokenized_word and any(x in tokenized_word for x in ['peace', 'justice', 'strong']): 173 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Peace, Justice and Strong Institutions') 174 | elif any(x in tokenized_word for x in ['partner', 'partners', 'partnership', 'partnerships']): 175 | UN_DOCS_Paragraphs.at[index, 'SDG'].append('Partnerships for the Goals') 176 | 177 | w2v_Targets = [] 178 | w2v_Indicators = [] 179 | Targets_isalpha = [] 180 | Indicators_isalpha = [] 181 | 182 | for i in range(len(Targets)): 183 | tokenized_word = word_tokenize(Targets[i].lower()) 184 | tokenized_word = [word for word in tokenized_word if len(word) > 1] 185 | tokenized_word = [word for word in tokenized_word if word.isalpha()] 186 | Targets_isalpha.append(' '.join(tokenized_word)) 187 | words_in_vocab = [word for word in tokenized_word if word in w2v_google.vocab] 188 | w2v_sum = np.sum(w2v_google[words_in_vocab], axis=0) 189 | #w2v_average = np.average(w2v_google[words_in_vocab], axis=0) 190 | w2v_Targets.append(w2v_sum) 191 | 192 | for i in range(len(Indicators)): 193 | tokenized_word = word_tokenize(Indicators[i].lower()) 194 | tokenized_word = [word for word in tokenized_word if len(word) > 1] 195 | tokenized_word = [word for word in tokenized_word if word.isalpha()] 196 | Indicators_isalpha.append(' '.join(tokenized_word)) 197 | words_in_vocab = [word for word in tokenized_word if word in w2v_google.vocab] 198 | w2v_sum = np.sum(w2v_google[words_in_vocab], axis=0) 199 | #w2v_average = np.average(w2v_google[words_in_vocab], axis=0) 200 | w2v_Indicators.append(w2v_sum) 201 | 202 | 203 | def Common_Substring(string1, string2): 204 | substrings = [] 205 | matches = difflib.SequenceMatcher(None, string1, string2).get_matching_blocks() 206 | for match in sorted(matches, key=lambda x: x[2], reverse=True): 207 | substrings.append(string1[match.a:match.a + match.size]) 208 | return substrings 209 | 210 | 211 | similarity_threshold_target = 0.9 212 | similarity_threshold_indicator = 0.9 213 | 214 | UN_DOCS_Paragraphs['Closest_Target'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))] 215 | UN_DOCS_Paragraphs['Closest_Indicator'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))] 216 | UN_DOCS_Paragraphs['Closest_Target_Similarity_Score'] = 0.0 217 | UN_DOCS_Paragraphs['Closest_Indicator_Similarity_Score'] = 0.0 218 | 219 | for row_index in range(len(UN_DOCS_Paragraphs)): 220 | if row_index % 1000 == 0: 221 | print(row_index) 222 | if UN_DOCS_Paragraphs.loc[row_index, 'Type'] == 'Paragraph': 223 | paragraph = UN_DOCS_Paragraphs.loc[row_index, 'Content'] 224 | tokenized_word = word_tokenize(paragraph.lower().replace('\t',' ')) 225 | tokenized_word = [word for word in tokenized_word if len(word) > 1] 226 | tokenized_word = [word for word in tokenized_word if word.isalpha()] 227 | paragraph_isalpha = ' '.join(tokenized_word) 228 | 229 | similarity_with_target_common_substring = [] 230 | for i in range(len(Targets_isalpha)): 231 | paragraph_target_common_substring = Common_Substring(paragraph_isalpha, Targets_isalpha[i]) 232 | if len(paragraph_target_common_substring) == 0: 233 | similarity_with_target_common_substring.append(0.0) 234 | elif len(paragraph_target_common_substring) >= 1: 235 | paragraph_target_common_substring = paragraph_target_common_substring[:3] 236 | paragraph_target_common_substring_aggregated = ' '.join(paragraph_target_common_substring) 237 | words_common_substring = paragraph_target_common_substring_aggregated.split() 238 | words_common_substring = [word for word in words_common_substring if word in tokenized_word] 239 | words_common_substring_in_vocab = [word for word in words_common_substring if word in w2v_google.vocab] 240 | if len(words_common_substring_in_vocab) >= 1: 241 | w2v_common_substring = np.sum(w2v_google[words_common_substring_in_vocab], axis=0) 242 | similarity_with_target_common_substring.append(1 - spatial.distance.cosine(w2v_Targets[i], w2v_common_substring)) 243 | else: 244 | similarity_with_target_common_substring.append(len(words_common_substring) / len(Targets_isalpha[i].split())) 245 | 246 | similarity_with_indicator_common_substring = [] 247 | for i in range(len(Indicators_isalpha)): 248 | paragraph_indicator_common_substring = Common_Substring(paragraph_isalpha, Indicators_isalpha[i]) 249 | if len(paragraph_indicator_common_substring) == 0: 250 | similarity_with_indicator_common_substring.append(0.0) 251 | elif len(paragraph_indicator_common_substring) >= 1: 252 | paragraph_indicator_common_substring = paragraph_indicator_common_substring[:3] 253 | paragraph_indicator_common_substring_aggregated = ' '.join(paragraph_indicator_common_substring) 254 | words_common_substring = paragraph_indicator_common_substring_aggregated.split() 255 | words_common_substring = [word for word in words_common_substring if word in tokenized_word] 256 | words_common_substring_in_vocab = [word for word in words_common_substring if word in w2v_google.vocab] 257 | if len(words_common_substring_in_vocab) >= 1: 258 | w2v_common_substring = np.sum(w2v_google[words_common_substring_in_vocab], axis=0) 259 | similarity_with_indicator_common_substring.append(1 - spatial.distance.cosine(w2v_Indicators[i], w2v_common_substring)) 260 | else: ## none of the words in common substring are in vocab 261 | similarity_with_indicator_common_substring.append(len(words_common_substring) / len(Indicators_isalpha[i].split())) 262 | 263 | UN_DOCS_Paragraphs.loc[row_index, 'Closest_Target_Similarity_Score'] = max(similarity_with_target_common_substring) 264 | UN_DOCS_Paragraphs.loc[row_index, 'Closest_Indicator_Similarity_Score'] = max(similarity_with_indicator_common_substring) 265 | 266 | similar_target_index = [i for i,similarity in enumerate(similarity_with_target_common_substring) if similarity >= similarity_threshold_target] 267 | similar_indicator_index = [i for i,similarity in enumerate(similarity_with_indicator_common_substring) if similarity >= similarity_threshold_indicator] 268 | 269 | if ((len(similar_target_index) >= 1) and (max(similarity_with_target_common_substring) >= max(similarity_with_indicator_common_substring))): 270 | most_similar_target_index = similarity_with_target_common_substring.index(max(similarity_with_target_common_substring)) 271 | most_similar_target = Targets[most_similar_target_index] 272 | UN_DOCS_Paragraphs.at[row_index, 'Closest_Target'].append(most_similar_target) 273 | if Targets_SDG_dict[most_similar_target] not in UN_DOCS_Paragraphs.at[row_index, 'SDG']: 274 | UN_DOCS_Paragraphs.at[row_index, 'SDG'].append(Targets_SDG_dict[most_similar_target]) 275 | elif ((len(similar_indicator_index) >= 1) and (max(similarity_with_target_common_substring) <= max(similarity_with_indicator_common_substring))): 276 | most_similar_indicator_index = similarity_with_indicator_common_substring.index(max(similarity_with_indicator_common_substring)) 277 | most_similar_indicator = Indicators[most_similar_indicator_index] 278 | UN_DOCS_Paragraphs.at[row_index, 'Closest_Indicator'].append(most_similar_indicator) 279 | if Indicators_SDG_dict[most_similar_indicator] not in UN_DOCS_Paragraphs.at[row_index, 'SDG']: 280 | UN_DOCS_Paragraphs.at[row_index, 'SDG'].append(Indicators_SDG_dict[most_similar_indicator]) 281 | 282 | 283 | 284 | country_list = pd.read_excel(data_dir + "country_list.xlsx").fillna('') 285 | country_names = [country.strip().replace('&', 'and') for country in country_list['Country'].tolist()] 286 | 287 | UN_agencies = pd.read_excel(data_dir + "agencies.xlsx").fillna('') 288 | UN_known_orgs = pd.read_excel(data_dir + "un_entities_20191017.xlsx").fillna('') 289 | 290 | UN_corporate_names = pd.read_excel(data_dir + "names_A60-72.xlsx").fillna('') 291 | UN_corporate_names = [x for x in UN_corporate_names['Name'] if x not in country_names] 292 | UN_corporate_names = [re.sub("[\(].*?[\)]", "", x).replace('UN', 'United Nations').replace('.','').strip() for x in UN_corporate_names] 293 | 294 | additional_un_org_list = [ 295 | 'Advisory Committee on Administrative and Budgetary Questions', 296 | 'African Union Mission in Somalia', 297 | 'European Union Rule of Law Mission in Kosovo', 298 | 'Special Political and Decolonization Committee (Fourth Committee)', 299 | 'United Nations Conference on Environment and Development', 300 | 'United Nations Entity for Gender Equality and the Empowerment of Women (UN-Women)', 301 | 'Bretton Woods Institutions', 302 | 'International Tribunal for the Former Yugoslavia', 303 | 'United Nations Assistance Mission in Afghanistan', 304 | 'United Nations Operation in Cte dIvoire', 305 | 'Consultative Group on International Agricultural Research', 306 | ] 307 | 308 | known_un_org_list = list(set( 309 | UN_agencies['Title'].tolist() 310 | + UN_known_orgs['Entity'].tolist() 311 | + UN_corporate_names 312 | + additional_un_org_list 313 | )) 314 | known_un_org_list = [x for x in known_un_org_list if x not in country_names] 315 | 316 | 317 | known_un_org_list = [org.translate(str.maketrans('', '', ',;:."')) for org in known_un_org_list] 318 | #known_un_org_list = [re.sub(r'[^\x00-\x7F]+',' ', org) for org in known_un_org_list] 319 | known_un_org_list = [''.join([x if x in string.printable else '' for x in org]) for org in known_un_org_list] 320 | known_un_org_list = [' '.join(w for w in org.split()) for org in known_un_org_list] 321 | 322 | known_un_org_w2v = dict() 323 | for org in known_un_org_list: 324 | words_in_vocab = [word for word in word_tokenize(org.lower()) if word in w2v_google.vocab] 325 | if len(words_in_vocab) >= 1: 326 | w2v_sum = np.sum(w2v_google[words_in_vocab], axis=0) 327 | known_un_org_w2v[org] = w2v_sum 328 | else: 329 | known_un_org_w2v[org] = np.asarray([]) 330 | 331 | 332 | key_words_un_org_list = open(data_dir + "key_words_un_org_list.txt").read().splitlines() 333 | key_words_not_un_org_list = open(data_dir + "key_words_not_un_org_list.txt").read().splitlines() 334 | 335 | UN_DOCS_Paragraphs['word_cnt'] = 0 336 | UN_DOCS_Paragraphs['Content_clean'] = '' 337 | 338 | for index, row in UN_DOCS_Paragraphs.iterrows(): 339 | if index % 10000 == 0: 340 | print(index) 341 | Content = row['Content'].replace('\t',' ') 342 | Content = Content.replace(',',', ') 343 | Content = Content.replace(';','; ') 344 | Content = Content.replace('.','. ') 345 | Content = re.sub(r'[0-9]{1,2}.', ' ', Content) 346 | Content = ''.join([x if x in string.printable else '' for x in Content]) 347 | Content = ' '.join(w for w in Content.split() if not any(x.isdigit() for x in w)) 348 | word_cnt = len(Content.split()) 349 | UN_DOCS_Paragraphs.at[index, 'word_cnt'] = word_cnt 350 | UN_DOCS_Paragraphs.at[index, 'Content_clean'] = Content 351 | 352 | UN_DOCS_Paragraphs = UN_DOCS_Paragraphs.sort_values(by=['SourceFile', 'Index']) 353 | 354 | 355 | UN_DOCS_Resolutions_Content = UN_DOCS_Paragraphs.loc[(UN_DOCS_Paragraphs.Type == 'Paragraph')].groupby(['SourceFile'])['Content'].apply(' '.join).reset_index() 356 | UN_DOCS_Resolutions_Content_clean = UN_DOCS_Paragraphs.loc[(UN_DOCS_Paragraphs.Type == 'Paragraph')].groupby(['SourceFile'])['Content_clean'].apply(' '.join).reset_index() 357 | UN_DOCS_Resolutions = pd.merge(UN_DOCS_Resolutions_Content, UN_DOCS_Resolutions_Content_clean, on='SourceFile') 358 | 359 | UN_DOCS_Resolutions['Organization_Names_known'] = [list() for x in range(len(UN_DOCS_Resolutions.index))] 360 | UN_DOCS_Resolutions['Organization_Names_not_from_known_orginal'] = [list() for x in range(len(UN_DOCS_Resolutions.index))] 361 | UN_DOCS_Resolutions['Organization_Names_not_from_known_inferred'] = [list() for x in range(len(UN_DOCS_Resolutions.index))] 362 | 363 | for index, row in UN_DOCS_Resolutions.iterrows(): 364 | if index % 100 == 0: 365 | print(index) 366 | Content_clean = row['Content_clean'] 367 | known_orgs = [known_org for known_org in known_un_org_list if known_org in Content_clean] 368 | UN_DOCS_Resolutions.at[index, 'Organization_Names_known'] = known_orgs 369 | 370 | extracted_orgs = list(set([str(element) for element in spacy_nlp(Content_clean).ents if element.label_ == 'ORG'])) 371 | extracted_orgs = [org for org in extracted_orgs if all(char not in org for char in ['_', '/', '.'])] 372 | for i in range(len(extracted_orgs)): 373 | extracted_org = extracted_orgs[i].translate(str.maketrans('', '', string.digits)) 374 | extracted_org = extracted_org.translate(str.maketrans('', '', ',;:.()')) 375 | if extracted_org.lower().startswith('the '): 376 | extracted_orgs[i] = extracted_org[4:] 377 | extracted_orgs = list(set(extracted_orgs)) 378 | 379 | Organization_Names_not_from_known_orginal = [] 380 | for org in extracted_orgs: 381 | if ( 382 | len(org.split()) > 1 383 | and (org not in known_un_org_list) 384 | and (not org.lower().split()[-1] in stop_words) 385 | and ((not any(key_word.lower() in org.lower() for key_word in key_words_not_un_org_list)) or (any(key_word.lower() in org.lower() for key_word in key_words_un_org_list))) 386 | and (max([org.lower() in known_org.lower() for known_org in known_un_org_list]) == False) 387 | #and (max([known_org.lower() in org.lower() for known_org in known_un_org_list]) == False) 388 | and (max([org.lower().split()[0] in [word for word in operative_verb_list if word.endswith('s')]]) == 0) 389 | and (max([word in preambular_verb_list for word in org.lower().split()]) == 0) 390 | and (' of the ' not in org) 391 | ): 392 | Organization_Names_not_from_known_orginal.append(org) 393 | elif ( 394 | len(org.split()) > 1 395 | and (org not in known_un_org_list) 396 | and (not org.lower().split()[-1] in stop_words) 397 | and ((not any(key_word.lower() in org.lower() for key_word in key_words_not_un_org_list)) or (any(key_word.lower() in org.lower() for key_word in key_words_un_org_list))) 398 | and (max([org.lower() in known_org.lower() for known_org in known_un_org_list]) == False) 399 | #and (max([known_org.lower() in org.lower() for known_org in known_un_org_list]) == False) 400 | and (max([org.lower().split()[0] in [word for word in operative_verb_list if word.endswith('s')]]) == 0) 401 | and (max([word in preambular_verb_list for word in org.lower().split()]) == 0) 402 | and (' of the ' in org) 403 | ): 404 | org_split = org.split(' of the ') 405 | if (org_split[0] not in known_un_org_list) and (len(org_split[0].split()) > 1) and (org_split[1] in known_un_org_list): 406 | Organization_Names_not_from_known_orginal.append(org_split[0]) 407 | elif (org_split[0] in known_un_org_list) and (org_split[1] not in known_un_org_list) and (len(org_split[1].split()) > 1): 408 | Organization_Names_not_from_known_orginal.append(org_split[1]) 409 | elif (org_split[0] not in known_un_org_list) and (org_split[1] not in known_un_org_list): 410 | Organization_Names_not_from_known_orginal.append(org) 411 | 412 | UN_DOCS_Resolutions.at[index, 'Organization_Names_not_from_known_orginal'] = Organization_Names_not_from_known_orginal 413 | 414 | 415 | if (len(known_orgs) > 0): 416 | for org in UN_DOCS_Resolutions.at[index, 'Organization_Names_not_from_known_orginal']: 417 | tokenized_word = word_tokenize(org) 418 | tokenized_word_lower = word_tokenize(org.lower()) 419 | words_in_vocab_lower = [word for word in tokenized_word_lower if word in w2v_google.vocab] 420 | if (len(words_in_vocab_lower) >= 1): 421 | org_w2v = np.sum(w2v_google[words_in_vocab_lower], axis=0) 422 | else: 423 | org_w2v = np.asarray([]) 424 | common_words_length = [] 425 | w2v_similarity = [] 426 | for known_org in known_orgs: 427 | known_org_tokenized_word = word_tokenize(known_org) 428 | known_org_tokenized_word_lower = word_tokenize(known_org.lower()) 429 | known_org_words_in_vocab_lower = [word for word in known_org_tokenized_word_lower if word in w2v_google.vocab] 430 | if len(known_org_words_in_vocab_lower) >= 1: 431 | known_org_w2v = np.sum(w2v_google[known_org_words_in_vocab_lower], axis=0) 432 | else: 433 | known_org_w2v = np.asarray([]) 434 | 435 | common_words = [word for word in tokenized_word if (word in known_org_tokenized_word and word[0].isupper())] 436 | common_words_length.append(len(common_words)) 437 | if ((len(org_w2v) == 0) or (len(known_org_w2v) == 0)): 438 | w2v_similarity.append(0) 439 | else: 440 | w2v_similarity.append(1 - spatial.distance.cosine(org_w2v, known_org_w2v)) 441 | 442 | if (max(common_words_length) == 0): 443 | UN_DOCS_Resolutions.at[index, 'Organization_Names_not_from_known_inferred'].append((org, org)) 444 | else: 445 | if (len([l for l in common_words_length if l == max(common_words_length)]) == 1): 446 | known_org_index = common_words_length.index(max(common_words_length)) 447 | similarity_score = w2v_similarity[known_org_index] 448 | known_org = known_orgs[known_org_index] 449 | UN_DOCS_Resolutions.at[index, 'Organization_Names_not_from_known_inferred'].append((org, known_org, similarity_score)) 450 | else: 451 | known_org_index = [i for i, x in enumerate(common_words_length) if x == max(common_words_length)] 452 | max_similarity_score = max([w2v_similarity[index] for index in known_org_index]) 453 | max_similarity_known_org_index = w2v_similarity.index(max_similarity_score) 454 | known_org = known_orgs[max_similarity_known_org_index] 455 | UN_DOCS_Resolutions.at[index, 'Organization_Names_not_from_known_inferred'].append((org, known_org, max_similarity_score)) 456 | 457 | 458 | Organization_Names_not_from_known = UN_DOCS_Resolutions['Organization_Names_not_from_known_orginal'].tolist() 459 | Organization_Names_not_from_known = [x for sublist in Organization_Names_not_from_known for x in sublist] 460 | Organization_Names_not_from_known_cnt = Counter(Organization_Names_not_from_known) 461 | Organization_Names_not_from_known_cnt = pd.DataFrame.from_dict(Organization_Names_not_from_known_cnt, orient='index').reset_index() 462 | Organization_Names_not_from_known_cnt = Organization_Names_not_from_known_cnt.rename(columns={'index':'org_names', 0:'count'}).sort_values(by='count', ascending=False).reset_index(drop=True) 463 | 464 | 465 | UN_DOCS_Paragraphs['Country'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))] 466 | UN_DOCS_Paragraphs['Organization_Names_known'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))] 467 | UN_DOCS_Paragraphs['Organization_Names_not_from_known_orginal'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))] 468 | UN_DOCS_Paragraphs['Organization_Names_not_from_known_inferred'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))] 469 | 470 | for index, row in UN_DOCS_Paragraphs.iterrows(): 471 | if index % 10000 == 0: 472 | print(index) 473 | SourceFile = row['SourceFile'] 474 | Content_clean = row['Content_clean'] 475 | Organization_Names_known_Resolution = UN_DOCS_Resolutions.loc[UN_DOCS_Resolutions['SourceFile'] == SourceFile , 'Organization_Names_known'].tolist()[0] 476 | Organization_Names_not_from_known_orginal_Resolution = UN_DOCS_Resolutions.loc[UN_DOCS_Resolutions['SourceFile'] == SourceFile]['Organization_Names_not_from_known_orginal'].tolist()[0] 477 | Organization_Names_not_from_known_inferred_Resolution = UN_DOCS_Resolutions.loc[UN_DOCS_Resolutions['SourceFile'] == SourceFile]['Organization_Names_not_from_known_inferred'].tolist()[0] 478 | Country = [country for country in country_names if country.lower() in Content_clean.lower()] 479 | Organization_Names_known = [] 480 | for org in Organization_Names_known_Resolution: 481 | if org.lower() in Content_clean.lower(): 482 | Organization_Names_known.append(org) 483 | Organization_Names_not_from_known_orginal = [] 484 | for org in Organization_Names_not_from_known_orginal_Resolution: 485 | if org in Content_clean: 486 | Organization_Names_not_from_known_orginal.append(org) 487 | Organization_Names_not_from_known_inferred = [] 488 | if len(Organization_Names_not_from_known_inferred_Resolution) >= 1: 489 | for org in Organization_Names_not_from_known_inferred_Resolution: 490 | if org[0] in Content_clean: 491 | Organization_Names_not_from_known_inferred.append(org) 492 | UN_DOCS_Paragraphs.at[index, 'Country'] = Country 493 | UN_DOCS_Paragraphs.at[index, 'Organization_Names_known'] = Organization_Names_known 494 | UN_DOCS_Paragraphs.at[index, 'Organization_Names_not_from_known_orginal'] = Organization_Names_not_from_known_orginal 495 | UN_DOCS_Paragraphs.at[index, 'Organization_Names_not_from_known_inferred'] = Organization_Names_not_from_known_inferred 496 | 497 | UN_DOCS_Paragraphs = UN_DOCS_Paragraphs.drop(columns=['word_cnt', 'Content_clean']) 498 | UN_DOCS_Paragraphs.to_excel(output_dir + 'output_UN_DOCS_paragraph_level.xlsx') 499 | 500 | --------------------------------------------------------------------------------