├── Data
    ├── agencies.xlsx
    ├── country_list.xlsx
    ├── names_A60-72.xlsx
    ├── un_entities_20191017.xlsx
    ├── Download GoogleNews-vectors-negative300
    ├── key_words_un_org_list.txt
    ├── preambular_verb_list.txt
    ├── operative_verb_list.txt
    └── key_words_not_un_org_list.txt
├── requirements.txt
├── CODE_OF_CONDUCT.md
├── tools
    └── document-processor
    │   ├── DSnA.WebJob.DocumentParser
    │       ├── Code
    │       │   ├── IStorageClientFactory.cs
    │       │   ├── DefaultStorageClientFactory.cs
    │       │   ├── Exceptions.cs
    │       │   ├── InteropWordUtils.cs
    │       │   ├── Constants.cs
    │       │   ├── Logger.cs
    │       │   ├── ParserClasses.cs
    │       │   ├── DocumentParser.cs
    │       │   ├── ParseHelper.cs
    │       │   └── Utils.cs
    │       ├── Interface
    │       │   ├── IDocumentParser.cs
    │       │   └── IStorageClient.cs
    │       ├── LocalStorageClient.cs
    │       ├── BlobStorageClient.cs
    │       ├── Properties
    │       │   └── AssemblyInfo.cs
    │       ├── README.md
    │       ├── App.config
    │       ├── packages.config
    │       ├── Program.cs
    │       └── DSnA.WebJob.DocumentParser.csproj
    │   └── DSnA.WebJob.DocumentParser.sln
├── SUPPORT.md
├── SECURITY.md
├── knowledge_extraction_resolution_level.py
├── README.md
├── .gitignore
├── LICENSE
└── knowledge_extraction_paragraph_level.py


/Data/agencies.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/un-knowledge-extraction/HEAD/Data/agencies.xlsx


--------------------------------------------------------------------------------
/Data/country_list.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/un-knowledge-extraction/HEAD/Data/country_list.xlsx


--------------------------------------------------------------------------------
/Data/names_A60-72.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/un-knowledge-extraction/HEAD/Data/names_A60-72.xlsx


--------------------------------------------------------------------------------
/Data/un_entities_20191017.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/un-knowledge-extraction/HEAD/Data/un_entities_20191017.xlsx


--------------------------------------------------------------------------------
/Data/Download GoogleNews-vectors-negative300:
--------------------------------------------------------------------------------
1 | It can be downloaded at https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit and then saved under the "Data" folder
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas==1.1.5
 2 | nltk==3.6.6
 3 | numpy==1.22.0
 4 | re==2.2.1
 5 | string
 6 | spacy==3.0.1
 7 | gensim==3.8.3
 8 | scipy==1.5.3
 9 | collections
10 | matplotlib==3.3.4
11 | 


--------------------------------------------------------------------------------
/Data/key_words_un_org_list.txt:
--------------------------------------------------------------------------------
 1 | Committee
 2 | Council
 3 | Conference
 4 | Fund
 5 | Organization
 6 | Entity
 7 | Department
 8 | Commission
 9 | Court
10 | Board
11 | Community
12 | Office
13 | Association
14 | Government
15 | Group
16 | Summit
17 | Subcommittee


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Code/IStorageClientFactory.cs:
--------------------------------------------------------------------------------
 1 | //Copyright(c) Microsoft Corporation.All rights reserved.
 2 | //Licensed under the MIT License.
 3 | 
 4 | using System;
 5 | using System.Collections.Generic;
 6 | using System.Linq;
 7 | using System.Text;
 8 | using System.Threading.Tasks;
 9 | 
10 | namespace DSnA.WebJob.DocumentParser
11 | {
12 |     public interface IStorageClientFactory
13 |     {
14 |         IStorageClient Create(string id, Dictionary<string, string> parameters, IUtils utils);
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Interface/IDocumentParser.cs:
--------------------------------------------------------------------------------
 1 | //Copyright(c) Microsoft Corporation.All rights reserved.
 2 | //Licensed under the MIT License.
 3 | 
 4 | using System;
 5 | using System.Collections.Generic;
 6 | using System.Linq;
 7 | using System.Text;
 8 | using System.Threading.Tasks;
 9 | using Microsoft.Office.Interop.Word;
10 | 
11 | namespace DSnA.WebJob.DocumentParser
12 | {
13 |     public interface IDocumentParser
14 |     {
15 |         string ParseDocuments(string uri, IStorageClient storageClient, Application wordApp, string outputFileFormat);
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Interface/IStorageClient.cs:
--------------------------------------------------------------------------------
 1 | //Copyright(c) Microsoft Corporation.All rights reserved.
 2 | //Licensed under the MIT License.
 3 | 
 4 | using System;
 5 | using System.Collections.Generic;
 6 | using System.Linq;
 7 | using System.Text;
 8 | using System.Threading.Tasks;
 9 | 
10 | namespace DSnA.WebJob.DocumentParser
11 | {
12 |     public class StorageObjectDescriptor
13 |     {
14 |         public string FileName { get; set; }
15 |         public Uri Uri { get; set; }
16 |     }
17 | 
18 |     public interface IStorageClient
19 |     {
20 |         string GetFile(StorageObjectDescriptor descriptor, string destinationFilePath);
21 |         void SaveFile(string sourceUri, string destinationUri);
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Code/DefaultStorageClientFactory.cs:
--------------------------------------------------------------------------------
 1 | //Copyright(c) Microsoft Corporation.All rights reserved.
 2 | //Licensed under the MIT License.
 3 | 
 4 | using System;
 5 | using System.Collections.Generic;
 6 | using System.Linq;
 7 | using System.Text;
 8 | using System.Threading.Tasks;
 9 | 
10 | namespace DSnA.WebJob.DocumentParser
11 | {
12 |     public class DefaultStorageClientFactory : IStorageClientFactory
13 |     {
14 |         public const string BlobContainerNameKey = "container";
15 | 
16 |         public IStorageClient Create(string id, Dictionary<string, string> parameters, IUtils utils)
17 |         {
18 |             switch(id)
19 |             {
20 |                 case "blob":
21 |                     return new BlobStorageClient(parameters[BlobContainerNameKey], utils);
22 | 
23 |                 default:
24 |                     return new LocalStorageClient();
25 |             }
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Code/Exceptions.cs:
--------------------------------------------------------------------------------
 1 | //Copyright(c) Microsoft Corporation.All rights reserved.
 2 | //Licensed under the MIT License.
 3 | 
 4 | using System;
 5 | 
 6 | namespace DSnA.WebJob.DocumentParser
 7 | {
 8 |     public class UnableToDeleteFileException : Exception
 9 |     {
10 |         public UnableToDeleteFileException()
11 |         {
12 |         }
13 | 
14 |         public UnableToDeleteFileException(string message)
15 |             : base(message)
16 |         {
17 |         }
18 | 
19 |         public UnableToDeleteFileException(string message, Exception inner)
20 |             : base(message, inner)
21 |         {
22 |         }
23 |     }
24 | 
25 |     public class LoggerException : Exception
26 |     {
27 |         public LoggerException()
28 |         {
29 |         }
30 | 
31 |         public LoggerException(string message)
32 |             : base(message)
33 |         {
34 |         }
35 | 
36 |         public LoggerException(string message, Exception inner)
37 |             : base(message, inner)
38 |         {
39 |         }
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/LocalStorageClient.cs:
--------------------------------------------------------------------------------
 1 | //Copyright(c) Microsoft Corporation.All rights reserved.
 2 | //Licensed under the MIT License.
 3 | 
 4 | using Microsoft.Azure;
 5 | using Microsoft.WindowsAzure.Storage;
 6 | using Microsoft.WindowsAzure.Storage.Blob;
 7 | using System;
 8 | using System.Diagnostics;
 9 | using System.Linq;
10 | 
11 | namespace DSnA.WebJob.DocumentParser
12 | {
13 |     public class LocalStorageClient : IStorageClient
14 |     {
15 |         public LocalStorageClient()
16 |         {
17 | 
18 |         }
19 | 
20 |         public string GetFile(StorageObjectDescriptor descriptor, string destinationFilePath)
21 |         {
22 |             return System.Net.WebUtility.UrlDecode(descriptor.Uri.AbsolutePath).Replace("/","\\");
23 |         }
24 | 
25 |         public void SaveFile(string sourceUri, string destinationUri)
26 |         {
27 |             string sourceUriPath = System.IO.Path.GetDirectoryName(sourceUri);
28 |             string sourceUriFileName = System.IO.Path.GetFileName(sourceUri);
29 | 
30 |             System.IO.File.Copy(sourceUri, (destinationUri != null ? destinationUri : $@"{sourceUriPath}\out_{sourceUriFileName}"), true);
31 |         }
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 16
 4 | VisualStudioVersion = 16.0.30804.86
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DSnA.WebJob.DocumentParser", "DSnA.WebJob.DocumentParser\DSnA.WebJob.DocumentParser.csproj", "{9E4D7884-0C36-429B-A4C9-1217D3CA7D4E}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Any CPU = Debug|Any CPU
11 | 		Release|Any CPU = Release|Any CPU
12 | 	EndGlobalSection
13 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | 		{9E4D7884-0C36-429B-A4C9-1217D3CA7D4E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | 		{9E4D7884-0C36-429B-A4C9-1217D3CA7D4E}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | 		{9E4D7884-0C36-429B-A4C9-1217D3CA7D4E}.Release|Any CPU.ActiveCfg = Release|Any CPU
17 | 		{9E4D7884-0C36-429B-A4C9-1217D3CA7D4E}.Release|Any CPU.Build.0 = Release|Any CPU
18 | 	EndGlobalSection
19 | 	GlobalSection(SolutionProperties) = preSolution
20 | 		HideSolutionNode = FALSE
21 | 	EndGlobalSection
22 | 	GlobalSection(ExtensibilityGlobals) = postSolution
23 | 		SolutionGuid = {202AC18B-E457-427C-B60C-2D7A1D2E6319}
24 | 	EndGlobalSection
25 | EndGlobal
26 | 


--------------------------------------------------------------------------------
/Data/preambular_verb_list.txt:
--------------------------------------------------------------------------------
 1 | acknowledging
 2 | acting
 3 | adhering
 4 | affirming
 5 | agreeing
 6 | alarmed
 7 | taking
 8 | anxious
 9 | appreciating
10 | asserting
11 | attaching
12 | aware
13 | bearing
14 | being
15 | believing
16 | cognizant
17 | commemorating
18 | commending
19 | concerned
20 | concluding
21 | concurring
22 | confident
23 | confirming
24 | conscious
25 | considering
26 | continuing
27 | convinced
28 | deeming
29 | deploring
30 | disturbed
31 | grieved
32 | perturbed
33 | regretting
34 | shocked
35 | desiring
36 | desirous
37 | deternied
38 | distressed
39 | disturbed
40 | emphasizing
41 | encouraged
42 | endorsing
43 | expressing
44 | faithful
45 | fearing
46 | noting
47 | recalling
48 | gratified
49 | alarmed
50 | guided
51 | having
52 | indignant
53 | holding
54 | hopeful
55 | conformity
56 | pursuance
57 | inspired
58 | invoking
59 | opinion
60 | keeping
61 | mindful
62 | observing
63 | outraged
64 | paying
65 | pending
66 | persuaded
67 | realizing
68 | recognizing
69 | recollecting
70 | referring
71 | regretting
72 | reiterating
73 | restating
74 | seeking
75 | sharing
76 | stressing
77 | striving
78 | condemning
79 | taking
80 | trusting
81 | underlining
82 | urging
83 | viewing
84 | warning
85 | welcoming
86 | wishing
87 | preventing


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/BlobStorageClient.cs:
--------------------------------------------------------------------------------
 1 | //Copyright(c) Microsoft Corporation.All rights reserved.
 2 | //Licensed under the MIT License.
 3 | 
 4 | using Microsoft.Azure;
 5 | using Microsoft.WindowsAzure.Storage;
 6 | using Microsoft.WindowsAzure.Storage.Blob;
 7 | using System;
 8 | using System.Diagnostics;
 9 | using System.Linq;
10 | 
11 | namespace DSnA.WebJob.DocumentParser
12 | {
13 |     public class BlobStorageClient : IStorageClient
14 |     {
15 |         private static readonly CloudStorageAccount StorageAccount = CloudStorageAccount.Parse(CloudConfigurationManager.GetSetting("StorageConnectionString"));
16 | 
17 |         private readonly IUtils _utils;
18 | 
19 |         public CloudBlobClient Client;
20 |         public CloudBlobContainer Container;
21 | 
22 |         public BlobStorageClient(string containerName, IUtils utils)
23 |         {
24 |             _utils = utils;
25 | 
26 |             Client = utils.CreateCloudBlobClient(StorageAccount);
27 |             Container = Client.GetContainerReference(containerName);
28 |         }
29 | 
30 |         public string GetFile(StorageObjectDescriptor descriptor, string destinationFilePath)
31 |         {
32 |             return _utils.DownloadBlobFile(descriptor.Uri.AbsoluteUri, Constants.FileConfigs.WorkingDirectoryPath, Client);
33 |         }
34 | 
35 |         public void SaveFile(string sourceUri, string destinationUri)
36 |         {
37 |             _utils.UploadFileToBlob(sourceUri, Client);
38 |         }
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
 1 | using System.Reflection;
 2 | using System.Runtime.CompilerServices;
 3 | using System.Runtime.InteropServices;
 4 | 
 5 | // General Information about an assembly is controlled through the following
 6 | // set of attributes. Change these attribute values to modify the information
 7 | // associated with an assembly.
 8 | [assembly: AssemblyTitle("DSnA.WebJob.DocumentParser")]
 9 | [assembly: AssemblyDescription("")]
10 | [assembly: AssemblyConfiguration("")]
11 | [assembly: AssemblyCompany("")]
12 | [assembly: AssemblyProduct("DSnA.WebJob.DocumentParser")]
13 | [assembly: AssemblyCopyright("Copyright ©  2018")]
14 | [assembly: AssemblyTrademark("")]
15 | [assembly: AssemblyCulture("")]
16 | 
17 | // Setting ComVisible to false makes the types in this assembly not visible
18 | // to COM components.  If you need to access a type in this assembly from
19 | // COM, set the ComVisible attribute to true on that type.
20 | [assembly: ComVisible(false)]
21 | 
22 | // The following GUID is for the ID of the typelib if this project is exposed to COM
23 | [assembly: Guid("9e4d7884-0c36-429b-a4c9-1217d3ca7d4e")]
24 | 
25 | // Version information for an assembly consists of the following four values:
26 | //
27 | //      Major Version
28 | //      Minor Version
29 | //      Build Number
30 | //      Revision
31 | //
32 | // You can specify all the values or you can default the Build and Revision Numbers
33 | // by using the '*' as shown below:
34 | // [assembly: AssemblyVersion("1.0.*")]
35 | [assembly: AssemblyVersion("1.0.0.0")]
36 | [assembly: AssemblyFileVersion("1.0.0.0")]
37 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/README.md:
--------------------------------------------------------------------------------
 1 | # Document Parser
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | [Visual Studio 2019 Community](https://visualstudio.microsoft.com/vs/community/)
 6 | 
 7 | ## Setup
 8 | 
 9 | Build Solution file **DSnA.WebJob.DocumentParser.sln** with Visual Studio. This will generate the executable file called: *DSnA.WebJob.DocumentParser.exe*
10 | 
11 | ## Configuration
12 | 
13 | App.config - add the key values for: 
14 | 
15 |     StorageConnectionString (only for blob storage mode): add storage connection string information in order to get access to the blob storage containers.
16 | 
17 |     OutputFileFormat: "csv" or "json" output file format
18 | 
19 |     StorageType: "blob" or "localstorage" values
20 | 
21 | ## How it works?
22 | 
23 | The document parser tool allows you to extract the content of a document file (pdf, word) and create an output csv or json file with the document data classified into text, paragraphs, headings, sections, clauses, heading clauses and additional information.
24 | 
25 | To run this, you have two options:
26 | 
27 | **Azure blob storage**: upload the documents to be processed to the input blob storage and open a command prompt window located where the *DSnA.WebJob.DocumentParser.exe* file is located and then run it with the required arguments as below:
28 | 
29 | >DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3
30 | 
31 | Options:
32 | 
33 | >arg1: **Required** - blob input container name
34 | 
35 | >arg2: **Required** - blob virtual directory name/path (/ root level)
36 | 
37 | >arg3: Optional - file name filter (if not present, all documents within source folder will be processed)
38 | 
39 | The output files will be located in the blob storage *docparseoutput* container.
40 | 
41 | **Local storage**: upload the documents to the local folder in your file system to then open a command prompt window located where the *DSnA.WebJob.DocumentParser.exe* file is located and then run it with the required arguments as below:
42 | 
43 | >DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3
44 | 
45 | Options:
46 | 
47 | >arg1: **Required** - local storage source folder path
48 | 
49 | >arg2: **Required** - local storage output folder path
50 | 
51 | >arg3: Optional - file name filter (if not present, all documents within source folder will be processed)
52 | 
53 | The output files will be located in the output local folder.
54 | 
55 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Code/InteropWordUtils.cs:
--------------------------------------------------------------------------------
 1 | //Copyright(c) Microsoft Corporation.All rights reserved.
 2 | //Licensed under the MIT License.
 3 | 
 4 | using Microsoft.Office.Interop.Word;
 5 | using System;
 6 | 
 7 | namespace DSnA.WebJob.DocumentParser
 8 | {
 9 |     public interface IInteropWordUtils
10 |     {
11 |         Application CreateWordAppInstance();
12 |         Document OpenDocument(string file, Application wordApp);
13 |         void DisposeIneropObject(Application wordApp, bool saveChanges = false);
14 |     }
15 | 
16 |     class InteropWordUtils : IInteropWordUtils
17 |     {
18 |         /// <summary>
19 |         /// Creates word application instance
20 |         /// </summary>
21 |         /// <returns></returns>
22 |         public Application CreateWordAppInstance()
23 |         {
24 |             return new Application
25 |             {
26 |                 DisplayAlerts = WdAlertLevel.wdAlertsNone,
27 |                 Visible = false,
28 |                 Options = { SavePropertiesPrompt = false, SaveNormalPrompt = false, DisplayPasteOptions = false, DoNotPromptForConvert = true }
29 |             };
30 |         }
31 | 
32 |         /// <summary>
33 |         /// Opens word document
34 |         /// </summary>
35 |         /// <param name="file"></param>
36 |         /// <param name="wordApp"></param>
37 |         /// <returns></returns>
38 |         public Document OpenDocument(string file, Application wordApp)
39 |         {
40 |             return wordApp.Documents.Open(file, ReadOnly: false);
41 |         }
42 | 
43 |         /// <summary>
44 |         /// Disposes all COM Objects
45 |         /// </summary>
46 |         /// <param name="wordApp"></param>
47 |         public void DisposeIneropObject(Application wordApp,bool saveChanges = false)
48 |         {
49 |             try
50 |             {
51 |                 wordApp.Quit(SaveChanges: saveChanges);
52 |                 System.Runtime.InteropServices.Marshal.ReleaseComObject(wordApp);
53 |                 GC.Collect();
54 |                 GC.WaitForPendingFinalizers();
55 |             }
56 |             catch (Exception)
57 |             {
58 | 
59 |                 wordApp.Quit(SaveChanges: false);
60 |                 System.Runtime.InteropServices.Marshal.ReleaseComObject(wordApp);
61 |                 GC.Collect();
62 |                 GC.WaitForPendingFinalizers();
63 |             }
64 |         }
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/Data/operative_verb_list.txt:
--------------------------------------------------------------------------------
  1 | accept
  2 | accepts
  3 | recommend
  4 | recommends
  5 | acknowledge
  6 | acknowledges
  7 | address
  8 | addresses
  9 | adopt
 10 | adopts
 11 | proclaim
 12 | proclaims
 13 | affirm
 14 | affirms
 15 | appeal
 16 | appeals
 17 | call
 18 | calls
 19 | draw
 20 | draws
 21 | pledge
 22 | pledges
 23 | reiterate
 24 | reiterates
 25 | request
 26 | requests
 27 | agree
 28 | agrees
 29 | decide
 30 | decides
 31 | endorse
 32 | endorses
 33 | invite
 34 | invites
 35 | note
 36 | notes
 37 | welcome
 38 | welcomes
 39 | amend
 40 | amends
 41 | applaud
 42 | applauds
 43 | appoint
 44 | appoints
 45 | approve
 46 | approves
 47 | assert
 48 | asserts
 49 | assure
 50 | assures
 51 | authorize
 52 | authorizes
 53 | await
 54 | awaits
 55 | believe
 56 | believes
 57 | condemn
 58 | condemns
 59 | censure
 60 | censures
 61 | commend
 62 | commends
 63 | commission
 64 | commissions
 65 | compliment
 66 | compliments
 67 | concur
 68 | concurs
 69 | confirm
 70 | confirms
 71 | congratulate
 72 | congratulates
 73 | consider
 74 | considers
 75 | convey
 76 | conveys
 77 | declare
 78 | declares
 79 | deem
 80 | deems
 81 | appreciate
 82 | appreciates
 83 | deplore
 84 | deplores
 85 | defer
 86 | defers
 87 | demand
 88 | demands
 89 | denounce
 90 | denounces
 91 | deprecate
 92 | deprecates
 93 | designate
 94 | designates
 95 | desire
 96 | desires
 97 | determine
 98 | determines
 99 | direct
100 | directs
101 | dissolve
102 | dissolves
103 | draw
104 | draws
105 | emphasize
106 | emphasizes
107 | empower
108 | empowers
109 | encourage
110 | encourages
111 | entrust
112 | entrusts
113 | envisage
114 | envisages
115 | establish
116 | establishes
117 | exhort
118 | exhorts
119 | expect
120 | expects
121 | express
122 | expresses
123 | extend
124 | extends
125 | maintain
126 | maintains
127 | support
128 | supports
129 | formulate
130 | formulates
131 | share
132 | shares
133 | reaffirm
134 | reaffirms
135 | insist
136 | insists
137 | instruct
138 | instructs
139 | invite
140 | invites
141 | look
142 | looks
143 | make
144 | makes
145 | mandate
146 | mandates
147 | offer
148 | offers
149 | pay
150 | pays
151 | propose
152 | proposes
153 | realize
154 | realizes
155 | reassert
156 | reasserts
157 | reassure
158 | reassures
159 | recall
160 | recalls
161 | recognize
162 | recognizes
163 | re-emphasize
164 | re-emphasizes
165 | refer
166 | refers
167 | regard
168 | regards
169 | register
170 | registers
171 | regret
172 | regrets
173 | reject
174 | rejects
175 | remind
176 | reminds
177 | renew
178 | renews
179 | resolve
180 | resolves
181 | seize
182 | seizes
183 | set
184 | sets
185 | warn
186 | warns
187 | state
188 | states
189 | stress
190 | stresses
191 | suggest
192 | suggests
193 | take
194 | takes
195 | transmit
196 | transmits
197 | trust
198 | trusts
199 | underline
200 | underlines
201 | urge
202 | urges


--------------------------------------------------------------------------------
/Data/key_words_not_un_org_list.txt:
--------------------------------------------------------------------------------
  1 | Goal
  2 | Goals
  3 | Agenda
  4 | Outcome
  5 | Headquarters
  6 | Declaration
  7 | Account
  8 | Implementation
  9 | Territory
 10 | Territories
 11 | Act
 12 | Action
 13 | Actions
 14 | Programme
 15 | Agreement
 16 | Partnership
 17 | Protection of Civilian Persons
 18 | Time of War
 19 | Framework
 20 | Frameworks
 21 | Consensus
 22 | Convention
 23 | Conventions
 24 | Related
 25 | Resolution
 26 | Resolutions
 27 | Forum
 28 | Meeting
 29 | Strategy
 30 | Eradicate
 31 | General Service
 32 | Document
 33 | Deconstruction
 34 | Status
 35 | Statute
 36 | Protocol
 37 | Protocols
 38 | Outcome
 39 | Illicit
 40 | Session
 41 | A/RES/
 42 | Movement
 43 | Chair
 44 | Treatment
 45 | Platform
 46 | Platforms
 47 | Plan
 48 | Weapons
 49 | National Food Security
 50 | Rules
 51 | Budget
 52 | Principle
 53 | Principles
 54 | System
 55 | Systems
 56 | Mechanism
 57 | Report
 58 | Pact
 59 | Compact
 60 | Trade
 61 | Consequences
 62 | United Nations Global Compact
 63 | Facility
 64 | Covenant
 65 | Covenants
 66 | Responsible
 67 | Treaty
 68 | Decade
 69 | Wider United Nations
 70 | Their
 71 | Expert
 72 | Personnel
 73 | Conservation
 74 | Field Service
 75 | Information
 76 | International Migration and Development
 77 | Coordinator
 78 | Armistice Line
 79 | Further
 80 | Day
 81 | Week
 82 | Month
 83 | Year
 84 | Criteria
 85 | El Nio
 86 | Fellowship
 87 | Safety of Maritime Navigation
 88 | Library
 89 | Doha Development Round
 90 | Journal
 91 | Review
 92 | Aid for Trade
 93 | Sea
 94 | Movement
 95 | Zone
 96 | International Health Regulations
 97 | International Mother
 98 | Goodwill Ambassadors
 99 | Chronicle
100 | Involuntary Disappearances
101 | Impact
102 | Rapporteur
103 | Rapporteurs
104 | Record
105 | Records
106 | Ministers
107 | Panel
108 | University
109 | Yearbook
110 | Messengers
111 | Terrorism
112 | Dialogue
113 | Officer
114 | Target
115 | Targets
116 | Elimination
117 | Council established
118 | Repair and Assembly
119 | Countries and Peoples
120 | Model Strategies and Practical Measures
121 | Ways and means
122 | Challenge
123 | Network
124 | Safety and Security of Radioactive Sources
125 | Guideline
126 | Guidelines
127 | Parties
128 | Unregulated Fishing
129 | Discrimination
130 | Armed Robbery against Ships
131 | Regular
132 | International Search
133 | Process
134 | Branch
135 | Context
136 | Orthodox Good Friday
137 | Seascape
138 | Regional Security
139 | Cooperation for
140 | Application
141 | Volunteer
142 | Volunteers
143 | Fishing Vessels
144 | Alternative
145 | Green Paper
146 | Holy See
147 | Need of Assistance
148 | Olympic Truce
149 | Mutual Understanding
150 | Tapta
151 | Census
152 | Sport for Development and Peace
153 | Campaign
154 | Protection of Child Victims of Trafficking
155 | Approach
156 | Service
157 | Commercial Shipping
158 | Reduction of Underwater Noise
159 | Chair
160 | Chairs
161 | Co-Chairs


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/App.config:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <configuration>
 3 |   <startup>
 4 |     <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.6.2" />
 5 |   </startup>
 6 |   <appSettings>
 7 |     <add key="StorageConnectionString" value="" />
 8 |     <add key="LogsContainerName" value="docparserlogs" />
 9 |     <add key="OutputContainerName" value="docparseoutput" />
10 |     <add key="LogPrefix" value="templates" />
11 |     <add key="OutputFileFormat" value="csv" /> <!--values: json or csv-->
12 |     <add key="StorageType" value="localstorage" /> <!--values: blob or localstorage-->
13 |     <add key="ClientSettingsProvider.ServiceUri" value="" />
14 |   </appSettings>
15 |   <runtime>
16 |     <assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
17 |       <dependentAssembly>
18 |         <assemblyIdentity name="System.Runtime" publicKeyToken="b03f5f7f11d50a3a" culture="neutral" />
19 |         <bindingRedirect oldVersion="0.0.0.0-4.1.1.0" newVersion="4.1.1.0" />
20 |       </dependentAssembly>
21 |       <dependentAssembly>
22 |         <assemblyIdentity name="System.Diagnostics.Tracing" publicKeyToken="b03f5f7f11d50a3a" culture="neutral" />
23 |         <bindingRedirect oldVersion="0.0.0.0-4.1.1.0" newVersion="4.1.1.0" />
24 |       </dependentAssembly>
25 |       <dependentAssembly>
26 |         <assemblyIdentity name="System.Reflection" publicKeyToken="b03f5f7f11d50a3a" culture="neutral" />
27 |         <bindingRedirect oldVersion="0.0.0.0-4.1.1.0" newVersion="4.1.1.0" />
28 |       </dependentAssembly>
29 |       <dependentAssembly>
30 |         <assemblyIdentity name="System.Runtime.InteropServices" publicKeyToken="b03f5f7f11d50a3a" culture="neutral" />
31 |         <bindingRedirect oldVersion="0.0.0.0-4.1.0.0" newVersion="4.1.0.0" />
32 |       </dependentAssembly>
33 |       <dependentAssembly>
34 |         <assemblyIdentity name="System.Diagnostics.DiagnosticSource" publicKeyToken="cc7b13ffcd2ddd51" culture="neutral" />
35 |         <bindingRedirect oldVersion="0.0.0.0-4.0.1.0" newVersion="4.0.1.0" />
36 |       </dependentAssembly>
37 |       <dependentAssembly>
38 |         <assemblyIdentity name="Newtonsoft.Json" publicKeyToken="30ad4fe6b2a6aeed" culture="neutral" />
39 |         <bindingRedirect oldVersion="0.0.0.0-11.0.0.0" newVersion="11.0.0.0" />
40 |       </dependentAssembly>
41 |     </assemblyBinding>
42 |   </runtime>
43 |   <system.web>
44 |     <membership defaultProvider="ClientAuthenticationMembershipProvider">
45 |       <providers>
46 |         <add name="ClientAuthenticationMembershipProvider" type="System.Web.ClientServices.Providers.ClientFormsAuthenticationMembershipProvider, System.Web.Extensions, Version=4.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35" serviceUri="" />
47 |       </providers>
48 |     </membership>
49 |     <roleManager defaultProvider="ClientRoleProvider" enabled="true">
50 |       <providers>
51 |         <add name="ClientRoleProvider" type="System.Web.ClientServices.Providers.ClientRoleProvider, System.Web.Extensions, Version=4.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35" serviceUri="" cacheTimeout="86400" />
52 |       </providers>
53 |     </roleManager>
54 |   </system.web>
55 | </configuration>


--------------------------------------------------------------------------------
/knowledge_extraction_resolution_level.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT License.
 3 | 
 4 | 
 5 | #%% Imports
 6 | import pandas as pd 
 7 | import re
 8 | 
 9 | current_dir = './UN_Knowledge_Extraction/'
10 | data_dir = current_dir + "data/"
11 | output_dir = current_dir + "output/"
12 | 
13 | 
14 | UN_DOCS = pd.read_csv(data_dir + "UN_RES_DOCS_2009_2018.csv") 
15 | UN_DOCS_resolution_level = UN_DOCS[['SourceFile']].drop_duplicates().reset_index(drop=True)
16 | UN_DOCS_resolution_level['Resolutuion_Session'] = ''
17 | UN_DOCS_resolution_level['Resolutuion_Agenda_item'] = ''
18 | UN_DOCS_resolution_level['Resolutuion_Number'] = ''
19 | UN_DOCS_resolution_level['Resolutuion_Title'] = ''
20 | UN_DOCS_resolution_level['Resolutuion_Adoption_DateMonthYear'] = ''
21 | UN_DOCS_resolution_level['Resolutuion_Adoption_Day'] = ''
22 | UN_DOCS_resolution_level['Resolutuion_Adoption_Month'] = ''
23 | UN_DOCS_resolution_level['Resolutuion_Adoption_Year'] = ''
24 | 
25 | for index, row in UN_DOCS_resolution_level.iterrows():
26 |     Resolution_Info = [''] * 5 
27 |     SourceFile = row['SourceFile']    
28 |     SourceFile_info_paragraphs = UN_DOCS.loc[UN_DOCS['SourceFile'] == SourceFile].sort_values(by=['Index']).fillna('').reset_index(drop=True)
29 |     for i in range(len(SourceFile_info_paragraphs)):
30 |         Content = SourceFile_info_paragraphs.loc[i,'Content']
31 |         Type = SourceFile_info_paragraphs.loc[i,'Type']
32 |         if(Resolution_Info[0] == '' and Type == 'Session'): 
33 |             Resolution_Info[0] = Content
34 |         elif(Resolution_Info[1] == '' and Type == 'AgendaItem'): 
35 |                 Resolution_Info[1] = Content
36 |         elif(Resolution_Info[2] == '' and Resolution_Info[3] == '' and re.match('(\d+/\d+)\s{0,1}\.\s{0,1}(.*)', Content)):
37 |             Resolution_Info[2] = re.match('(\d+/\d+)\s{0,1}\.\s{0,1}(.*)',Content).groups()[0]
38 |             Resolution_Info[3] = re.match('(\d+/\d+)\s{0,1}\.\s{0,1}(.*)',Content).groups()[1] 
39 |         elif(Resolution_Info[4] == '' and re.match('(.*)on (\d{1,2}\s\w+\s\d{4})$', Content)):
40 |             Resolution_Info[4] = re.match('(.*)on (\d{1,2}\s\w+\s\d{4})$', Content).groups()[1] 
41 |     UN_DOCS_resolution_level.iloc[index]['Resolutuion_Session'] =  Resolution_Info[0]
42 |     UN_DOCS_resolution_level.iloc[index]['Resolutuion_Agenda_item'] = Resolution_Info[1]
43 |     UN_DOCS_resolution_level.iloc[index]['Resolutuion_Number'] = Resolution_Info[2]
44 |     UN_DOCS_resolution_level.iloc[index]['Resolutuion_Title'] = Resolution_Info[3]
45 |     UN_DOCS_resolution_level.iloc[index]['Resolutuion_Adoption_DateMonthYear'] = Resolution_Info[4]
46 |     if (Resolution_Info[4] != ''):
47 |         UN_DOCS_resolution_level.iloc[index]['Resolutuion_Adoption_Day'] = re.match(r'(\d{1,2})\s(\w+)\s(\d{4})', Resolution_Info[4]).groups()[0]
48 |         UN_DOCS_resolution_level.iloc[index]['Resolutuion_Adoption_Month'] = re.match(r'(\d{1,2})\s(\w+)\s(\d{4})', Resolution_Info[4]).groups()[1]   
49 |         UN_DOCS_resolution_level.iloc[index]['Resolutuion_Adoption_Year'] = re.match(r'(\d{1,2})\s(\w+)\s(\d{4})', Resolution_Info[4]).groups()[2]
50 | 
51 | UN_DOCS_resolution_level.to_excel(output_dir + 'output_UN_DOCS_resolution_level.xlsx')
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Automatic Information Extraction and Knowledge Elicitation for United Nations Documents
 2 | 
 3 | #### Context: 
 4 | The processing of considerable and rapidly growing amount of information within UN system is left to the very limited human capacities. The UN system produces a substantial amount of information that, if effectively mobilized, could greatly enhance the effectiveness and efficiency of the UN system.
 5 | 
 6 | #### Goal: 
 7 | The goal is to pilot Microsoft Cognitive Services to unlock the strategic value of UN unstructured content by building on AI and semantic technologies. The idea is to showcase the innovative smart services that natural language processing and machine learning to effectively support policy and decision making, coordination, synergies and accountability.
 8 | 
 9 | #### Data: 
10 | UN General Assembly Resolutions (English only) between 2009 and 2018. In total 3138 resolution files in pdf format.
11 | 
12 | #### Data Reference:
13 | pre-trained word2vec embeddings trained on part of Google News dataset (about 100 billion words): https://code.google.com/archive/p/word2vec/
14 | 
15 | #### Deliverables:
16 | ##### Resolution Level:
17 | 	Resolution File Name
18 | 	Resolution Session 
19 | 	Resolution Agenda Item
20 | 	Resolution Number
21 | 	Resolution Title
22 | 	Resolution Adoption Date/Month/Year
23 | 
24 | ##### Paragraph Level:
25 | 	Paragraph Type
26 | 	First Action Verb
27 | 	Key Terms
28 | 	Referenced Resolutions
29 | 	Referenced Resolution Dates
30 | 	Sustainable Development Goals (SDG), Targets, and Indicators
31 | 	Country
32 | 	Organization Names
33 | 
34 | 
35 | 
36 | # Setup
37 | 
38 | 1. Install requirements
39 |     
40 |     This code use python 3.7
41 | 
42 |      ```
43 |      pip install -r requirements.txt
44 |      
45 |      ```
46 | 
47 | 2. Run Scripts
48 | 
49 |  	a. Run the following file for extracting resolution level information: [knowledge_extraction_resolution_level.py](https://github.com/microsoft/UN-Knowledge-Extraction/blob/main/knowledge_extraction_resolution_level.py)
50 |     
51 |       ```
52 |       python knowledge_extraction_resolution_level.py
53 |       
54 |       ```
55 |  	b. Run the following file for extracting paragraph level information: [knowledge_extraction_paragraph_level.py](https://github.com/microsoft/UN-Knowledge-Extraction/blob/main/knowledge_extraction_paragraph_level.py)
56 |     
57 |       ```
58 |       python knowledge_extraction_paragraph_level.py
59 |       
60 |       ```
61 | 
62 | ## Contributing
63 | 
64 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
65 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
66 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
67 | 
68 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
69 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
70 | provided by the bot. You will only need to do this once across all repos using our CLA.
71 | 
72 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
73 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
74 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
75 | 
76 | ## Trademarks
77 | 
78 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
79 | trademarks or logos is subject to and must follow 
80 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
81 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
82 | Any use of third-party trademarks or logos are subject to those third-party's policies.
83 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/packages.config:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <packages>
 3 |   <package id="DocumentFormat.OpenXml" version="2.8.1" targetFramework="net462" />
 4 |   <package id="Microsoft.Azure.KeyVault.Core" version="1.0.0" targetFramework="net462" />
 5 |   <package id="Microsoft.NETCore.Platforms" version="1.1.0" targetFramework="net462" />
 6 |   <package id="Microsoft.Office.Interop.Word" version="15.0.4797.1003" targetFramework="net462" />
 7 |   <package id="Microsoft.Win32.Primitives" version="4.3.0" targetFramework="net462" />
 8 |   <package id="Microsoft.WindowsAzure.ConfigurationManager" version="3.2.3" targetFramework="net462" />
 9 |   <package id="NETStandard.Library" version="1.6.1" targetFramework="net462" />
10 |   <package id="Newtonsoft.Json" version="13.0.2" targetFramework="net462" />
11 |   <package id="System.AppContext" version="4.3.0" targetFramework="net462" />
12 |   <package id="System.Collections" version="4.3.0" targetFramework="net462" />
13 |   <package id="System.Collections.Concurrent" version="4.3.0" targetFramework="net462" />
14 |   <package id="System.Console" version="4.3.0" targetFramework="net462" />
15 |   <package id="System.Diagnostics.Debug" version="4.3.0" targetFramework="net462" />
16 |   <package id="System.Diagnostics.DiagnosticSource" version="4.3.0" targetFramework="net462" />
17 |   <package id="System.Diagnostics.Tools" version="4.3.0" targetFramework="net462" />
18 |   <package id="System.Diagnostics.Tracing" version="4.3.0" targetFramework="net462" />
19 |   <package id="System.Globalization" version="4.3.0" targetFramework="net462" />
20 |   <package id="System.Globalization.Calendars" version="4.3.0" targetFramework="net462" />
21 |   <package id="System.IO" version="4.3.0" targetFramework="net462" />
22 |   <package id="System.IO.Abstractions" version="2.1.0.247" targetFramework="net462" />
23 |   <package id="System.IO.Compression" version="4.3.0" targetFramework="net462" />
24 |   <package id="System.IO.Compression.ZipFile" version="4.3.0" targetFramework="net462" />
25 |   <package id="System.IO.FileSystem" version="4.3.0" targetFramework="net462" />
26 |   <package id="System.IO.FileSystem.Primitives" version="4.3.0" targetFramework="net462" />
27 |   <package id="System.IO.Packaging" version="4.4.0" targetFramework="net462" />
28 |   <package id="System.Linq" version="4.3.0" targetFramework="net462" />
29 |   <package id="System.Linq.Expressions" version="4.3.0" targetFramework="net462" />
30 |   <package id="System.Net.Http" version="4.3.4" targetFramework="net462" />
31 |   <package id="System.Net.Primitives" version="4.3.0" targetFramework="net462" />
32 |   <package id="System.Net.Sockets" version="4.3.0" targetFramework="net462" />
33 |   <package id="System.ObjectModel" version="4.3.0" targetFramework="net462" />
34 |   <package id="System.Reflection" version="4.3.0" targetFramework="net462" />
35 |   <package id="System.Reflection.Extensions" version="4.3.0" targetFramework="net462" />
36 |   <package id="System.Reflection.Primitives" version="4.3.0" targetFramework="net462" />
37 |   <package id="System.Resources.ResourceManager" version="4.3.0" targetFramework="net462" />
38 |   <package id="System.Runtime" version="4.3.0" targetFramework="net462" />
39 |   <package id="System.Runtime.Extensions" version="4.3.0" targetFramework="net462" />
40 |   <package id="System.Runtime.Handles" version="4.3.0" targetFramework="net462" />
41 |   <package id="System.Runtime.InteropServices" version="4.3.0" targetFramework="net462" />
42 |   <package id="System.Runtime.InteropServices.RuntimeInformation" version="4.3.0" targetFramework="net462" />
43 |   <package id="System.Runtime.Numerics" version="4.3.0" targetFramework="net462" />
44 |   <package id="System.Security.Cryptography.Algorithms" version="4.3.0" targetFramework="net462" />
45 |   <package id="System.Security.Cryptography.Encoding" version="4.3.0" targetFramework="net462" />
46 |   <package id="System.Security.Cryptography.Primitives" version="4.3.0" targetFramework="net462" />
47 |   <package id="System.Security.Cryptography.X509Certificates" version="4.3.0" targetFramework="net462" />
48 |   <package id="System.Text.Encoding" version="4.3.0" targetFramework="net462" />
49 |   <package id="System.Text.Encoding.Extensions" version="4.3.0" targetFramework="net462" />
50 |   <package id="System.Text.RegularExpressions" version="4.3.1" targetFramework="net462" />
51 |   <package id="System.Threading" version="4.3.0" targetFramework="net462" />
52 |   <package id="System.Threading.Tasks" version="4.3.0" targetFramework="net462" />
53 |   <package id="System.Threading.Timer" version="4.3.0" targetFramework="net462" />
54 |   <package id="System.Xml.ReaderWriter" version="4.3.0" targetFramework="net462" />
55 |   <package id="System.Xml.XDocument" version="4.3.0" targetFramework="net462" />
56 |   <package id="WindowsAzure.Storage" version="9.3.2" targetFramework="net462" />
57 | </packages>


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Code/Constants.cs:
--------------------------------------------------------------------------------
 1 | //Copyright(c) Microsoft Corporation.All rights reserved.
 2 | //Licensed under the MIT License.
 3 | 
 4 | namespace DSnA.WebJob.DocumentParser
 5 | {
 6 |     using System;
 7 |     using System.Text.RegularExpressions;
 8 | 
 9 |     public static class Constants
10 |     {
11 |         public static string DateTimeFormat => "MM-dd-yyyy_HH-mm-ss";
12 |         public static class FileConfigs
13 |         {
14 |             private static string _sourceDirectoryPath;
15 |             private static string _outputDirectoryPath;
16 | 
17 |             public static string SourceDirectoryPath
18 |             {
19 |                 get
20 |                 {
21 |                     return _sourceDirectoryPath;
22 |                 }
23 | 
24 |                 set { _sourceDirectoryPath = string.Format(@"{0}", value); }
25 |             }
26 | 
27 |             public static string OutputDirectoryPath
28 |             {
29 |                 get
30 |                 {
31 |                     return string.IsNullOrEmpty(_outputDirectoryPath) ? @"\DocumentParser" : _outputDirectoryPath;
32 |                 }
33 | 
34 |                 set { _outputDirectoryPath = string.Format(@"{0}", value); }
35 |             }
36 | 
37 |             public static string WorkingDirectoryPath
38 |             {
39 |                 get
40 |                 {
41 |                     return OutputDirectoryPath + @"\Temp";
42 |                 }
43 |             }
44 | 
45 |             public static string LogFileName = "log_" + DateTime.UtcNow.ToString("MM-dd-yyyy") + ".log";
46 |             public static string TempFileName => "JsonByExtractionProgram";
47 |         }
48 | 
49 |         public static class RegexExp
50 |         {
51 |             public static string NoSpecialCharRegex => "[\\W]+";
52 |             // match only strings with combination of numbers and spaces
53 |             public static string OnlyNumericWithSpaces => "^([0-9\\s]+)$";
54 |             // to match company names in reports
55 |             public static string CompanyNameRegex => "^(^[a-zA-Z\\d\\s]+[a-zA-Z\\d]+[a-zA-Z\\d\\W]*)$";
56 |             // regex to match dates like MMMM dd,YYYY (January 23, 2017) and its combinations
57 |             public static string DateRegex => "^(\\s*\\w{3,9}?\\s*?\\d{1,2}?\\s*?,\\s*?\\d{4}?)";
58 |             public static string NoEscapeSequences => @"[\a\b\t\r\v\f\e]";
59 |             public static string OnlyAsciiChar => @"[^\u0000-\u007F]+";
60 |             public static string OnlyWhiteSpaces => @"\s+";
61 |             public static string OnlyHyperlinks => @"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?";
62 |             public static string HyperlinkAppProtocols => @"(http|https|ftp)";
63 |             public static string HasNumbers => @"^(?=.*[0-9])";
64 | 
65 |             public static string HasBulletPoint => @"^(|\u2022|\u2023|\u25E6|\u2043|\u2219|-|[a-z]\)|[a-z]\.)";
66 |             public static readonly Regex SessionRegEx = new Regex(@"(\w)*(-)?(\w)*(\s)*(session)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
67 |             public static readonly Regex AgendaItemRegEx = new Regex(@"(agenda)(\s)+(item)+(s)?(\s)+(\d)*(\s)*(and)*(\s)*([a-fA-F0-9\(\)])*", RegexOptions.IgnoreCase | RegexOptions.Compiled);
68 |         }
69 | 
70 |         public static class ParserConfig
71 |         {
72 |             public static string MessageQueueRef => "QueueName";
73 |             public static string ConnectionUriRef => "KeyVaultUriForConnectionString";
74 |             public static int MaxDequeueCount => 5;
75 |             public static string LogsContainerNameRef => "LogsContainerName";
76 |             public static string LogPrefix => "LogPrefix";
77 |             public static string OutputContainerNameRef => "OutputContainerName";
78 |         }
79 | 
80 |         public static class CsvFileConfig
81 |         {
82 |             public const string CsvFileFormat = "csv";
83 |             public const string JsonFileFormat = "json";
84 |             public static string Headers => "SourceFile,Index,Content,Type";
85 |             public static string ContentTypeBlobUri => "BlobUri";
86 |             public static string ContentTypeAgreementNumber => "AgreementNumber";
87 |             public static string ContentTypeFileType => "FileType";
88 |             public static string ContentTypeExtractionTimeStamp => "ExtractionTimeStamp";
89 |             public static string ContentTypeText => "Text";
90 |             public static string ContentTypeParagraph => "Paragraph";
91 |             public static string ContentTypeHeader => "Header";
92 |             public static string ContentTypeSection => "Section";
93 |             public static string ContentTypeClause => "Clause";
94 |             public static string ContentTypeHeaderClause => "HeaderClause";
95 |             public static string ContentTypeAdditionalInformation => "AdditionalInformation";
96 |         }
97 |     }
98 | }


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Code/Logger.cs:
--------------------------------------------------------------------------------
  1 | //Copyright(c) Microsoft Corporation.All rights reserved.
  2 | //Licensed under the MIT License.
  3 | 
  4 | using System;
  5 | 
  6 | namespace DSnA.WebJob.DocumentParser
  7 | {
  8 |     using System.Diagnostics;
  9 |     using Microsoft.Azure;
 10 |     using Microsoft.WindowsAzure.Storage;
 11 |     using Microsoft.WindowsAzure.Storage.Blob;
 12 | 
 13 |     public interface ILogger
 14 |     {
 15 |         void Info(string message);
 16 |         void Error(string message, Exception exp);
 17 |     }
 18 |     public class Logger : ILogger
 19 |     {
 20 |         private static CloudStorageAccount StorageAccount = CloudStorageAccount.Parse(CloudConfigurationManager.GetSetting("StorageConnectionString"));
 21 | 
 22 |         private static CloudBlobClient blobClient = StorageAccount.CreateCloudBlobClient();
 23 | 
 24 |         private static string logContainerName = CloudConfigurationManager.GetSetting(Constants.ParserConfig.LogsContainerNameRef);
 25 | 
 26 |         private static string logPrefix = CloudConfigurationManager.GetSetting(Constants.ParserConfig.LogPrefix);
 27 | 
 28 |         private static Logger LoggerInstance;
 29 |         private Logger() { }
 30 | 
 31 |         public static Logger Instance
 32 |         {
 33 |             get
 34 |             {
 35 |                 if (LoggerInstance == null)
 36 |                 {
 37 |                     LoggerInstance = new Logger();
 38 |                 }
 39 | 
 40 |                 return LoggerInstance;
 41 |             }
 42 |         }
 43 | 
 44 |         /// <summary>
 45 |         /// Write log text (info/error) to Azure Blob
 46 |         /// </summary>
 47 |         /// <param name="message"></param>
 48 |         /// <param name="category"></param>
 49 |         /// <param name="exp"></param>
 50 |         private void Write(string message, EventLogEntryType category, Exception exp = null)
 51 |         {
 52 |             try
 53 |             {
 54 |                 // create blob client and container(if not exists) to store the logs in Azure Storage Account
 55 |                 CloudBlobContainer logsContainer = blobClient.GetContainerReference(logContainerName);
 56 |                 logsContainer.CreateIfNotExists();
 57 |                 // append information to blob - create logs for every day if not exists
 58 |                 CloudAppendBlob appendBlob = logsContainer.GetAppendBlobReference($"log_{logPrefix}_{DateTime.UtcNow.ToString("MM-dd-yyyy")}.log");
 59 |                 if (!appendBlob.Exists())
 60 |                     appendBlob.CreateOrReplace();
 61 | 
 62 |                 if (exp != null)
 63 |                     appendBlob.AppendText(String.Format("{0:u}\t[{1}]\t[{2}]\tMessage:{3}{4}{5}{6}",
 64 |                         DateTime.UtcNow, Environment.MachineName, category.ToString().ToUpper(), message, Environment.NewLine, exp, Environment.NewLine));
 65 |                 else
 66 |                     appendBlob.AppendText(String.Format("{0:u}\t[{1}]\t[{2}]\tMessage:{3}{4}",
 67 |                         DateTime.UtcNow, Environment.MachineName, category.ToString().ToUpper(), message, Environment.NewLine));
 68 |             }
 69 |             catch (Exception exception)
 70 |             {
 71 |                 throw new LoggerException("Exception in Logging information/error", exception);
 72 |             }
 73 |         }
 74 | 
 75 |         /// <summary>
 76 |         /// Logs information text
 77 |         /// </summary>
 78 |         /// <param name="message"></param>
 79 |         public void Info(string message)
 80 |         {
 81 |             Write(message, EventLogEntryType.Information);
 82 |         }
 83 | 
 84 |         /// <summary>
 85 |         /// Logs Exception text
 86 |         /// </summary>
 87 |         /// <param name="message"></param>
 88 |         /// <param name="exp"></param>
 89 |         public void Error(string message, Exception exp)
 90 |         {
 91 |             Write(message, EventLogEntryType.Error, exp);
 92 |         }
 93 |     }
 94 |     public class ConsoleLogger : ILogger
 95 |     {
 96 |         private static ConsoleLogger LoggerInstance;
 97 |         private ConsoleLogger() { }
 98 | 
 99 | 
100 |         public static ConsoleLogger Instance
101 |         {
102 |             get
103 |             {
104 |                 if (LoggerInstance == null)
105 |                 {
106 |                     LoggerInstance = new ConsoleLogger();
107 |                 }
108 |                 return LoggerInstance;
109 |             }
110 |         }
111 | 
112 |         /// <summary>
113 |         /// Write log text (info/error) to Azure Blob
114 |         /// </summary>
115 |         /// <param name="message"></param>
116 |         /// <param name="category"></param>
117 |         /// <param name="exp"></param>
118 |         private void Write(string message, EventLogEntryType category, Exception exp = null)
119 |         {
120 |             Console.WriteLine($"{category}: {message}");
121 | 
122 |             if (exp != null)
123 |             {
124 |                 Console.WriteLine(exp.Message);
125 |                 Console.WriteLine(exp.StackTrace);
126 |             }
127 |         }
128 | 
129 |         /// <summary>
130 |         /// Logs information text
131 |         /// </summary>
132 |         /// <param name="message"></param>
133 |         public void Info(string message)
134 |         {
135 |             Write(message, EventLogEntryType.Information);
136 |         }
137 | 
138 |         /// <summary>
139 |         /// Logs Exception text
140 |         /// </summary>
141 |         /// <param name="message"></param>
142 |         /// <param name="exp"></param>
143 |         public void Error(string message, Exception exp)
144 |         {
145 |             Write(message, EventLogEntryType.Error, exp);
146 |         }
147 |     }
148 | }
149 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Code/ParserClasses.cs:
--------------------------------------------------------------------------------
  1 | //Copyright(c) Microsoft Corporation.All rights reserved.
  2 | //Licensed under the MIT License.
  3 | 
  4 | namespace DSnA.WebJob.DocumentParser
  5 | {
  6 |     using Newtonsoft.Json;
  7 |     using System.Collections.Generic;
  8 | 
  9 |     /// <summary>
 10 |     /// Represents output JSON document structure
 11 |     /// </summary>
 12 |     public class JsonDocumentStruct
 13 |     {
 14 |         public DocumentContent DocumentContent { get; set; }
 15 |         public FileMetaData FileProperties { get; set; }
 16 |         public Error Errors { get; set; }
 17 |     }
 18 | 
 19 |     public class JsonDocumentStructFlat
 20 |     {
 21 |         [JsonProperty(PropertyName = "agreementNumber")]
 22 |         public string AgreementNumber { get; set; }
 23 | 
 24 |         [JsonProperty(PropertyName = "fileName")]
 25 |         public string FileName { get; set; }
 26 | 
 27 |         [JsonProperty(PropertyName = "fileType")]
 28 |         public string FileType { get; set; }
 29 | 
 30 |         [JsonProperty(PropertyName = "imageStoreUri")]
 31 |         public string ImageStoreUri { get; set; }
 32 | 
 33 |         [JsonProperty(PropertyName = "extractionTimeStamp")]
 34 |         public string ExtractionTimeStamp { get; set; }
 35 | 
 36 |         [JsonProperty(PropertyName = "text")]
 37 |         public string Text { get; set; }
 38 | 
 39 |         [JsonProperty(PropertyName = "headers")]
 40 |         public Dictionary<int, string> Headers { get; set; }
 41 | 
 42 |         [JsonProperty(PropertyName = "paragraphs")]
 43 |         public Dictionary<int, string> Paragraphs { get; set; }
 44 | 
 45 |         [JsonProperty(PropertyName = "sections")]
 46 |         public Dictionary<int, string> Sections { get; set; }
 47 | 
 48 |         [JsonProperty(PropertyName = "clauses")]
 49 |         public List<Clauses> Clauses { get; set; }
 50 | 
 51 |         [JsonProperty(PropertyName = "headerClauses")]
 52 |         public List<Clauses> HeaderClauses { get; set; }
 53 | 
 54 |         [JsonProperty(PropertyName = "additionalInformation")]
 55 |         public List<string> AdditionalInformation { get; set; }
 56 |     }
 57 | 
 58 |     public class ReportExtractionResponse
 59 |     {
 60 |         public string location { get; set; }
 61 |         public string contentJson { get; set; }
 62 |     }
 63 | 
 64 |     public class Clauses
 65 |     {
 66 |         public Clauses()
 67 |         {
 68 |             this.Title = "";
 69 |             this.Content = "";
 70 |             this.Start = -1;
 71 |             this.End = -1;
 72 |         }
 73 | 
 74 |         public string Title { get; set; }
 75 |         public string Content { get; set; }
 76 |         public int Start { get; set; }
 77 |         public int End { get; set; }
 78 |     }
 79 | 
 80 |     /// <summary>
 81 |     /// Represents file meta data in Json output
 82 |     /// </summary>
 83 |     public class FileMetaData
 84 |     {
 85 |         [JsonProperty(PropertyName = "agreementNumber")]
 86 |         public string AgreementNumber { get; set; }
 87 | 
 88 |         [JsonProperty(PropertyName = "fileName")]
 89 |         public string FileName { get; set; }
 90 | 
 91 |         [JsonProperty(PropertyName = "fileType")]
 92 |         public string FileType { get; set; }
 93 | 
 94 |         [JsonProperty(PropertyName = "extractionTimeStamp")]
 95 |         public string ExtractionTimeStamp { get; set; }
 96 |     }
 97 | 
 98 |     /// <summary>
 99 |     /// Represents higher structure of red flag document content in Json output
100 |     /// </summary>
101 |     public class DocumentContent
102 |     {
103 |         [JsonProperty(PropertyName = "text")]
104 |         public string Text { get; set; }
105 |         [JsonProperty(PropertyName = "paragraphs")]
106 |         public Dictionary<int, string> Paragraphs { get; set; }
107 | 
108 |         [JsonProperty(PropertyName = "headers")]
109 |         public Dictionary<int, string> Headers { get; set; }
110 | 
111 |         [JsonProperty(PropertyName = "sections")]
112 |         public Dictionary<int, string> Sections { get; set; }
113 | 
114 |         [JsonProperty(PropertyName = "clauses")]
115 |         public List<Clauses> Clauses { get; set; }
116 | 
117 |         [JsonProperty(PropertyName = "headerClauses")]
118 |         public List<Clauses> HeaderClauses { get; set; }
119 | 
120 |         [JsonProperty(PropertyName = "additionalInformation")]
121 |         public List<string> AdditionalInformation { get; set; }
122 |     }
123 | 
124 |     /// <summary>
125 |     /// Represents structure of errors presented in Json output
126 |     /// </summary>
127 |     public class Error
128 |     {
129 |         public Error()
130 |         {
131 |             this.IsError = false;
132 |             this.Description = "";
133 |         }
134 | 
135 |         public bool IsError { get; set; }
136 |         public string Description { get; set; }
137 |     }
138 | 
139 |     /// <summary>
140 |     /// Represents structure of input message from OneVet Queue
141 |     /// </summary>
142 |     public class QueueMessage
143 |     {
144 |         public string DocumentId { get; set; }
145 |         public string FileInputUri { get; set; }
146 |         public string FileOutputUri { get; set; }
147 |         public string RequestCreationDateTimeUtc { get; set; }
148 |         public string DocumentTypeId { get; set; }
149 |     }
150 | 
151 |     /// <summary>
152 |     /// Represents structure for csv output files
153 |     /// </summary>
154 |     public class CsvDocumentFile
155 |     {
156 |         public CsvDocumentFile()
157 |         {
158 |             this.CsvOutputLines = new List<string>() { Constants.CsvFileConfig.Headers };
159 |         }
160 | 
161 |         private List<string> CsvOutputLines { get; set; }
162 | 
163 |         public void AddCsvLine(string sourceFile, string content, string type)
164 |         {
165 |             this.CsvOutputLines.Add(string.Format("{0},{1},\"{2}\",{3}", sourceFile, GetCurrentLineCount(), content, type));
166 |         }
167 | 
168 |         public List<string> GetCsvOutputLines()
169 |         {
170 |             return this.CsvOutputLines;
171 |         }
172 | 
173 |         private int GetCurrentLineCount()
174 |         {
175 |             return this.CsvOutputLines.Count > 0 ? this.CsvOutputLines.Count - 1 : this.CsvOutputLines.Count;
176 |         }
177 |     }
178 | }
179 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | .DS_Store
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Aa][Rr][Mm]/
 27 | [Aa][Rr][Mm]64/
 28 | bld/
 29 | [Bb]in/
 30 | [Oo]bj/
 31 | [Ll]og/
 32 | [Ll]ogs/
 33 | 
 34 | # Visual Studio 2015/2017 cache/options directory
 35 | .vs/
 36 | # Uncomment if you have tasks that create the project's static files in wwwroot
 37 | #wwwroot/
 38 | 
 39 | # Visual Studio 2017 auto generated files
 40 | Generated\ Files/
 41 | 
 42 | # MSTest test Results
 43 | [Tt]est[Rr]esult*/
 44 | [Bb]uild[Ll]og.*
 45 | 
 46 | # NUnit
 47 | *.VisualState.xml
 48 | TestResult.xml
 49 | nunit-*.xml
 50 | 
 51 | # Build Results of an ATL Project
 52 | [Dd]ebugPS/
 53 | [Rr]eleasePS/
 54 | dlldata.c
 55 | 
 56 | # Benchmark Results
 57 | BenchmarkDotNet.Artifacts/
 58 | 
 59 | # .NET Core
 60 | project.lock.json
 61 | project.fragment.lock.json
 62 | artifacts/
 63 | 
 64 | # StyleCop
 65 | StyleCopReport.xml
 66 | 
 67 | # Files built by Visual Studio
 68 | *_i.c
 69 | *_p.c
 70 | *_h.h
 71 | *.ilk
 72 | *.meta
 73 | *.obj
 74 | *.iobj
 75 | *.pch
 76 | *.pdb
 77 | *.ipdb
 78 | *.pgc
 79 | *.pgd
 80 | *.rsp
 81 | *.sbr
 82 | *.tlb
 83 | *.tli
 84 | *.tlh
 85 | *.tmp
 86 | *.tmp_proj
 87 | *_wpftmp.csproj
 88 | *.log
 89 | *.vspscc
 90 | *.vssscc
 91 | .builds
 92 | *.pidb
 93 | *.svclog
 94 | *.scc
 95 | 
 96 | # Chutzpah Test files
 97 | _Chutzpah*
 98 | 
 99 | # Visual C++ cache files
100 | ipch/
101 | *.aps
102 | *.ncb
103 | *.opendb
104 | *.opensdf
105 | *.sdf
106 | *.cachefile
107 | *.VC.db
108 | *.VC.VC.opendb
109 | 
110 | # Visual Studio profiler
111 | *.psess
112 | *.vsp
113 | *.vspx
114 | *.sap
115 | 
116 | # Visual Studio Trace Files
117 | *.e2e
118 | 
119 | # TFS 2012 Local Workspace
120 | $tf/
121 | 
122 | # Guidance Automation Toolkit
123 | *.gpState
124 | 
125 | # ReSharper is a .NET coding add-in
126 | _ReSharper*/
127 | *.[Rr]e[Ss]harper
128 | *.DotSettings.user
129 | 
130 | # TeamCity is a build add-in
131 | _TeamCity*
132 | 
133 | # DotCover is a Code Coverage Tool
134 | *.dotCover
135 | 
136 | # AxoCover is a Code Coverage Tool
137 | .axoCover/*
138 | !.axoCover/settings.json
139 | 
140 | # Visual Studio code coverage results
141 | *.coverage
142 | *.coveragexml
143 | 
144 | # NCrunch
145 | _NCrunch_*
146 | .*crunch*.local.xml
147 | nCrunchTemp_*
148 | 
149 | # MightyMoose
150 | *.mm.*
151 | AutoTest.Net/
152 | 
153 | # Web workbench (sass)
154 | .sass-cache/
155 | 
156 | # Installshield output folder
157 | [Ee]xpress/
158 | 
159 | # DocProject is a documentation generator add-in
160 | DocProject/buildhelp/
161 | DocProject/Help/*.HxT
162 | DocProject/Help/*.HxC
163 | DocProject/Help/*.hhc
164 | DocProject/Help/*.hhk
165 | DocProject/Help/*.hhp
166 | DocProject/Help/Html2
167 | DocProject/Help/html
168 | 
169 | # Click-Once directory
170 | publish/
171 | 
172 | # Publish Web Output
173 | *.[Pp]ublish.xml
174 | *.azurePubxml
175 | # Note: Comment the next line if you want to checkin your web deploy settings,
176 | # but database connection strings (with potential passwords) will be unencrypted
177 | *.pubxml
178 | *.publishproj
179 | 
180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
181 | # checkin your Azure Web App publish settings, but sensitive information contained
182 | # in these scripts will be unencrypted
183 | PublishScripts/
184 | 
185 | # NuGet Packages
186 | *.nupkg
187 | # NuGet Symbol Packages
188 | *.snupkg
189 | # The packages folder can be ignored because of Package Restore
190 | **/[Pp]ackages/*
191 | # except build/, which is used as an MSBuild target.
192 | !**/[Pp]ackages/build/
193 | # Uncomment if necessary however generally it will be regenerated when needed
194 | #!**/[Pp]ackages/repositories.config
195 | # NuGet v3's project.json files produces more ignorable files
196 | *.nuget.props
197 | *.nuget.targets
198 | 
199 | # Microsoft Azure Build Output
200 | csx/
201 | *.build.csdef
202 | 
203 | # Microsoft Azure Emulator
204 | ecf/
205 | rcf/
206 | 
207 | # Windows Store app package directories and files
208 | AppPackages/
209 | BundleArtifacts/
210 | Package.StoreAssociation.xml
211 | _pkginfo.txt
212 | *.appx
213 | *.appxbundle
214 | *.appxupload
215 | 
216 | # Visual Studio cache files
217 | # files ending in .cache can be ignored
218 | *.[Cc]ache
219 | # but keep track of directories ending in .cache
220 | !?*.[Cc]ache/
221 | 
222 | # Others
223 | ClientBin/
224 | ~$*
225 | *~
226 | *.dbmdl
227 | *.dbproj.schemaview
228 | *.jfm
229 | *.pfx
230 | *.publishsettings
231 | orleans.codegen.cs
232 | 
233 | # Including strong name files can present a security risk
234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
235 | #*.snk
236 | 
237 | # Since there are multiple workflows, uncomment next line to ignore bower_components
238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
239 | #bower_components/
240 | 
241 | # RIA/Silverlight projects
242 | Generated_Code/
243 | 
244 | # Backup & report files from converting an old project file
245 | # to a newer Visual Studio version. Backup files are not needed,
246 | # because we have git ;-)
247 | _UpgradeReport_Files/
248 | Backup*/
249 | UpgradeLog*.XML
250 | UpgradeLog*.htm
251 | ServiceFabricBackup/
252 | *.rptproj.bak
253 | 
254 | # SQL Server files
255 | *.mdf
256 | *.ldf
257 | *.ndf
258 | 
259 | # Business Intelligence projects
260 | *.rdl.data
261 | *.bim.layout
262 | *.bim_*.settings
263 | *.rptproj.rsuser
264 | *- [Bb]ackup.rdl
265 | *- [Bb]ackup ([0-9]).rdl
266 | *- [Bb]ackup ([0-9][0-9]).rdl
267 | 
268 | # Microsoft Fakes
269 | FakesAssemblies/
270 | 
271 | # GhostDoc plugin setting file
272 | *.GhostDoc.xml
273 | 
274 | # Node.js Tools for Visual Studio
275 | .ntvs_analysis.dat
276 | node_modules/
277 | 
278 | # Visual Studio 6 build log
279 | *.plg
280 | 
281 | # Visual Studio 6 workspace options file
282 | *.opt
283 | 
284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
285 | *.vbw
286 | 
287 | # Visual Studio LightSwitch build output
288 | **/*.HTMLClient/GeneratedArtifacts
289 | **/*.DesktopClient/GeneratedArtifacts
290 | **/*.DesktopClient/ModelManifest.xml
291 | **/*.Server/GeneratedArtifacts
292 | **/*.Server/ModelManifest.xml
293 | _Pvt_Extensions
294 | 
295 | # Paket dependency manager
296 | .paket/paket.exe
297 | paket-files/
298 | 
299 | # FAKE - F# Make
300 | .fake/
301 | 
302 | # CodeRush personal settings
303 | .cr/personal
304 | 
305 | # Python Tools for Visual Studio (PTVS)
306 | __pycache__/
307 | *.pyc
308 | 
309 | # Cake - Uncomment if you are using it
310 | # tools/**
311 | # !tools/packages.config
312 | 
313 | # Tabs Studio
314 | *.tss
315 | 
316 | # Telerik's JustMock configuration file
317 | *.jmconfig
318 | 
319 | # BizTalk build output
320 | *.btp.cs
321 | *.btm.cs
322 | *.odx.cs
323 | *.xsd.cs
324 | 
325 | # OpenCover UI analysis results
326 | OpenCover/
327 | 
328 | # Azure Stream Analytics local run output
329 | ASALocalRun/
330 | 
331 | # MSBuild Binary and Structured Log
332 | *.binlog
333 | 
334 | # NVidia Nsight GPU debugger configuration file
335 | *.nvuser
336 | 
337 | # MFractors (Xamarin productivity tool) working folder
338 | .mfractor/
339 | 
340 | # Local History for Visual Studio
341 | .localhistory/
342 | 
343 | # BeatPulse healthcheck temp database
344 | healthchecksdb
345 | 
346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
347 | MigrationBackup/
348 | 
349 | # Ionide (cross platform F# VS Code tools) working folder
350 | .ionide/
351 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Program.cs:
--------------------------------------------------------------------------------
  1 | //Copyright(c) Microsoft Corporation.All rights reserved.
  2 | //Licensed under the MIT License.
  3 | 
  4 | using Microsoft.Azure;
  5 | using Microsoft.Office.Interop.Word;
  6 | using Microsoft.WindowsAzure.Storage;
  7 | using Microsoft.WindowsAzure.Storage.Blob;
  8 | using System;
  9 | using System.Diagnostics;
 10 | using System.Linq;
 11 | 
 12 | namespace DSnA.WebJob.DocumentParser
 13 | {
 14 |     class Program
 15 |     {
 16 |         private static string _storageType = CloudConfigurationManager.GetSetting("StorageType");
 17 | 
 18 |         static void Main(string[] args)
 19 |         {
 20 |             if (!ValidateArgs(args, _storageType))
 21 |                 return;
 22 | 
 23 |             var logger = ConsoleLogger.Instance;
 24 |             var util = new Utils(logger);
 25 |             
 26 |             var args0 = _storageType == "blob" ? args[0] : Constants.FileConfigs.SourceDirectoryPath = args[0];
 27 |             var args1 = _storageType == "blob" ? args[1] : Constants.FileConfigs.OutputDirectoryPath = args[1];
 28 |             var args2 = args.Count() > 2 ? args[2] : null;
 29 |             
 30 |             IStorageClientFactory clientFactory = new DefaultStorageClientFactory();
 31 |             IStorageClient client = clientFactory.Create(_storageType, new System.Collections.Generic.Dictionary<string, string>() {
 32 |                                         { DefaultStorageClientFactory.BlobContainerNameKey, args0 }
 33 |                                     }, util);
 34 | 
 35 |             string[] uris = GetUris(client, _storageType == "blob" ? args1 : Constants.FileConfigs.SourceDirectoryPath, args2, util);
 36 | 
 37 |             IDocumentParser parser = new DocumentParser(logger, util, new ParseHelper(logger, util));
 38 | 
 39 |             var total = uris.Count();
 40 |             if (total == 0)
 41 |             {
 42 |                 Console.WriteLine("No files to process...");
 43 |             }
 44 | 
 45 |             var counter = 0;
 46 | 
 47 |             var outputFileFormat = CloudConfigurationManager.GetSetting("OutputFileFormat");
 48 | 
 49 |             Stopwatch stopWatch = new Stopwatch();
 50 | 
 51 |             InteropWordUtils iInteropWordUtils = new InteropWordUtils();
 52 | 
 53 |             // fire up word instance
 54 |             Application wordApp = iInteropWordUtils.CreateWordAppInstance();
 55 | 
 56 |             int maxFailures = 3;
 57 |             int currentFailures = 0;
 58 | 
 59 |             try
 60 |             {
 61 |                 foreach (var uri in uris)
 62 |                 {
 63 |                     stopWatch.Start();
 64 |                     counter++;
 65 |                     Console.WriteLine($"Processing: {counter} out of {total}");
 66 |                     Console.WriteLine($"Processing: {uri}");
 67 | 
 68 |                     string result = null;
 69 | 
 70 |                     try
 71 |                     {
 72 |                         result = parser.ParseDocuments(uri, client, wordApp, outputFileFormat);
 73 |                     }
 74 |                     catch (Exception ex)
 75 |                     {
 76 |                         Console.WriteLine(ex.Message);
 77 |                         Console.WriteLine(ex.StackTrace);
 78 | 
 79 |                         currentFailures++;
 80 | 
 81 |                         if (currentFailures >= maxFailures)
 82 |                         {
 83 |                             throw new Exception("Max failure count reached.");
 84 |                         }
 85 |                     }
 86 | 
 87 |                     stopWatch.Stop();
 88 | 
 89 |                     TimeSpan ts = stopWatch.Elapsed;
 90 | 
 91 |                     string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}",
 92 |                                                         ts.Hours, ts.Minutes, ts.Seconds,
 93 |                                                         ts.Milliseconds / 10);
 94 | 
 95 |                     Console.WriteLine("RunTime " + elapsedTime);
 96 |                     Console.WriteLine(result);
 97 |                     stopWatch.Reset();
 98 |                 }
 99 |             }
100 |             finally
101 |             {
102 |                 iInteropWordUtils.DisposeIneropObject(wordApp);
103 |             }
104 | 
105 |             Console.WriteLine("Press any key to exit...");
106 |             Console.ReadKey();
107 |         }
108 | 
109 |         private static string[] GetUris(IStorageClient client, string prefix, string filter, IUtils utils)
110 |         {
111 |             if (client is BlobStorageClient) return GetBlobUris(client as BlobStorageClient, prefix, filter, utils);
112 |             else if (client is LocalStorageClient) return GetLocalUris(client as LocalStorageClient, prefix, filter, utils);
113 |             else return null;
114 |         }
115 | 
116 |         private static string[] GetBlobUris(BlobStorageClient client, string sourcePath, string filter, IUtils utils)
117 |         {
118 |             Console.WriteLine($"Listing Blobs in container {client.Container.Name} in folder {sourcePath}");
119 | 
120 |             string blobPrefix = sourcePath == "null" || sourcePath == "/" ? null : sourcePath;
121 | 
122 |             var outputBlobList = utils.GetBlobListFromOutputContainer(client.Client);
123 | 
124 |             var blobList = client.Container.ListBlobs(prefix: blobPrefix, useFlatBlobListing: true);
125 | 
126 |             var filteredBlobList = blobList.Where(s => !outputBlobList.Contains(utils.CleanNonSupportedSparkChar(s.Uri.Segments[s.Uri.Segments.Length - 1]))).ToList();
127 | 
128 |             if (filter != null)
129 |             {
130 |                 filteredBlobList = filteredBlobList.Where(s => s.Uri.PathAndQuery.Contains(filter)).ToList();
131 |             }
132 | 
133 |             return filteredBlobList
134 |                 .Select(x => x.Uri.AbsoluteUri)
135 |                 .ToArray();
136 |         }
137 | 
138 |         private static string[] GetLocalUris(LocalStorageClient client, string sourcePath, string filter, IUtils utils)
139 |         {
140 |             string[] files = System.IO.Directory.GetFiles(sourcePath, "*", System.IO.SearchOption.AllDirectories);
141 | 
142 |             return (!string.IsNullOrEmpty(filter)
143 |                 ? files.Where(x => x.Contains(filter)).ToArray()
144 |                 : files);
145 |         }
146 | 
147 |         private static bool ValidateArgs(string[] args, string storageType)
148 |         {
149 |             bool validArgs = true;
150 | 
151 |             switch (storageType)
152 |             {
153 |                 case "blob":
154 |                     if (!args?.Any() ?? true)
155 |                     {
156 |                         Console.WriteLine(string.Format(" No arguments passed. \n\n DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3 \n\n Options: \n\t arg1: Required - blob container name \n\t arg2: Required - virtual directory name (/ root level) \n\t arg3: Optional - document file name filter"));
157 |                         validArgs = false;
158 |                     }
159 |                     else if (!(args.Length >= 2 && args.Length < 4))
160 |                     {
161 |                         Console.WriteLine(string.Format(" Incorrect number of arguments. \n\n DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3 \n\n Options: \n\t arg1: Required - blob container name \n\t arg2: Required - virtual directory name (/ root level) \n\t arg3: Optional - document file name filter"));
162 |                         validArgs = false;
163 |                     }
164 |                     break;
165 | 
166 |                 case "localstorage":
167 |                     if (!args?.Any() ?? true)
168 |                     {
169 |                         Console.WriteLine(string.Format(" No arguments passed. \n\n DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3 \n\n Options:  \n\t arg1: Required - local storage source folder path \n\t arg2: Required - local storage output folder path \n\t arg3: Optional - document file name filter"));
170 |                         validArgs = false;
171 |                     }
172 |                     else if (!(args.Length >= 2 && args.Length < 4))
173 |                     {
174 |                         Console.WriteLine(string.Format(" Incorrect number of arguments. \n\n DSnA.WebJob.DocumentParser.exe arg1 arg2 arg3 \n\n Options:  \n\t arg1: Required - local storage source folder path \n\t arg2: Required - local storage output folder path \n\t arg3: Optional - document file name filter"));
175 |                         validArgs = false;
176 |                     }
177 |                     break;
178 | 
179 |                 default:
180 |                     return validArgs = false;
181 |             }
182 | 
183 |             return validArgs;
184 |         }
185 |     }
186 | }
187 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Code/DocumentParser.cs:
--------------------------------------------------------------------------------
  1 | //Copyright(c) Microsoft Corporation.All rights reserved.
  2 | //Licensed under the MIT License.
  3 | 
  4 | using System;
  5 | using System.IO;
  6 | using Microsoft.WindowsAzure.Storage.Blob;
  7 | using System.Collections.Generic;
  8 | using Newtonsoft.Json;
  9 | using Microsoft.Office.Interop.Word;
 10 | 
 11 | namespace DSnA.WebJob.DocumentParser
 12 | {
 13 |     public class DocumentParser : IDocumentParser
 14 |     {
 15 |         private readonly ILogger iLogger;
 16 |         private readonly IUtils iUtils;
 17 |         private readonly IParseHelper iparseHelper;
 18 | 
 19 |         public DocumentParser(ILogger iLogger, IUtils iUtils, IParseHelper iparseHelper)
 20 |         {
 21 |             this.iLogger = iLogger;
 22 |             this.iUtils = iUtils;
 23 |             this.iparseHelper = iparseHelper;
 24 |         }
 25 | 
 26 |         /// <summary>
 27 |         /// Main API for document extraction
 28 |         /// </summary>
 29 |         public string ParseDocuments(string uri, IStorageClient storageClient, Application wordApp, string outputFileFormat)
 30 |         {
 31 |             try
 32 |             {
 33 |                 string fileLocation = "";
 34 |                 var output = "";
 35 |                 try
 36 |                 {
 37 |                     fileLocation = storageClient.GetFile(new StorageObjectDescriptor() { Uri = new Uri(uri) }, Constants.FileConfigs.WorkingDirectoryPath);
 38 |                     var result = ExtractContentFromReports(fileLocation, outputFileFormat, uri, wordApp);
 39 |                     storageClient.SaveFile(result.location, null);
 40 |                     output = $"Finished Processing: {result.location}";
 41 |                     iUtils.DeleteInputFiles(new List<string> { result.location });
 42 |                 }
 43 |                 catch (Exception exp)
 44 |                 {
 45 |                     iLogger.Error($"Error Processing: {uri}", exp);
 46 |                     output = $"error processing: {uri}";
 47 |                 }
 48 | 
 49 |                 return output;
 50 |             }
 51 |             catch (Exception exp)
 52 |             {
 53 |                 iLogger.Error("{" + nameof(ParseDocuments) + "} - exception occured-Level 2", exp);
 54 |                 throw;
 55 |             }
 56 |             finally
 57 |             {
 58 |                 // force garbage collection to collect leftover COM objects
 59 |                 GC.Collect();
 60 |             }
 61 |         }
 62 | 
 63 |         /// <summary>
 64 |         /// Extracts document content - Initial function encapsulating different extraction helper methods
 65 |         /// </summary>
 66 |         /// <param name="fileLocation"></param>
 67 |         /// <param name="queueMessage"></param>
 68 |         /// <returns>saved JSON output file location</returns>
 69 |         private ReportExtractionResponse ExtractContentFromReports(string fileLocation, string outputFileFormat, string originalFileLocation = null, Application wordApp = null)
 70 |         {
 71 |             var docFile = "";
 72 |             try
 73 |             {
 74 |                 docFile = iUtils.ConvertPdfToWord(fileLocation, Constants.FileConfigs.WorkingDirectoryPath, wordApp);
 75 |                 var documentContent = iparseHelper.ExtractDocumentContent(docFile, wordApp);
 76 |                 ReportExtractionResponse reportExtractionResponse = null;
 77 |                 switch (outputFileFormat)
 78 |                 {
 79 |                     case Constants.CsvFileConfig.JsonFileFormat:
 80 |                         JsonDocumentStructFlat jsonDoc;
 81 |                         string jsonOutputFileLocation;
 82 |                         ExtractAsJsonFormat(fileLocation, originalFileLocation, documentContent, out jsonDoc, out jsonOutputFileLocation);
 83 |                         reportExtractionResponse = new ReportExtractionResponse()
 84 |                         {
 85 |                             location = jsonOutputFileLocation,
 86 |                             contentJson = JsonConvert.SerializeObject(jsonDoc, Formatting.Indented)
 87 |                         };
 88 |                         break;
 89 | 
 90 |                     case Constants.CsvFileConfig.CsvFileFormat:
 91 |                     default:
 92 |                         string csvOutputFileLocation;
 93 |                         ExtractAsCsvFormat(fileLocation, originalFileLocation, documentContent, out csvOutputFileLocation);
 94 |                         reportExtractionResponse = new ReportExtractionResponse()
 95 |                         {
 96 |                             location = csvOutputFileLocation
 97 |                         };
 98 |                         break;
 99 |                 }
100 | 
101 |                 return reportExtractionResponse;
102 |             }
103 |             finally
104 |             {
105 |                 iUtils.DeleteInputFiles(new List<string> { fileLocation, docFile });
106 |             }
107 |         }
108 | 
109 | 
110 |         private void ExtractAsJsonFormat(string fileLocation, string originalFileLocation, DocumentContent documentContent, out JsonDocumentStructFlat jsonDoc, out string jsonOutputFileLocation)
111 |         {
112 |             jsonDoc = new JsonDocumentStructFlat();
113 |             jsonDoc.ImageStoreUri = originalFileLocation;
114 |             jsonDoc.Text = documentContent.Text;
115 |             jsonDoc.Paragraphs = documentContent.Paragraphs;
116 |             jsonDoc.Headers = documentContent.Headers;
117 |             jsonDoc.Sections = documentContent.Sections;
118 |             jsonDoc.Clauses = documentContent.Clauses;
119 |             jsonDoc.HeaderClauses = documentContent.HeaderClauses;
120 |             jsonDoc.AdditionalInformation = documentContent.AdditionalInformation;
121 | 
122 |             var fileProperties = iUtils.ExtractFileMetadata(fileLocation);
123 |             jsonDoc.FileName = fileProperties.FileName;
124 |             jsonDoc.FileType = fileProperties.FileType;
125 |             jsonDoc.AgreementNumber = fileProperties.AgreementNumber;
126 |             jsonDoc.ExtractionTimeStamp = fileProperties.ExtractionTimeStamp;
127 |             jsonOutputFileLocation = iUtils.SerializeAndSaveJson(jsonDoc, Path.GetFileName(fileLocation));
128 |         }
129 | 
130 |         private void ExtractAsCsvFormat(string fileLocation, string originalFileLocation, DocumentContent documentContent, out string csvOutputFileLocation)
131 |         {
132 |             CsvDocumentFile csvDocumentFile = new CsvDocumentFile();
133 |             var fileProperties = iUtils.ExtractFileMetadata(fileLocation);
134 |             csvDocumentFile.AddCsvLine(fileProperties.FileName, originalFileLocation, Constants.CsvFileConfig.ContentTypeBlobUri);
135 |             csvDocumentFile.AddCsvLine(fileProperties.FileName, fileProperties.AgreementNumber, Constants.CsvFileConfig.ContentTypeAgreementNumber);
136 |             csvDocumentFile.AddCsvLine(fileProperties.FileName, fileProperties.FileType, Constants.CsvFileConfig.ContentTypeFileType);
137 |             csvDocumentFile.AddCsvLine(fileProperties.FileName, fileProperties.ExtractionTimeStamp, Constants.CsvFileConfig.ContentTypeExtractionTimeStamp);
138 |             csvDocumentFile.AddCsvLine(fileProperties.FileName, iUtils.CleanTextFromNonAsciiChar(documentContent.Text), Constants.CsvFileConfig.ContentTypeText);
139 | 
140 |             foreach (var paragraph in documentContent.Paragraphs)
141 |             {
142 |                 var paragraphCleanText = iUtils.CleanTextFromNonAsciiChar(paragraph.Value);
143 |                 if (!string.IsNullOrEmpty(paragraphCleanText))
144 |                     csvDocumentFile.AddCsvLine(fileProperties.FileName, paragraphCleanText, Constants.CsvFileConfig.ContentTypeParagraph);
145 |             }
146 | 
147 |             foreach (var header in documentContent.Headers)
148 |             {
149 |                 var headerCleanText = iUtils.CleanTextFromNonAsciiChar(header.Value);
150 |                 if (!string.IsNullOrEmpty(headerCleanText))
151 |                     csvDocumentFile.AddCsvLine(fileProperties.FileName, headerCleanText, Constants.CsvFileConfig.ContentTypeHeader);
152 |             }
153 | 
154 |             foreach (var section in documentContent.Sections)
155 |             {
156 |                 var sectionCleanText = iUtils.CleanTextFromNonAsciiChar(section.Value);
157 |                 if (!string.IsNullOrEmpty(sectionCleanText))
158 |                     csvDocumentFile.AddCsvLine(fileProperties.FileName, sectionCleanText, Constants.CsvFileConfig.ContentTypeSection);
159 |             }
160 | 
161 |             foreach (var clause in documentContent.Clauses)
162 |             {
163 |                 var clauseCleanText = iUtils.CleanTextFromNonAsciiChar(clause.Content);
164 |                 if (!string.IsNullOrEmpty(clauseCleanText))
165 |                     csvDocumentFile.AddCsvLine(fileProperties.FileName, clauseCleanText, Constants.CsvFileConfig.ContentTypeClause);
166 |             }
167 | 
168 |             foreach (var headerClause in documentContent.HeaderClauses)
169 |             {
170 |                 var headerClauseCleanText = iUtils.CleanTextFromNonAsciiChar(headerClause.Content);
171 |                 if (!string.IsNullOrEmpty(headerClauseCleanText))
172 |                     csvDocumentFile.AddCsvLine(fileProperties.FileName, headerClauseCleanText, Constants.CsvFileConfig.ContentTypeHeaderClause);
173 |             }
174 | 
175 |             foreach (var additionalInformation in documentContent.AdditionalInformation)
176 |             {
177 |                 var additionalInformationCleanText = iUtils.CleanTextFromNonAsciiChar(additionalInformation);
178 |                 if (!string.IsNullOrEmpty(additionalInformationCleanText))
179 |                     csvDocumentFile.AddCsvLine(fileProperties.FileName, additionalInformationCleanText, Constants.CsvFileConfig.ContentTypeAdditionalInformation);
180 |             }
181 | 
182 |             csvOutputFileLocation = iUtils.SaveToCsvFile(csvDocumentFile.GetCsvOutputLines(), Path.GetFileNameWithoutExtension(fileLocation));
183 |         }
184 |     }
185 | }


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/DSnA.WebJob.DocumentParser.csproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
  4 |   <PropertyGroup>
  5 |     <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
  6 |     <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
  7 |     <ProjectGuid>{9E4D7884-0C36-429B-A4C9-1217D3CA7D4E}</ProjectGuid>
  8 |     <OutputType>Exe</OutputType>
  9 |     <RootNamespace>DSnA.WebJob.DocumentParser</RootNamespace>
 10 |     <AssemblyName>DSnA.WebJob.DocumentParser</AssemblyName>
 11 |     <TargetFrameworkVersion>v4.6.2</TargetFrameworkVersion>
 12 |     <FileAlignment>512</FileAlignment>
 13 |     <AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
 14 |   </PropertyGroup>
 15 |   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
 16 |     <PlatformTarget>AnyCPU</PlatformTarget>
 17 |     <DebugSymbols>true</DebugSymbols>
 18 |     <DebugType>full</DebugType>
 19 |     <Optimize>false</Optimize>
 20 |     <OutputPath>bin\Debug\</OutputPath>
 21 |     <DefineConstants>DEBUG;TRACE</DefineConstants>
 22 |     <ErrorReport>prompt</ErrorReport>
 23 |     <WarningLevel>4</WarningLevel>
 24 |   </PropertyGroup>
 25 |   <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
 26 |     <PlatformTarget>AnyCPU</PlatformTarget>
 27 |     <DebugType>pdbonly</DebugType>
 28 |     <Optimize>true</Optimize>
 29 |     <OutputPath>bin\Release\</OutputPath>
 30 |     <DefineConstants>TRACE</DefineConstants>
 31 |     <ErrorReport>prompt</ErrorReport>
 32 |     <WarningLevel>4</WarningLevel>
 33 |   </PropertyGroup>
 34 |   <ItemGroup>
 35 |     <Reference Include="DocumentFormat.OpenXml, Version=2.8.1.0, Culture=neutral, PublicKeyToken=8fb06cb64d019a17, processorArchitecture=MSIL">
 36 |       <HintPath>..\packages\DocumentFormat.OpenXml.2.8.1\lib\net46\DocumentFormat.OpenXml.dll</HintPath>
 37 |     </Reference>
 38 |     <Reference Include="Microsoft.Azure.KeyVault.Core, Version=1.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
 39 |       <HintPath>..\packages\Microsoft.Azure.KeyVault.Core.1.0.0\lib\net40\Microsoft.Azure.KeyVault.Core.dll</HintPath>
 40 |     </Reference>
 41 |     <Reference Include="Microsoft.Office.Interop.Word, Version=15.0.0.0, Culture=neutral, PublicKeyToken=71e9bce111e9429c, processorArchitecture=MSIL">
 42 |       <EmbedInteropTypes>True</EmbedInteropTypes>
 43 |       <HintPath>..\packages\Microsoft.Office.Interop.Word.15.0.4797.1003\lib\net20\Microsoft.Office.Interop.Word.dll</HintPath>
 44 |       <Private>True</Private>
 45 |     </Reference>
 46 |     <Reference Include="Microsoft.Win32.Primitives, Version=4.0.2.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 47 |       <HintPath>..\packages\Microsoft.Win32.Primitives.4.3.0\lib\net46\Microsoft.Win32.Primitives.dll</HintPath>
 48 |     </Reference>
 49 |     <Reference Include="Microsoft.WindowsAzure.Configuration, Version=3.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35, processorArchitecture=MSIL">
 50 |       <HintPath>..\packages\Microsoft.WindowsAzure.ConfigurationManager.3.2.3\lib\net40\Microsoft.WindowsAzure.Configuration.dll</HintPath>
 51 |     </Reference>
 52 |     <Reference Include="Microsoft.WindowsAzure.Storage">
 53 |       <HintPath>..\packages\WindowsAzure.Storage.9.3.2\lib\net45\Microsoft.WindowsAzure.Storage.dll</HintPath>
 54 |       <Private>True</Private>
 55 |     </Reference>
 56 |     <Reference Include="Newtonsoft.Json, Version=11.0.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
 57 |       <HintPath>..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll</HintPath>
 58 |     </Reference>
 59 |     <Reference Include="System" />
 60 |     <Reference Include="System.AppContext, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 61 |       <HintPath>..\packages\System.AppContext.4.3.0\lib\net46\System.AppContext.dll</HintPath>
 62 |       <Private>True</Private>
 63 |     </Reference>
 64 |     <Reference Include="System.ComponentModel.Composition" />
 65 |     <Reference Include="System.Configuration" />
 66 |     <Reference Include="System.Console, Version=4.0.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 67 |       <HintPath>..\packages\System.Console.4.3.0\lib\net46\System.Console.dll</HintPath>
 68 |     </Reference>
 69 |     <Reference Include="System.Core" />
 70 |     <Reference Include="System.Diagnostics.DiagnosticSource, Version=4.0.1.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51, processorArchitecture=MSIL">
 71 |       <HintPath>..\packages\System.Diagnostics.DiagnosticSource.4.3.0\lib\net46\System.Diagnostics.DiagnosticSource.dll</HintPath>
 72 |     </Reference>
 73 |     <Reference Include="System.Diagnostics.Tracing, Version=4.1.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 74 |       <HintPath>..\packages\System.Diagnostics.Tracing.4.3.0\lib\net462\System.Diagnostics.Tracing.dll</HintPath>
 75 |     </Reference>
 76 |     <Reference Include="System.Globalization.Calendars, Version=4.0.2.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 77 |       <HintPath>..\packages\System.Globalization.Calendars.4.3.0\lib\net46\System.Globalization.Calendars.dll</HintPath>
 78 |     </Reference>
 79 |     <Reference Include="System.IO, Version=4.1.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 80 |       <HintPath>..\packages\System.IO.4.3.0\lib\net462\System.IO.dll</HintPath>
 81 |     </Reference>
 82 |     <Reference Include="System.IO.Abstractions, Version=2.1.0.247, Culture=neutral, PublicKeyToken=96bf224d23c43e59, processorArchitecture=MSIL">
 83 |       <HintPath>..\packages\System.IO.Abstractions.2.1.0.247\lib\net40\System.IO.Abstractions.dll</HintPath>
 84 |     </Reference>
 85 |     <Reference Include="System.IO.Compression, Version=4.1.2.0, Culture=neutral, PublicKeyToken=b77a5c561934e089, processorArchitecture=MSIL">
 86 |       <HintPath>..\packages\System.IO.Compression.4.3.0\lib\net46\System.IO.Compression.dll</HintPath>
 87 |       <Private>True</Private>
 88 |     </Reference>
 89 |     <Reference Include="System.IO.Compression.FileSystem" />
 90 |     <Reference Include="System.IO.Compression.ZipFile, Version=4.0.2.0, Culture=neutral, PublicKeyToken=b77a5c561934e089, processorArchitecture=MSIL">
 91 |       <HintPath>..\packages\System.IO.Compression.ZipFile.4.3.0\lib\net46\System.IO.Compression.ZipFile.dll</HintPath>
 92 |     </Reference>
 93 |     <Reference Include="System.IO.FileSystem, Version=4.0.2.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 94 |       <HintPath>..\packages\System.IO.FileSystem.4.3.0\lib\net46\System.IO.FileSystem.dll</HintPath>
 95 |     </Reference>
 96 |     <Reference Include="System.IO.FileSystem.Primitives, Version=4.0.2.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
 97 |       <HintPath>..\packages\System.IO.FileSystem.Primitives.4.3.0\lib\net46\System.IO.FileSystem.Primitives.dll</HintPath>
 98 |     </Reference>
 99 |     <Reference Include="System.IO.Packaging, Version=4.0.2.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
100 |       <HintPath>..\packages\System.IO.Packaging.4.4.0\lib\net46\System.IO.Packaging.dll</HintPath>
101 |     </Reference>
102 |     <Reference Include="System.Net.Http, Version=4.1.1.3, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
103 |       <HintPath>..\packages\System.Net.Http.4.3.4\lib\net46\System.Net.Http.dll</HintPath>
104 |     </Reference>
105 |     <Reference Include="System.Net.Sockets, Version=4.1.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
106 |       <HintPath>..\packages\System.Net.Sockets.4.3.0\lib\net46\System.Net.Sockets.dll</HintPath>
107 |     </Reference>
108 |     <Reference Include="System.Numerics" />
109 |     <Reference Include="System.Reflection, Version=4.1.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
110 |       <HintPath>..\packages\System.Reflection.4.3.0\lib\net462\System.Reflection.dll</HintPath>
111 |     </Reference>
112 |     <Reference Include="System.Runtime, Version=4.1.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
113 |       <HintPath>..\packages\System.Runtime.4.3.0\lib\net462\System.Runtime.dll</HintPath>
114 |     </Reference>
115 |     <Reference Include="System.Runtime.Extensions, Version=4.1.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
116 |       <HintPath>..\packages\System.Runtime.Extensions.4.3.0\lib\net462\System.Runtime.Extensions.dll</HintPath>
117 |     </Reference>
118 |     <Reference Include="System.Runtime.InteropServices, Version=4.1.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
119 |       <HintPath>..\packages\System.Runtime.InteropServices.4.3.0\lib\net462\System.Runtime.InteropServices.dll</HintPath>
120 |     </Reference>
121 |     <Reference Include="System.Runtime.InteropServices.RuntimeInformation, Version=4.0.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
122 |       <HintPath>..\packages\System.Runtime.InteropServices.RuntimeInformation.4.3.0\lib\net45\System.Runtime.InteropServices.RuntimeInformation.dll</HintPath>
123 |       <Private>True</Private>
124 |     </Reference>
125 |     <Reference Include="System.Runtime.Serialization" />
126 |     <Reference Include="System.Security.Cryptography.Algorithms, Version=4.1.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
127 |       <HintPath>..\packages\System.Security.Cryptography.Algorithms.4.3.0\lib\net461\System.Security.Cryptography.Algorithms.dll</HintPath>
128 |     </Reference>
129 |     <Reference Include="System.Security.Cryptography.Encoding, Version=4.0.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
130 |       <HintPath>..\packages\System.Security.Cryptography.Encoding.4.3.0\lib\net46\System.Security.Cryptography.Encoding.dll</HintPath>
131 |     </Reference>
132 |     <Reference Include="System.Security.Cryptography.Primitives, Version=4.0.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
133 |       <HintPath>..\packages\System.Security.Cryptography.Primitives.4.3.0\lib\net46\System.Security.Cryptography.Primitives.dll</HintPath>
134 |     </Reference>
135 |     <Reference Include="System.Security.Cryptography.X509Certificates, Version=4.1.1.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
136 |       <HintPath>..\packages\System.Security.Cryptography.X509Certificates.4.3.0\lib\net461\System.Security.Cryptography.X509Certificates.dll</HintPath>
137 |     </Reference>
138 |     <Reference Include="System.Web.Extensions" />
139 |     <Reference Include="System.Xml.Linq" />
140 |     <Reference Include="System.Data.DataSetExtensions" />
141 |     <Reference Include="Microsoft.CSharp" />
142 |     <Reference Include="System.Data" />
143 |     <Reference Include="System.Xml" />
144 |     <Reference Include="System.Xml.ReaderWriter, Version=4.1.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a, processorArchitecture=MSIL">
145 |       <HintPath>..\packages\System.Xml.ReaderWriter.4.3.0\lib\net46\System.Xml.ReaderWriter.dll</HintPath>
146 |     </Reference>
147 |     <Reference Include="WindowsBase" />
148 |   </ItemGroup>
149 |   <ItemGroup>
150 |     <Compile Include="BlobStorageClient.cs" />
151 |     <Compile Include="Code\Constants.cs" />
152 |     <Compile Include="Code\DefaultStorageClientFactory.cs" />
153 |     <Compile Include="Code\DocumentParser.cs" />
154 |     <Compile Include="Code\Exceptions.cs" />
155 |     <Compile Include="Code\InteropWordUtils.cs" />
156 |     <Compile Include="Code\IStorageClientFactory.cs" />
157 |     <Compile Include="Code\Logger.cs" />
158 |     <Compile Include="Code\ParseHelper.cs" />
159 |     <Compile Include="Code\ParserClasses.cs" />
160 |     <Compile Include="Code\Utils.cs" />
161 |     <Compile Include="Interface\IDocumentParser.cs" />
162 |     <Compile Include="Interface\IStorageClient.cs" />
163 |     <Compile Include="LocalStorageClient.cs" />
164 |     <Compile Include="Program.cs" />
165 |     <Compile Include="Properties\AssemblyInfo.cs" />
166 |   </ItemGroup>
167 |   <ItemGroup>
168 |     <None Include="App.config">
169 |       <SubType>Designer</SubType>
170 |     </None>
171 |     <None Include="packages.config" />
172 |     <None Include="README.md" />
173 |   </ItemGroup>
174 |   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
175 | </Project>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |     MIT License
  2 | 
  3 |     Copyright (c) Microsoft Corporation.
  4 | 
  5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |     of this software and associated documentation files (the "Software"), to deal
  7 |     in the Software without restriction, including without limitation the rights
  8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |     copies of the Software, and to permit persons to whom the Software is
 10 |     furnished to do so, subject to the following conditions:
 11 | 
 12 |     The above copyright notice and this permission notice shall be included in all
 13 |     copies or substantial portions of the Software.
 14 | 
 15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 |     SOFTWARE
 22 | 
 23 | 
 24 | 
 25 |                                  Apache License
 26 |                            Version 2.0, January 2004
 27 |                         http://www.apache.org/licenses/
 28 | 
 29 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 30 | 
 31 |    1. Definitions.
 32 | 
 33 |       "License" shall mean the terms and conditions for use, reproduction,
 34 |       and distribution as defined by Sections 1 through 9 of this document.
 35 | 
 36 |       "Licensor" shall mean the copyright owner or entity authorized by
 37 |       the copyright owner that is granting the License.
 38 | 
 39 |       "Legal Entity" shall mean the union of the acting entity and all
 40 |       other entities that control, are controlled by, or are under common
 41 |       control with that entity. For the purposes of this definition,
 42 |       "control" means (i) the power, direct or indirect, to cause the
 43 |       direction or management of such entity, whether by contract or
 44 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 45 |       outstanding shares, or (iii) beneficial ownership of such entity.
 46 | 
 47 |       "You" (or "Your") shall mean an individual or Legal Entity
 48 |       exercising permissions granted by this License.
 49 | 
 50 |       "Source" form shall mean the preferred form for making modifications,
 51 |       including but not limited to software source code, documentation
 52 |       source, and configuration files.
 53 | 
 54 |       "Object" form shall mean any form resulting from mechanical
 55 |       transformation or translation of a Source form, including but
 56 |       not limited to compiled object code, generated documentation,
 57 |       and conversions to other media types.
 58 | 
 59 |       "Work" shall mean the work of authorship, whether in Source or
 60 |       Object form, made available under the License, as indicated by a
 61 |       copyright notice that is included in or attached to the work
 62 |       (an example is provided in the Appendix below).
 63 | 
 64 |       "Derivative Works" shall mean any work, whether in Source or Object
 65 |       form, that is based on (or derived from) the Work and for which the
 66 |       editorial revisions, annotations, elaborations, or other modifications
 67 |       represent, as a whole, an original work of authorship. For the purposes
 68 |       of this License, Derivative Works shall not include works that remain
 69 |       separable from, or merely link (or bind by name) to the interfaces of,
 70 |       the Work and Derivative Works thereof.
 71 | 
 72 |       "Contribution" shall mean any work of authorship, including
 73 |       the original version of the Work and any modifications or additions
 74 |       to that Work or Derivative Works thereof, that is intentionally
 75 |       submitted to Licensor for inclusion in the Work by the copyright owner
 76 |       or by an individual or Legal Entity authorized to submit on behalf of
 77 |       the copyright owner. For the purposes of this definition, "submitted"
 78 |       means any form of electronic, verbal, or written communication sent
 79 |       to the Licensor or its representatives, including but not limited to
 80 |       communication on electronic mailing lists, source code control systems,
 81 |       and issue tracking systems that are managed by, or on behalf of, the
 82 |       Licensor for the purpose of discussing and improving the Work, but
 83 |       excluding communication that is conspicuously marked or otherwise
 84 |       designated in writing by the copyright owner as "Not a Contribution."
 85 | 
 86 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 87 |       on behalf of whom a Contribution has been received by Licensor and
 88 |       subsequently incorporated within the Work.
 89 | 
 90 |    2. Grant of Copyright License. Subject to the terms and conditions of
 91 |       this License, each Contributor hereby grants to You a perpetual,
 92 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 93 |       copyright license to reproduce, prepare Derivative Works of,
 94 |       publicly display, publicly perform, sublicense, and distribute the
 95 |       Work and such Derivative Works in Source or Object form.
 96 | 
 97 |    3. Grant of Patent License. Subject to the terms and conditions of
 98 |       this License, each Contributor hereby grants to You a perpetual,
 99 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
100 |       (except as stated in this section) patent license to make, have made,
101 |       use, offer to sell, sell, import, and otherwise transfer the Work,
102 |       where such license applies only to those patent claims licensable
103 |       by such Contributor that are necessarily infringed by their
104 |       Contribution(s) alone or by combination of their Contribution(s)
105 |       with the Work to which such Contribution(s) was submitted. If You
106 |       institute patent litigation against any entity (including a
107 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
108 |       or a Contribution incorporated within the Work constitutes direct
109 |       or contributory patent infringement, then any patent licenses
110 |       granted to You under this License for that Work shall terminate
111 |       as of the date such litigation is filed.
112 | 
113 |    4. Redistribution. You may reproduce and distribute copies of the
114 |       Work or Derivative Works thereof in any medium, with or without
115 |       modifications, and in Source or Object form, provided that You
116 |       meet the following conditions:
117 | 
118 |       (a) You must give any other recipients of the Work or
119 |           Derivative Works a copy of this License; and
120 | 
121 |       (b) You must cause any modified files to carry prominent notices
122 |           stating that You changed the files; and
123 | 
124 |       (c) You must retain, in the Source form of any Derivative Works
125 |           that You distribute, all copyright, patent, trademark, and
126 |           attribution notices from the Source form of the Work,
127 |           excluding those notices that do not pertain to any part of
128 |           the Derivative Works; and
129 | 
130 |       (d) If the Work includes a "NOTICE" text file as part of its
131 |           distribution, then any Derivative Works that You distribute must
132 |           include a readable copy of the attribution notices contained
133 |           within such NOTICE file, excluding those notices that do not
134 |           pertain to any part of the Derivative Works, in at least one
135 |           of the following places: within a NOTICE text file distributed
136 |           as part of the Derivative Works; within the Source form or
137 |           documentation, if provided along with the Derivative Works; or,
138 |           within a display generated by the Derivative Works, if and
139 |           wherever such third-party notices normally appear. The contents
140 |           of the NOTICE file are for informational purposes only and
141 |           do not modify the License. You may add Your own attribution
142 |           notices within Derivative Works that You distribute, alongside
143 |           or as an addendum to the NOTICE text from the Work, provided
144 |           that such additional attribution notices cannot be construed
145 |           as modifying the License.
146 | 
147 |       You may add Your own copyright statement to Your modifications and
148 |       may provide additional or different license terms and conditions
149 |       for use, reproduction, or distribution of Your modifications, or
150 |       for any such Derivative Works as a whole, provided Your use,
151 |       reproduction, and distribution of the Work otherwise complies with
152 |       the conditions stated in this License.
153 | 
154 |    5. Submission of Contributions. Unless You explicitly state otherwise,
155 |       any Contribution intentionally submitted for inclusion in the Work
156 |       by You to the Licensor shall be under the terms and conditions of
157 |       this License, without any additional terms or conditions.
158 |       Notwithstanding the above, nothing herein shall supersede or modify
159 |       the terms of any separate license agreement you may have executed
160 |       with Licensor regarding such Contributions.
161 | 
162 |    6. Trademarks. This License does not grant permission to use the trade
163 |       names, trademarks, service marks, or product names of the Licensor,
164 |       except as required for reasonable and customary use in describing the
165 |       origin of the Work and reproducing the content of the NOTICE file.
166 | 
167 |    7. Disclaimer of Warranty. Unless required by applicable law or
168 |       agreed to in writing, Licensor provides the Work (and each
169 |       Contributor provides its Contributions) on an "AS IS" BASIS,
170 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
171 |       implied, including, without limitation, any warranties or conditions
172 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
173 |       PARTICULAR PURPOSE. You are solely responsible for determining the
174 |       appropriateness of using or redistributing the Work and assume any
175 |       risks associated with Your exercise of permissions under this License.
176 | 
177 |    8. Limitation of Liability. In no event and under no legal theory,
178 |       whether in tort (including negligence), contract, or otherwise,
179 |       unless required by applicable law (such as deliberate and grossly
180 |       negligent acts) or agreed to in writing, shall any Contributor be
181 |       liable to You for damages, including any direct, indirect, special,
182 |       incidental, or consequential damages of any character arising as a
183 |       result of this License or out of the use or inability to use the
184 |       Work (including but not limited to damages for loss of goodwill,
185 |       work stoppage, computer failure or malfunction, or any and all
186 |       other commercial damages or losses), even if such Contributor
187 |       has been advised of the possibility of such damages.
188 | 
189 |    9. Accepting Warranty or Additional Liability. While redistributing
190 |       the Work or Derivative Works thereof, You may choose to offer,
191 |       and charge a fee for, acceptance of support, warranty, indemnity,
192 |       or other liability obligations and/or rights consistent with this
193 |       License. However, in accepting such obligations, You may act only
194 |       on Your own behalf and on Your sole responsibility, not on behalf
195 |       of any other Contributor, and only if You agree to indemnify,
196 |       defend, and hold each Contributor harmless for any liability
197 |       incurred by, or claims asserted against, such Contributor by reason
198 |       of your accepting any such warranty or additional liability.
199 | 
200 |    END OF TERMS AND CONDITIONS
201 | 
202 |    APPENDIX: How to apply the Apache License to your work.
203 | 
204 |       To apply the Apache License to your work, attach the following
205 |       boilerplate notice, with the fields enclosed by brackets "[]"
206 |       replaced with your own identifying information. (Don't include
207 |       the brackets!)  The text should be enclosed in the appropriate
208 |       comment syntax for the file format. We also recommend that a
209 |       file or class name and description of purpose be included on the
210 |       same "printed page" as the copyright notice for easier
211 |       identification within third-party archives.
212 | 
213 |    Copyright [yyyy] [name of copyright owner]
214 | 
215 |    Licensed under the Apache License, Version 2.0 (the "License");
216 |    you may not use this file except in compliance with the License.
217 |    You may obtain a copy of the License at
218 | 
219 |        http://www.apache.org/licenses/LICENSE-2.0
220 | 
221 |    Unless required by applicable law or agreed to in writing, software
222 |    distributed under the License is distributed on an "AS IS" BASIS,
223 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
224 |    See the License for the specific language governing permissions and
225 |    limitations under the License.
226 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Code/ParseHelper.cs:
--------------------------------------------------------------------------------
  1 | //Copyright(c) Microsoft Corporation.All rights reserved.
  2 | //Licensed under the MIT License.
  3 | 
  4 | using System;
  5 | using System.Collections.Generic;
  6 | using System.Linq;
  7 | using System.Text;
  8 | using System.Text.RegularExpressions;
  9 | using Microsoft.Office.Interop.Word;
 10 | 
 11 | namespace DSnA.WebJob.DocumentParser
 12 | {
 13 |     public interface IParseHelper
 14 |     {
 15 |         DocumentContent ExtractDocumentContent(string docFile, Application wordApp);
 16 |     }
 17 | 
 18 |     public class ParseHelper : IParseHelper
 19 |     {
 20 |         internal static string WordHeading1 = "Heading 1";
 21 |         internal static string WordHeading2 = "Heading 2";
 22 |         internal static string WordHeading3 = "Heading 3";
 23 |         internal static string WordHeading4 = "Heading 4";
 24 |         private readonly ILogger iLogger;
 25 |         private readonly IUtils iUtils;
 26 |         private readonly IInteropWordUtils iInteropWordUtils;
 27 | 
 28 |         public ParseHelper(ILogger iLogger, IUtils iUtils)
 29 |         {
 30 |             this.iLogger = iLogger;
 31 |             this.iUtils = iUtils;
 32 |             iInteropWordUtils = new InteropWordUtils();
 33 |         }
 34 | 
 35 |         public ParseHelper(ILogger iLogger, IUtils iUtils, IInteropWordUtils iInteropWordUtils)
 36 |         {
 37 |             this.iLogger = iLogger;
 38 |             this.iUtils = iUtils;
 39 |             this.iInteropWordUtils = iInteropWordUtils;
 40 |         }
 41 | 
 42 |         /// <summary>
 43 |         /// Extract all paragraphs
 44 |         /// </summary>
 45 |         /// <param name="wordDocToExtract"></param>
 46 |         /// <returns></returns>
 47 |         private DocumentContent ExtractAllParagraphs(Document wordDocToExtract, Dictionary<int, string> headers, List<string> tableContent, Dictionary<int, string> listParagraphs)
 48 |         {
 49 |             try
 50 |             {
 51 |                 var fullContent = string.Empty;
 52 |                 var paragraphs = new Dictionary<int, string>();
 53 |                 var sections = new Dictionary<int, string>();
 54 | 
 55 |                 List<Clauses> clauses = new List<Clauses>();
 56 |                 List<Clauses> headerClauses = new List<Clauses>();
 57 |                 List<string> additionalInformation = new List<string>();
 58 | 
 59 |                 foreach (Paragraph para in wordDocToExtract.Paragraphs)
 60 |                 {
 61 |                     var text = para.Range.Text;
 62 |                     var cleanText = iUtils.CleanTextFromNonAsciiChar(text);
 63 |                     var textLastSentence = para.Range.Sentences.Last.Text;
 64 |                     var textStart = para.Range.Start;
 65 |                     fullContent += text;
 66 |                     var listNumber = para.Range.ListFormat.ListString;
 67 | 
 68 |                     if (!string.IsNullOrEmpty(listNumber))
 69 |                     {
 70 |                         text = $"{listNumber.Trim()} {text}";
 71 |                     }
 72 | 
 73 |                     if (textStart > 250 && headers.ContainsKey(textStart))
 74 |                     {
 75 |                         if (headerClauses.Count > 0)
 76 |                         {
 77 |                             headerClauses.Last().End = textStart - 1;
 78 |                         }
 79 | 
 80 |                         headerClauses.Add(new Clauses
 81 |                         {
 82 |                             Title = headers[textStart],
 83 |                             Content = text,
 84 |                             Start = textStart
 85 |                         });
 86 | 
 87 |                         if (headers.Keys.Max() == textStart)
 88 |                         {
 89 |                             headerClauses.Last().End = para.Range.End;
 90 | 
 91 |                             if (!string.IsNullOrEmpty(listNumber))
 92 |                             {
 93 |                                 headerClauses.Last().Content = para.Range.ListFormat.List.Range.Text;
 94 |                             }
 95 |                         }
 96 |                     }
 97 |                     else if (headerClauses.Count >= 1 && textStart <= headers.Keys.Max() && tableContent.Contains(cleanText) == false)
 98 |                     {
 99 |                         headerClauses.Last().Content += text;
100 |                     }
101 | 
102 |                     if (listParagraphs.Count > 0 && listParagraphs.ContainsKey(textStart))
103 |                     {
104 |                         if(!string.IsNullOrEmpty(cleanText))
105 |                         {
106 |                             sections.Add(textStart, text);
107 |                         }
108 | 
109 |                         if (clauses.Count > 0)
110 |                         {
111 |                             clauses.Last().End = textStart - 1;
112 |                         }
113 | 
114 |                         clauses.Add(new Clauses
115 |                         {
116 |                             Title = listParagraphs[textStart],
117 |                             Content = text,
118 |                             Start = textStart
119 |                         });
120 | 
121 |                         if (listParagraphs.Keys.Max() == textStart)
122 |                         {
123 |                             var nextPara = para.Next();
124 | 
125 |                             if (nextPara != null)
126 |                             {
127 |                                 clauses.Last().End = nextPara.Range.End;
128 |                                 clauses.Last().Content += nextPara?.Range?.Text ?? string.Empty;
129 |                             }
130 |                             else
131 |                             {
132 |                                 clauses.Last().End = para.Range.End;
133 |                             }
134 |                         }
135 |                     }
136 |                     else if (clauses.Count >= 1 && textStart <= listParagraphs.Keys.Max() && tableContent.Contains(cleanText) == false)
137 |                     {
138 |                         if (!string.IsNullOrEmpty(cleanText))
139 |                         {
140 |                             sections.Add(textStart, text);
141 |                         }
142 | 
143 |                         clauses.Last().Content += text;
144 |                     }
145 | 
146 |                     paragraphs.Add(textStart, text);
147 |                 }
148 | 
149 |                 if (headerClauses.Count == 0)
150 |                 {
151 |                     headerClauses.Add(new Clauses());
152 |                 }
153 | 
154 |                 if (clauses.Count == 0)
155 |                 {
156 |                     clauses.Add(new Clauses());
157 |                 }
158 | 
159 |                 if (sections.Count == 0)
160 |                 {
161 |                     sections.Add(-1, string.Empty);
162 |                 }
163 | 
164 |                 if (headers.Count == 0)
165 |                 {
166 |                     headers.Add(-1, string.Empty);
167 |                 }
168 | 
169 |                 StringBuilder rangesContent = new StringBuilder();
170 |                 List<Tuple<int, int>> ranges = new List<Tuple<int, int>>();
171 |                 List<WdStoryType> rangeTypes = new List<WdStoryType>();
172 | 
173 |                 foreach (Range range in wordDocToExtract.StoryRanges)
174 |                 {
175 |                     Range currentRange = range;
176 | 
177 |                     do
178 |                     {
179 |                         if (RangeStoryTypeIsHeaderOrFooter(currentRange) &&
180 |                             CurrentRangeHaveShapeRanges(currentRange))
181 |                         {
182 |                             foreach (Shape shape in currentRange.ShapeRange)
183 |                             {
184 |                                 if (shape.TextFrame.HasText == 0)
185 |                                 {
186 |                                     continue;
187 |                                 }
188 | 
189 |                                 Range shapeRange = shape.TextFrame.TextRange;
190 | 
191 |                                 rangesContent.Append(RemoveLineBreaks(shapeRange.Text));
192 |                                 ranges.Add(new Tuple<int, int>(shapeRange.Start, shapeRange.End));
193 |                                 rangeTypes.Add(currentRange.StoryType);
194 |                             }
195 |                         }
196 |                         else
197 |                         {
198 |                             rangesContent.Append(RemoveLineBreaks(currentRange.Text));
199 |                             ranges.Add(new Tuple<int, int>(currentRange.Start, currentRange.End));
200 |                             rangeTypes.Add(currentRange.StoryType);
201 |                         }
202 | 
203 |                         bool hasMatch = false;
204 |                         MatchCollection matches = Constants.RegexExp.SessionRegEx.Matches(rangesContent.ToString());
205 | 
206 |                         foreach (Match match in matches)
207 |                         {
208 |                             additionalInformation.Add($"{string.Join("\t", rangeTypes.Select(x => x.ToString()))}|{string.Join("\t", ranges.Select(x => $"{{{x.Item1},{x.Item2}}}"))}|{match.Index}|{match.Value}");
209 |                             hasMatch = true;
210 |                         }
211 | 
212 |                         matches = Constants.RegexExp.AgendaItemRegEx.Matches(rangesContent.ToString());
213 | 
214 |                         foreach (Match match in matches)
215 |                         {
216 |                             additionalInformation.Add($"{string.Join("\t", rangeTypes.Select(x => x.ToString()))}|{string.Join("\t", ranges.Select(x => $"{{{x.Item1},{x.Item2}}}"))}|{match.Index}|{match.Value}");
217 |                             hasMatch = true;
218 |                         }
219 | 
220 |                         if (hasMatch)
221 |                         {
222 |                             rangesContent.Clear();
223 |                             ranges.Clear();
224 |                             rangeTypes.Clear();
225 |                         }
226 | 
227 |                         currentRange = currentRange.NextStoryRange;
228 |                     } while (currentRange != null);
229 |                 }
230 | 
231 |                 return new DocumentContent()
232 |                 {
233 |                     Text = fullContent,
234 |                     Paragraphs = paragraphs,
235 |                     Sections = sections,
236 |                     Clauses = clauses,
237 |                     Headers = headers,
238 |                     HeaderClauses = headerClauses,
239 |                     AdditionalInformation = additionalInformation
240 |                 };
241 |             }
242 |             catch (Exception exception)
243 |             {
244 |                 throw new Exception("Exception in extracting data\n", exception);
245 |             }
246 |         }
247 | 
248 |         public static string RemoveLineBreaks(string text)
249 |         {
250 |             if (text == "\n"
251 |                 || text == "\r\n")
252 |             {
253 |                 return " ";
254 |             }
255 | 
256 |             return text
257 |                 .Replace("\r", string.Empty)
258 |                 .Replace("\n", string.Empty);
259 |         }
260 | 
261 |         private static bool RangeStoryTypeIsHeaderOrFooter(Range range)
262 |         {
263 |             return (range.StoryType == WdStoryType.wdEvenPagesHeaderStory ||
264 |                     range.StoryType == WdStoryType.wdPrimaryHeaderStory ||
265 |                     range.StoryType == WdStoryType.wdEvenPagesFooterStory ||
266 |                     range.StoryType == WdStoryType.wdPrimaryFooterStory ||
267 |                     range.StoryType == WdStoryType.wdFirstPageHeaderStory ||
268 |                     range.StoryType == WdStoryType.wdFirstPageFooterStory);
269 |         }
270 | 
271 |         private static bool CurrentRangeHaveShapeRanges(Range range)
272 |         {
273 |             return range.ShapeRange.Count > 0;
274 |         }
275 | 
276 |         /// <summary>
277 |         /// Extracts content (red flags, company name, report date) from document
278 |         /// </summary>
279 |         /// <param name="docFile"></param>
280 |         /// <returns>document content</returns>
281 |         public DocumentContent ExtractDocumentContent(string docFile, Application wordApp)
282 |         {
283 |             Document wordDocToExtract = null;
284 | 
285 |             try
286 |             {
287 |                 DocumentContent docContent = new DocumentContent();
288 |                 // open the document only in read only mode - so that no edits are made on the document
289 |                 wordDocToExtract = iInteropWordUtils.OpenDocument(docFile, wordApp);
290 |                 var tableContent = ExtractTableContent(wordDocToExtract);
291 |                 // Extract paragraph
292 |                 var listParagraphs = ExtractListPragraphs(wordDocToExtract);
293 |                 var headers = ExtractHeaders(wordDocToExtract, tableContent, listParagraphs);
294 |                 docContent = ExtractAllParagraphs(wordDocToExtract, headers, tableContent, listParagraphs);
295 | 
296 |                 return docContent;
297 |             }
298 |             catch (Exception exception)
299 |             {
300 |                 throw new Exception("Exception extracting content (" + nameof(ExtractDocumentContent) + ")\n", exception);
301 |             }
302 |             finally
303 |             {
304 |                 // Close without saving and release resources
305 |                 wordDocToExtract?.Close(SaveChanges: false);
306 |             }
307 |         }
308 | 
309 |         private Dictionary<int, string> ExtractListPragraphs(Document wordDocToExtract)
310 |         {
311 |             var listParagraphs = new Dictionary<int, string>();
312 |             foreach (List firstItem in wordDocToExtract.Lists.OfType<List>().Reverse())
313 |             {
314 |                 if (firstItem.Range.ListFormat.ListString != null)
315 |                 {
316 |                     var totalVlaues = firstItem.Range.ListParagraphs.Count;
317 |                     bool foundNumeric = false;
318 |                     foreach (Paragraph item in firstItem.Range.ListParagraphs.OfType<Paragraph>().Reverse())
319 |                     {
320 |                         if (listParagraphs.ContainsKey(item.Range.Start))
321 |                         {
322 |                             break;
323 |                         }
324 | 
325 |                         var isNumeric = Regex.IsMatch(item.Range.ListFormat?.ListString ?? string.Empty, Constants.RegexExp.HasNumbers);
326 |                         if (foundNumeric == false)
327 |                         {
328 |                             foundNumeric = isNumeric;
329 |                         }
330 | 
331 |                         if (foundNumeric == true && isNumeric == false)
332 |                         {
333 |                             continue;
334 |                         }
335 | 
336 |                         if (item.Range.ListFormat.ListLevelNumber == 1 && (listParagraphs.Count == 0 || listParagraphs.Keys.Max() < item.Range.Start))
337 |                         {
338 |                             listParagraphs.Add(item.Range.Start, item.Range.Sentences.First.Text);
339 |                         }
340 |                     }
341 |                 }
342 |             }
343 | 
344 |             return listParagraphs;
345 |         }
346 | 
347 |         private List<string> ExtractTableContent(Document wordDocToExtract)
348 |         {
349 |             var tblParaList = new List<string>();
350 |             try
351 |             {
352 |                 foreach (Table table in wordDocToExtract.Tables)
353 |                 {
354 |                     foreach (Paragraph tblPara in table.Range.Paragraphs)
355 |                     {
356 |                         var cleanText = iUtils.CleanTextFromNonAsciiChar(tblPara.Range.Text).Replace(" ", "");
357 |                         if (!string.IsNullOrEmpty(cleanText))
358 |                         {
359 |                             tblParaList.Add(iUtils.CleanTextFromNonAsciiChar(tblPara.Range.Text));
360 |                         }
361 |                     }
362 |                 }
363 |             }
364 |             catch (Exception)
365 |             {
366 |             }
367 | 
368 |             return tblParaList;
369 |         }
370 | 
371 |         /// <summary>
372 |         /// Extract headers
373 |         /// </summary>
374 |         /// <param name="wordDocToExtract"></param>
375 |         /// <returns>company name</returns>
376 |         private Dictionary<int, string> ExtractHeaders(Document wordDocToExtract, List<string> tblParaList, Dictionary<int, string> listParagraphs)
377 |         {
378 |             try
379 |             {
380 |                 var headers = new Dictionary<int, string>();
381 |                 foreach (Paragraph para in wordDocToExtract.Paragraphs)
382 |                 {
383 |                     try
384 |                     {
385 |                         var textStart = para.Range.Start;
386 |                         if (listParagraphs.Count > 0 && textStart <= listParagraphs.Keys.Max() && textStart >= listParagraphs.Keys.Min())
387 |                         {
388 |                             continue;
389 |                         }
390 | 
391 |                         var paraText = para.Range.Text;
392 |                         var cleanTextWithoutSpecialChar = iUtils.CleanTextFromNonAsciiChar(Regex.Replace(paraText.ToLower().Trim(), Constants.RegexExp.NoSpecialCharRegex, string.Empty));
393 |                         if (!string.IsNullOrEmpty(cleanTextWithoutSpecialChar) && tblParaList.Contains(iUtils.CleanTextFromNonAsciiChar(paraText)) == false)
394 |                         {
395 |                             string headingStyle = null;
396 |                             try
397 |                             {
398 |                                 headingStyle = (para.Range.get_Style() as Style).NameLocal;
399 |                             }
400 |                             catch (Exception)
401 |                             {
402 |                                 headingStyle = string.Empty;
403 |                             }
404 | 
405 |                             if (headingStyle.Equals(WordHeading1) || headingStyle.Equals(WordHeading2) || headingStyle.Equals(WordHeading3) || headingStyle.Equals(WordHeading4) || para.Range.Font.Bold == -1)
406 |                             {
407 |                                 if (!Regex.IsMatch(paraText, Constants.RegexExp.OnlyNumericWithSpaces))
408 |                                 {
409 |                                     headers.Add(textStart, iUtils.CleanTextFromNonAsciiChar(Regex.Replace(paraText.Replace(".", " ").TrimStart(), Constants.RegexExp.OnlyNumericWithSpaces, string.Empty)));
410 |                                 }
411 |                             }
412 |                             else if (para.Range.Words.First.Bold == -1 || para.Range.Font.Size > 12)
413 |                             {
414 |                                 var wordCount = para.Range.Sentences.First.Words.Count;
415 |                                 if (wordCount <= 6)
416 |                                 {
417 |                                     var firstWords = iUtils.CleanTextFromNonAsciiChar(Regex.Replace(para.Range.Sentences.First.Text.Replace(".", " ").TrimStart(), Constants.RegexExp.OnlyNumericWithSpaces, string.Empty));
418 |                                     if (firstWords.Length <= 1)
419 |                                     {
420 |                                         firstWords = iUtils.CleanTextFromNonAsciiChar(Regex.Replace(para.Range.Sentences[2].Text.Replace(".", " ").TrimStart(), Constants.RegexExp.OnlyNumericWithSpaces, string.Empty));
421 |                                     }
422 | 
423 |                                     headers.Add(textStart, firstWords);
424 |                                 }
425 |                                 else
426 |                                 {
427 |                                     var boldText = string.Empty;
428 |                                     var wordCounter = wordCount <= 25 ? wordCount : 25;
429 |                                     for (int i = 1; i <= wordCount; i++)
430 |                                     {
431 |                                         if (para.Range.Sentences.First.Words[i].Bold == -1)
432 |                                         {
433 |                                             boldText += para.Range.Words[i].Text;
434 |                                         }
435 |                                         else
436 |                                         {
437 |                                             break;
438 |                                         }
439 |                                     }
440 | 
441 |                                     if (boldText != string.Empty)
442 |                                     {
443 |                                         headers.Add(textStart, iUtils.CleanTextFromNonAsciiChar((Regex.Replace(boldText.Replace(".", " ").TrimStart(), Constants.RegexExp.OnlyNumericWithSpaces, string.Empty))));
444 |                                     }
445 |                                 }
446 |                             }
447 |                         }
448 |                     }
449 |                     catch (Exception)
450 |                     {
451 |                     }
452 |                 }
453 | 
454 |                 return headers;
455 |             }
456 |             catch (Exception exception)
457 |             {
458 |                 throw new Exception("Exception in extracting headers (" + nameof(ExtractHeaders) + ")\n", exception);
459 |             }
460 |         }
461 |     }
462 | }
463 | 


--------------------------------------------------------------------------------
/tools/document-processor/DSnA.WebJob.DocumentParser/Code/Utils.cs:
--------------------------------------------------------------------------------
  1 | //Copyright(c) Microsoft Corporation.All rights reserved.
  2 | //Licensed under the MIT License.
  3 | 
  4 | using System;
  5 | using System.Collections.Generic;
  6 | using System.IO;
  7 | using System.Text;
  8 | using Microsoft.Office.Interop.Word;
  9 | using Newtonsoft.Json;
 10 | using System.Text.RegularExpressions;
 11 | using System.Linq;
 12 | using Microsoft.WindowsAzure.Storage.Blob;
 13 | using Microsoft.WindowsAzure.Storage.Queue;
 14 | 
 15 | 
 16 | namespace DSnA.WebJob.DocumentParser
 17 | {
 18 |     using Microsoft.Azure;
 19 |     using Microsoft.WindowsAzure.Storage;
 20 |     using System.IO.Abstractions;
 21 |     using Table = Table;
 22 |     public interface IUtils
 23 |     {
 24 |         string CleanNonSupportedSparkChar(string dirtyString);
 25 |         string CleanTextFromNonAsciiChar(string dirtyString);
 26 |         List<string> ExtractLinksFromText(string content, bool isPreProcessingReq = false);
 27 |         string SerializeAndSaveJson(dynamic jsonData, string fileName);
 28 |         string SaveToCsvFile(List<string> csvLines, string fileName);
 29 |         string SaveJsonToFile(string jsonDoc, string fileName, string directory);
 30 |         FileMetaData ExtractFileMetadata(string fileLocation);
 31 |         void UploadFileToBlob(string fileLocation, CloudBlobClient blobClient);
 32 |         List<string> GetBlobListFromOutputContainer(CloudBlobClient blobClient);
 33 |         void CheckBlobUriInMsg(QueueMessage queueMsg);
 34 |         void DeleteInputFiles(List<string> filesToDelete);
 35 |         void CheckAllQueueExists(List<CloudQueue> queueList);
 36 |         string DownloadBlobFile(string blobUri, string locationToSave, CloudBlobClient blobClient);
 37 |         bool CheckUriIsValid(string inputUri);
 38 |         string ConvertPdfToWord(string file, string directoryToSave, Application wordApp);
 39 |         JsonDocumentStruct PrepareErrorJsonDoc(string fileLocation, Exception exp);
 40 |         Tuple<int, int> FindTableWithHeader(Document wordDocToExtract, List<string> tableHeaders);
 41 |         CloudBlobClient CreateCloudBlobClient(CloudStorageAccount StorageAccount);
 42 |         CloudQueue GetQueueReference(CloudStorageAccount StorageAccount);
 43 |         int GetQueueMessageDequeueCount(CloudQueueMessage queueMsg);
 44 |     }
 45 | 
 46 |     public class Utils : IUtils
 47 |     {
 48 |         private readonly ILogger iLogger;
 49 |         private readonly IFileSystem iFileSystem;
 50 |         private readonly IInteropWordUtils iInteropWordUtils;
 51 |         private static string outputContainerName = CloudConfigurationManager.GetSetting(Constants.ParserConfig.OutputContainerNameRef);
 52 | 
 53 |         public Utils(ILogger iLogger)
 54 |         {
 55 |             this.iLogger = iLogger;
 56 |             this.iFileSystem = new FileSystem();
 57 |             iInteropWordUtils = new InteropWordUtils();
 58 |         }
 59 | 
 60 |         public Utils(ILogger iLogger, IFileSystem iFileSystem)
 61 |         {
 62 |             this.iLogger = iLogger;
 63 |             this.iFileSystem = iFileSystem;
 64 |             iInteropWordUtils = new InteropWordUtils();
 65 |         }
 66 | 
 67 |         public Utils(ILogger iLogger, IFileSystem iFileSystem, IInteropWordUtils iInteropWordUtils)
 68 |         {
 69 |             this.iLogger = iLogger;
 70 |             this.iFileSystem = iFileSystem;
 71 |             this.iInteropWordUtils = iInteropWordUtils;
 72 |         }
 73 | 
 74 |         /// <summary>
 75 |         /// Clean non supported Spark filename characters
 76 |         /// </summary>
 77 |         /// <param name="dirtyString"></param>
 78 |         /// <returns>text with only ascii char</returns>
 79 |         public string CleanNonSupportedSparkChar(string dirtyString)
 80 |         {
 81 |             if (string.IsNullOrEmpty(dirtyString))
 82 |                 return dirtyString;
 83 | 
 84 |             return dirtyString.Replace("%20", "_").Replace("{", "").Replace("}", "").Replace("[", "").Replace("]", "");
 85 |         }
 86 | 
 87 |         /// <summary>
 88 |         /// Clean non ascii char from input text/string
 89 |         /// </summary>
 90 |         /// <param name="dirtyString"></param>
 91 |         /// <returns>text with only ascii char</returns>
 92 |         public string CleanTextFromNonAsciiChar(string dirtyString)
 93 |         {
 94 |             if (string.IsNullOrEmpty(dirtyString))
 95 |                 return dirtyString;
 96 | 
 97 |             string cleanString = Regex.Replace(dirtyString, Constants.RegexExp.NoEscapeSequences, String.Empty);
 98 |             cleanString = Regex.Replace(cleanString, Constants.RegexExp.OnlyAsciiChar, String.Empty);
 99 |             cleanString = Regex.Replace(cleanString, "\u0001", String.Empty);
100 |             cleanString = Regex.Replace(cleanString, "\u0015", String.Empty);
101 |             cleanString = Regex.Replace(cleanString, Constants.RegexExp.OnlyWhiteSpaces, " ");
102 |             return cleanString.Trim();
103 |         }
104 | 
105 |         /// <summary>
106 |         /// extract only hyperlinks from text
107 |         /// if preprocessing = true -> removes all spaces and adds spaces only before App protocols
108 |         /// -to distinguish hyperlinks from other strings.
109 |         /// </summary>
110 |         /// <param name="content"></param>
111 |         /// <param name="isPreProcessingReq"></param>
112 |         /// <returns>list of hyperlinks</returns>
113 |         public List<string> ExtractLinksFromText(string content, bool isPreProcessingReq = false)
114 |         {
115 |             // do some preprocessing on the text
116 |             if (isPreProcessingReq)
117 |             {
118 |                 string stringWithSpacesRemoved = Regex.Replace(content, Constants.RegexExp.OnlyWhiteSpaces, String.Empty);
119 |                 content = Regex.Replace(stringWithSpacesRemoved, Constants.RegexExp.HyperlinkAppProtocols, word => String.Format(@" {0}", word.Value)); //add space before http or https or ftp so that next regex can pick the link
120 |             }
121 | 
122 |             MatchCollection matches = Regex.Matches(content, Constants.RegexExp.OnlyHyperlinks);
123 |             List<string> webLinks = matches.Cast<Match>().Select(match => match.Value).ToList();
124 |             return webLinks;
125 |         }
126 | 
127 |         /// <summary>
128 |         /// Serialize json string and save it to location
129 |         /// </summary>
130 |         /// <param name="jsonData"></param>
131 |         /// <param name="fileName"></param>
132 |         /// <returns>location where json file is saved</returns>
133 |         public string SerializeAndSaveJson(dynamic jsonData, string fileName)
134 |         {
135 |             try
136 |             {
137 |                 if (String.IsNullOrEmpty(fileName))
138 |                     fileName = Constants.FileConfigs.TempFileName + "-" + DateTime.UtcNow.ToString(Constants.DateTimeFormat);
139 | 
140 |                 string finalJson = JsonConvert.SerializeObject(jsonData, Formatting.Indented);
141 |                 var jsonOutputFileLocation = SaveJsonToFile(finalJson, fileName, Constants.FileConfigs.OutputDirectoryPath);
142 |                 return jsonOutputFileLocation;
143 |             }
144 |             catch (Exception exception)
145 |             {
146 |                 throw new Exception("Exception occured in saving JSON (SerializeAndSaveJson)\n", exception);
147 |             }
148 |         }
149 | 
150 |         public string SaveToCsvFile(List<string> csvLines, string fileName)
151 |         {
152 |             if (String.IsNullOrEmpty(fileName))
153 |                 fileName = Constants.FileConfigs.TempFileName + "-" + DateTime.UtcNow.ToString(Constants.DateTimeFormat);
154 | 
155 |             var directory = Constants.FileConfigs.OutputDirectoryPath;
156 |             if (!iFileSystem.Directory.Exists(directory))
157 |                 iFileSystem.Directory.CreateDirectory(directory);
158 | 
159 |             string outputFileName = Path.Combine(directory, $"{fileName}.csv");
160 |             using (StreamWriter sw = new StreamWriter(outputFileName))
161 |             {
162 |                 foreach (var row in csvLines)
163 |                 {
164 |                     sw.WriteLine(row);
165 |                 }
166 |             }
167 | 
168 |             return outputFileName;
169 |         }
170 | 
171 |         /// <summary>
172 |         /// Populate all metadata related to extracted file
173 |         /// </summary>
174 |         /// <param name="fileLocation"></param>
175 |         /// <param name="queueMessage"></param>
176 |         /// <returns>FileMetaData</returns>
177 |         public FileMetaData ExtractFileMetadata(string fileLocation)
178 |         {
179 |             try
180 |             {
181 |                 FileMetaData fileData = new FileMetaData();
182 |                 var filName = Path.GetFileName(fileLocation);
183 |                 fileData.AgreementNumber = filName.Contains("_") ? filName.Split('_')[0] : string.Empty;
184 |                 fileData.FileName = filName;
185 |                 fileData.FileType = Path.GetExtension(fileLocation).Replace(".","");
186 |                 fileData.ExtractionTimeStamp = DateTime.UtcNow.ToString(Constants.DateTimeFormat);
187 |                 return fileData;
188 |             }
189 |             catch (Exception exception)
190 |             {
191 |                 throw new Exception("Error in extracting metadata of given file(ExtractFileMetadata)\n", exception);
192 |             }
193 |         }
194 | 
195 |         /// <summary>
196 |         /// upload file to given blob location
197 |         /// </summary>
198 |         /// <param name="fileLocation"></param>
199 |         /// <param name="blobClient"></param>
200 |         public void UploadFileToBlob(string fileLocation, CloudBlobClient blobClient)
201 |         {
202 |             try
203 |             {
204 |                 var container = blobClient.GetContainerReference(outputContainerName);
205 |                 ICloudBlob blob = container.GetBlockBlobReference(Path.GetFileName(fileLocation));
206 |                 if (blob == null)
207 |                     throw new Exception("Inaccessible blob location --> " + container?.Uri?.AbsolutePath + " (UploadFileToBlob)\n");
208 | 
209 |                 blob.UploadFromFile(fileLocation);
210 |             }
211 |             catch (Exception exception)
212 |             {
213 |                 throw new Exception("Error in uploading the output JSON file to blob location (" + fileLocation + ") in (UploadFileToBlob)\n", exception);
214 |             }
215 |         }
216 | 
217 |         public List<string> GetBlobListFromOutputContainer(CloudBlobClient blobClient)
218 |         {
219 |             try
220 |             {
221 |                 var container = blobClient.GetContainerReference(outputContainerName);
222 |                 var blobList = container.ListBlobs(useFlatBlobListing: true);
223 |                 var ouputBlobs = blobList.Select(s => s.Uri.Segments[s.Uri.Segments.Length - 1].Replace(".json","")).ToList();
224 |                 return ouputBlobs;
225 |             }
226 |             catch (Exception exception)
227 |             {
228 |                 throw new Exception("Error while executing func GetBlobListFromOutputContainer", exception);
229 |             }
230 |         }
231 | 
232 |         /// <summary>
233 |         /// higher level - check blob URI is valid
234 |         /// </summary>
235 |         /// <param name="queueMsg"></param>
236 |         public void CheckBlobUriInMsg(QueueMessage queueMsg)
237 |         {
238 |             if (!CheckUriIsValid(queueMsg.FileInputUri))
239 |                 throw new Exception("Queue message is invalid" + "-->" + queueMsg.FileInputUri);
240 | 
241 |             if (!CheckUriIsValid(queueMsg.FileOutputUri))
242 |                 throw new Exception("Queue message is invalid" + "-->" + queueMsg.FileOutputUri);
243 |         }
244 | 
245 |         /// <summary>
246 |         /// Delete given list of files
247 |         /// </summary>
248 |         /// <param name="filesToDelete"></param>
249 |         public void DeleteInputFiles(List<string> filesToDelete)
250 |         {
251 |             try
252 |             {
253 |                 foreach (var file in filesToDelete)
254 |                 {
255 |                     if (iFileSystem.File.Exists(file))
256 |                         iFileSystem.File.Delete(file);
257 |                 }
258 |             }
259 |             catch (Exception exception)
260 |             {
261 |                 throw new UnableToDeleteFileException("Unable to delete files related to reports (pdf doc or word doc or json output file)\n", exception);
262 |             }
263 |         }
264 | 
265 |         /// <summary>
266 |         /// Check given list of Azure Queues exist
267 |         /// </summary>
268 |         /// <param name="queueList"></param>
269 |         public void CheckAllQueueExists(List<CloudQueue> queueList)
270 |         {
271 |             foreach (CloudQueue queue in queueList)
272 |             {
273 |                 if (!queue.Exists())
274 |                     throw new Exception("Message Queue is inaccessible or does not exist" + "-->" + queue.Uri + "(CheckAllQueueExists)\n");
275 |             }
276 |         }
277 | 
278 |         /// <summary>
279 |         /// Download blob file to local file system 
280 |         /// </summary>
281 |         /// <param name="blobUri"></param>
282 |         /// <param name="locationToSave"></param>
283 |         /// <param name="blobClient"></param>
284 |         /// <returns>local file location where the file is saved</returns>
285 |         public string DownloadBlobFile(string blobUri, string locationToSave, CloudBlobClient blobClient)
286 |         {
287 |             try
288 |             {
289 |                 if (!iFileSystem.Directory.Exists(locationToSave))
290 |                     iFileSystem.Directory.CreateDirectory(locationToSave);
291 |                 
292 |                 ICloudBlob blob = blobClient.GetBlobReferenceFromServer(new Uri(blobUri));
293 |                 if (blob == null)
294 |                     throw new Exception("Inaccessible blob location " + blobUri + "\n");
295 | 
296 |                 string fileName = Path.GetFileName(new Uri(blobUri).LocalPath).Replace(" ", "_").Replace("{", "").Replace("}", "").Replace("[", "").Replace("]", ""); // to deal with spaces in filenames and invalid values for spark
297 |                 string localBlobLocation = Path.Combine(locationToSave, fileName);
298 |                 blob.DownloadToFile(localBlobLocation, FileMode.Create);
299 |                 return localBlobLocation;
300 |             }
301 |             catch (Exception exception)
302 |             {
303 |                 throw new Exception("Exception occured in downloading file from Azure Blob Storage(DownloadBlobFile)\n", exception);
304 |             }
305 |         }
306 | 
307 |         /// <summary>
308 |         /// check whether given uri is valid by recreating it
309 |         /// </summary>
310 |         /// <param name="inputUri"></param>
311 |         /// <returns>true if valid, false otherwise</returns>
312 |         public bool CheckUriIsValid(string inputUri)
313 |         {
314 |             try
315 |             {
316 |                 Uri result;
317 |                 if (String.IsNullOrEmpty(inputUri) || String.IsNullOrWhiteSpace(inputUri))
318 |                     return false;
319 | 
320 |                 if (!Uri.TryCreate(inputUri, UriKind.Absolute, out result))
321 |                     return false;
322 | 
323 |                 if (!result.Scheme.Equals(Uri.UriSchemeHttp) && !result.Scheme.Equals(Uri.UriSchemeHttps))
324 |                     return false;
325 | 
326 |                 return true;
327 |             }
328 |             catch (Exception exception)
329 |             {
330 |                 throw new Exception("Exception occured in checking blob URI(CheckUriIsValid)\n", exception);
331 |             }
332 |         }
333 | 
334 |         /// <summary>
335 |         /// save json output to file in local filesystem
336 |         /// </summary>
337 |         /// <param name="jsonData"></param>
338 |         /// <param name="fileName"></param>
339 |         /// <param name="directory"></param>
340 |         /// <returns>saved local file location</returns>
341 |         public string SaveJsonToFile(string jsonData, string fileName, string directory)
342 |         {
343 |       
344 |             if (!iFileSystem.Directory.Exists(directory))
345 |                 iFileSystem.Directory.CreateDirectory(directory);
346 | 
347 |             string outputFileName = Path.Combine(directory, $"{fileName}.json");
348 |             JsonTextWriter jsonTextWriter = new JsonTextWriter(iFileSystem.File.CreateText(outputFileName));
349 |             jsonTextWriter.Close();
350 |             // file is overwritten if already exists
351 |             iFileSystem.File.WriteAllText(outputFileName, jsonData);
352 |             return outputFileName;
353 |         }
354 | 
355 |         /// <summary>
356 |         /// Convert PDF document to MS Word document
357 |         /// </summary>
358 |         /// <param name="file"></param>
359 |         /// <returns>location of converted/saved word doc</returns>
360 |         public string ConvertPdfToWord(string file, string directoryToSave, Application wordApp)
361 |         {
362 |             Document pdfAsWordDoc = null;
363 |             try
364 |             {
365 |                 if (!iFileSystem.Directory.Exists(directoryToSave))
366 |                     iFileSystem.Directory.CreateDirectory(directoryToSave);
367 | 
368 |                 pdfAsWordDoc = iInteropWordUtils.OpenDocument(file, wordApp);
369 |                 string convertedDocFileLocation = directoryToSave + "/" + Path.ChangeExtension(Path.GetFileName(file), ".doc");
370 |                 pdfAsWordDoc.SaveAs2(convertedDocFileLocation, WdSaveFormat.wdFormatDocument);
371 |                 return convertedDocFileLocation;
372 |             }
373 |             catch (Exception exception)
374 |             {
375 |                 throw new Exception("Exception occured while converting PDF document to Word document (ConvertPdfToWord)\n", exception);
376 |             }
377 |             finally
378 |             {
379 |                 // Close without saving and release resources
380 |                 pdfAsWordDoc?.Close(SaveChanges: false);
381 |             }
382 |         }
383 | 
384 |         /// <summary>
385 |         /// When Error, prepare json document with error details
386 |         /// </summary>
387 |         /// <param name="fileLocation"></param>
388 |         /// <param name="queueMessage"></param>
389 |         /// <param name="exp"></param>
390 |         /// <returns>Json structure with error details</returns>
391 |         public JsonDocumentStruct PrepareErrorJsonDoc(string fileLocation, Exception exp)
392 |         {
393 |             JsonDocumentStruct jsonDoc = new JsonDocumentStruct();
394 |             jsonDoc.Errors = new Error();
395 |             jsonDoc.Errors.IsError = true;
396 |             jsonDoc.Errors.Description = exp.ToString();
397 |             jsonDoc.FileProperties = ExtractFileMetadata(fileLocation);
398 |             return jsonDoc;
399 |         }
400 | 
401 |         /// <summary>
402 |         /// Find table containing provided table headers - mainly for document containing tables
403 |         /// </summary>
404 |         /// <param name="wordDocToExtract"></param>
405 |         /// <param name="tableHeaders"></param>
406 |         /// <returns>table index and row index (to know where to start reading data from)</returns>
407 |         public Tuple<int, int> FindTableWithHeader(Document wordDocToExtract, List<string> tableHeaders)
408 |         {
409 |             try
410 |             {
411 |                 var tableIndex = 1;
412 |                 string cleanColHeader;
413 |                 var cellText = new StringBuilder();
414 |                 var columnHeadersAsList = new List<string>();
415 |                 foreach (Table table in wordDocToExtract.Tables)
416 |                 {
417 |                     if (table.Columns.Count == tableHeaders.Count)
418 |                     {
419 |                         // check whether the table contains red flags
420 |                         for (var row = 1; row <= table.Rows.Count; row++)
421 |                         {
422 |                             for (var col = 1; col <= table.Columns.Count; col++)
423 |                             {
424 |                                 foreach (Paragraph para in table.Cell(row, col).Range.Paragraphs)
425 |                                     cellText.Append(para.Range.Text);
426 | 
427 |                                 // looks for exact wordings in headers of red flag table
428 |                                 cleanColHeader = Regex.Replace(cellText.ToString(), Constants.RegexExp.NoSpecialCharRegex, "");
429 |                                 columnHeadersAsList.Add(tableHeaders.Find(x => cleanColHeader.ToLower().Equals(x.ToLower())));
430 |                                 cellText.Clear();
431 |                             }
432 | 
433 |                             // if this table is the table we were looking for, break and return the table index
434 |                             if (!columnHeadersAsList.Contains(null))
435 |                                 return Tuple.Create(tableIndex, row + 1);
436 | 
437 |                             columnHeadersAsList.Clear();
438 |                         }
439 |                     }
440 | 
441 |                     tableIndex++;
442 |                 }
443 | 
444 |                 return Tuple.Create(-1, -1);
445 |             }
446 |             catch (Exception exception)
447 |             {
448 |                 throw new Exception("Exception occured on finding table with provided headers in document(FindTableWithHeader)\n", exception);
449 |             }
450 |         }
451 | 
452 |         /// <summary>
453 |         /// Create cloud blob client from storage account
454 |         /// </summary>
455 |         /// <param name="StorageAccount"></param>
456 |         /// <returns>Cloud blob client</returns>
457 |         public CloudBlobClient CreateCloudBlobClient(CloudStorageAccount StorageAccount)
458 |         {
459 |             return StorageAccount?.CreateCloudBlobClient();
460 |         }
461 | 
462 |         /// <summary>
463 |         /// get reference for Azure queue from storage account
464 |         /// </summary>
465 |         /// <param name="StorageAccount"></param>
466 |         /// <returns>Queue reference</returns>
467 |         public CloudQueue GetQueueReference(CloudStorageAccount StorageAccount)
468 |         {
469 |             CloudQueueClient queueClient = StorageAccount?.CreateCloudQueueClient();
470 |             return queueClient?.GetQueueReference(CloudConfigurationManager.GetSetting(Constants.ParserConfig.MessageQueueRef));
471 |         }
472 | 
473 |         /// <summary>
474 |         /// Get Queue message dequeue count
475 |         /// </summary>
476 |         /// <param name="queueMsg"></param>
477 |         /// <returns>message dequeue count</returns>
478 |         public int GetQueueMessageDequeueCount(CloudQueueMessage queueMsg)
479 |         {
480 |             return queueMsg.DequeueCount;
481 |         }
482 |     }
483 | }


--------------------------------------------------------------------------------
/knowledge_extraction_paragraph_level.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT License.
  3 | 
  4 | 
  5 | #%% Imports
  6 | 
  7 | import pandas as pd 
  8 | import numpy as np
  9 | from nltk import word_tokenize
 10 | from nltk.corpus import stopwords
 11 | from nltk.tokenize import RegexpTokenizer
 12 | import re
 13 | import string
 14 | import spacy
 15 | spacy_nlp = spacy.load('en')
 16 | import gensim
 17 | from scipy import spatial
 18 | from collections import Counter
 19 | import matplotlib.pyplot as plt
 20 | import difflib
 21 | 
 22 | 
 23 | stop_words = set(stopwords.words('english'))
 24 | 
 25 | current_dir = './UN_Knowledge_Extraction/'
 26 | data_dir = current_dir + "data/"
 27 | output_dir = current_dir + "output/"
 28 | 
 29 | UN_DOCS_Paragraphs = pd.read_csv(data_dir + "UN_RES_DOCS_2009_2018.csv").fillna('').reset_index(drop=True)
 30 | w2v_google = gensim.models.KeyedVectors.load_word2vec_format(data_dir + 'GoogleNews-vectors-negative300.bin.gz', binary=True)
 31 | 
 32 | 
 33 | UNBIS_terms = pd.read_csv(data_dir + "UNBIS_terms.csv", encoding='cp1252')
 34 | UNBIS_terms = [term.lower() for term in UNBIS_terms['Term'].unique().tolist()]
 35 | 
 36 | SDG_Targets_Indicators = pd.read_csv(data_dir + "SDG_Targets_Indicators.csv", encoding='cp1252')
 37 | SDG = list(SDG_Targets_Indicators['SDG'].drop_duplicates())
 38 | 
 39 | Targets_SDG_dict = pd.Series(SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Targets'].SDG.values,index=SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Targets'].Content).to_dict()
 40 | Indicators_SDG_dict = pd.Series(SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Indicators'].SDG.values,index=SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Indicators'].Content).to_dict()
 41 | 
 42 | Targets = list(SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Targets']['Content'].drop_duplicates())
 43 | Indicators = list(SDG_Targets_Indicators.loc[SDG_Targets_Indicators.Type == 'Indicators']['Content'].drop_duplicates())
 44 | 
 45 | 
 46 | SDG_Targets_Indicators_High_Frequency_Words = dict()
 47 | for SDG in list(SDG_Targets_Indicators.SDG.unique()):
 48 |     target = [key for key,value in Targets_SDG_dict.items() if value == SDG]
 49 |     indicator = [key for key,value in Indicators_SDG_dict.items() if value == SDG]
 50 |     tokenizer = RegexpTokenizer(r'\w+')
 51 |     all_words = [w for w in tokenizer.tokenize(' '.join(target + indicator).lower().replace('\t',' ')) if w not in stop_words]
 52 |     SDG_Targets_Indicators_High_Frequency_Words[SDG] = Counter(all_words).most_common(10)
 53 | 
 54 | for SDG in SDG_Targets_Indicators_High_Frequency_Words.keys():
 55 |     print(SDG, SDG_Targets_Indicators_High_Frequency_Words[SDG])
 56 | 
 57 | 
 58 | preambular_verb_list = open(data_dir + "preambular_verb_list.txt").read().splitlines()
 59 | operative_verb_list = open(data_dir + "operative_verb_list.txt").read().splitlines()
 60 | 
 61 | UN_DOCS_Paragraphs['First_Action_Verb'] = ''
 62 | UN_DOCS_Paragraphs['Paragraph_Type'] = ''
 63 | UN_DOCS_Paragraphs['Key_Terms'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))]
 64 | UN_DOCS_Paragraphs['Referenced_Resolutions'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))]
 65 | UN_DOCS_Paragraphs['Referenced_Resolutions_Dates'] = [dict() for x in range(len(UN_DOCS_Paragraphs.index))]
 66 | UN_DOCS_Paragraphs['SDG'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))]
 67 | 
 68 | 
 69 | for index, row in UN_DOCS_Paragraphs.iterrows():
 70 |     if index % 10000 == 0:
 71 |         print(index)
 72 |     Content = row['Content'].replace('\t',' ')
 73 |     Content = ''.join(filter(lambda x:x in string.printable, Content))
 74 |     Content = Content.translate(str.maketrans('', '', '(),:;?@{|}~.'))
 75 |     Content = Content.translate(str.maketrans('', '', string.digits))
 76 |     tokenized_word = word_tokenize(Content.lower())
 77 |     Content_space_seperated = " " +  " ".join(tokenized_word) + " "        
 78 |     word_count = len(tokenized_word)
 79 | 
 80 | 
 81 |     if row['Type'] == 'Paragraph' and word_count >= 10:
 82 |         first_action_verb = ''
 83 |         try:
 84 |             first_action_verb = next(word for word in tokenized_word[:10] if word in preambular_verb_list + operative_verb_list)
 85 |         except Exception:
 86 |             pass    
 87 |         if Content[0].islower() == False:
 88 |             UN_DOCS_Paragraphs.loc[index, 'First_Action_Verb'] = first_action_verb
 89 |         if first_action_verb in preambular_verb_list and Content[0].islower() == False:
 90 |             UN_DOCS_Paragraphs.loc[index, 'Paragraph_Type'] = 'preambular'
 91 |         elif first_action_verb in operative_verb_list and Content[0].islower() == False:
 92 |             UN_DOCS_Paragraphs.loc[index, 'Paragraph_Type'] = 'operative' 
 93 |         elif Content[0].islower() == True:       
 94 |             previous_paragraph_types = list(UN_DOCS_Paragraphs.Paragraph_Type[(index-5):(index-1)])  
 95 |             previous_paragraph_types_non_empty = [x for x in previous_paragraph_types if x != '']
 96 |             if len(previous_paragraph_types_non_empty) >= 1:
 97 |                 UN_DOCS_Paragraphs.loc[index, 'Paragraph_Type'] = previous_paragraph_types_non_empty[-1]
 98 |              
 99 |         matching_terms = list(set([term for term in UNBIS_terms if " " + term + " " in Content_space_seperated]))
100 |         matching_terms.sort(key=len, reverse=True)
101 |         key_terms = []
102 |         if matching_terms is not None:
103 |             for i in range(len(matching_terms)):
104 |                 if matching_terms[i] in Content:
105 |                     key_terms.append(matching_terms[i])
106 |                     Content = Content.replace(matching_terms[i], '')
107 |         UN_DOCS_Paragraphs.at[index, 'Key_Terms'] = key_terms
108 |         
109 |         Referenced_Resolutions = re.findall(r'resolutions \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* .* and all subsequent related resolutions|resolutions \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* of [0-9]{1,2} [A-Za-z]{3,9} [0-9]{4}.* and \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* of [0-9]{1,2} [A-Za-z]{3,9} [0-9]{4}|resolutions \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* and \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* of [0-9]{1,2} [A-Za-z]{3,9} [0-9]{4}|resolution \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)* of [0-9]{1,2} [A-Za-z]{3,9} [0-9]{4}|resolutions \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)*.* and \w*-*\d+[/]*\d+\s*\(*\w*-*\w*\)* of [0-9]{1,2} [A-Za-z]{3,9} [0-9]{4}|resolutions \w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)*.* and \w*-*\d+[/]*\d+\s*\(*\w*-*\w*\)*|resolution \w*-*\d+[/]*[.]*\d+ \(\w*-*\w*\)|resolution \w*-*\d+[/]*[.]*\d+', Content)
110 |         Referenced_Resolutions_Dates = []
111 |         for referenced_resolution in Referenced_Resolutions:
112 |             referenced_resolution = re.sub(' January ', '/01/', referenced_resolution)
113 |             referenced_resolution = re.sub(' February ', '/02/', referenced_resolution)
114 |             referenced_resolution = re.sub(' March ', '/03/', referenced_resolution)            
115 |             referenced_resolution = re.sub(' April ', '/04/', referenced_resolution)            
116 |             referenced_resolution = re.sub(' May ', '/05/', referenced_resolution)            
117 |             referenced_resolution = re.sub(' June ', '/06/', referenced_resolution)
118 |             referenced_resolution = re.sub(' July ', '/07/', referenced_resolution)
119 |             referenced_resolution = re.sub(' August ', '/08/', referenced_resolution)
120 |             referenced_resolution = re.sub(' September ', '/09/', referenced_resolution)
121 |             referenced_resolution = re.sub(' October ', '/10/', referenced_resolution)
122 |             referenced_resolution = re.sub(' November ', '/11/', referenced_resolution)
123 |             referenced_resolution = re.sub(' December ', '/12/', referenced_resolution)   
124 |             referenced_resolution_split = re.split(',|and', referenced_resolution)
125 |             for resolution in referenced_resolution_split:
126 |                 if bool(re.search('resolution\w* (.*) of ([0-9]{1,2}/[0-9]{2}/[0-9]{4})', resolution)):
127 |                     resolution_number = re.findall(r'resolution\w* (.*) of', resolution)[0]
128 |                     date = re.findall(r'of ([0-9]{1,2}/[0-9]{2}/[0-9]{4})', resolution)[0]
129 |                 elif bool(re.search('\s*(.*) of ([0-9]{1,2}/[0-9]{2}/[0-9]{4})', resolution)):       
130 |                     resolution_number = re.findall(r'\s*(.*) of', resolution)[0]
131 |                     date = re.findall(r'of ([0-9]{1,2}/[0-9]{2}/[0-9]{4})', resolution)[0]  
132 |                 elif bool(re.search('resolution\w* (.*)', resolution)):                         
133 |                     resolution_number = re.findall(r'resolution\w* (.*)', resolution)[0]
134 |                     date = 'NA'    
135 |                 elif bool(re.search('\w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)*', resolution)):
136 |                     resolution_number = re.findall(r'\w*-*\d+[/]*[.]*\d+\s*\(*\w*-*\w*\)*', resolution)[0]
137 |                     date = 'NA'
138 |                 Referenced_Resolutions_Dates[resolution_number] = date
139 |         UN_DOCS_Paragraphs.at[index, 'Referenced_Resolutions'] = Referenced_Resolutions
140 |         UN_DOCS_Paragraphs.at[index, 'Referenced_Resolutions_Dates'] = Referenced_Resolutions_Dates
141 | 
142 |         if any(x in tokenized_word for x in ['poverty', 'poor']):
143 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('No Poverty')
144 |         elif any(x in Content.lower() for x in ['hunger', 'hungry', 'malnutrition', 'food crisis', 'sufficient food', 'food producers', 'food production', 'food reserves', 'food price', 'food insecurity', 'food security', 'undernutrition']):
145 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Zero Hunger')
146 |         elif any(x in tokenized_word for x in ['health', 'well-being', 'mortality', 'disease']):
147 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Good Health and Well-Being')
148 |         elif any(x in tokenized_word for x in ['education', 'educational']):
149 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Quality Education')
150 |         elif any(x in tokenized_word for x in ['gender equality']):
151 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Gender Equality')
152 |         elif any(x in tokenized_word for x in ['water', 'sanitation', 'wastewater']):
153 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Clean Water and Sanitation')
154 |         elif any(x in tokenized_word for x in ['energy', 'renewable']):
155 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Affordable and Clean Energy')
156 |         elif any(x in tokenized_word for x in ['labour-intensive', 'employment']) or any(x in Content.lower() for x in ['child labour', 'labour rights',  'decent work', 'economic growth', 'economic productivity']):
157 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Decent Work and Economic Growth')
158 |         elif any(x in tokenized_word for x in ['industry', 'innovation', 'infrastructure']):
159 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Industry, Innovation and Infrastructure')
160 |         elif any(x in tokenized_word for x in ['inequalities', 'inequality']) and (not any(x in Content.lower() for x in ['gender equality'])):
161 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Reduced Inequalities')            
162 |         elif 'sustainable cities' in Content.lower():
163 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Sustainable Cities and Communities')
164 |         elif any(x in Content.lower() for x in ['consumption and production']):
165 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Responsible Consumption and Production')
166 |         elif any(x in Content.lower() for x in ['climate change', 'climate-related', 'natural disaster', 'national disaster', 'local disaster']):
167 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Climate Action')
168 |         elif any(x in tokenized_word for x in ['marine', 'fisheries', 'coastal']) or any(x in Content.lower() for x in ['oceans and seas']):
169 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Life Below Water') 
170 |         elif any(x in tokenized_word for x in ['biodiversity', 'land ', 'inland', 'species']):
171 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Life on Land')     
172 |         elif 'institutions' in tokenized_word and any(x in tokenized_word for x in ['peace', 'justice', 'strong']):
173 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Peace, Justice and Strong Institutions')
174 |         elif any(x in tokenized_word for x in ['partner', 'partners', 'partnership', 'partnerships']):
175 |             UN_DOCS_Paragraphs.at[index, 'SDG'].append('Partnerships for the Goals')
176 | 
177 | w2v_Targets = []
178 | w2v_Indicators = []
179 | Targets_isalpha = []
180 | Indicators_isalpha = []
181 | 
182 | for i in range(len(Targets)):
183 |     tokenized_word = word_tokenize(Targets[i].lower())
184 |     tokenized_word = [word for word in tokenized_word if len(word) > 1] 
185 |     tokenized_word = [word for word in tokenized_word if word.isalpha()]
186 |     Targets_isalpha.append(' '.join(tokenized_word))
187 |     words_in_vocab = [word for word in tokenized_word if word in w2v_google.vocab]
188 |     w2v_sum = np.sum(w2v_google[words_in_vocab], axis=0)
189 |     #w2v_average = np.average(w2v_google[words_in_vocab], axis=0)    
190 |     w2v_Targets.append(w2v_sum)
191 | 
192 | for i in range(len(Indicators)):
193 |     tokenized_word = word_tokenize(Indicators[i].lower())
194 |     tokenized_word = [word for word in tokenized_word if len(word) > 1] 
195 |     tokenized_word = [word for word in tokenized_word if word.isalpha()]
196 |     Indicators_isalpha.append(' '.join(tokenized_word))
197 |     words_in_vocab = [word for word in tokenized_word if word in w2v_google.vocab]
198 |     w2v_sum = np.sum(w2v_google[words_in_vocab], axis=0)
199 |     #w2v_average = np.average(w2v_google[words_in_vocab], axis=0)    
200 |     w2v_Indicators.append(w2v_sum)
201 | 
202 | 
203 | def Common_Substring(string1, string2):
204 |     substrings = []
205 |     matches = difflib.SequenceMatcher(None, string1, string2).get_matching_blocks()
206 |     for match in sorted(matches, key=lambda x: x[2], reverse=True):  
207 |         substrings.append(string1[match.a:match.a + match.size])
208 |     return substrings
209 | 
210 | 
211 | similarity_threshold_target = 0.9
212 | similarity_threshold_indicator = 0.9
213 | 
214 | UN_DOCS_Paragraphs['Closest_Target'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))]
215 | UN_DOCS_Paragraphs['Closest_Indicator'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))]
216 | UN_DOCS_Paragraphs['Closest_Target_Similarity_Score'] = 0.0
217 | UN_DOCS_Paragraphs['Closest_Indicator_Similarity_Score'] = 0.0
218 | 
219 | for row_index in range(len(UN_DOCS_Paragraphs)):
220 |     if row_index % 1000 == 0:
221 |         print(row_index)    
222 |     if UN_DOCS_Paragraphs.loc[row_index, 'Type'] == 'Paragraph':
223 |         paragraph = UN_DOCS_Paragraphs.loc[row_index, 'Content']
224 |         tokenized_word = word_tokenize(paragraph.lower().replace('\t',' '))
225 |         tokenized_word = [word for word in tokenized_word if len(word) > 1] 
226 |         tokenized_word = [word for word in tokenized_word if word.isalpha()]
227 |         paragraph_isalpha = ' '.join(tokenized_word)
228 | 
229 |         similarity_with_target_common_substring = [] 
230 |         for i in range(len(Targets_isalpha)):
231 |             paragraph_target_common_substring = Common_Substring(paragraph_isalpha, Targets_isalpha[i])
232 |             if len(paragraph_target_common_substring) == 0:  
233 |                 similarity_with_target_common_substring.append(0.0)
234 |             elif len(paragraph_target_common_substring) >= 1:
235 |                 paragraph_target_common_substring = paragraph_target_common_substring[:3] 
236 |                 paragraph_target_common_substring_aggregated = ' '.join(paragraph_target_common_substring)
237 |                 words_common_substring = paragraph_target_common_substring_aggregated.split()
238 |                 words_common_substring = [word for word in words_common_substring if word in tokenized_word]
239 |                 words_common_substring_in_vocab = [word for word in words_common_substring if word in w2v_google.vocab]
240 |                 if len(words_common_substring_in_vocab) >= 1:
241 |                     w2v_common_substring = np.sum(w2v_google[words_common_substring_in_vocab], axis=0) 
242 |                     similarity_with_target_common_substring.append(1 - spatial.distance.cosine(w2v_Targets[i], w2v_common_substring))
243 |                 else: 
244 |                     similarity_with_target_common_substring.append(len(words_common_substring) / len(Targets_isalpha[i].split()))
245 | 
246 |         similarity_with_indicator_common_substring = [] 
247 |         for i in range(len(Indicators_isalpha)):
248 |             paragraph_indicator_common_substring = Common_Substring(paragraph_isalpha, Indicators_isalpha[i])
249 |             if len(paragraph_indicator_common_substring) == 0: 
250 |                 similarity_with_indicator_common_substring.append(0.0)
251 |             elif len(paragraph_indicator_common_substring) >= 1:
252 |                 paragraph_indicator_common_substring = paragraph_indicator_common_substring[:3]
253 |                 paragraph_indicator_common_substring_aggregated = ' '.join(paragraph_indicator_common_substring)
254 |                 words_common_substring = paragraph_indicator_common_substring_aggregated.split()
255 |                 words_common_substring = [word for word in words_common_substring if word in tokenized_word]
256 |                 words_common_substring_in_vocab = [word for word in words_common_substring if word in w2v_google.vocab]
257 |                 if len(words_common_substring_in_vocab) >= 1:
258 |                     w2v_common_substring = np.sum(w2v_google[words_common_substring_in_vocab], axis=0) 
259 |                     similarity_with_indicator_common_substring.append(1 - spatial.distance.cosine(w2v_Indicators[i], w2v_common_substring))
260 |                 else:  ## none of the words in common substring are in vocab
261 |                     similarity_with_indicator_common_substring.append(len(words_common_substring) / len(Indicators_isalpha[i].split()))
262 | 
263 |         UN_DOCS_Paragraphs.loc[row_index, 'Closest_Target_Similarity_Score'] = max(similarity_with_target_common_substring)
264 |         UN_DOCS_Paragraphs.loc[row_index, 'Closest_Indicator_Similarity_Score'] = max(similarity_with_indicator_common_substring)
265 | 
266 |         similar_target_index = [i for i,similarity in enumerate(similarity_with_target_common_substring) if similarity >= similarity_threshold_target]
267 |         similar_indicator_index = [i for i,similarity in enumerate(similarity_with_indicator_common_substring) if similarity >= similarity_threshold_indicator]
268 | 
269 |         if ((len(similar_target_index) >= 1) and (max(similarity_with_target_common_substring) >= max(similarity_with_indicator_common_substring))):
270 |             most_similar_target_index = similarity_with_target_common_substring.index(max(similarity_with_target_common_substring))
271 |             most_similar_target = Targets[most_similar_target_index]
272 |             UN_DOCS_Paragraphs.at[row_index, 'Closest_Target'].append(most_similar_target)
273 |             if Targets_SDG_dict[most_similar_target] not in UN_DOCS_Paragraphs.at[row_index, 'SDG']:
274 |                 UN_DOCS_Paragraphs.at[row_index, 'SDG'].append(Targets_SDG_dict[most_similar_target])  
275 |         elif ((len(similar_indicator_index) >= 1) and (max(similarity_with_target_common_substring) <= max(similarity_with_indicator_common_substring))):
276 |             most_similar_indicator_index = similarity_with_indicator_common_substring.index(max(similarity_with_indicator_common_substring))
277 |             most_similar_indicator = Indicators[most_similar_indicator_index]
278 |             UN_DOCS_Paragraphs.at[row_index, 'Closest_Indicator'].append(most_similar_indicator)
279 |             if Indicators_SDG_dict[most_similar_indicator] not in UN_DOCS_Paragraphs.at[row_index, 'SDG']:
280 |                 UN_DOCS_Paragraphs.at[row_index, 'SDG'].append(Indicators_SDG_dict[most_similar_indicator])
281 | 
282 | 
283 | 
284 | country_list = pd.read_excel(data_dir + "country_list.xlsx").fillna('')
285 | country_names = [country.strip().replace('&', 'and') for country in country_list['Country'].tolist()]
286 | 
287 | UN_agencies = pd.read_excel(data_dir + "agencies.xlsx").fillna('')
288 | UN_known_orgs = pd.read_excel(data_dir + "un_entities_20191017.xlsx").fillna('')
289 | 
290 | UN_corporate_names = pd.read_excel(data_dir + "names_A60-72.xlsx").fillna('')
291 | UN_corporate_names = [x for x in UN_corporate_names['Name'] if x not in country_names]
292 | UN_corporate_names = [re.sub("[\(].*?[\)]", "", x).replace('UN', 'United Nations').replace('.','').strip() for x in UN_corporate_names]
293 | 
294 | additional_un_org_list = [
295 |         'Advisory Committee on Administrative and Budgetary Questions',
296 |         'African Union Mission in Somalia',
297 |         'European Union Rule of Law Mission in Kosovo',
298 |         'Special Political and Decolonization Committee (Fourth Committee)',
299 |         'United Nations Conference on Environment and Development',
300 |         'United Nations Entity for Gender Equality and the Empowerment of Women (UN-Women)',
301 |         'Bretton Woods Institutions',
302 |         'International Tribunal for the Former Yugoslavia',
303 |         'United Nations Assistance Mission in Afghanistan',
304 |         'United Nations Operation in Cte dIvoire',
305 |         'Consultative Group on International Agricultural Research',
306 |         ]
307 | 
308 | known_un_org_list = list(set(
309 |         UN_agencies['Title'].tolist() 
310 |         + UN_known_orgs['Entity'].tolist()
311 |         + UN_corporate_names
312 |         + additional_un_org_list
313 |         ))
314 | known_un_org_list = [x for x in known_un_org_list if x not in country_names]
315 | 
316 | 
317 | known_un_org_list = [org.translate(str.maketrans('', '', ',;:."')) for org in known_un_org_list]
318 | #known_un_org_list = [re.sub(r'[^\x00-\x7F]+',' ', org) for org in known_un_org_list]
319 | known_un_org_list = [''.join([x if x in string.printable else '' for x in org]) for org in known_un_org_list] 
320 | known_un_org_list = [' '.join(w for w in org.split()) for org in known_un_org_list]
321 | 
322 | known_un_org_w2v = dict()
323 | for org in known_un_org_list:
324 |     words_in_vocab = [word for word in word_tokenize(org.lower()) if word in w2v_google.vocab]
325 |     if len(words_in_vocab) >= 1:
326 |         w2v_sum = np.sum(w2v_google[words_in_vocab], axis=0)
327 |         known_un_org_w2v[org] = w2v_sum
328 |     else:
329 |         known_un_org_w2v[org] = np.asarray([])
330 | 
331 | 
332 | key_words_un_org_list = open(data_dir + "key_words_un_org_list.txt").read().splitlines()
333 | key_words_not_un_org_list = open(data_dir + "key_words_not_un_org_list.txt").read().splitlines()
334 | 
335 | UN_DOCS_Paragraphs['word_cnt'] = 0
336 | UN_DOCS_Paragraphs['Content_clean'] = ''
337 | 
338 | for index, row in UN_DOCS_Paragraphs.iterrows():
339 |     if index % 10000 == 0:
340 |         print(index)
341 |     Content = row['Content'].replace('\t',' ')
342 |     Content = Content.replace(',',', ') 
343 |     Content = Content.replace(';','; ')     
344 |     Content = Content.replace('.','. ')
345 |     Content = re.sub(r'[0-9]{1,2}.', ' ', Content)
346 |     Content = ''.join([x if x in string.printable else '' for x in Content])
347 |     Content = ' '.join(w for w in Content.split() if not any(x.isdigit() for x in w)) 
348 |     word_cnt = len(Content.split())
349 |     UN_DOCS_Paragraphs.at[index, 'word_cnt'] = word_cnt
350 |     UN_DOCS_Paragraphs.at[index, 'Content_clean'] = Content
351 | 
352 | UN_DOCS_Paragraphs = UN_DOCS_Paragraphs.sort_values(by=['SourceFile', 'Index'])
353 | 
354 | 
355 | UN_DOCS_Resolutions_Content = UN_DOCS_Paragraphs.loc[(UN_DOCS_Paragraphs.Type == 'Paragraph')].groupby(['SourceFile'])['Content'].apply(' '.join).reset_index()
356 | UN_DOCS_Resolutions_Content_clean = UN_DOCS_Paragraphs.loc[(UN_DOCS_Paragraphs.Type == 'Paragraph')].groupby(['SourceFile'])['Content_clean'].apply(' '.join).reset_index()
357 | UN_DOCS_Resolutions = pd.merge(UN_DOCS_Resolutions_Content, UN_DOCS_Resolutions_Content_clean, on='SourceFile')
358 | 
359 | UN_DOCS_Resolutions['Organization_Names_known'] = [list() for x in range(len(UN_DOCS_Resolutions.index))]
360 | UN_DOCS_Resolutions['Organization_Names_not_from_known_orginal'] = [list() for x in range(len(UN_DOCS_Resolutions.index))]
361 | UN_DOCS_Resolutions['Organization_Names_not_from_known_inferred'] = [list() for x in range(len(UN_DOCS_Resolutions.index))]
362 | 
363 | for index, row in UN_DOCS_Resolutions.iterrows():
364 |     if index % 100 == 0:
365 |         print(index)
366 |     Content_clean = row['Content_clean']
367 |     known_orgs = [known_org for known_org in known_un_org_list if known_org in Content_clean]
368 |     UN_DOCS_Resolutions.at[index, 'Organization_Names_known'] = known_orgs
369 | 
370 |     extracted_orgs = list(set([str(element) for element in spacy_nlp(Content_clean).ents if element.label_ == 'ORG']))
371 |     extracted_orgs = [org for org in extracted_orgs if all(char not in org for char in ['_', '/', '.'])]
372 |     for i in range(len(extracted_orgs)):
373 |         extracted_org = extracted_orgs[i].translate(str.maketrans('', '', string.digits)) 
374 |         extracted_org = extracted_org.translate(str.maketrans('', '', ',;:.()')) 
375 |         if extracted_org.lower().startswith('the '): 
376 |             extracted_orgs[i] = extracted_org[4:]
377 |     extracted_orgs = list(set(extracted_orgs))   
378 | 
379 |     Organization_Names_not_from_known_orginal = []
380 |     for org in extracted_orgs:
381 |         if (
382 |                 len(org.split()) > 1 
383 |                 and (org not in known_un_org_list)
384 |                 and (not org.lower().split()[-1] in stop_words) 
385 |                 and ((not any(key_word.lower() in org.lower() for key_word in key_words_not_un_org_list)) or (any(key_word.lower() in org.lower() for key_word in key_words_un_org_list)))
386 |                 and (max([org.lower() in known_org.lower() for known_org in known_un_org_list]) == False) 
387 |                 #and (max([known_org.lower() in org.lower() for known_org in known_un_org_list]) == False)
388 |                 and (max([org.lower().split()[0] in [word for word in operative_verb_list if word.endswith('s')]]) == 0)
389 |                 and (max([word in preambular_verb_list for word in org.lower().split()]) == 0)
390 |                 and (' of the ' not in org) 
391 |                 ):
392 |             Organization_Names_not_from_known_orginal.append(org)  
393 |         elif (
394 |                 len(org.split()) > 1 
395 |                 and (org not in known_un_org_list)
396 |                 and (not org.lower().split()[-1] in stop_words) 
397 |                 and ((not any(key_word.lower() in org.lower() for key_word in key_words_not_un_org_list)) or (any(key_word.lower() in org.lower() for key_word in key_words_un_org_list))) 
398 |                 and (max([org.lower() in known_org.lower() for known_org in known_un_org_list]) == False)
399 |                 #and (max([known_org.lower() in org.lower() for known_org in known_un_org_list]) == False)
400 |                 and (max([org.lower().split()[0] in [word for word in operative_verb_list if word.endswith('s')]]) == 0) 
401 |                 and (max([word in preambular_verb_list for word in org.lower().split()]) == 0) 
402 |                 and (' of the ' in org)  
403 |                 ):
404 |             org_split = org.split(' of the ')
405 |             if (org_split[0] not in known_un_org_list) and (len(org_split[0].split()) > 1) and (org_split[1] in known_un_org_list):
406 |                 Organization_Names_not_from_known_orginal.append(org_split[0])
407 |             elif (org_split[0] in known_un_org_list) and (org_split[1] not in known_un_org_list) and (len(org_split[1].split()) > 1):
408 |                 Organization_Names_not_from_known_orginal.append(org_split[1])
409 |             elif (org_split[0] not in known_un_org_list) and (org_split[1] not in known_un_org_list): 
410 |                 Organization_Names_not_from_known_orginal.append(org)
411 | 
412 |     UN_DOCS_Resolutions.at[index, 'Organization_Names_not_from_known_orginal'] = Organization_Names_not_from_known_orginal
413 |     
414 | 
415 |     if (len(known_orgs) > 0): 
416 |         for org in UN_DOCS_Resolutions.at[index, 'Organization_Names_not_from_known_orginal']:
417 |             tokenized_word = word_tokenize(org)
418 |             tokenized_word_lower = word_tokenize(org.lower())
419 |             words_in_vocab_lower = [word for word in tokenized_word_lower if word in w2v_google.vocab]
420 |             if (len(words_in_vocab_lower) >= 1):
421 |                 org_w2v = np.sum(w2v_google[words_in_vocab_lower], axis=0)
422 |             else:
423 |                 org_w2v = np.asarray([])
424 |             common_words_length = []
425 |             w2v_similarity = []
426 |             for known_org in known_orgs:
427 |                 known_org_tokenized_word = word_tokenize(known_org)
428 |                 known_org_tokenized_word_lower = word_tokenize(known_org.lower())
429 |                 known_org_words_in_vocab_lower = [word for word in known_org_tokenized_word_lower if word in w2v_google.vocab]
430 |                 if len(known_org_words_in_vocab_lower) >= 1:
431 |                     known_org_w2v = np.sum(w2v_google[known_org_words_in_vocab_lower], axis=0)
432 |                 else:
433 |                     known_org_w2v = np.asarray([])
434 |         
435 |                 common_words = [word for word in tokenized_word if (word in known_org_tokenized_word and word[0].isupper())]
436 |                 common_words_length.append(len(common_words))
437 |                 if ((len(org_w2v) == 0) or (len(known_org_w2v) == 0)):
438 |                     w2v_similarity.append(0)
439 |                 else:
440 |                     w2v_similarity.append(1 - spatial.distance.cosine(org_w2v, known_org_w2v))
441 |             
442 |             if (max(common_words_length) == 0): 
443 |                 UN_DOCS_Resolutions.at[index, 'Organization_Names_not_from_known_inferred'].append((org, org))
444 |             else:
445 |                 if (len([l for l in common_words_length if l == max(common_words_length)]) == 1): 
446 |                     known_org_index = common_words_length.index(max(common_words_length))
447 |                     similarity_score = w2v_similarity[known_org_index]
448 |                     known_org = known_orgs[known_org_index]
449 |                     UN_DOCS_Resolutions.at[index, 'Organization_Names_not_from_known_inferred'].append((org, known_org, similarity_score))
450 |                 else: 
451 |                     known_org_index = [i for i, x in enumerate(common_words_length) if x == max(common_words_length)]
452 |                     max_similarity_score = max([w2v_similarity[index] for index in known_org_index])
453 |                     max_similarity_known_org_index = w2v_similarity.index(max_similarity_score)
454 |                     known_org = known_orgs[max_similarity_known_org_index]
455 |                     UN_DOCS_Resolutions.at[index, 'Organization_Names_not_from_known_inferred'].append((org, known_org, max_similarity_score))
456 |     
457 | 
458 | Organization_Names_not_from_known = UN_DOCS_Resolutions['Organization_Names_not_from_known_orginal'].tolist()
459 | Organization_Names_not_from_known = [x for sublist in Organization_Names_not_from_known for x in sublist]
460 | Organization_Names_not_from_known_cnt = Counter(Organization_Names_not_from_known)
461 | Organization_Names_not_from_known_cnt = pd.DataFrame.from_dict(Organization_Names_not_from_known_cnt, orient='index').reset_index()
462 | Organization_Names_not_from_known_cnt = Organization_Names_not_from_known_cnt.rename(columns={'index':'org_names', 0:'count'}).sort_values(by='count', ascending=False).reset_index(drop=True)
463 | 
464 | 
465 | UN_DOCS_Paragraphs['Country'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))]
466 | UN_DOCS_Paragraphs['Organization_Names_known'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))]
467 | UN_DOCS_Paragraphs['Organization_Names_not_from_known_orginal'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))]
468 | UN_DOCS_Paragraphs['Organization_Names_not_from_known_inferred'] = [list() for x in range(len(UN_DOCS_Paragraphs.index))]
469 | 
470 | for index, row in UN_DOCS_Paragraphs.iterrows():
471 |     if index % 10000 == 0:
472 |         print(index)
473 |     SourceFile = row['SourceFile']
474 |     Content_clean = row['Content_clean']
475 |     Organization_Names_known_Resolution = UN_DOCS_Resolutions.loc[UN_DOCS_Resolutions['SourceFile'] == SourceFile , 'Organization_Names_known'].tolist()[0]
476 |     Organization_Names_not_from_known_orginal_Resolution = UN_DOCS_Resolutions.loc[UN_DOCS_Resolutions['SourceFile'] == SourceFile]['Organization_Names_not_from_known_orginal'].tolist()[0]
477 |     Organization_Names_not_from_known_inferred_Resolution = UN_DOCS_Resolutions.loc[UN_DOCS_Resolutions['SourceFile'] == SourceFile]['Organization_Names_not_from_known_inferred'].tolist()[0]
478 |     Country = [country for country in country_names if country.lower() in Content_clean.lower()]
479 |     Organization_Names_known = []
480 |     for org in Organization_Names_known_Resolution:
481 |         if org.lower() in Content_clean.lower():
482 |             Organization_Names_known.append(org)
483 |     Organization_Names_not_from_known_orginal = []
484 |     for org in Organization_Names_not_from_known_orginal_Resolution:
485 |         if org in Content_clean:
486 |             Organization_Names_not_from_known_orginal.append(org)     
487 |     Organization_Names_not_from_known_inferred = []
488 |     if len(Organization_Names_not_from_known_inferred_Resolution) >= 1:
489 |         for org in Organization_Names_not_from_known_inferred_Resolution:
490 |             if org[0] in Content_clean:
491 |                 Organization_Names_not_from_known_inferred.append(org)  
492 |     UN_DOCS_Paragraphs.at[index, 'Country'] = Country         
493 |     UN_DOCS_Paragraphs.at[index, 'Organization_Names_known'] = Organization_Names_known
494 |     UN_DOCS_Paragraphs.at[index, 'Organization_Names_not_from_known_orginal'] = Organization_Names_not_from_known_orginal
495 |     UN_DOCS_Paragraphs.at[index, 'Organization_Names_not_from_known_inferred'] = Organization_Names_not_from_known_inferred
496 | 
497 | UN_DOCS_Paragraphs = UN_DOCS_Paragraphs.drop(columns=['word_cnt', 'Content_clean'])
498 | UN_DOCS_Paragraphs.to_excel(output_dir + 'output_UN_DOCS_paragraph_level.xlsx')
499 | 
500 | 


--------------------------------------------------------------------------------