├── module
    ├── modulePDF.sym
    ├── Makefile.am
    ├── modulePDF.h
    └── modulePDF.c
├── tools
    ├── Makefile.am
    ├── armadito-pdf
    │   ├── Makefile.am
    │   └── main.c
    ├── cli_analyzer
    │   ├── Makefile
    │   ├── scandir.bat
    │   ├── scandir.sh
    │   └── main.c
    ├── perl_poc
    │   └── lib
    │   │   ├── conf
    │   │       └── Config.pm
    │   │   ├── analysis
    │   │       ├── CVEs.pm
    │   │       ├── ObjectAnalysis.pm
    │   │       └── DocumentStruct.pm
    │   │   └── utils
    │   │       └── CleanRewriting.pm
    └── cli_parser
    │   └── parser.c
├── autogen.sh
├── Makefile.am
├── .gitignore
├── sonar-project.properties
├── lib
    ├── libarmadito-pdf.pc.in
    ├── Makefile.am
    ├── TODO
    ├── src
    │   ├── log.c
    │   ├── osdeps.c
    │   ├── armaditopdf.c
    │   ├── pdfStructs.c
    │   ├── utils.c
    │   └── pdfStructAnalysis.c
    ├── includes
    │   ├── pdfParsing.h
    │   ├── pdfAnalysis.h
    │   ├── osdeps.h
    │   ├── log.h
    │   ├── utils.h
    │   ├── armaditopdf.h
    │   ├── filters.h
    │   └── pdfStructs.h
    └── Spec.txt
├── win32
    └── ArmaditoPDF
    │   ├── ArmaditoPDF.sln
    │   └── ArmaditoPDF
    │       ├── ArmaditoPDF.vcxproj.filters
    │       └── ArmaditoPDF.vcxproj
├── README.md
├── CHANGES
├── configure.ac
└── .travis.yml


/module/modulePDF.sym:
--------------------------------------------------------------------------------
1 | module
2 | 


--------------------------------------------------------------------------------
/tools/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS=armadito-pdf


--------------------------------------------------------------------------------
/autogen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -x
3 | aclocal --force
4 | libtoolize --force --automake --copy 
5 | #autoheader --force
6 | automake --foreign --add-missing --force-missing --copy
7 | autoconf --force
8 | 
9 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
 1 | if COND_LIBRARY
 2 | LIB_DIR = lib
 3 | endif
 4 | 
 5 | if COND_MODULE
 6 | MOD_DIR = module
 7 | endif
 8 | 
 9 | if COND_TOOLS
10 | TOOLS_DIR = tools
11 | endif
12 | 
13 | SUBDIRS = $(LIB_DIR) $(TOOLS_DIR) $(MOD_DIR)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | Makefile
 2 | Makefile.in
 3 | autom4te.cache/*
 4 | compile
 5 | config.log
 6 | config.status
 7 | configure
 8 | aclocal.m4
 9 | config.guess
10 | config.sub
11 | depcomp
12 | install-sh
13 | ltmain.sh
14 | missing
15 | *~
16 | version.m4
17 | *.lo
18 | *.o
19 | 


--------------------------------------------------------------------------------
/sonar-project.properties:
--------------------------------------------------------------------------------
1 | sonar.projectKey=armadito:mod-pdf:DEV
2 | sonar.projectName=armadito-mod-pdf
3 | sonar.projectVersion=1.0
4 | sonar.sources=.
5 | sonar.branch=DEV
6 | sonar.exclusions=armadito-av/**/*,cov-int/**/*
7 | sonar.cfamily.build-wrapper-output=build-wrapper-out
8 | 


--------------------------------------------------------------------------------
/tools/armadito-pdf/Makefile.am:
--------------------------------------------------------------------------------
 1 | AUTOMAKE_OPTIONS=subdir-objects no-dependencies
 2 | 
 3 | AM_CPPFLAGS=
 4 | armadito_pdf_LDADD= @LIBARMADITO_PDF_LIBS@
 5 | armadito_pdf_CFLAGS= @LIBARMADITO_PDF_CFLAGS@
 6 | #armadito_pdf_CFLAGS= -I$(top_srcdir)/lib/includes
 7 | 
 8 | 
 9 | 
10 | bin_PROGRAMS = armadito-pdf
11 | armadito_pdf_SOURCES= main.c


--------------------------------------------------------------------------------
/lib/libarmadito-pdf.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=@prefix@
 2 | exec_prefix=@exec_prefix@
 3 | includedir=@includedir@
 4 | libdir=@libdir@
 5 | 
 6 | Name: libarmadito-pdf
 7 | Description: Armadito PDF library
 8 | URL: https://github.com/armadito/armadito-mod-pdf
 9 | Version: @PACKAGE_VERSION@
10 | Cflags: -I${includedir}
11 | Libs: -L${libdir} -larmadito-pdf


--------------------------------------------------------------------------------
/tools/cli_analyzer/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	gcc -g -o a6oPDFAnalyzer -Wall -Wextra main.c ../../lib/src/*.c -I../../lib/includes
 3 | 
 4 | lib:
 5 | 	gcc -fPIC -g -c -Wall -Wextra src/*.c -Iincludes
 6 | 	gcc -shared -Wl,-soname,a6oPDFAnalyzer.so.1 -o a6oPDFAnalyzer-1.0.1.so *.o -lc
 7 | 	
 8 | 
 9 | clean:
10 | 	rm a6oPDFAnalyzer
11 | 	rm *.o
12 | 


--------------------------------------------------------------------------------
/tools/cli_analyzer/scandir.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | set ANALYZER="a6oPDFAnalyzer.exe"
 4 | set DIRPATH=%1
 5 | 
 6 | if [%1]==[] goto :help
 7 | 
 8 | REM for /R in %DIRPATH%\ %%A do echo "entry = %%A"
 9 | REM - FOR /R [[drive:]path] %%parameter IN (set) DO command
10 | FOR /R %DIRPATH% %%E IN (*) DO echo %%E && %ANALYZER% %%E >> result.txt
11 | 	
12 | goto :end
13 | 
14 | :help
15 | echo syntax: scandir.bat [directoryPath]
16 | 
17 | :end
18 | echo - Exiting...


--------------------------------------------------------------------------------
/module/Makefile.am:
--------------------------------------------------------------------------------
 1 | AUTOMAKE_OPTIONS=subdir-objects
 2 | 
 3 | modulesdir=$(libdir)/armadito/modules
 4 | modules_LTLIBRARIES=modulePDF.la
 5 | 
 6 | 
 7 | modulePDF_la_SOURCES= \
 8 | modulePDF.c \
 9 | modulePDF.h
10 | 
11 | 
12 | AM_CPPFLAGS= 
13 | modulePDF_la_LDFLAGS= -module -avoid-version -export-symbols "$(srcdir)/modulePDF.sym"
14 | modulePDF_la_CFLAGS= -I$(top_srcdir)/lib/includes
15 | modulePDF_la_LIBADD=../lib/libarmadito-pdf.la
16 | modulePDF_la_CFLAGS+= @LIBARMADITO_CFLAGS@ 
17 | modulePDF_la_LIBADD+= @LIBARMADITO_LIBS@
18 | 
19 | 
20 | install-exec-hook:
21 | 	-rm -f "$(DESTDIR)$(modulesdir)"/modulePDF.la "$(DESTDIR)$(modulesdir)"/modulePDF.a
22 | 
23 | install-data-hook:
24 | 	-rm -f "$(DESTDIR)$(modulesdir)"/modulePDF.la "$(DESTDIR)$(modulesdir)"/modulePDF.a
25 | 
26 | EXTRA_DIST=modulePDF.sym


--------------------------------------------------------------------------------
/tools/cli_analyzer/scandir.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This script scan all pdf files in a directory given in parameter
 4 | # return the results in another file given as second parameter
 5 | # the stats are stored in the stat.txt file
 6 | 
 7 | # check parameters
 8 | 
 9 | # VARIABLES
10 | DIR=$1
11 | RES_FILE=$2
12 | EXE=./a6oPDFAnalyzer
13 | 
14 | if [ -z "$1" ]
15 |   then
16 |     echo "Missing parameters";
17 |     echo "SYNTAX :: scandir.sh [directory] [result_file]";
18 |     exit -1;
19 | fi
20 | 
21 | if [ -z "$2" ]
22 |   then
23 |     echo "Missing parameters"
24 |     echo "SYNTAX :: scandir.sh [directory] [result_file]"
25 |     ecit -2;
26 | fi
27 | 
28 | for f in $DIR/* ; do
29 | 	echo "Processing $f ...";
30 | 	$EXE "$f" >> $RES_FILE
31 | 	#mv "$f" $DIR/Treated/
32 | done
33 | 
34 | 
35 | # Stats coef
36 | more $RES_FILE | grep -e 'Coef =' | sort | uniq -c > stats.txt
37 | 
38 | 
39 | exit 0;
40 | 


--------------------------------------------------------------------------------
/module/modulePDF.h:
--------------------------------------------------------------------------------
 1 | /***
 2 | 
 3 | Copyright (C) 2015, 2016 Teclib'
 4 | 
 5 | This file is part of Armadito module PDF.
 6 | 
 7 | Armadito module PDF is free software: you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 | 
12 | Armadito module PDF is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | ***/
21 | 
22 | #include <libarmadito/armadito.h>
23 | #include <string.h>
24 | 
25 | #include <armaditopdf.h>
26 | #include <osdeps.h>
27 | 
28 | #define MALICIOUS_COEF 70


--------------------------------------------------------------------------------
/win32/ArmaditoPDF/ArmaditoPDF.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 2013
 4 | VisualStudioVersion = 12.0.31101.0
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ArmaditoPDF", "ArmaditoPDF\ArmaditoPDF.vcxproj", "{667A295C-61CD-47A7-AAFC-5B7F6088CDB5}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Win32 = Debug|Win32
11 | 		Release|Win32 = Release|Win32
12 | 	EndGlobalSection
13 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | 		{667A295C-61CD-47A7-AAFC-5B7F6088CDB5}.Debug|Win32.ActiveCfg = Debug|Win32
15 | 		{667A295C-61CD-47A7-AAFC-5B7F6088CDB5}.Debug|Win32.Build.0 = Debug|Win32
16 | 		{667A295C-61CD-47A7-AAFC-5B7F6088CDB5}.Release|Win32.ActiveCfg = Release|Win32
17 | 		{667A295C-61CD-47A7-AAFC-5B7F6088CDB5}.Release|Win32.Build.0 = Release|Win32
18 | 	EndGlobalSection
19 | 	GlobalSection(SolutionProperties) = preSolution
20 | 		HideSolutionNode = FALSE
21 | 	EndGlobalSection
22 | EndGlobal
23 | 


--------------------------------------------------------------------------------
/tools/perl_poc/lib/conf/Config.pm:
--------------------------------------------------------------------------------
 1 | package Config;
 2 | 
 3 | use strict;
 4 | 
 5 | # GLOBAL CONFIG
 6 | our $DEBUG = "no";
 7 | our $ANALYSIS_TIMEOUT = 5;
 8 | our $MAX_REP_DETECTION = 150;
 9 | 
10 | 
11 | # PDF STRUCT TESTS coefs
12 | our $ENCRYPTED_PDF = "ENCRYPTED_PDF";
13 | our $EMPTY_PAGES_WITH_ACTIVE_CONTENT = 99;
14 | our $EMPTY_PAGES_CONTENT = 70;
15 | our $OBJECT_COLLISION_PLUS_BAD_XREF = 90;
16 | our $OBJECT_COLLISION = 10;
17 | our $BAD_XREF_OFFSET = 30;
18 | our $TRAILER_NOT_FOUND = 30;
19 | our $BAD_TRAILER = 40;
20 | our $OBFUSCATED_OBJECTS = 40;
21 | our $MALICIOUS_URI = 50;
22 | our $MULTIPLE_HEADERS = 50;
23 | 
24 | 
25 | 
26 | # OBJECT ANALYSIS TESTS coefs
27 | our $ACTIVE_CONTENT = 40;
28 | our $SHELLCODE = 40;
29 | our $PATTERN_REPETITION = 40;
30 | our $DANGEROUS_PATTERN_HIGH = 90;
31 | our $DANGEROUS_PATTERN_MEDIUM = 40;
32 | our $DANGEROUS_PATTERN_LOW = 20;
33 | our $TIME_EXCEEDED = 20;
34 | 
35 | 
36 | # CVEs TESTS
37 | our $CVE_2010_2883_DETECTED = 50;
38 | our $CVE_2010_2883_BAD_FONT_FILE_LENGTH = 40;
39 | 
40 | 
41 | our $MALWARE_DETECTION_COEF = 70;
42 | 
43 | # 
44 | 	
45 | 1;;
46 | 


--------------------------------------------------------------------------------
/lib/Makefile.am:
--------------------------------------------------------------------------------
 1 | AUTOMAKE_OPTIONS=subdir-objects no-dependencies
 2 | 
 3 | lib_LTLIBRARIES = libarmadito-pdf.la
 4 | libarmadito_pdf_la_LDFLAGS = -version-number 0:12:6
 5 | 
 6 | AM_CPPFLAGS= -I$(top_srcdir)/lib/includes
 7 | 
 8 | libarmadito_pdf_la_SOURCES = \
 9 | $(top_srcdir)/lib/src/armaditopdf.c \
10 | $(top_srcdir)/lib/src/filters.c \
11 | $(top_srcdir)/lib/src/log.c \
12 | $(top_srcdir)/lib/src/osdeps.c \
13 | $(top_srcdir)/lib/src/pdfObjectsAnalysis.c \
14 | $(top_srcdir)/lib/src/pdfParsing.c \
15 | $(top_srcdir)/lib/src/pdfStructAnalysis.c \
16 | $(top_srcdir)/lib/src/pdfStructs.c \
17 | $(top_srcdir)/lib/src/utils.c
18 | 
19 | armadito_pdfincludedir=$(includedir)/libarmadito-pdf
20 | 
21 | armadito_pdfinclude_HEADERS =\
22 | $(top_srcdir)/lib/includes/armaditopdf.h \
23 | $(top_srcdir)/lib/includes/filters.h \
24 | $(top_srcdir)/lib/includes/log.h \
25 | $(top_srcdir)/lib/includes/miniz.c \
26 | $(top_srcdir)/lib/includes/osdeps.h \
27 | $(top_srcdir)/lib/includes/pdfAnalysis.h \
28 | $(top_srcdir)/lib/includes/pdfParsing.h \
29 | $(top_srcdir)/lib/includes/pdfStructs.h \
30 | $(top_srcdir)/lib/includes/utils.h
31 | 
32 | 
33 | pkgconfigdir = $(libdir)/pkgconfig
34 | pkgconfig_DATA = libarmadito-pdf.pc
35 | 
36 | libarmadito-pdf.pc: libarmadito-pdf.pc.in
37 | 		sed -e 's![@]prefix[@]!$(prefix)!g' \
38 | 		    -e 's![@]exec_prefix[@]!$(exec_prefix)!g' \
39 | 		    -e 's![@]includedir[@]!$(includedir)!g' \
40 | 		    -e 's![@]libdir[@]!$(libdir)!g' \
41 | 		    -e 's![@]PACKAGE_VERSION[@]!$(PACKAGE_VERSION)!g' \
42 | $(top_srcdir)/lib/libarmadito-pdf.pc.in > $@


--------------------------------------------------------------------------------
/lib/TODO:
--------------------------------------------------------------------------------
 1 | 
 2 | ---------
 3 | TODO LIST
 4 | ---------
 5 | 
 6 | // TODO :: checkMagicNumber :: search the header in the 1024 first bytes.
 7 | // TODO :: checkMagicNumber :: Thread XDP files.
 8 | // TODO :: printAnalysisReport :: filter report informations by log level.
 9 | // TODO :: getPDFContent :: set max_size limit.
10 | // TODO :: removeComments :: split this function (implement function get_line, etc.)
11 | // TODO :: check offset :: if the document has been uncommented. the offset should be incorrects.
12 | // TODO :: getObjectInfo :: fill obj->dico_len;
13 | // TODO :: replaceString :: replace all occurrences.
14 | // TODO :: Notation :: empty_doc_with_active_content (if no error).
15 | // TODO :: getActions :: get other potentially dangerous actions (OpenActions - GoToE - GoToR - etc.)
16 | // TODO :: decodeObjectStream :: check if the stream is encrypted. (/Encrypt in the dico)
17 | // TODO :: pdfParsing :: continue if the parsing failure is due to bad stream decode.
18 | // TODO :: getJSContentInXFA :: Check the keyword javascript
19 | // TODO :: decodeObjectStream :: do not try to decode an object twice.
20 | // TODO :: getEmbeddedFile :: Threat the case <</EF <</F 3 0 R>> >>
21 | // TODO :: all :: set error codes.
22 | // TODO :: all :: use obj->dico_len instead of strlen(dico).
23 | // TODO :: documentStructureAnalysis :: check trailers.
24 | // TODO :: TOFIX :: filters implementations.
25 | // TODO :: TOFIX :: removeComments() function implementation.
26 | // TODO :: FlateDecode :: check if the stream is conform (Ex: '\r')
27 | // TODO :: all :: declare a public API. (for version 1.0.0)
28 | // TODO :: all :: api documentation.


--------------------------------------------------------------------------------
/lib/src/log.c:
--------------------------------------------------------------------------------
 1 | /***
 2 | 
 3 | Copyright (C) 2015, 2016 Teclib'
 4 | 
 5 | This file is part of Armadito module PDF.
 6 | 
 7 | Armadito module PDF is free software: you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 | 
12 | Armadito module PDF is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | ***/
21 | 
22 | #include "log.h"
23 | 
24 | 
25 | static enum log_level current_max_level = default_max_level;
26 | 
27 | 
28 | void set_current_log_level(enum log_level level){
29 | 
30 | 	current_max_level = level;
31 | 
32 | 	return;
33 | }
34 | 
35 | char * lvl_tostring(enum log_level level){
36 | 
37 | 	switch (level){
38 | 	case LOG_LEVEL_ERROR:
39 | 		return "<error>";
40 | 	case LOG_LEVEL_WARNING:
41 | 		return "<warning>";
42 | 	case LOG_LEVEL_INFO:
43 | 		return "<info>";
44 | 	case LOG_LEVEL_DEBUG:
45 | 		return "<debug>";
46 | 	default:
47 | 		return "";
48 | 	}
49 | 
50 | }
51 | 
52 | void cli_log(enum log_level level, const char * fmt, ...){
53 | 	
54 | 	va_list ap;	
55 | 
56 | 	if (level > current_max_level)
57 | 		return;
58 | 	
59 | 	printf("%s ", lvl_tostring(level));
60 | 	
61 | 	va_start(ap, fmt);
62 | 	vprintf(fmt, ap);
63 | 	va_end(ap);
64 | 
65 | 	
66 | 	return;
67 | }


--------------------------------------------------------------------------------
/lib/includes/pdfParsing.h:
--------------------------------------------------------------------------------
 1 | /***
 2 | 
 3 | Copyright (C) 2015, 2016 Teclib'
 4 | 
 5 | This file is part of Armadito module PDF.
 6 | 
 7 | Armadito module PDF is free software: you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 | 
12 | Armadito module PDF is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | ***/
21 | 
22 | 
23 | 
24 | #ifndef _pdf_Parsing_h_
25 | #define _pdf_Parsing_h_
26 | 
27 | 
28 | #include "pdfStructs.h"
29 | 
30 | 
31 | #define LARGE_FILE_SIZE 1500000
32 | 
33 | 
34 | /***** pdf Parsing functions prototypes *****/
35 | 
36 | int parsePDF(struct pdfDocument * pdf);
37 | int checkMagicNumber(struct pdfDocument * pdf);
38 | int getPDFContent(struct pdfDocument * pdf);
39 | int extractObjectFromObjStream(struct pdfDocument * pdf, struct pdfObject *obj);
40 | int getObjectInfos(struct pdfObject * obj, struct pdfDocument * pdf);
41 | int getPDFObjects(struct pdfDocument * pdf);
42 | int getPDFTrailers(struct pdfDocument * pdf);
43 | int getPDFTrailers_2(struct pdfDocument * pdf);
44 | int decodeObjectStream(struct pdfObject * obj);
45 | int removeComments(struct pdfDocument * pdf);
46 | char * getObjectDictionary(struct pdfObject * obj, struct pdfDocument * pdf);
47 | char * getObjectType(struct pdfObject * obj);
48 | char * getObjectStream(struct pdfObject * obj);
49 | char * getStreamFilters(struct pdfObject * obj);
50 | char * hexaObfuscationDecode(char * dico);
51 | char *removeCommentLine(char * src, int size, int * ret_len);
52 | 
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/lib/includes/pdfAnalysis.h:
--------------------------------------------------------------------------------
 1 | /***
 2 | 
 3 | Copyright (C) 2015, 2016 Teclib'
 4 | 
 5 | This file is part of Armadito module PDF.
 6 | 
 7 | Armadito module PDF is free software: you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 | 
12 | Armadito module PDF is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | ***/
21 | 
22 | 
23 | 
24 | #ifndef _pdf_Analysis_h_
25 | #define _pdf_Analysis_h_
26 | 
27 | 
28 | #include "pdfStructs.h"
29 | 
30 | 
31 | /***** pdf Structure analysis functions prototypes *****/
32 | int documentStructureAnalysis(struct pdfDocument * pdf);
33 | int checkXRef(struct pdfDocument * pdf);
34 | int checkEmptyDocument(struct pdfDocument * pdf);
35 | int checkTrailer(struct pdfDocument * pdf);
36 | 
37 | 
38 | /***** pdf Objects analysis functions prototypes *****/
39 | int getDangerousContent(struct pdfDocument* pdf);
40 | int getJavaScript(struct pdfDocument * pdf, struct pdfObject* obj);
41 | int getXFA(struct pdfDocument * pdf, struct pdfObject* obj);
42 | int getEmbeddedFile(struct pdfDocument * pdf, struct pdfObject* obj);
43 | int getInfoObject(struct pdfDocument * pdf);
44 | int unknownPatternRepetition(char * stream, int size, struct pdfDocument * pdf, struct pdfObject * obj);
45 | int findDangerousKeywords(char * stream, struct pdfDocument * pdf, struct pdfObject * obj);
46 | int getURI(struct pdfDocument * pdf, struct pdfObject * obj);
47 | int getJSContentInXFA(char * stream, int size, struct pdfObject * obj, struct pdfDocument * pdf);
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/lib/includes/osdeps.h:
--------------------------------------------------------------------------------
 1 | /***
 2 | 
 3 | Copyright (C) 2015, 2016 Teclib'
 4 | 
 5 | This file is part of Armadito module PDF.
 6 | 
 7 | Armadito module PDF is free software: you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 | 
12 | Armadito module PDF is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | ***/
21 | 
22 | #ifndef _os_deps_h_
23 | #define _os_deps_h_
24 | 
25 | #include <stdio.h>
26 | 
27 | typedef int(*dirent_scan_cb)(int fd, char * filename);
28 | int os_scan_dir(char * path, int recurse, dirent_scan_cb dirent_cb, void * data);
29 | 
30 | #ifdef _WIN32
31 | 
32 | #include <Windows.h>
33 | #include <io.h>
34 | #define os_strncat strncat_s
35 | #define os_sprintf sprintf_s
36 | #define os_sscanf sscanf_s
37 | #define os_strncpy strncpy_s
38 | #define os_strdup _strdup
39 | #define os_lseek _lseek
40 | #define os_read _read
41 | #define os_fileno _fileno
42 | FILE * os_fopen(const char * filename, const char * mode);
43 | 
44 | 
45 | #else
46 | 
47 | #include <unistd.h>
48 | #define os_fopen fopen
49 | #define os_sprintf snprintf
50 | #define os_sscanf sscanf
51 | #define os_strdup strdup
52 | #define os_lseek lseek
53 | #define os_read read
54 | #define os_fileno fileno
55 | //#define os_sprintf(buffer,sizeOfBuffer, format,...) sprintf(buffer, format,...)
56 | int os_strncat(char *strDest, size_t numberOfElements, const char *strSource, size_t count);
57 | int os_strncpy(char *strDest, size_t numberOfElements, const char *strSource, size_t count);
58 | 
59 | #endif
60 | 
61 | #endif


--------------------------------------------------------------------------------
/lib/includes/log.h:
--------------------------------------------------------------------------------
 1 | /***
 2 | 
 3 | Copyright (C) 2015, 2016 Teclib'
 4 | 
 5 | This file is part of Armadito module PDF.
 6 | 
 7 | Armadito module PDF is free software: you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 | 
12 | Armadito module PDF is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | ***/
21 | 
22 | #ifndef _log_h_
23 | #define _log_h_
24 | 
25 | #include <stdio.h>
26 | #include <stdarg.h>
27 | 
28 | 
29 | #define default_max_level LOG_LEVEL_WARNING
30 | #define print_report 1	// print the analysis report.
31 | 
32 | enum log_level {
33 | 	LOG_LEVEL_ERROR = 1 << 1,
34 | 	LOG_LEVEL_WARNING = 1 << 2,
35 | 	LOG_LEVEL_INFO = 1 << 3,
36 | 	LOG_LEVEL_DEBUG = 1 << 4,
37 | 	LOG_LEVEL_NONE = 1 << 5,
38 | };
39 | 
40 | void cli_log(enum log_level level, const char * fmt, ...);
41 | void set_current_log_level(enum log_level level);
42 | 
43 | #ifdef _WIN32
44 | 
45 | #define err_log(fmt, ...) cli_log(LOG_LEVEL_ERROR,(fmt),__VA_ARGS__)
46 | #define warn_log(fmt, ...) cli_log(LOG_LEVEL_WARNING,(fmt),__VA_ARGS__)
47 | #define dbg_log(fmt, ...) cli_log(LOG_LEVEL_DEBUG,(fmt),__VA_ARGS__)
48 | #define info_log(fmt, ...) cli_log(LOG_LEVEL_INFO,(fmt),__VA_ARGS__)
49 | 
50 | #else
51 | 
52 | #define err_log(fmt, ...) cli_log(LOG_LEVEL_ERROR,(fmt),##__VA_ARGS__)
53 | #define warn_log(fmt, ...) cli_log(LOG_LEVEL_WARNING,(fmt),##__VA_ARGS__)
54 | #define dbg_log(fmt, ...) cli_log(LOG_LEVEL_DEBUG,(fmt),##__VA_ARGS__)
55 | #define info_log(fmt, ...) cli_log(LOG_LEVEL_INFO,(fmt),##__VA_ARGS__)
56 | 
57 | #endif
58 | 
59 | 
60 | #endif


--------------------------------------------------------------------------------
/lib/includes/utils.h:
--------------------------------------------------------------------------------
 1 | /***
 2 | 
 3 | Copyright (C) 2015, 2016 Teclib'
 4 | 
 5 | This file is part of Armadito module PDF.
 6 | 
 7 | Armadito module PDF is free software: you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 | 
12 | Armadito module PDF is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | ***/
21 | 
22 | 
23 | 
24 | #ifndef _utils_h_
25 | #define _utils_h_
26 | 
27 | 
28 | #include "pdfStructs.h"
29 | 
30 | /* Utils functions prototypes */
31 | 
32 | void * searchPattern(char* src, char* pat , int pat_size ,  int size);
33 | struct pdfObject * getPDFObjectByRef(struct pdfDocument * pdf, char * ref);
34 | struct pdfObject * getPDFNextObjectByRef(struct pdfDocument * pdf, struct pdfObject * obj, char * ref);
35 | void printObject(struct pdfObject * obj);
36 | void printObjectByRef(struct pdfDocument * pdf, char * ref);
37 | void printObjectInFile(struct pdfObject * obj);
38 | void printPDFObjects(struct pdfDocument * pdf);
39 | int getNumber(char* ptr, int size);
40 | char* getNumber_s(char* ptr, int size);
41 | char * getIndirectRef(char * ptr, int size);
42 | char * getDelimitedStringContent(char * src, char * delimiter1, char * delimiter2, int src_len);
43 | char * getIndirectRefInString(char * ptr, int size);
44 | char * getPattern(char * ptr, int size, int len);
45 | char * getUnicodeInString(char * stream, int size);
46 | char * getHexa(char * dico, int size);
47 | char * replaceInString(char * src, char * toReplace , char * pat);
48 | char * toBinary(char * stream, int size);
49 | char * binarytoChar(char * binary, int size, int * returned_size);
50 | void printStream(char * stream, int size);
51 | void debugPrint(char * stream, int len); // print in a debug file
52 | 
53 | 
54 | 
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/lib/includes/armaditopdf.h:
--------------------------------------------------------------------------------
 1 | /***
 2 | 
 3 | Copyright (C) 2015, 2016 Teclib'
 4 | 
 5 | This file is part of Armadito module PDF.
 6 | 
 7 | Armadito module PDF is free software: you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 | 
12 | Armadito module PDF is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | ***/
21 | 
22 | 
23 | 
24 | #ifndef _armadito_pdf_h_
25 | #define _armadito_pdf_h_
26 | 
27 | #include "pdfStructs.h"
28 | 
29 | #define a6o_pdf_ver "0.12.6"
30 | 
31 | 
32 | // Tests Coefficients
33 | 
34 | //#define bad_header 
35 | //#define encrypted
36 | #define EMPTY_PAGE_CONTENT 99
37 | #define OBJECT_COLLISION 10
38 | #define OBJECT_COLLISION_AND_BAD_XREF 60
39 | #define BAD_TRAILER 40
40 | #define BAD_XREF_OFFSET 30
41 | #define BAD_OBJ_OFFSET 20
42 | #define OBFUSCATED_OBJECT 50 
43 | #define MULTIPLE_HEADERS 50
44 | #define MALICIOUS_COMMENTS 50
45 | 
46 | #define ACTIVE_CONTENT 40
47 | #define SHELLCODE 40
48 | #define PATTERN_HIGH_REPETITION 40
49 | #define DANGEROUS_KEYWORD_HIGH 90
50 | #define DANGEROUS_KEYWORD_MEDIUM 40
51 | #define DANGEROUS_KEYWORD_LOW 20
52 | #define TIME_EXCEEDED 20
53 | 
54 | //#define LARGE_FILE_SIZE 1500000
55 | 
56 | 
57 | enum err_code {
58 | 
59 | 	unexpected_error = -1 << 0,
60 | 	bad_pdf_version = -1 << 1,
61 | 	bad_pdf_header = -1 << 2,
62 | 	unsupported_file = -1 << 3,
63 | 	bad_xref_format = -1 << 4,
64 | 	bad_trailer_format = -1 << 5,
65 | 	bad_object_format = -1 << 6,
66 | 	stream_not_decoded = -1 << 7
67 | 
68 | };
69 | 
70 | 
71 | /* Functions */
72 | char * getVersion();
73 | int analyzePDF(char * filename);
74 | int analyzePDF_fd(int fd, char * filename);
75 | int analyzePDF_ex(int fd, char * filename);
76 | int calcSuspiciousCoefficient(struct pdfDocument * pdf);
77 | void printAnalysisReport(struct pdfDocument * pdf);
78 | 
79 | 
80 | 
81 | #endif
82 | 


--------------------------------------------------------------------------------
/lib/includes/filters.h:
--------------------------------------------------------------------------------
 1 | /***
 2 | 
 3 | Copyright (C) 2015, 2016 Teclib'
 4 | 
 5 | This file is part of Armadito module PDF.
 6 | 
 7 | Armadito module PDF is free software: you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 | 
12 | Armadito module PDF is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | ***/
21 | 
22 | 
23 |     
24 | #ifndef _filters_h_
25 | #define _filters_h_
26 | 
27 | 
28 | #include "pdfStructs.h"
29 | 
30 | 
31 | // LZWDecode
32 | #define FIRST_CODE 258
33 | #define EOD_MARKER 257
34 | #define CLEAR_TABLE 256
35 | 
36 | #define MAX_CODES 512
37 | 
38 | struct LZWdico{
39 | 
40 | 	unsigned short code;
41 | 	char * entry;
42 | 	int entry_len;
43 | 
44 | 	struct LZWdico * next;
45 | };
46 | 
47 | /* Functions prototypes */
48 | 
49 | char * FlateDecode(char * stream, struct pdfObject* obj);
50 | char * ASCIIHexDecode(char * stream, struct pdfObject * obj);
51 | char * LZWDecode(char* stream, struct pdfObject * obj);
52 | char * ASCII85Decode(char * stream, struct pdfObject * obj);
53 | char * CCITTFaxDecode(char* stream, struct pdfObject * obj);
54 | 
55 | // LZWDdecode functions.
56 | struct LZWdico * initDico(int code, char * entry);
57 | struct LZWdico * initDico_(int code, char * entry, int len);
58 | int addInDico(struct LZWdico * dico, int code, char * entry);
59 | void freeDico(struct LZWdico * dico);
60 | char * getEntryInDico(struct LZWdico * dico, int code);
61 | unsigned short readData(char ** data, unsigned int * partial_code, unsigned int * partial_bits, unsigned int code_len);
62 | void printDico(struct LZWdico * dico);
63 | 
64 | // CCITTFaxDecode functions.
65 | int getRunLengthCodeInTable(char ** table, char * bits, int table_size);
66 | int getMakeUpCodeInTable(char ** table, char *bits, int table_size);
67 | 
68 | 
69 | // ASCII85Decode functions
70 | char * getTuple(char * data, int len);
71 | 
72 | 
73 | // CCITTFaxDecode
74 | 
75 | extern char * WHITE_RUN_LENGTH_TERMINATING_CODES[];
76 | extern char * BLACK_RUN_LENGTH_TERMINATING_CODES[];
77 | extern char * WHITE_MAKE_UP_CODES[];
78 | extern char * BLACK_MAKE_UP_CODES[];
79 | extern int WHITE_BLACK_MAKE_UP_CODES_VALUES[];
80 | 
81 | 
82 | #endif


--------------------------------------------------------------------------------
/tools/perl_poc/lib/analysis/CVEs.pm:
--------------------------------------------------------------------------------
 1 | package CVEs;
 2 | 
 3 | use strict;
 4 | 
 5 | my $DEBUG = "no";
 6 | 
 7 | # This function detects
 8 | sub CVE_2010_2883_Detection{
 9 | 
10 | 	my $ref = shift;
11 | 	my %pdfObjects = %{$ref};
12 | 	my $fontfile;
13 | 	my $status = "none";
14 | 
15 | 	print "\n\n:::CVE_2010_2883_Detection:::\n" unless $DEBUG eq "no";
16 | 
17 | 	# Get font descriptors objects	
18 | 	my @objs = values(%pdfObjects);
19 | 	foreach(@objs){
20 | 	
21 | 		if( exists($_->{"type"}) && $_->{"type"} eq "/FontDescriptor" ){
22 | 			print "Found FontDescriptor object :: $_->{ref}\n" unless $DEBUG eq "no";
23 | 			
24 | 			if(exists($_->{"fontfile"}) && $_->{"fontfile"} =~ /(\d+\s\d\sR)/){
25 | 				$fontfile = $1 ;
26 | 				$fontfile =~ s/R/obj/;
27 | 				print "font File found :: $fontfile\n" unless $DEBUG eq "no";
28 | 			}else{
29 | 				next;
30 | 			}
31 | 			
32 | 			# Get the font file stream
33 | 			if(exists($pdfObjects{$fontfile}) && exists($pdfObjects{$fontfile}->{"stream_d"}) && length($pdfObjects{$fontfile}->{"stream_d"}) > 0 ){
34 | 			
35 | 				my $fontstream = $pdfObjects{$fontfile}->{"stream_d"};
36 | 				#print "font stream = $fontstream\n";
37 | 				
38 | 				# Check the length of the decoded stream /!\
39 | 				#my $realen = length();
40 | 				print "Lenght1 = ".$pdfObjects{$fontfile}->{"length1"}."\n" unless ($DEBUG eq "no" or ! exists($pdfObjects{$fontfile}->{"length1"})) ;
41 | 				print "Real length = ".length($fontstream)."\n" unless $DEBUG eq "no";
42 | 				if(exists($pdfObjects{$fontfile}->{"length1"})  && $pdfObjects{$fontfile}->{"length1"} != length($fontstream)){
43 | 					print "Warning :: Font File decoded stream Length is Wrong :: ".$pdfObjects{$fontfile}->{"length1"}." :: ".length($fontstream)."\n" unless $DEBUG eq "no";
44 | 					#$TESTS_CAT_3{"CVE_2010_2883"} = "BAD_FONT_FILE_LENGTH";
45 | 					$status = "BAD_FONT_FILE_LENGTH"; 
46 | 				}
47 | 				
48 | 				# Check TrueType required tables
49 | 				# - cmap - glyf - head - hhea - hmtx - loca - maxp - name - post
50 | 				# Detect the SING ()Smart INdependent Glyphlets) string
51 | 				if($fontstream =~ /SING/ ){
52 | 					print "Warning :: Found SING (Smart INdependent Glyphlets) :: Possible CVE_2010_2883\n" unless $DEBUG eq "yes";
53 | 					#$TESTS_CAT_3{"CVE_2010_2883"} = "DETECTED";
54 | 					$status = "DETECTED";
55 | 					
56 | 					# TODO combine with previous test (bad_font_file_length) to detect CVE
57 | 				}
58 | 				
59 | 						
60 | 			}else{
61 | 				print "Warning :: CVE_2010_2883_Detection :: Font File Object $fontfile is not defined :\n" unless $DEBUG eq "no";
62 | 			}
63 | 			
64 | 		}
65 | 	}
66 | 
67 | 	return $status;
68 | }
69 | 
70 | 1;
71 | 
72 | __END__
73 | 
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ARMADITO PDF ANALYZER
 2 | =====================
 3 | [![Build Status](https://travis-ci.org/armadito/armadito-mod-pdf.svg?branch=DEV)](https://travis-ci.org/armadito/armadito-mod-pdf)
 4 | <a href="https://scan.coverity.com/projects/armadito-armadito-mod-pdf">
 5 |   <img alt="Coverity Scan Build Status"
 6 |        src="https://scan.coverity.com/projects/10496/badge.svg"/>
 7 | </a>
 8 | 
 9 | Armadito module PDF is an heuristic module for PDF documents analysis.
10 | 
11 | Copyright (C) Teclib', 2015, 2016
12 | 
13 | See Online documentation at : http://armadito-av.readthedocs.io/en/latest/
14 | 
15 | License : GPLv3 <https://www.gnu.org/licenses/license-list.html#GNUGPLv3>
16 | 
17 | What is it?
18 | -----------
19 | 
20 | Armadito PDF analyzer is a module for PDF documents scanning that includes:
21 | 
22 | * a PDF parser
23 | 
24 | * an heuristic analyzer that computes the document confidence level
25 | 
26 | Licensing
27 | ---------
28 | 
29 | Armadito PDF analyzer is licensed under the GPLv3 https://www.gnu.org/licenses/license-list.html#GNUGPLv3	
30 | 
31 | Dependencies
32 | ------------
33 | 
34 | > miniz.c
35 | 
36 | FEATURES
37 | --------
38 | 
39 | ==> Parsing	<==
40 | 
41 | * Remove PostScript comments in the content of the document.
42 | * Get PDF version in header (Ex: %PDF-1.7).
43 | * Get trailers and xref table or xref objects.
44 | * Get objects informations described in the document (reference, dictionary, type, stream, filters, etc).
45 | * Extract objects embedded in stream objects.
46 | * Decode object streams encoded with filters : FlateDecode, ASCIIHexDecode, ASCII85Decode, LZWDecode, CCITTFaxDecode
47 | 
48 | ==> Analysis <==
49 | 
50 | * Tests based on PDF document structure (accodring to PDF specifications):
51 | 	- Check the PDF header version (from version 1.1 to 1.7).
52 | 	- Check if the content of the document is encrypted.
53 | 	- Check that the document contains non-empty pages.
54 | 	- Check object collision in object declaration.
55 | 	- Check trailers format.
56 | 	- Check xref table and xref object.
57 | 	- Check the presence of malicious Postscript comments (which could cause parsing errors).
58 | 
59 | 
60 | * Tests based on PDF objects content:
61 | 	- Get potentially malicious active contents (JavaScripts, Embedded files, Forms, URI, etc.)
62 | 	- JavaScript content analysis (malicious keywords, pattern repetition, unicode strings, etc).
63 | 	- Info object content analysis (search potentially malicious strings).
64 | 	- Check if object dictionary is hexa obfuscated.
65 | 
66 | 
67 | ==>	Notation <==
68 | 
69 | * A suspicious coefficient is attributed to each test.
70 | * Calc the suspicious coefficient of the pdf document.
71 | 
72 | 
73 | LIMITATIONS
74 | -----------
75 | 
76 | - Supported PDF versions are: %PDF-1.1 to %PDF-1.7.
77 | - PDF documents with encrypted content are not supported.
78 | - Removing comments is skipped for document > 2MB
79 | 


--------------------------------------------------------------------------------
/module/modulePDF.c:
--------------------------------------------------------------------------------
 1 | /***
 2 | 
 3 | Copyright (C) 2015, 2016 Teclib'
 4 | 
 5 | This file is part of Armadito module PDF.
 6 | 
 7 | Armadito module PDF is free software: you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 | 
12 | Armadito module PDF is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | ***/
21 | 
22 | #include "modulePDF.h"
23 | 
24 | struct modulePDF_data {
25 | 	const char *tmp_dir;
26 | 	int late_days;
27 | 	int critical_days;
28 | };
29 | 
30 | static enum a6o_mod_status modulePDF_init(struct a6o_module *module) {
31 | 
32 | 	// This module doesn't need initialization.
33 | 	a6o_log(A6O_LOG_MODULE, A6O_LOG_LEVEL_INFO, "module PDF initialized successfully!");
34 | 	return A6O_MOD_OK;
35 | }
36 | 
37 | static enum a6o_mod_status modulePDF_close(struct a6o_module *module) {
38 | 
39 | 	// This modules doesn't need close instruction.
40 | 	return A6O_MOD_OK;
41 | }
42 | 
43 | static enum a6o_update_status modulePDF_info(struct a6o_module *module, struct a6o_module_info *info){
44 | 
45 | 	time_t ts = 0;
46 | 	struct tm timeptr = {0, 30, 8, 8, 5, 116}; // 01/03/2016 9:30
47 | 
48 | 	ts=mktime(&timeptr);
49 | 	info->mod_update_ts = ts;
50 | 
51 | 	return A6O_UPDATE_OK;
52 | }
53 | 
54 | 
55 | static enum a6o_file_status modulePDF_scan(struct a6o_module *module, int fd, const char *path, const char *mime_type, char **pmod_report) {
56 | 
57 | 	enum a6o_file_status status = A6O_FILE_CLEAN;
58 | 	int ret = 0;
59 | 
60 | 
61 | 	// launch analysis
62 | 	ret = analyzePDF_ex(fd,path);
63 | 
64 | 	if (ret == -1) {
65 | 		status = A6O_FILE_IERROR;
66 | 	}
67 | 	else if (ret == -2) {
68 | 		status = A6O_FILE_UNDECIDED; // Not supported files (encrypted contents or bad header).
69 | 	}
70 | 	else if (ret < MALICIOUS_COEF) {
71 | 		status = A6O_FILE_CLEAN;
72 | 	}
73 | 	else if (ret >= MALICIOUS_COEF) {
74 | 		status = A6O_FILE_SUSPICIOUS;
75 | 
76 | 		*pmod_report = os_strdup("ModulePDF!SuspiciousPDF");
77 | 	}
78 | 
79 | 	return status;
80 | }
81 | 
82 | 
83 | static const char *modulePDF_mime_types[] = {
84 | 	"application/pdf",
85 | 	NULL,
86 | };
87 | 
88 | struct a6o_module module = {
89 | 	.init_fun = modulePDF_init,
90 | 	.conf_table = NULL,
91 | 	.post_init_fun = NULL,
92 | 	.scan_fun = modulePDF_scan,
93 | 	.close_fun = modulePDF_close,
94 | 	.info_fun = modulePDF_info,
95 | 	.supported_mime_types = modulePDF_mime_types,
96 | 	.name = "modulePDF",
97 | 	.size = sizeof(struct modulePDF_data),
98 | };
99 | 


--------------------------------------------------------------------------------
/CHANGES:
--------------------------------------------------------------------------------
 1 | :: Change Log :: 
 2 | All notable changes to the project Armadito-pdf will be documented in this file.
 3 | This project adheres to [Semantic versionning](http://semver.org)
 4 | 
 5 | 
 6 | Mon, 27 May 2016 :: 0.10.1 :: (ufausther)
 7 | -----------------------------------------
 8 | *** New version 0.10.1 pushed in github (DEV branch).
 9 | 
10 | 
11 | Wed, 1 June 2016 :: 0.11.0 :: (ufausther)
12 | -----------------------------------------
13 | 
14 | + add custom log function (cli_log with macro definition.)
15 | + add "filename" field to pdfDocument struct.
16 | + modif function printAnalysisReport(struct pdfDocument * pdf) prototype.
17 | + add errors count in analysis report.
18 | + add functions headers.
19 | + remove duplicated call to checkMagicNumber() function
20 | + rename function getNumber_a(...) => getNumber_s(...)
21 | + improve error handling.
22 | + FlateDecode filter : increase destination buffer size on MZ_BUFF_ERROR error.
23 | 
24 | Fri, 03 June 2016 :: 0.11.1 :: (ufausther)
25 | ------------------------------------------
26 | 
27 | + FlateDecode filter : increase destination buffer size on MZ_BUFF_ERROR error.
28 | 
29 | 
30 | Mon, 06 June 2016 :: 0.11.1 :: (ufausther)
31 | ------------------------------------------
32 | 
33 | + translate/update README.md
34 | 
35 | Tue, 07 June 2016 :: 0.12.0 :: (ufausther)
36 | ------------------------------------------
37 | 
38 | + Add header files (armaditopdf.h - pdfStructs.h - pdfParsing.h - pdfAnalysis.h - utils.h ) and corresponding sources files.
39 | + folders reorganisation.
40 | + define version string in header (#define a6o_pdf_ver)
41 | + Add change log file.
42 | 
43 | Tue, 07 June 2016 :: 0.12.1 :: (ufausther)
44 | ------------------------------------------
45 | 
46 | + bug fix: getDelimitedStringContent() temporary buffer was not updated.
47 | 
48 | 
49 | Wed, 08 June 2016 :: 0.12.1 :: (ufausther)
50 | ------------------------------------------
51 | 
52 | + modif for linux compatibility.
53 | 
54 | Thu, 09 June 2016 :: 0.12.2 :: (ufausther)
55 | ------------------------------------------
56 | 
57 | + fix some crashes detected with fuzzing
58 | 
59 | Tue, 14 June 2016 :: 0.12.3 :: (ufausther)
60 | ------------------------------------------
61 | 
62 | + fix crash :: getStreamFilters :: malformed dictionary.
63 | + recursive scan fix :: scan was stopped on error.
64 | + checkXref :: skip white spaces at the end of the line.
65 | 
66 | Wed, 15 June 2016 :: 0.12.3 :: (ufausther)
67 | ------------------------------------------
68 | 
69 | + fix craches :: extractObjectFromObjStream() :: invalid offset (/First) and number of object (/N) in dictionary
70 | 
71 | 
72 | Fri, 19 Aug 2016 :: 0.12.4 :: (ufausther)
73 | ------------------------------------------
74 | 
75 | + fix warning in function getTrailers :: test file = clam.pdf
76 | + getEmbeddedFile :: looking for FileSpec object is not necessary for embedded file detection.
77 | 
78 | 
79 | Thu, 9 Feb 2017 :: 0.12.5 :: (ufausther)
80 | ------------------------------------------
81 | 
82 | + Fix some memory leaks.
83 | + Fix default log level to warn.
84 | + Minors changes
85 | + pdfParsing: improve code.
86 | 
87 | 
88 | Mon, 20 Feb 2017 :: 0.12.6 :: (ufausther)
89 | ------------------------------------------
90 | 
91 | + Minors changes
92 | + Fix resource leaks
93 | + Get library version with getVersion() function.
94 | 


--------------------------------------------------------------------------------
/win32/ArmaditoPDF/ArmaditoPDF/ArmaditoPDF.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Fichiers sources">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Fichiers d%27en-tête">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Fichiers de ressources">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
15 |     </Filter>
16 |     <Filter Include="Fichiers sources\osdeps">
17 |       <UniqueIdentifier>{de5d87ae-3c70-4a71-877f-f5cef05b237e}</UniqueIdentifier>
18 |     </Filter>
19 |     <Filter Include="Fichiers d%27en-tête\osdeps">
20 |       <UniqueIdentifier>{2cf89655-f2a4-4131-a986-3db75eda9ff0}</UniqueIdentifier>
21 |     </Filter>
22 |   </ItemGroup>
23 |   <ItemGroup>
24 |     <ClCompile Include="..\..\..\lib\src\armaditopdf.c">
25 |       <Filter>Fichiers sources</Filter>
26 |     </ClCompile>
27 |     <ClCompile Include="..\..\..\lib\src\filters.c">
28 |       <Filter>Fichiers sources</Filter>
29 |     </ClCompile>
30 |     <ClCompile Include="..\..\..\lib\src\log.c">
31 |       <Filter>Fichiers sources</Filter>
32 |     </ClCompile>
33 |     <ClCompile Include="..\..\..\tools\cli_analyzer\main.c">
34 |       <Filter>Fichiers sources</Filter>
35 |     </ClCompile>
36 |     <ClCompile Include="..\..\..\lib\src\pdfObjectsAnalysis.c">
37 |       <Filter>Fichiers sources</Filter>
38 |     </ClCompile>
39 |     <ClCompile Include="..\..\..\lib\src\pdfParsing.c">
40 |       <Filter>Fichiers sources</Filter>
41 |     </ClCompile>
42 |     <ClCompile Include="..\..\..\lib\src\pdfStructAnalysis.c">
43 |       <Filter>Fichiers sources</Filter>
44 |     </ClCompile>
45 |     <ClCompile Include="..\..\..\lib\src\pdfStructs.c">
46 |       <Filter>Fichiers sources</Filter>
47 |     </ClCompile>
48 |     <ClCompile Include="..\..\..\lib\src\utils.c">
49 |       <Filter>Fichiers sources</Filter>
50 |     </ClCompile>
51 |     <ClCompile Include="..\..\..\lib\src\osdeps.c">
52 |       <Filter>Fichiers sources\osdeps</Filter>
53 |     </ClCompile>
54 |   </ItemGroup>
55 |   <ItemGroup>
56 |     <ClInclude Include="..\..\..\lib\includes\filters.h">
57 |       <Filter>Fichiers d%27en-tête</Filter>
58 |     </ClInclude>
59 |     <ClInclude Include="..\..\..\lib\includes\armaditopdf.h">
60 |       <Filter>Fichiers d%27en-tête</Filter>
61 |     </ClInclude>
62 |     <ClInclude Include="..\..\..\lib\includes\log.h">
63 |       <Filter>Fichiers d%27en-tête</Filter>
64 |     </ClInclude>
65 |     <ClInclude Include="..\..\..\lib\includes\pdfAnalysis.h">
66 |       <Filter>Fichiers d%27en-tête</Filter>
67 |     </ClInclude>
68 |     <ClInclude Include="..\..\..\lib\includes\pdfParsing.h">
69 |       <Filter>Fichiers d%27en-tête</Filter>
70 |     </ClInclude>
71 |     <ClInclude Include="..\..\..\lib\includes\pdfStructs.h">
72 |       <Filter>Fichiers d%27en-tête</Filter>
73 |     </ClInclude>
74 |     <ClInclude Include="..\..\..\lib\includes\utils.h">
75 |       <Filter>Fichiers d%27en-tête</Filter>
76 |     </ClInclude>
77 |     <ClInclude Include="..\..\..\lib\includes\osdeps.h">
78 |       <Filter>Fichiers d%27en-tête\osdeps</Filter>
79 |     </ClInclude>
80 |   </ItemGroup>
81 | </Project>


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
  1 | 
  2 | AC_INIT(armadito-pdf, [0.12.6], ufausther@teclib.com)
  3 | AM_INIT_AUTOMAKE([foreign])
  4 | 
  5 | # collect arguments
  6 | # debug
  7 | AC_ARG_ENABLE([debug],
  8 | 	AS_HELP_STRING([--enable-debug],[enable debugging @<:@default is yes@:>@]),
  9 | 	,
 10 | 	enable_debug="yes")
 11 | 
 12 | # armadito pdf library
 13 | AC_ARG_ENABLE([lib],
 14 | 	AS_HELP_STRING([--enable-lib],[enable armadito-pdf library @<:@default is no@:>@]),
 15 | 	enable_lib="yes",
 16 | 	enable_lib="no")
 17 | 
 18 | # module
 19 | AC_ARG_ENABLE([module],
 20 | 	AS_HELP_STRING([--enable-module],[enable armadito-av module @<:@default is no@:>@]),
 21 | 	enable_module="yes",
 22 | 	enable_module="no")
 23 | 
 24 | 
 25 | # cli tools
 26 | AC_ARG_ENABLE([tools],
 27 | 	AS_HELP_STRING([--enable-tools],[enable armadito-pdf cli tools @<:@default is no@:>@]),
 28 | 	[enable_tools="yes"],
 29 | 	[enable_tools="no"])
 30 | 
 31 | PKG_PROG_PKG_CONFIG
 32 | 
 33 | # check for analysis module sources
 34 | AC_MSG_CHECKING(for armadito-pdf library sources)
 35 | AC_CONFIG_SRCDIR(lib/src/armaditopdf.c)
 36 | AC_MSG_RESULT(yes)
 37 | 
 38 | # check for needed programs
 39 | AC_PROG_CC([gcc])
 40 | AC_PROG_LIBTOOL
 41 | 
 42 | # check for debug arg
 43 | AC_MSG_CHECKING(for debug)
 44 | AC_MSG_RESULT($enable_debug)
 45 | AM_CONDITIONAL([DEBUG], [test "$enable_debug" = "yes"])
 46 | if test "$enable_debug" = "yes"; then
 47 |    AC_DEFINE(DEBUG,1,[Define DEBUG to enable debug messages])
 48 |    CFLAGS="${CFLAGS} -g"
 49 | fi
 50 | 
 51 | 
 52 | # check for library arg
 53 | AC_MSG_CHECKING(for armadito PDF library)
 54 | AC_MSG_RESULT($enable_lib)
 55 | AM_CONDITIONAL([COND_LIBRARY], [test "$enable_lib" = "yes"])
 56 | 
 57 | # check for module arg
 58 | AC_MSG_CHECKING(for armadito module)
 59 | AC_MSG_RESULT($enable_module)
 60 | AM_CONDITIONAL([COND_MODULE], [test "$enable_module" = "yes"])
 61 | 
 62 | # check for libarmadito if module is enabled.
 63 | if test "$enable_module" = "yes"; then
 64 | 	# check for libarmadito
 65 | 	PKG_CHECK_MODULES(LIBARMADITO, libarmadito, [HAVE_LIBARMADITO=yes], [HAVE_LIBARMADITO=no])
 66 | 	# check for libarmadito/armadito.h in source tree
 67 | 	if test "$HAVE_LIBARMADITO" = "no"; then
 68 | 	   OLD_CPPFLAGS="$CPPFLAGS"
 69 | 	   CPPFLAGS=-I$srcdir/../../armadito-core/libmodule/include
 70 | 	   AC_CHECK_HEADER([libarmadito/armadito.h], [], [AC_MSG_ERROR([cannot find Armadito module library])], [])
 71 | 	   CPPFLAGS="$OLD_CPPFLAGS"
 72 | 	   LIBARMADITO_CFLAGS='-I$(top_srcdir)/../../armadito-core/libmodule/include'
 73 | 	   AC_SUBST([LIBARMADITO_CFLAGS])
 74 | 	fi
 75 | fi
 76 | 
 77 | # check for tools arg
 78 | AC_MSG_CHECKING(for cli tools)
 79 | AC_MSG_RESULT($enable_tools)
 80 | AM_CONDITIONAL([COND_TOOLS], [test "$enable_tools" = "yes"])
 81 | 
 82 | # check for libarmadito-pdf if module is enabled.
 83 | if test "$enable_tools" = "yes"; then
 84 | 
 85 | 	# check for libarmadito-pdf
 86 | 	PKG_CHECK_MODULES(LIBARMADITO_PDF, libarmadito-pdf, [HAVE_LIBARMADITO_PDF=yes], [HAVE_LIBARMADITO_PDF=no])
 87 | 	if test "$HAVE_LIBARMADITO_PDF" = "no"; then
 88 | 		AC_MSG_ERROR("cannot find ARMADITO-PDF library: skipping armadito-pdf tool build.")
 89 | 	fi
 90 | fi
 91 | 
 92 | 
 93 | # Output files
 94 | AC_CONFIG_FILES([
 95 | Makefile
 96 | ])
 97 | 
 98 | # conditional makefile for library.
 99 | AM_COND_IF([COND_LIBRARY],
100 | 	AC_CONFIG_FILES([
101 | 	lib/Makefile
102 | ]))
103 | 
104 | # conditional makefile for tools.
105 | AM_COND_IF([COND_TOOLS],
106 | 	AC_CONFIG_FILES([
107 | 	tools/Makefile
108 | 	tools/armadito-pdf/Makefile
109 | ]))
110 | 
111 | # conditional makefile for module.
112 | AM_COND_IF([COND_MODULE],
113 | 	AC_CONFIG_FILES([
114 | 	module/Makefile
115 | ]))
116 | 
117 | AC_OUTPUT
118 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: C
 2 | sudo: required
 3 | dist: trusty
 4 | addons:
 5 |   apt:
 6 |     packages:
 7 |     - libtool
 8 |     - libglib2.0-dev
 9 |     - libmagic-dev
10 |     - libxml2-dev
11 |     - libmicrohttpd-dev
12 |     - libcurl4-openssl-dev
13 |     - libjansson-dev
14 |     - libjansson4
15 |     - autoconf-archive
16 |   coverity_scan:
17 |     project:
18 |       name: armadito/armadito-mod-pdf
19 |       description: Build submitted via Travis CI
20 |     notification_email: vhamon@teclib.com
21 |     build_command_prepend: "./configure --enable-lib PKG_CONFIG_PATH=/home/travis/build/armadito-core/lib/pkgconfig;
22 |       make clean"
23 |     build_command: make -j 4
24 |     branch_pattern: coverity_scan
25 |   sonarqube:
26 |     branches :
27 |       - DEV
28 | deploy:
29 |   skip_cleanup: true
30 |   provider: launchpad
31 |   slug: "~armadito/armadito-mod-pdf/+git/main"
32 |   oauth_token:
33 |     secure: d8X9y9hEPB0zgibcvVL2Q2qrDoM3o2/Wh28bZcD/A4TH5pgDK9oYbgtU5ywozK/CScpHEHmRTjZhwq9Mh4xVNni7XIKigpHbAKH0NhR6wKryQwR3VwUgRBKxp/UAnWFSv9RiPT7fG5tOQpHaD+8O8N06vLXZ0p4xV29jwnxPEvYEUt0fKmo4SWI18HQLpoFCuQLchgzvPs1X+iixWms5BmBlDIhnwx79geaMLF6sCc9f4TcvJ8yT9s4VbH/qYhsbCHB8lkRiBY1qQqw4sN67gQhJ9oY5J/AbGMJSdb4nimMhhUfkiME8iQVOx07YOaQJ6pMz5VYWpF7dOLlXdvEgc5caVs/gENmpb270aQHUSILn2A+3NI+LnSW3R3dkrtLU+HX9zVtMvnus/8T1f5KSecLvH+mRU4J6RTl9+MEDYA6sD59Ie2sBPqgljymgq8DJ8yoVqBzPzVCk95n5KTu21xobHgqXn4QfB/vGqQbE3U9XBFloALzsd4kMp+W37JtztVgaTlG+YNslCzWUP+f8pO6wHaRzJeVHC9/h24b5fk87TtvwTtVrxinNFXzj4i42A+6zlsW8BZXJ9I8jgaNhKGiWpHUy/iH9EADo2Y2rtYd09adAelEsIIJ4X+/N2+QfRjnDMGUV+8v3xUbLayQT3nb2G05X8YA19NlDoXZATYU=
34 |   oauth_token_secret:
35 |     secure: MnRk4gTNMwpSUodIwrEvdPg5mrufFjodZElpWomHg7gNBOfp/rsbzk4MEeu8sqxq1s0lUQUPzw4uiMy7tKy0x3Gn4XAhFMW9WDOtanDQV3qfPiFxlxHpfg2p+Tu5Ol6TnSYRI0l/vC3rN/pEJPOJoWJu6md6LEXX9U5Itub+rcmiMyygz11bMY46jY+vSZhLcT1LmgK6sojyCdIBFOjd+ACpvK+Lu/6Kj/QYhz3txOjMbsEgz+L2giSgs3j9zmpbg4zPFCL2Cq04muXVr3mOW1fmAXWop6DEs0rc9ovkqJwKaNcPWc1Inm5QDaAcR5ckeY/Oy4L/mHe2zv3/d7RpubrdudhFJzYogsHY2r7kGtQYNCfXaj5QHVRxCnvHhvMCiBI4lrqFZEMZii9eXSQJZJk32Cz20RiTvHeJwQsuukizfuHIzCNLwbb4aumR6m3uAL2U3QJE7j2ZfgkSF1boOvCQplCRiBL6kexNLHvTlVGHbt8a+Eptp0KE+7lKXk9o7CONWxLaCVGOe+2zTms7b5/4FYLD4gajoBRUmaTe0KZVlvvKaoQanj1+ay9SA7pIlBMHoTmnoHHZVu8l2lOERXd4VM1bE+zWH5MULy76cSIhuerwJg5rPUbizVpFsAVrk346gUKcGzWkibY2uSvUyVEoef2jiX62uJ9FUmVUgFo=
36 |   on:
37 |     tags: true
38 | env:
39 |   global:
40 |   - secure: ZqqOmdvcZ/tUhR87wlAhnDeFgkCywwYlAchKMIIGJPjGpsMEZU/m+kKi49YJEZrC6i7QO/WexCi3TVIFWaIGtWXCyri0rJIIDPdJtiJ3SNvDhwjcB+eNOsNVnv3Hg8OllZV3+GvoKNKNrwYtYraill4C3+8PeDLLpOI6wQmNJWUsLMRHlSXM6ZDS9bxILrNHFobvTdtuM9wkfwh0JfoIWMdYnzmqrRrhT+bEZaDHMHp7GTSiFBL2lvqlutMWMn8fvTLn35M8839WyFPcr/lnFZZZcSKJBqM8GySq1yNr9NNAoUUhhmP0Oope8iks59mkDQRyRbVpkOYQGkJChhGt2/LQE5W0WjpmWS+5cAqYFlqJKs2AFPNI9a94HI/XZjO0RwQqkpbNHc06wbPIYf0h1aPi75xYsC07NhdwvS40NYou82IjnpffdRDrqNcVP4yBASqaQlaDCrr35nPGm993vPCVa7i0NxrqWmtf5KFo/Nk6Lxh3nWFLABLZI8DEIogOCnVF2W+Q9LaSmbUcmsovV2XNdLNYoKETVlZp5F0h7mXjyYsRN8dexPZb8DBYNWCI5PbJS4gf98JjDRxKVuIbWqRg8W/KbMdU78d4aetznVeLkh7SnVyFtR4ImYEod1xFVdulh1F09VAfLBk/J6HTs84uqdW65w1TKHAtZyO/77Q=
41 |   - secure: eFWWkuzR9vYPgXPz1f0JyZQFFdipAu5b5fOKXeDyzlb27DG0jQYUeRou6xdtXvH9jKgAtRxwa0IdtV+2cEW+deTr/bxvWlGGLHHnQSgtgvq/BgbDptuTsSVWfRhuNJogNGB5BaMd33lyX5nKrH6Jbz8+qp8DnE2l96kThSbByyamVG/7Q9sXAHEfCUI4UdRSuGijB18ezzzxloDuA3pZPaa6laDXxXSCVRyaY+qREL6qwhWK9LwNdqtSuqcFs5ppIYhz3c0KCMMKb86cZrtJTXolSK6Oe6sjXy+Pj+E1D895jZF2k1YDQ7YLFNhYyFClC85l2NvJBitr5l7FUHhQGsuLfFk3z2yMkJE9UbDOog98EjiRH/VRICReFrEN78D8k8JSziyIXB0FGHoIKEUXV3n9Og++AkFF4qdwxPC/VGGComZBAGC7VMEg67AIuG2vKoCgl7GHvPu/lW0cCZUE0wt+RZOXB6mJv6Fr9qmXWibarv/wA7gItDj1PP8pgxSWn69b4U8aNwoYHNYyBldqK7YRevqwAnaFfohMUmYPJnE80pWLDe2i/4T+Wl8XhuOubG9k3uncXZ6uB4mzVooITYiAzTzesqkU4ix62bm3C0g6h0XawQjzHbGpK0Je2AX7DMlCLyjef+R/4Cj6hP2rsK1IrJbh5McwoSTErydzQU8=
42 | before_install:
43 | - echo -n | openssl s_client -connect scan.coverity.com:443 | sed -ne '/-BEGIN CERTIFICATE-/,/-END
44 |   CERTIFICATE-/p' | sudo tee -a /etc/ssl/certs/ca-
45 | - wget https://sonarqube.com/static/cpp/build-wrapper-linux-x86.zip
46 | - unzip build-wrapper-linux-x86.zip
47 | - export PATH=$PWD/build-wrapper-linux-x86:$PATH
48 | compiler:
49 | - gcc
50 | before_script:
51 | - git clone git://github.com/armadito/armadito-av -b DEV
52 | - cd armadito-av/
53 | - "./autogen.sh"
54 | - mkdir -p /home/travis/build/armadito-core
55 | - "./configure --prefix=/home/travis/build/armadito-core"
56 | - make
57 | - make install
58 | - cd /home/travis/build/armadito/armadito-mod-pdf
59 | - "./autogen.sh"
60 | script:
61 | - "./configure --enable-lib PKG_CONFIG_PATH=/home/travis/build/armadito-core/lib/pkgconfig"
62 | - build-wrapper-linux-x86-64 --out-dir build-wrapper-out make clean all
63 | - sonar-scanner -X -Dsonar.host.url=https://sonarqube.com -Dsonar.login=$SONAR_TOKEN
64 | 


--------------------------------------------------------------------------------
/lib/src/osdeps.c:
--------------------------------------------------------------------------------
  1 | /***
  2 | 
  3 | Copyright (C) 2015, 2016 Teclib'
  4 | 
  5 | This file is part of Armadito module PDF.
  6 | 
  7 | Armadito module PDF is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | Armadito module PDF is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | ***/
 21 | 
 22 | 
 23 | 
 24 | #include "osdeps.h"
 25 | #include <string.h>
 26 | #include "log.h"
 27 | 
 28 | 
 29 | #ifdef _WIN32
 30 | 
 31 | FILE * os_fopen(const char * filename, const char * mode) {
 32 | 
 33 | 	FILE * f = NULL;
 34 | 
 35 | 	fopen_s(&f, filename,mode);
 36 | 
 37 | 	return f;
 38 | 
 39 | 
 40 | }
 41 | 
 42 | 
 43 | int os_scan_dir(char * path, int recurse, dirent_scan_cb dirent_cb, void * data){
 44 | 
 45 | 	char * rpath = NULL, *entryPath = NULL;
 46 | 	char * escapedPath = NULL;
 47 | 	int ret = 0;
 48 | 	int size = 0;
 49 | 	HANDLE fh = NULL;
 50 | 	WIN32_FIND_DATAA fdata;
 51 | 	WIN32_FIND_DATAA tmp;
 52 | 	int fd = -1;
 53 | 	
 54 | 	if (path == NULL || dirent_cb == NULL){
 55 | 		err_log("scan_dir :: invalid parameter\n");
 56 | 		return -1;
 57 | 	}
 58 | 
 59 | 	dbg_log("scan_dir :: path = %s\n", path);
 60 | 
 61 | 	// Check if it is a directory // TODO :: os_scan_dir :: scan a file.
 62 | 	if (!(GetFileAttributesA(path) & FILE_ATTRIBUTE_DIRECTORY)) {
 63 | 		err_log("scan_dir :: (%s) is not a directory\n", path);
 64 | 		return -2;
 65 | 	}
 66 | 
 67 | 	size = strlen(path) + 3;
 68 | 	rpath = (char*)calloc(size + 1, sizeof(char));
 69 | 	rpath[size] = '\0';
 70 | 	sprintf_s(rpath, size, "%s\\*", path);
 71 | 
 72 | 	dbg_log("scan_dir :: rpath = %s\n",rpath);
 73 | 
 74 | 	/*
 75 | 	FindFirstFile note
 76 | 	Be aware that some other thread or process could create or delete a file with this name between the time you query for the result and the time you act on the information. If this is a potential concern for your application, one possible solution is to use the CreateFile function with CREATE_NEW (which fails if the file exists) or OPEN_EXISTING (which fails if the file does not exist).
 77 | 	*/
 78 | 	fh = FindFirstFile(rpath, &fdata);
 79 | 	if (fh == INVALID_HANDLE_VALUE) {
 80 | 		warn_log("scan_dir :: FindFirstFileA call failed :: err= [%d]\n", GetLastError());
 81 | 		goto clean;
 82 | 	}
 83 | 
 84 | 	while (fh != INVALID_HANDLE_VALUE && FindNextFile(fh, &tmp) != FALSE) {
 85 | 
 86 | 		// exclude paths "." and ".."
 87 | 		if (!strcmp(tmp.cFileName, ".") || !strcmp(tmp.cFileName, ".."))
 88 | 			continue;
 89 | 
 90 | 		// build the entry complete path.
 91 | 		size = strlen(path) + strlen(tmp.cFileName) + 2;
 92 | 
 93 | 		entryPath = (char*)calloc(size + 1, sizeof(char));
 94 | 		entryPath[size] = '\0';
 95 | 		sprintf_s(entryPath, size, "%s\\%s", path, tmp.cFileName);
 96 | 		dbg_log("scan_dir :: cfilename = %s\n", &tmp.cFileName);
 97 | 		dbg_log("scan_dir :: entryPath = %s\n", entryPath);
 98 | 
 99 | 
100 | 		// If it is a directory and we do recursive scan
101 | 		if ((GetFileAttributesA(entryPath) & FILE_ATTRIBUTE_DIRECTORY) && recurse >= 1) {
102 | 
103 | 			ret = os_scan_dir(entryPath, recurse, dirent_cb, data);
104 | 			if (ret != 0){
105 | 				free(entryPath);
106 | 				break;
107 | 			}
108 | 		}
109 | 		else {
110 | 			
111 | 			(*dirent_cb)(fd,entryPath);
112 | 		}
113 | 
114 | 		free(entryPath);
115 | 		entryPath = NULL;
116 | 	}
117 | 
118 | 
119 | 	// TODO :: os_scan_dir :: get stats.
120 | 
121 | clean:
122 | 	if (rpath != NULL){
123 | 		free(rpath);
124 | 		rpath = NULL;
125 | 	}
126 | 	FindClose(fh);
127 | 
128 | 	return ret;
129 | 
130 | }
131 | 
132 | 
133 | 
134 | #else
135 | 
136 | // Linux part for compatibility.
137 | int os_strncat(char *strDest, size_t numberOfElements, const char *strSource, size_t count) {
138 | 	
139 | 	
140 | 	if( strncat(strDest, strSource, count) == NULL){
141 | 		return -1;
142 | 	}
143 | 	
144 | 
145 | 	return 0;
146 | 
147 | }
148 | 
149 | int os_strncpy(char *strDest, size_t numberOfElements, const char *strSource, size_t count) {
150 | 
151 | 	
152 | 	if( strncpy(strDest , strSource, count)  == NULL){
153 | 		return -1;
154 | 	}
155 | 
156 | 	return 0;
157 | }
158 | 
159 | 
160 | int os_scan_dir(char * path, int recurse, dirent_scan_cb dirent_cb, void * data){
161 | 
162 | 	int ret = 0;
163 | 
164 | 	return ret;
165 | }
166 | 	
167 | 
168 | #endif


--------------------------------------------------------------------------------
/tools/cli_analyzer/main.c:
--------------------------------------------------------------------------------
  1 | /***
  2 | 
  3 | Copyright (C) 2015, 2016 Teclib'
  4 | 
  5 | This file is part of Armadito module PDF.
  6 | 
  7 | Armadito module PDF is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | Armadito module PDF is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | ***/
 21 | 
 22 | 
 23 |     
 24 | #include "armaditopdf.h"
 25 | #include "osdeps.h"
 26 | #include "log.h"
 27 | #include "getopt.h"
 28 | 
 29 | 
 30 | struct scan_options {
 31 | 	char *path_to_scan;
 32 | 	enum log_level log_level;
 33 | };
 34 | 
 35 | 
 36 | static struct option cli_option_def[] = {
 37 | 	{"help",        no_argument, 0, 'h'},
 38 | 	{"version",     no_argument, 0, 'v'},
 39 | 	{"log-level",   required_argument, 0, 'l'},
 40 | 	{0, 0, 0, 0}
 41 | };
 42 | 
 43 | 
 44 | void Usage(){
 45 | 
 46 | 	fprintf(stderr, "usage: armadito-pdf [options] FILE|DIR\n");
 47 | 	fprintf(stderr, "\n");
 48 | 	fprintf(stderr, "Armadito PDF scanner\n");
 49 | 	fprintf(stderr, "\n");
 50 | 	fprintf(stderr, "Options:\n");
 51 | 	fprintf(stderr, "  --help  -h                    print help and quit\n");
 52 | 	fprintf(stderr, "  --version -V                  print program version\n");
 53 | 	fprintf(stderr, "  --log-level=LEVEL | -l LEVEL  set log level [debug=X; warning=Y; error=Z]\n");
 54 | 	fprintf(stderr, "\n");
 55 | 
 56 | 	exit(-1);
 57 | 
 58 | }
 59 | 
 60 | void Version(){
 61 | 
 62 | 	printf("armadito-pdf v%s (c) 2015 - 2017 by Teclib\n",a6o_pdf_ver);
 63 | 	exit(1);
 64 | }
 65 | 
 66 | 
 67 | int parse_options(int argc, char ** argv, struct scan_options * opts){
 68 | 
 69 | 	while(1){
 70 | 
 71 | 		int c, option_index = 0;
 72 | 
 73 | 		c = getopt_long (argc, argv, "hvil:", cli_option_def, &option_index);
 74 | 		
 75 | 		/* Detect the end of the options. */
 76 | 		if (c == -1){
 77 | 			break;
 78 | 		}
 79 | 
 80 | 		switch(c){
 81 | 
 82 | 			case 'h':
 83 | 				Usage();
 84 | 			break;
 85 | 
 86 | 			case 'v':
 87 | 				Version();
 88 | 			break;
 89 | 
 90 | 			case 'l':
 91 | 				
 92 | 				if(!strcmp("error",optarg))
 93 | 					opts->log_level = LOG_LEVEL_ERROR;
 94 | 				else if(!strcmp("warn",optarg))
 95 | 					opts->log_level = LOG_LEVEL_WARNING;
 96 | 				else if(!strcmp("info",optarg))
 97 | 					opts->log_level = LOG_LEVEL_INFO;
 98 | 				else if(!strcmp("debug",optarg))
 99 | 					opts->log_level = LOG_LEVEL_DEBUG;
100 | 				else if(!strcmp("none",optarg))
101 | 					opts->log_level = LOG_LEVEL_NONE;
102 | 				else{
103 | 					fprintf(stderr, "Option Error: Bad log level value\n");
104 | 					Usage();
105 | 					abort();
106 | 				}
107 | 			break;
108 | 
109 | 			default:
110 | 				abort();
111 | 			break;
112 | 		}	
113 | 
114 | 	}
115 | 
116 | 	if (optind < argc){
117 | 
118 | 		opts->path_to_scan = strdup(argv[optind]);
119 | 
120 | 	}else{
121 | 		fprintf(stderr, "Argument Error: Missing file or directory path\n");
122 | 		Usage();
123 | 	}
124 | 
125 | 	return 0;
126 | }
127 | 
128 | 
129 | // Launch a scan directory
130 | int do_scan(struct scan_options * opts){
131 | 
132 | 	int ret;
133 | 	FILE * f = NULL;
134 | 	int fd = -1;
135 | 
136 | 	// analysis with opened file descriptor.	
137 | 	if(!(f = os_fopen(opts->path_to_scan,"rb"))){
138 | 		err_log("Can't open file %s\n", opts->path_to_scan);
139 | 		return -1;
140 | 	}
141 | 
142 | 	fd = os_fileno(f);
143 | 	ret = analyzePDF_ex(fd, opts->path_to_scan);
144 | 	fclose(f);
145 | 
146 | 	return ret;
147 | }
148 | 
149 | 
150 | // launch a task according to options and parameters.
151 | int process_opts(struct scan_options * opts){
152 | 
153 | 	
154 | 	if(opts == NULL || opts->path_to_scan == NULL){
155 | 		return -1;
156 | 	}
157 | 
158 | 	// Set log level
159 | 	if(opts->log_level > 0)
160 | 		set_current_log_level(opts->log_level);
161 | 
162 | 	return do_scan(opts);
163 | 
164 | }
165 | 
166 | 
167 | int main (int argc, char ** argv){
168 | 
169 | 	int ret = 0;
170 | 	struct scan_options * opts = NULL;
171 | 
172 | 	if( !(opts = (struct scan_options*)calloc(1,sizeof(struct scan_options)))){
173 | 		err_log("Memory allocation failed!\n");
174 | 		return -1;
175 | 	}
176 | 
177 | 	opts->log_level = -1;
178 | 	opts->path_to_scan = NULL;
179 | 
180 | 	parse_options(argc,argv,opts);
181 | 
182 | 	ret = process_opts(opts);
183 | 
184 | 	if(opts->path_to_scan != NULL){
185 | 		free(opts->path_to_scan);
186 | 		opts->path_to_scan = NULL;
187 | 	}
188 | 
189 | 	free(opts);
190 | 	opts = NULL;
191 | 
192 | 	return ret;
193 | 	
194 | }


--------------------------------------------------------------------------------
/tools/cli_parser/parser.c:
--------------------------------------------------------------------------------
  1 | /***
  2 | 
  3 | Copyright (C) 2015, 2016 Teclib'
  4 | 
  5 | This file is part of Armadito module PDF.
  6 | 
  7 | Armadito module PDF is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | Armadito module PDF is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | ***/
 21 | 
 22 | 
 23 |     
 24 | #include "armaditopdf.h"
 25 | 
 26 | 
 27 | 
 28 | void Banner(){
 29 | 	
 30 | 	printf("----------------------------\n");
 31 | 	printf("-- ARMADITO PDF ANALYZER  --\n");
 32 | 	printf("----------------------------\n\n");
 33 | 
 34 | 	return;
 35 | }
 36 | 
 37 | 
 38 | void Helper(){
 39 | 	
 40 | 	printf("ARMADITO PDF ANALYZER :: No file in parameter\n");
 41 | 	printf("Command : ./pdfAnalyzer [filename]\n\n");
 42 | 
 43 | 	return;
 44 | }
 45 | 
 46 | void Commands(){
 47 | 
 48 | 	Banner();
 49 | 	
 50 | 	printf("Commands list:\n");
 51 | 	printf("- avscan :: launch a complete analysis and display report\n");
 52 | 	printf("- decode [obj_ref] :: decode object stream\n");
 53 | 	printf("- dump [obj_ref] :: dump object stream\n");
 54 | 	printf("- exit :: exit the parser.\n");
 55 | 	printf("- object [obj_ref] :: display object infos\n");
 56 | 	printf("- quit :: exit the parser.\n");
 57 | 	printf("\n");
 58 | 	printf("objects actions:\n");
 59 | 	printf("- decode [obj_ref] :: decode object stream\n");
 60 | 	printf("- object [obj_ref] :: display object infos\n");
 61 | 	printf("\n");
 62 | 
 63 | 	return;
 64 | }
 65 | 
 66 | 
 67 | 
 68 | 
 69 | int main (int argc, char ** argv){
 70 | 
 71 | 	int ret;
 72 | 	FILE * f = NULL;
 73 | 	int fd = -1;
 74 | 	struct pdfDocument * pdf = NULL;
 75 | 	struct pdfObject * obj = NULL;
 76 | 	char cmd[512] = {0};
 77 | 	char params[512] = {0};	
 78 | 
 79 | 	
 80 | 	#ifdef DEBUG
 81 | 	Banner();
 82 | 	#endif
 83 | 
 84 | 	if(argc < 2){
 85 | 		Helper();
 86 | 		return (-1);
 87 | 	}
 88 | 	
 89 | 	//printf ("Analyzing file : %s\n",argv[1]);
 90 | 	if(!(f = os_fopen(argv[1],"rb"))){
 91 | 		printf("[-] Error :: main :: Error while opening file %s\n",argv[1]);
 92 | 		return -1;
 93 | 	}
 94 | 	
 95 | 
 96 | 	// Initialize the pdfDocument struct
 97 | 	if(!(pdf = initPDFDocument())){
 98 | 		printf("[-] Error :: analyzePDF :: Error while allocating memory for pdfDocument structure\n");
 99 | 		fclose(f);
100 | 		return -1;
101 | 	}
102 | 	pdf->fh = f;
103 | 
104 | 	if ( parsePDF(pdf)< 0){
105 | 		printf("[-] Error :: parsing error\n");
106 | 		return -2;
107 | 	}
108 | 	
109 | 	
110 | 
111 | 	while(1){
112 | 
113 | 		printf("enter a command:\nUHPDF>");
114 | 		scanf("%s",&cmd);
115 | 		//scanf("%[^\t\r\n]",&cmd);
116 | 		
117 | 		if(strncmp(cmd,"quit",4) == 0 || strncmp(cmd,"exit",4) == 0 ){
118 | 
119 | 			break;
120 | 
121 | 		}else if(strncmp(cmd,"help",4) == 0){
122 | 
123 | 			Commands();
124 | 
125 | 		}else if(strncmp(cmd,"avscan",6) == 0){
126 | 
127 | 			printf("[TODO] :: av scan\n");
128 | 
129 | 		}else if(strncmp(cmd,"info",4) == 0){
130 | 
131 | 			printf("[TODO] :: display infos.\n");
132 | 
133 | 		}else if(strncmp(cmd,"obj",3) == 0){
134 | 
135 | 			//printf("[TODO] :: display object. %s \n",params);
136 | 			printf("Enter an object reference: UHPDF>");
137 | 			scanf("%10s",params);
138 | 			//printf("params = %s\n",params );
139 | 			sprintf(ref, "%s 0 obj",params );
140 | 			printf("object = %s\n",ref );
141 | 			
142 | 			//printf("Decoding object :: %s\n","83 0 obj");
143 | 			obj = getPDFObjectByRef(pdf,ref);
144 | 
145 | 			if(obj == NULL){
146 | 				printf("[-] Error :: Object [%s] not found!\n",ref);				
147 | 				continue;
148 | 			}			
149 | 
150 | 			printf("Display object :: %s\n","83 0 obj");
151 | 
152 | 			printObject(obj);
153 | 			
154 | 
155 | 
156 | 		}else if(strncmp(cmd,"decode",6) == 0){
157 | 
158 | 			printf("[TODO] :: display object.\n");
159 | 
160 | 			printf("Enter object reference: UHPDF>");
161 | 			scanf("%s",params);
162 | 			//printf("params = %s\n",params );
163 | 			sprintf(ref, "%s 0 obj",params );
164 | 			printf("object = %s\n",ref );
165 | 
166 | 			obj = getPDFObjectByRef(pdf,ref);
167 | 
168 | 			if(obj == NULL){
169 | 				printf("[-] Error :: Object [%s] not found!\n",ref);				
170 | 				continue;
171 | 			}
172 | 			decodeObjectStream(obj);			
173 | 
174 | 
175 | 		}
176 | 		else{
177 | 			printf("Command [%s] not found. See Help (command: help)!\n",cmd);
178 | 		}
179 | 
180 | 
181 | 
182 | 	}
183 | 
184 | 	//fclose(f);
185 | 	freePDFDocumentStruct(pdf);
186 | 
187 | 	//system("pause");
188 | 	
189 | 	return ret;
190 | }


--------------------------------------------------------------------------------
/tools/armadito-pdf/main.c:
--------------------------------------------------------------------------------
  1 | /***
  2 | 
  3 | Copyright (C) 2015 - 2017 Teclib'
  4 | 
  5 | This file is part of Armadito module PDF.
  6 | 
  7 | Armadito module PDF is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | Armadito module PDF is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | ***/
 21 | 
 22 | 
 23 | #include "libarmadito-pdf/armaditopdf.h"
 24 | #include "libarmadito-pdf/osdeps.h"
 25 | #include "libarmadito-pdf/log.h"
 26 | #include "getopt.h"
 27 | 
 28 | #define TOOL_VERSION "0.13.2"
 29 | 
 30 | 
 31 | struct scan_options {
 32 | 	char *path_to_scan;
 33 | 	enum log_level log_level;
 34 | };
 35 | 
 36 | 
 37 | static struct option cli_option_def[] = {
 38 | 	{"help",        no_argument, 0, 'h'},
 39 | 	{"version",     no_argument, 0, 'v'},
 40 | 	{"log-level",   required_argument, 0, 'l'},
 41 | 	{0, 0, 0, 0}
 42 | };
 43 | 
 44 | 
 45 | void Usage(){
 46 | 
 47 | 	fprintf(stderr, "usage: armadito-pdf [options] FILE\n");
 48 | 	fprintf(stderr, "\n");
 49 | 	fprintf(stderr, "Armadito PDF scanner\n");
 50 | 	fprintf(stderr, "\n");
 51 | 	fprintf(stderr, "Options:\n");
 52 | 	fprintf(stderr, "  --help  -h                    print help and quit\n");
 53 | 	fprintf(stderr, "  --version -V                  print program version\n");
 54 | 	fprintf(stderr, "  --log-level=LEVEL | -l LEVEL  set log level [error, warn, info, debug, none]\n");
 55 | 	fprintf(stderr, "\n");
 56 | 
 57 | 	exit(-1);
 58 | 
 59 | }
 60 | 
 61 | void Version(){
 62 | 
 63 | 	printf("armadito-pdf v%s (using libarmadito-pdf v%s) \nCopyright (C) 2015 - 2017 by Teclib.\n",TOOL_VERSION,getVersion());
 64 | 	exit(1);
 65 | }
 66 | 
 67 | 
 68 | int parse_options(int argc, char ** argv, struct scan_options * opts){
 69 | 
 70 | 	while(1){
 71 | 
 72 | 		int c, option_index = 0;
 73 | 
 74 | 		c = getopt_long (argc, argv, "hvil:", cli_option_def, &option_index);
 75 | 		
 76 | 		/* Detect the end of the options. */
 77 | 		if (c == -1){
 78 | 			break;
 79 | 		}
 80 | 
 81 | 		switch(c){
 82 | 
 83 | 			case 'h':
 84 | 				Usage();
 85 | 			break;
 86 | 
 87 | 			case 'v':
 88 | 				Version();
 89 | 			break;
 90 | 
 91 | 			case 'l':
 92 | 				
 93 | 				if(!strcmp("error",optarg))
 94 | 					opts->log_level = LOG_LEVEL_ERROR;
 95 | 				else if(!strcmp("warn",optarg))
 96 | 					opts->log_level = LOG_LEVEL_WARNING;
 97 | 				else if(!strcmp("info",optarg))
 98 | 					opts->log_level = LOG_LEVEL_INFO;
 99 | 				else if(!strcmp("debug",optarg))
100 | 					opts->log_level = LOG_LEVEL_DEBUG;
101 | 				else if(!strcmp("none",optarg))
102 | 					opts->log_level = LOG_LEVEL_NONE;
103 | 				else{
104 | 					fprintf(stderr, "Option Error: Bad log level value\n");
105 | 					Usage();
106 | 					abort();
107 | 				}
108 | 			break;
109 | 
110 | 			default:
111 | 				abort();
112 | 			break;
113 | 		}	
114 | 
115 | 	}
116 | 
117 | 	if (optind < argc){
118 | 
119 | 		opts->path_to_scan = strdup(argv[optind]);
120 | 
121 | 	}else{
122 | 		fprintf(stderr, "Argument Error: Missing file or directory path\n");
123 | 		Usage();
124 | 	}
125 | 
126 | 	return 0;
127 | }
128 | 
129 | 
130 | // Launch a scan directory
131 | int do_scan(struct scan_options * opts){
132 | 
133 | 	int ret;
134 | 	FILE * f = NULL;
135 | 	int fd = -1;
136 | 
137 | 	// analysis with opened file descriptor.	
138 | 	if(!(f = os_fopen(opts->path_to_scan,"rb"))){
139 | 		err_log("Can't open file %s\n", opts->path_to_scan);
140 | 		return -1;
141 | 	}
142 | 
143 | 	fd = os_fileno(f);
144 | 	ret = analyzePDF_ex(fd, opts->path_to_scan);
145 | 	fclose(f);
146 | 
147 | 	return ret;
148 | }
149 | 
150 | 
151 | // launch a task according to options and parameters.
152 | int process_opts(struct scan_options * opts){
153 | 
154 | 	
155 | 	if(opts == NULL || opts->path_to_scan == NULL){
156 | 		return -1;
157 | 	}
158 | 
159 | 	// Set log level
160 | 	if(opts->log_level > 0)
161 | 		set_current_log_level(opts->log_level);
162 | 
163 | 	return do_scan(opts);
164 | 
165 | }
166 | 
167 | 
168 | int main (int argc, char ** argv){
169 | 
170 | 	int ret = 0;
171 | 	struct scan_options * opts = NULL;
172 | 
173 | 	if( !(opts = (struct scan_options*)calloc(1,sizeof(struct scan_options)))){
174 | 		err_log("Memory allocation failed!\n");
175 | 		return -1;
176 | 	}
177 | 
178 | 	opts->log_level = -1;
179 | 	opts->path_to_scan = NULL;
180 | 
181 | 	parse_options(argc,argv,opts);
182 | 
183 | 	ret = process_opts(opts);
184 | 
185 | 	if(opts->path_to_scan != NULL){
186 | 		free(opts->path_to_scan);
187 | 		opts->path_to_scan = NULL;
188 | 	}
189 | 
190 | 	free(opts);
191 | 	opts = NULL;
192 | 
193 | 	return ret;
194 | 	
195 | }


--------------------------------------------------------------------------------
/lib/includes/pdfStructs.h:
--------------------------------------------------------------------------------
  1 | /***
  2 | 
  3 | Copyright (C) 2015, 2016 Teclib'
  4 | 
  5 | This file is part of Armadito module PDF.
  6 | 
  7 | Armadito module PDF is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | Armadito module PDF is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | ***/
 21 | 
 22 | 
 23 | 
 24 | #ifndef _pdf_Structs_h_
 25 | #define _pdf_Structs_h_
 26 | 
 27 | 
 28 | #include <stdio.h>
 29 | #include <stdlib.h>
 30 | #include <string.h>
 31 | 
 32 | 
 33 | // PDF object structure
 34 | struct pdfObject{
 35 | 	
 36 | 	char * reference; // reference of the object Ex : 12 0 obj
 37 | 	char * content; // The content of the object obj...endobj
 38 | 	char * dico;	// The dictionary (if any)
 39 | 	char * type;	// The type of the object (if any)
 40 | 	char * stream;	// The content stream. stream...endstream
 41 | 	char * filters;
 42 | 	char * decoded_stream;
 43 | 	int offset;	// offset (in byte) in the file
 44 | 	int stream_size;	// Size in byte of the object's stream
 45 | 	int tmp_stream_size; // temp size of the stream (between two decoding process)
 46 | 	int decoded_stream_size;	// Size in byte of the object's decoded stream
 47 | 	int content_size;	// size in byte of the object's content
 48 | 	int errors;		// errors in parsing
 49 | 	
 50 | 	struct pdfObject* next;	// next object in the list.
 51 | 
 52 | 	
 53 | };
 54 | 
 55 | 
 56 | // PDF Trailer structure
 57 | struct pdfTrailer{
 58 | 
 59 | 	int offset;	// offset in the document
 60 | 	char * content; // content of the trailer
 61 | 	char * dico;
 62 | 	struct pdfTrailer* next;	// next trailer in the document
 63 | 	
 64 | };
 65 | 
 66 | 
 67 | // PDF Cross-reference table structure
 68 | struct pdfXRef{
 69 | 
 70 | 	int offset;	// offset in the document
 71 | 	char * content; // content of the XRef
 72 | 	struct pdfXRef* next;	// next trailer in the document
 73 | 	
 74 | };
 75 | 
 76 | 
 77 | // Suit of tests according to the PDF structure specifications.
 78 | struct testsPDFStruct{
 79 | 
 80 | 	int bad_header;	// when the PDF header is incorrect
 81 | 	int encrypted;	//  when the document is encrypted
 82 | 	int empty_page_content;	// when all pages are empty of content
 83 | 	int object_collision;	// when two objects have the same reference in the document.
 84 | 	int bad_trailer;	// when the trailer is in an incorrect form
 85 | 	int bad_xref_offset; // when the offset of the xref table is incorrect;
 86 | 	int bad_obj_offset; // When at least an object's offset in the reference table is incorrect
 87 | 	int obfuscated_object;	// when an object dictionary is obfuscated within hexa
 88 | 	int multiple_headers; // when several headers are found in the document.
 89 | 	int large_file;
 90 | 	int comments;	// If PostScript comments are found in pdf.
 91 | 	int malicious_comments; // Malicious comments found (potentially defeat pdf parsers).
 92 | 
 93 | };
 94 | 
 95 | 
 96 | // Suit of tests for PDF objects content
 97 | struct testsPDFObjAnalysis{
 98 | 
 99 | 	int active_content;	// presence of js, embedded files, or forms.
100 | 	int shellcode;	// presence of shellcode in an object stream content
101 | 	int pattern_high_repetition;	// high scale repetition of a pattern in a stream content
102 | 	int dangerous_keyword_high;	// potentially dangerous keyword (high level)
103 | 	int dangerous_keyword_medium;	// potentially dangerous keyword (medium level)
104 | 	int dangerous_keyword_low;	// potentially dangerous keyword (lowlevel)
105 | 	int time_exceeded;	// when the analysis of an object stream exceed a given duration.	
106 | 
107 | 	int js; // number of js content
108 | 	int xfa; // number of xfa objects
109 | 	int ef; // number of ef objects
110 | 
111 | 
112 | };
113 | 
114 | 
115 | // PDF Document structure
116 | struct pdfDocument{
117 | 	
118 | 	FILE * fh;	// File handle of the document
119 | 	int fd;
120 | 	char * fname;
121 | 	char * content;
122 | 	struct pdfObject * objects;	// List of objects
123 | 	int coef;	// Suspicious coefficient
124 | 	int size;	// size in bytes of the PDF
125 | 	char * version;	// PDF specification version
126 | 	struct pdfTrailer* trailers;
127 | 	struct pdfXRef* xref;
128 | 	struct testsPDFStruct * testStruct;
129 | 	struct testsPDFObjAnalysis * testObjAnalysis;
130 | 	double scan_time; // time elapsed in second for parse or scan.
131 | 	int errors; // treatment errors
132 | 
133 | };
134 | 
135 | 
136 | 
137 | /* pdf structures functions prototypes */
138 | 
139 | struct pdfDocument* initPDFDocument();
140 | struct pdfObject* initPDFObject();
141 | struct pdfTrailer* initPDFTrailer();
142 | struct testsPDFStruct * initTestsPDFStruct();
143 | struct testsPDFObjAnalysis * initTestsPDFObjAnalysisStruct();
144 | 
145 | void freePDFDocumentStruct(struct pdfDocument * pdf);
146 | void freePDFObjectStruct(struct pdfObject * obj);
147 | void freePDFTrailerStruct(struct pdfTrailer * trailer);
148 | 
149 | int addObjectInList(struct pdfObject* obj, struct pdfDocument* pdf);
150 | int addTrailerInList(struct pdfDocument * pdf, struct pdfTrailer * trailer);
151 | 
152 | 
153 | #endif
154 | 


--------------------------------------------------------------------------------
/win32/ArmaditoPDF/ArmaditoPDF/ArmaditoPDF.vcxproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Release|Win32">
  9 |       <Configuration>Release</Configuration>
 10 |       <Platform>Win32</Platform>
 11 |     </ProjectConfiguration>
 12 |   </ItemGroup>
 13 |   <PropertyGroup Label="Globals">
 14 |     <ProjectGuid>{667A295C-61CD-47A7-AAFC-5B7F6088CDB5}</ProjectGuid>
 15 |     <Keyword>Win32Proj</Keyword>
 16 |     <RootNamespace>ArmaditoPDF</RootNamespace>
 17 |   </PropertyGroup>
 18 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 19 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 20 |     <ConfigurationType>Application</ConfigurationType>
 21 |     <UseDebugLibraries>true</UseDebugLibraries>
 22 |     <PlatformToolset>v120</PlatformToolset>
 23 |     <CharacterSet>MultiByte</CharacterSet>
 24 |   </PropertyGroup>
 25 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 26 |     <ConfigurationType>Application</ConfigurationType>
 27 |     <UseDebugLibraries>false</UseDebugLibraries>
 28 |     <PlatformToolset>v120</PlatformToolset>
 29 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 30 |     <CharacterSet>MultiByte</CharacterSet>
 31 |   </PropertyGroup>
 32 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 33 |   <ImportGroup Label="ExtensionSettings">
 34 |   </ImportGroup>
 35 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 36 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 37 |   </ImportGroup>
 38 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 39 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 40 |   </ImportGroup>
 41 |   <PropertyGroup Label="UserMacros" />
 42 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 43 |     <LinkIncremental>true</LinkIncremental>
 44 |   </PropertyGroup>
 45 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 46 |     <LinkIncremental>false</LinkIncremental>
 47 |   </PropertyGroup>
 48 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 49 |     <ClCompile>
 50 |       <PrecompiledHeader>
 51 |       </PrecompiledHeader>
 52 |       <WarningLevel>Level3</WarningLevel>
 53 |       <Optimization>Disabled</Optimization>
 54 |       <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 55 |       <SDLCheck>true</SDLCheck>
 56 |       <AdditionalIncludeDirectories>$(SolutionDir)..\..\lib\includes;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
 57 |     </ClCompile>
 58 |     <Link>
 59 |       <SubSystem>Console</SubSystem>
 60 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 61 |     </Link>
 62 |   </ItemDefinitionGroup>
 63 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 64 |     <ClCompile>
 65 |       <WarningLevel>Level3</WarningLevel>
 66 |       <PrecompiledHeader>
 67 |       </PrecompiledHeader>
 68 |       <Optimization>MaxSpeed</Optimization>
 69 |       <FunctionLevelLinking>true</FunctionLevelLinking>
 70 |       <IntrinsicFunctions>true</IntrinsicFunctions>
 71 |       <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 72 |       <SDLCheck>true</SDLCheck>
 73 |       <AdditionalIncludeDirectories>$(SolutionDir)..\..\includes;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
 74 |     </ClCompile>
 75 |     <Link>
 76 |       <SubSystem>Console</SubSystem>
 77 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 78 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
 79 |       <OptimizeReferences>true</OptimizeReferences>
 80 |     </Link>
 81 |   </ItemDefinitionGroup>
 82 |   <ItemGroup>
 83 |     <ClCompile Include="..\..\..\tools\cli_analyzer\main.c" />
 84 |     <ClCompile Include="..\..\..\lib\src\armaditopdf.c" />
 85 |     <ClCompile Include="..\..\..\lib\src\filters.c" />
 86 |     <ClCompile Include="..\..\..\lib\src\log.c" />
 87 |     <ClCompile Include="..\..\..\lib\src\osdeps.c" />
 88 |     <ClCompile Include="..\..\..\lib\src\pdfObjectsAnalysis.c" />
 89 |     <ClCompile Include="..\..\..\lib\src\pdfParsing.c" />
 90 |     <ClCompile Include="..\..\..\lib\src\pdfStructAnalysis.c" />
 91 |     <ClCompile Include="..\..\..\lib\src\pdfStructs.c" />
 92 |     <ClCompile Include="..\..\..\lib\src\utils.c" />
 93 |   </ItemGroup>
 94 |   <ItemGroup>
 95 |     <ClInclude Include="..\..\..\lib\includes\armaditopdf.h" />
 96 |     <ClInclude Include="..\..\..\lib\includes\filters.h" />
 97 |     <ClInclude Include="..\..\..\lib\includes\log.h" />
 98 |     <ClInclude Include="..\..\..\lib\includes\osdeps.h" />
 99 |     <ClInclude Include="..\..\..\lib\includes\pdfAnalysis.h" />
100 |     <ClInclude Include="..\..\..\lib\includes\pdfParsing.h" />
101 |     <ClInclude Include="..\..\..\lib\includes\pdfStructs.h" />
102 |     <ClInclude Include="..\..\..\lib\includes\utils.h" />
103 |   </ItemGroup>
104 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
105 |   <ImportGroup Label="ExtensionTargets">
106 |   </ImportGroup>
107 | </Project>


--------------------------------------------------------------------------------
/lib/Spec.txt:
--------------------------------------------------------------------------------
  1 | ::::::::::::::::::: PDF ANALYZER SPECIFICATIONS :::::::::::::::::::::::
  2 | 
  3 | Author: Ulrich FAUSTHER
  4 | Modification date: 21/01/2015
  5 | -----------------------------------------------------------------------
  6 | 
  7 | Plan:
  8 | 
  9 | I- Description 
 10 | 
 11 | II- Parsing du PDF
 12 | 	II-a	PDF Header
 13 | 	II-b	Objets
 14 | 	II-c	Trailer
 15 | 	II-d	Cross-reference table
 16 | 	 
 17 | 
 18 | III- Analyse Anti-malware
 19 | 
 20 | 	III-a	Format PDF
 21 | 	III-b	Analyse des objets
 22 | 	III-c	CVEs detection
 23 | 	III-d	Coefficient de suspicion
 24 | 	III-e	Cas non pris en compte
 25 | 
 26 | IV- Réécriture de PDF
 27 | 
 28 | 	IV-a	Principes de réécriture de PDF
 29 | 	IV-b	Cas non pris en compte
 30 | 	
 31 | -----------------------------------------------------------------------
 32 | 
 33 | I- Description 
 34 | 
 35 | 
 36 | II- Parsing du PDF
 37 | 
 38 | Cette étape a pour but d'extraire tous les informations nécessaires à l'analyse antivirale du PDF.
 39 | 
 40 | -------	II-a	PDF Header
 41 | 	
 42 | 	
 43 | 	Vérification de l'entête du document à analyser. 
 44 | 	L'entête du document doit etre de la forme: %PDF-x.y (où x-y représente la version du PDF) Exemple : %PDF-1.7 (version 1.7 des spécifications).
 45 | 	
 46 | 	/!\NB: L'analyseur prend egalement en compte le format XDP (XML Data Package). Qui représente la forme XML d'un PDF. Dans ce format, le contenu du PDF est encodé dans le XML en base64.
 47 | 		L'anlyseur va alors extraire le PDF packagé dans le XML, puis vérifier le header. A noter que la suite de l'analyse se fera sur le PDF extrait.
 48 | 	
 49 | 	Dans le cas où l'entête serait incorrect, l'analyse s'arrete en produisant un rapport d'erreur.
 50 | 		
 51 | -------	II-b	Objets
 52 | 	
 53 | 	
 54 | 	Récupération de tous les objets qui constituent le document PDF et parsing des attributs.
 55 | 	Les attributs de l'objet récupérés sont rangés dans une structure (table de hashage).
 56 | 	Les principaux attributs d'un objects qui sont récupérés sont:
 57 | 		* La réference (Ex: 16 0 obj);
 58 | 		* L'offset en octet du début de l'objet dans le document;
 59 | 		* Le contenu integrale de l'objet (11 0 obj......endobj);
 60 | 	
 61 | 	Si présent:
 62 | 		* Le dictionaire de l'objects (<<...>>);
 63 | 		* Le type de l'objet.
 64 | 		* Puis d'autres attributs specifiques au type d'objet. etc...
 65 | 	
 66 | 	/!\NB: A partir de la version 1.5 de PDF : Récupération des objets intégrés dans des objets de type /ObjStm (object stream).
 67 | 		Dans ce cas, des attributs sont rajoutés à la structure de l'objet à savoir ( objstm = l'objet stream dans lequel il est integré; objStmOff = l'offset dans l'object stream );
 68 | 		
 69 | 	
 70 | 	Decodage des streams presentes dans les objects; Les filtres implémentés sont:
 71 | 		* FlateDecode
 72 | 		* ASCIIHexDecode
 73 | 		* ASCII85Decode
 74 | 		* LZWDecode
 75 | 		* CCITTFaxDecode
 76 | 		* DCTDecode
 77 | 
 78 | 		
 79 | 	
 80 | 	
 81 | -------	II-c	Trailer
 82 | 
 83 | 
 84 | 	Récupération des trailers du PDF. (TODO:: Récupérer seulement le "dernier" Trailer mis à jour)
 85 | 	Conformément aux specifications PDF (jusqu'a la version 1.4 incluse), le trailer du document est défini sous la forme: 
 86 | 		trailer
 87 | 		<<dico>>
 88 | 		startxref
 89 | 		xref_table_offset
 90 | 		%%EOF
 91 | 	A partir de la version 1.5 de PDF, le trailer peut etre de la forme:
 92 | 		startxref
 93 | 		xref_stream_offset
 94 | 		%%EOF
 95 | 	
 96 | 	Dans le cas où la première forme du trailer n'aurait pas été retrouvée, essayer de l'obtenir sous la deuxième forme.
 97 | 
 98 | 	/!\NB: Pour les documents PDF ayant été mis à jour, on peut retrouver plusieurs trailers.
 99 | 
100 | 
101 | -------	II-d	Cross-reference table
102 | 
103 | 
104 | 	Récupération de la table de référence des objects.
105 | 	La récupération de la table de référence des objects se fera lors de la vérification de sa confirmité (voir partie III-a).
106 | 	
107 | 	Conformément aux spécifications PDF (jusqu'à la version 1.4 incluse); la table de reference des objets est de la forme:
108 | 		xref
109 | 		0 3
110 | 		0000000000 65535 f
111 | 		0000000009 00000 n
112 | 		0000000098 00000 n
113 | 	
114 | 	A partir de la version 1.5, la table de référence peut etre représentée via des objets de type (/XRef).
115 | 	/!\NB: Pour les documents PDF ayant été mis à jour, on peut retrouver plusieurs trailers.
116 | 	 
117 | 
118 | 
119 | III- Analyse Anti-malware
120 | 
121 | Objectif: Déterminer le niveau de suspicion (ou de dangerosité) du document analysé.
122 | Pour se faire, plusieurs tests divisés en 3 catégories seront effectuées sur le document.
123 | 	* Tests portant sur la structure du document.
124 | 	* Tests analysant le contenu des objets.
125 | 	* Tests d'identification de exploitation de vulnérabilités (CVE).
126 | 
127 | 
128 | -------	III-a	Format PDF
129 | 
130 | 
131 | 	L'objectif de cette suite de tests est de déterminer si le document respecte bien les spécifications sur la structure du PDF.
132 | 	
133 | 	* Vérification du trailer
134 | 		Vérifier qu'un trailer a bien été trouvé dans le document.
135 | 		Si oui, vérifier les paramètres du dictionaire du trailer (Ex: /root 1 0 R :: vérifier que l'objet '1 0 obj' référencé est bien de type /Catalog).
136 | 		
137 | 	* Vérification de la table de reference des objets (Xref table).
138 | 		Vérifier que l'offset de la table de reference (ou de l'objet de type XRef) défini dans le trailer.
139 | 		Si ok, vérifier les entrées de la table de réference (par rapport aux offsets des objets).
140 | 		
141 | 	* Vérification du contenu du document.
142 | 		Vérifier que le document ne contienne pas uniquement que des pages vides.
143 | 		/!\NB: Un document PDF dont toutes les pages sont vides est considéré comme suspect.
144 | 		
145 | 	* Collisions d'objets
146 | 		Vérifier qu'un object ne soit pas défini plusieurs fois dans le PDF (avec une table de reference incorrecte).
147 | 		
148 | 
149 | 	* Detection de PDF dont le contenu est chiffré.
150 | 		/!\NB: Dans ce cas, ne pas continuer le traitement du document.
151 | 		
152 | 	
153 | 	
154 | -------	III-b	Analyse des objets
155 | 
156 | 	L'objectif de cette suite de tests est de detecter des éléments potentiellement dangereux dans le contenu des objects du PDF.
157 | 	
158 | 	* Recherche d'objets potentiellement dangereux.
159 | 		-> javascript,
160 | 		-> embedded file,
161 | 		-> formulaire XFA,
162 | 		-> action de type /Launch,
163 | 		-> URI
164 | 		-> etc.
165 | 		
166 | 	* Recherche de pattern potentiellement dangereux dans les objects.
167 | 		-> shellcode,
168 | 		-> pattern à forte répetition,
169 | 		-> unicode strings,
170 | 		-> mots clés potentiellement dangereux (heapSpray, payload, shellcode, etc.),
171 | 		-> javascript fonctions (StringfromChar, byteToChar, eval, unescape, split, etc.),
172 | 		-> path traversal URI.
173 | 		
174 | 
175 | -------	III-c	CVEs exploit detection
176 | 
177 | 
178 | 	L'objectif de ces tests est de determiner une possible exploitation d'une vulnérabilité CVE.
179 | 	
180 | 	Les vulnérabilités implémentées sont:
181 | 		* CVE-2010-2883
182 | 		
183 | 		
184 | -------	III-d	Coefficient de suspicion
185 | 
186 | 	Afin de définir un coefficient de suspicion du document analysé, un coefficient va etre attribué au resultat de chaque test:
187 | 		* $EMPTY_PAGES_WITH_ACTIVE_CONTENT = 99;
188 | 		* $EMPTY_PAGES_CONTENT = 70;
189 | 		* $OBJECT_COLLISION_PLUS_BAD_XREF = 90;
190 | 		* $OBJECT_COLLISION = 10;
191 | 		* $BAD_XREF_OFFSET = 30;
192 | 		* $TRAILER_NOT_FOUND = 30;
193 | 		* $BAD_TRAILER = 40;
194 | 		* $OBFUSCATED_OBJECTS = 40;
195 | 		* $ACTIVE_CONTENT = 40;
196 | 		* $SHELLCODE = 40;
197 | 		* $PATTERN_REPETITION = 40;
198 | 		* $DANGEROUS_PATTERN_HIGH = 90;
199 | 		* $DANGEROUS_PATTERN_MEDIUM = 40;
200 | 		* $DANGEROUS_PATTERN_LOW = 20;
201 | 		* $TIME_EXCEEDED = 20;
202 | 		* $MALICIOUS_URI = 50;
203 | 		* $MULTIPLE_HEADERS = 50;
204 | 		
205 | 	
206 | 	NB: Ces coefficients doivent etre calibrés afin d'obtenir de meilleurs resultats.
207 | 	
208 | 	Le coefficient de suspicion est la somme de tous les résultats des tests positifs. Plus le coefficient est élevé, plus le document est suspect.
209 | 
210 | 
211 | 	III-e	Cas non pris en compte
212 | 
213 | IV- Réécriture de PDF
214 | 
215 | 	IV-a	Principes de réécriture de PDF
216 | 	IV-b	Cas non pris en compte
217 | 


--------------------------------------------------------------------------------
/lib/src/armaditopdf.c:
--------------------------------------------------------------------------------
  1 | /***
  2 | 
  3 | Copyright (C) 2015, 2016 Teclib'
  4 | 
  5 | This file is part of Armadito module PDF.
  6 | 
  7 | Armadito module PDF is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | Armadito module PDF is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | ***/
 21 | 
 22 | 
 23 | 
 24 | #include "armaditopdf.h"
 25 | #include "pdfParsing.h"
 26 | #include "pdfAnalysis.h"
 27 | #include "osdeps.h"
 28 | #include "log.h"
 29 | #include <time.h>
 30 | 
 31 | 
 32 | 
 33 | char * getVersion(){
 34 | 	return a6o_pdf_ver;
 35 | }
 36 | 
 37 | 
 38 | /*
 39 | printAnalysisReport() :: print a report of the analysis (debug only).
 40 | parameters:
 41 | - struct pdfDocument * pdf
 42 | returns:
 43 | - none.
 44 | */
 45 | // TODO :: printAnalysisReport :: filter report informations by log level.
 46 | void printAnalysisReport(struct pdfDocument * pdf){
 47 | 
 48 | 
 49 | 	if (!print_report || pdf == NULL){
 50 | 		return;
 51 | 	}
 52 | 
 53 | 	printf("\n\n");
 54 | 	printf("----------------------------------\n");
 55 | 	printf("-- ARMADITO PDF ANALYZER REPORT --\n");
 56 | 	printf("----------------------------------\n\n");
 57 | 
 58 | 	printf("Filename = %s\n",pdf->fname);
 59 | 	if (pdf->version)
 60 | 		printf("PDF version = %s\n",pdf->version);
 61 | 
 62 | 	printf("size = %d bytes\n", pdf->size);
 63 | 	
 64 | 	printf("\n\n");
 65 | 	printf("::: PDF Document Structure Tests :::\n\n");
 66 | 
 67 | 
 68 | 	printf("bad_header  = %d\n", pdf->testStruct->bad_header);
 69 | 	printf("encrypted  = %d\n", pdf->testStruct->encrypted);
 70 | 	printf("empty_page_content  = %d\n", pdf->testStruct->empty_page_content);
 71 | 	printf("object_collision  = %d\n", pdf->testStruct->object_collision);
 72 | 	printf("bad_trailer  = %d\n", pdf->testStruct->bad_trailer);
 73 | 	printf("bad_xref_offset  = %d\n", pdf->testStruct->bad_xref_offset);
 74 | 	printf("bad_obj_offset  = %d\n", pdf->testStruct->bad_obj_offset);
 75 | 	printf("obfuscated_object  = %d\n", pdf->testStruct->obfuscated_object);
 76 | 	printf("multiple_headers  = %d\n", pdf->testStruct->multiple_headers);
 77 | 	printf("postscript_comments = %d\n", pdf->testStruct->comments);
 78 | 	printf("malicious_comments = %d\n", pdf->testStruct->malicious_comments);
 79 | 
 80 | 	printf("\n\n");
 81 | 	printf("::: PDF Object Analysis Tests :::\n\n");
 82 | 
 83 | 	printf("active_content = %d\n", pdf->testObjAnalysis->active_content);
 84 | 		printf(" - js content = %d\n", pdf->testObjAnalysis->js);
 85 | 		printf(" - xfa content = %d\n", pdf->testObjAnalysis->xfa);
 86 | 		printf(" - ef content = %d\n", pdf->testObjAnalysis->ef);
 87 | 	printf("shellcode = %d\n", pdf->testObjAnalysis->shellcode); 
 88 | 	printf("pattern_high_repetition = %d\n", pdf->testObjAnalysis->pattern_high_repetition); 
 89 | 	printf("dangerous_keyword_high = %d\n", pdf->testObjAnalysis->dangerous_keyword_high); 
 90 | 	printf("dangerous_keyword_medium = %d\n", pdf->testObjAnalysis->dangerous_keyword_medium); 
 91 | 	printf("dangerous_keyword_low = %d\n", pdf->testObjAnalysis->dangerous_keyword_low); 
 92 | 	printf("time_exceeded = %d\n", pdf->testObjAnalysis->time_exceeded);
 93 | 
 94 | 
 95 | 	printf("\n\n");
 96 | 	printf("::: Suspicious Coefficient :::\n\n");
 97 | 	printf("errors = %d\n", pdf->errors);
 98 | 
 99 | 	if(pdf->testStruct->bad_header > 0)
100 | 		printf("Coef = BAD_HEADER\n");
101 | 	else
102 | 		if(pdf->testStruct->large_file > 0)
103 | 			printf("Coef = %d (LARGE_FILE)\n",pdf->coef);
104 | 		else
105 | 			if(pdf->testStruct->encrypted > 0)
106 | 				printf("Coef = Encrypted_PDF\n");
107 | 			else
108 | 				printf("Coef = %d\n",pdf->coef);
109 | 
110 | 
111 | 	printf("-------------------------------------------------------\n");
112 | 	//printf("-------------------------------------------------------\n");
113 | 	printf("Execution time : %.2lf sec \n",pdf->scan_time);
114 | 	printf("-------------------------------------------------------\n");
115 | 	printf("-------------------------------------------------------\n\n");
116 | 
117 | 	return;
118 | 
119 | }
120 | 
121 | 
122 | // This function calc the suspicious coefficient according to the tests results
123 | // TODO Improve  this fucntion by calc the coef with the operation coef += test_result * test_coef
124 | int calcSuspiciousCoefficient(struct pdfDocument * pdf){
125 | 
126 | 	// check parameters
127 | 	if(pdf == NULL){
128 | 		return -1;
129 | 	}
130 | 
131 | 	// PDF Document Structure tests
132 | 	/*
133 | 	EMPTY_PAGE_CONTENT 99
134 | 	OBJECT_COLLISION 10
135 | 	BAD_TRAILER 40
136 | 	BAD_XREF_OFFSET 30
137 | 	BAD_OBJ_OFFSET 20
138 | 	OBFUSCATED_OBJECT 50 
139 | 	MULTIPLE_HEADERS 50
140 | 	*/
141 | 
142 | 	pdf->coef = 0;
143 | 
144 | 	if(pdf->testStruct->encrypted > 0 ){
145 | 		pdf->coef = -2;
146 | 		return -2;
147 | 	}
148 | 
149 | 	if(pdf->testStruct->empty_page_content > 0){
150 | 		pdf->coef = EMPTY_PAGE_CONTENT;
151 | 		return 0;
152 | 	}
153 | 
154 | 	if(pdf->testStruct->object_collision > 0 && ( pdf->testStruct->bad_obj_offset > 0 || pdf->testStruct->bad_xref_offset > 0 )){
155 | 		pdf->coef += OBJECT_COLLISION_AND_BAD_XREF;
156 | 	}else{
157 | 
158 | 		if(pdf->testStruct->object_collision > 0){
159 | 			pdf->coef += OBJECT_COLLISION;
160 | 		}
161 | 
162 | 		if(pdf->testStruct->bad_obj_offset > 0){
163 | 			pdf->coef += BAD_OBJ_OFFSET;
164 | 		}
165 | 
166 | 		if( pdf->testStruct->bad_xref_offset > 0){
167 | 			pdf->coef += BAD_XREF_OFFSET;
168 | 		}
169 | 	}
170 | 
171 | 	if(pdf->testStruct->bad_trailer > 0){
172 | 		pdf->coef += BAD_TRAILER;
173 | 	}
174 | 
175 | 	if(pdf->testStruct->multiple_headers > 0){
176 | 		pdf->coef += MULTIPLE_HEADERS;
177 | 	}
178 | 
179 | 	if(pdf->testStruct->obfuscated_object > 0){
180 | 		pdf->coef += OBFUSCATED_OBJECT;
181 | 	}
182 | 
183 | 	if(pdf->testStruct->malicious_comments > 0){
184 | 		pdf->coef += MALICIOUS_COMMENTS;
185 | 	}
186 | 
187 | 
188 | 	// PDF Objects Analysis tests
189 | 	/*
190 | 	ACTIVE_CONTENT 40
191 | 	SHELLCODE 40
192 | 	PATTERN_HIGH_REPETITION 40
193 | 	DANGEROUS_KEYWORD_HIGH 90
194 | 	DANGEROUS_KEYWORD_MEDIUM 40
195 | 	DANGEROUS_KEYWORD_LOW 20
196 | 	TIME_EXCEEDED 20
197 | 	*/
198 | 
199 | 
200 | 	if(pdf->testObjAnalysis->active_content > 0){
201 | 		pdf->coef += ACTIVE_CONTENT;
202 | 	}
203 | 
204 | 	if(pdf->testObjAnalysis->shellcode > 0){
205 | 		pdf->coef += SHELLCODE;
206 | 	}
207 | 
208 | 	if(pdf->testObjAnalysis->pattern_high_repetition > 0){
209 | 		pdf->coef += PATTERN_HIGH_REPETITION;
210 | 	}
211 | 
212 | 	if(pdf->testObjAnalysis->dangerous_keyword_high > 0){
213 | 		pdf->coef += DANGEROUS_KEYWORD_HIGH;
214 | 	}
215 | 
216 | 	if(pdf->testObjAnalysis->dangerous_keyword_medium > 0){
217 | 		pdf->coef += DANGEROUS_KEYWORD_MEDIUM;
218 | 	}
219 | 
220 | 	if(pdf->testObjAnalysis->dangerous_keyword_low > 0){
221 | 		pdf->coef += DANGEROUS_KEYWORD_LOW;
222 | 	}
223 | 
224 | 	if(pdf->testObjAnalysis->time_exceeded > 0){
225 | 		pdf->coef += TIME_EXCEEDED;
226 | 	}
227 | 
228 | 	
229 | 	return 0;
230 | 
231 | }
232 | 
233 | 
234 | /* 
235 | 	analyzePDF_ex() :: Analyze pdf extension function 
236 | 	parameters: 
237 | 		- int fd (file descriptor of the file to analyze)
238 | 		- char * filename (file name of the file).
239 | 	returns:
240 | 		- the suspicious coefficient (>=0) on success.
241 | 		- an error code (<0) on error.
242 | */
243 | int analyzePDF_ex(int fd, char * filename){
244 | 
245 | 	int ret = 0;
246 | 	struct pdfDocument * pdf = NULL;
247 | 	time_t start_time =0, end_time = 0;
248 | 	double time_elapsed = 0;
249 | 	int res = 0;
250 | 	FILE * fh = NULL;
251 | 
252 | 
253 | 	if (fd < 0 && filename == NULL){
254 | 		err_log("analyzePDF_ex :: invalid parameters!",0);
255 | 		return -1;
256 | 	}
257 | 
258 | 	dbg_log("analyzePDF_ex :: Analyzing file :: [%s]\n", filename);
259 | 
260 | 	// open the file if fd is invalid	
261 | 	if (fd < 0 && !(fh = os_fopen(filename, "rb"))){
262 | 		err_log("analyzePDF_ex :: Can't open file %s\n", filename);
263 | 		return -1;
264 | 	}
265 | 	
266 | 
267 | 	// Initialize pdfDocument struct
268 | 	if (!(pdf = initPDFDocument())){
269 | 		err_log("analyzePDF_ex :: pdfDocument initialization failed!\n");
270 | 
271 | 		if(fh != NULL)
272 | 			fclose(fh);
273 | 
274 | 		return -1;
275 | 	}
276 | 
277 | 	pdf->fh = fh;
278 | 	pdf->fd = fd;
279 | 	pdf->fname = os_strdup(filename);
280 | 
281 | 	// start time initialization.
282 | 	time(&start_time);	
283 | 
284 | 	// Parse pdf document content.	
285 | 	if ((ret = parsePDF(pdf)) < 0){
286 | 		err_log("analyzePDF_ex :: parsing PDF document failed\n");
287 | 		goto clean;
288 | 	}
289 | 
290 | 	/* this is for debug purpose only */
291 | 	// printPDFObjects(pdf);
292 | 	// printObjectReferences(pdf);
293 | 
294 | 
295 | 	// PDF objects analysis.
296 | 	if ((ret = getDangerousContent(pdf)) < 0){
297 | 		err_log("analyzePDF_ex :: get dangerous content failed\n");
298 | 		goto clean;
299 | 	}
300 | 	
301 | 
302 | 	// Document structure analysis
303 | 	if((ret = documentStructureAnalysis(pdf))< 0){
304 | 		err_log("analyzePDF_ex :: document structure Analysis failed\n");
305 | 		goto clean;
306 | 	}
307 | 
308 | 
309 | clean:
310 | 
311 | 	time(&end_time);
312 | 	time_elapsed = difftime(end_time, start_time);
313 | 
314 | 	pdf->scan_time = time_elapsed;
315 | 
316 | 	// calc supicious coefficient of the document.
317 | 	calcSuspiciousCoefficient(pdf);
318 | 
319 | 	// print report. (debug only)
320 | 	printAnalysisReport(pdf);
321 | 
322 | 	if (ret >= 0){
323 | 		ret = pdf->coef;
324 | 		dbg_log("[armaditoPDF] Coef = %d\n", ret);
325 | 	}
326 | 	
327 | 	if (pdf != NULL){
328 | 		freePDFDocumentStruct(pdf);
329 | 	}
330 | 
331 | 
332 | 	return ret;
333 | 
334 | 
335 | }


--------------------------------------------------------------------------------
/lib/src/pdfStructs.c:
--------------------------------------------------------------------------------
  1 | /***
  2 | 
  3 | Copyright (C) 2015, 2016 Teclib'
  4 | 
  5 | This file is part of Armadito module PDF.
  6 | 
  7 | Armadito module PDF is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | Armadito module PDF is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | ***/
 21 | 
 22 | 
 23 | #include "pdfStructs.h"
 24 | #include "log.h"
 25 | 
 26 | 
 27 | /*
 28 | freePDFObjectStruct() :: free the allocated memory PDF object structure.
 29 | parameters:
 30 | - struct pdfObject * pdf (the pdf object list pointer).
 31 | returns:
 32 | - none
 33 | */
 34 | void freePDFObjectStruct(struct pdfObject * obj){
 35 | 
 36 | 	struct pdfObject * tmp = NULL;
 37 | 
 38 | 	if(obj == NULL){
 39 | 		return ;
 40 | 	}
 41 | 
 42 | 	while(obj != NULL){
 43 | 
 44 | 		tmp = obj;
 45 | 		obj = obj->next;
 46 | 
 47 | 		// free all elements		
 48 | 		free(tmp->reference);
 49 | 		free(tmp->content);
 50 | 		free(tmp->dico);
 51 | 		free(tmp->type);
 52 | 		free(tmp->stream);
 53 | 		free(tmp->filters);
 54 | 		free(tmp->decoded_stream);
 55 | 		
 56 | 		free(tmp);
 57 | 		tmp = NULL;
 58 | 
 59 | 	}
 60 | 
 61 | 	return;
 62 | 
 63 | }
 64 | 
 65 | 
 66 | /*
 67 | freePDFTrailerStruct() :: free the allocated memory PDF trailer structure.
 68 | parameters:
 69 | - struct pdfTrailer * pdf (the pdf trailer list pointer).
 70 | returns:
 71 | - none
 72 | */
 73 | void freePDFTrailerStruct(struct pdfTrailer * trailer){
 74 | 
 75 | 	struct pdfTrailer * tmp = NULL;
 76 | 
 77 | 	if(trailer == NULL){
 78 | 		return ;
 79 | 	}
 80 | 
 81 | 	while(trailer!= NULL){
 82 | 		
 83 | 		tmp = trailer;
 84 | 		trailer = trailer->next;
 85 | 
 86 | 		free(tmp->dico);
 87 | 		free(tmp->content);
 88 | 				
 89 | 		free(tmp);
 90 | 		tmp = NULL;
 91 | 
 92 | 	}
 93 | 
 94 | 	return;
 95 | 
 96 | }
 97 | 
 98 | 
 99 | /*
100 | freePDFDocumentStruct() :: free the allocated memory PDF document structure.
101 | parameters:
102 | - struct pdfDocument * pdf (the pdf document pointer).
103 | returns:
104 | - none
105 | */
106 | void freePDFDocumentStruct(struct pdfDocument * pdf){
107 | 
108 | 	
109 | 	if(pdf == NULL){
110 | 		return ;
111 | 	}
112 | 	
113 | 	if (pdf->fname != NULL){
114 | 		free(pdf->fname);
115 | 		pdf->fname = NULL;
116 | 	}
117 | 	
118 | 	// Free objects
119 | 	if (pdf->objects != NULL){
120 | 		freePDFObjectStruct(pdf->objects);		
121 | 	}
122 | 	
123 | 	// Free trailer
124 | 	if (pdf->trailers != NULL){
125 | 		freePDFTrailerStruct(pdf->trailers);
126 | 	}
127 | 	
128 | 	if (pdf->fh != NULL){
129 | 		fclose(pdf->fh);
130 | 		pdf->fh = NULL;
131 | 	}
132 | 
133 | 	if (pdf->version != NULL){
134 | 		free(pdf->version);
135 | 		pdf->version = NULL;
136 | 	}		
137 | 
138 | 	if (pdf->content != NULL){
139 | 		free(pdf->content);
140 | 		pdf->content = NULL;
141 | 	}
142 | 
143 | 	if (pdf->testStruct != NULL){
144 | 		free(pdf->testStruct);
145 | 		pdf->testStruct = NULL;
146 | 	}
147 | 
148 | 	if (pdf->testObjAnalysis != NULL){
149 | 		free(pdf->testObjAnalysis);
150 | 		pdf->testObjAnalysis = NULL;
151 | 	}
152 | 
153 | 	
154 | 	free(pdf);
155 | 	pdf = NULL;
156 | 	
157 | 
158 | 	return ;
159 | 
160 | }
161 | 
162 | 
163 | /*
164 | addObjectInList() :: add an object in the pdf document object list
165 | parameters:
166 | - struct pdfObject * obj (pdf object pointer)
167 | - struct pdfDocument * pdf (pdf document pointer)
168 | returns: (int)
169 | - 0 on success.
170 | - an error code (<0) on error.
171 | */
172 | int addObjectInList(struct pdfObject* obj, struct pdfDocument* pdf){
173 | 
174 | 	struct pdfObject* tmp = NULL;
175 | 
176 | 	if(obj == NULL || pdf == NULL){		
177 | 		err_log("addObjectInList :: invalid parameter\n");
178 | 		return -1;	
179 | 	}
180 | 	
181 | 	if(pdf->objects == NULL){
182 | 		pdf->objects = obj;
183 | 	}else{
184 | 		
185 | 		tmp = pdf->objects;
186 | 
187 | 		// Object collision detection
188 | 		if(strncmp(tmp->reference,obj->reference,strlen(tmp->reference)) == 0 && strncmp(tmp->reference,obj->reference,strlen(obj->reference)) == 0){
189 | 			
190 | 			warn_log("addObjectInList :: Object collision :: %s\n", obj->reference);
191 | 			pdf->testStruct->object_collision ++;
192 | 		}
193 | 
194 | 
195 | 		while(tmp->next != NULL){
196 | 
197 | 			// Object collision detection			
198 | 			tmp = tmp->next;
199 | 
200 | 			if(strncmp(tmp->reference,obj->reference,strlen(tmp->reference)) == 0 && strncmp(tmp->reference,obj->reference,strlen(obj->reference)) == 0){
201 | 				warn_log("addObjectInList :: Object collision :: %s\n", obj->reference);
202 | 				pdf->testStruct->object_collision ++;
203 | 			}	
204 | 		}
205 | 		tmp->next = obj;
206 | 		
207 | 	}
208 | 	
209 | 	return 0;
210 | }
211 | 
212 | 
213 | /*
214 | initTestsPDFStruct() :: Initialize PDF Tests structure.
215 | parameters:
216 | - none
217 | returns: (struct testsPDFStruct *)
218 | - the testsPDFStruct pointer on success.
219 | - NULL on error.
220 | */
221 | struct testsPDFStruct * initTestsPDFStruct(){
222 | 
223 | 	struct testsPDFStruct * testStruct = NULL;
224 | 
225 | 	if( !(testStruct = (struct testsPDFStruct *)calloc(1,sizeof(struct testsPDFStruct)) ) ){
226 | 		err_log("initTestsPDFStruct :: memory allocation failed\n");
227 | 		return NULL;
228 | 	}
229 | 
230 | 	testStruct->bad_header = 0;
231 | 	testStruct->encrypted = 0;
232 | 	testStruct->empty_page_content = 0;
233 | 	testStruct->object_collision = 0;
234 | 	testStruct->bad_trailer = 0;
235 | 	testStruct->bad_xref_offset = 0;
236 | 	testStruct->bad_obj_offset = 0;
237 | 	testStruct->obfuscated_object = 0;
238 | 	testStruct->multiple_headers = 0;
239 | 	testStruct->large_file = 0;
240 | 	testStruct->comments = 0;
241 | 	testStruct->malicious_comments = 0;
242 | 
243 | 	return testStruct;
244 | }
245 | 
246 | 
247 | /*
248 | initTestsPDFObjAnalysisStruct() :: Initialize PDF Tests structure.
249 | parameters:
250 | - none
251 | returns: (struct testsPDFObjAnalysis *)
252 | - the testsPDFObjAnalysis pointer on success.
253 | - NULL on error.
254 | */
255 | struct testsPDFObjAnalysis * initTestsPDFObjAnalysisStruct(){
256 | 
257 | 	struct testsPDFObjAnalysis * testObjAnalysis = NULL;
258 | 
259 | 	if( !(testObjAnalysis = (struct testsPDFObjAnalysis *)calloc(1,sizeof(struct testsPDFObjAnalysis)) ) ){
260 | 		err_log("initTestsPDFObjAnalysisStruct :: memory allocation failed\n");
261 | 		return NULL;
262 | 	}
263 | 
264 | 	testObjAnalysis->active_content = 0;
265 | 	testObjAnalysis->shellcode = 0;
266 | 	testObjAnalysis->pattern_high_repetition = 0;
267 | 	testObjAnalysis->dangerous_keyword_high = 0;
268 | 	testObjAnalysis->dangerous_keyword_medium = 0;
269 | 	testObjAnalysis->dangerous_keyword_low = 0;
270 | 	testObjAnalysis->time_exceeded = 0;
271 | 
272 | 	testObjAnalysis->js = 0;
273 | 	testObjAnalysis->xfa = 0;
274 | 	testObjAnalysis->ef = 0;
275 | 
276 | 	return testObjAnalysis;
277 | }
278 | 
279 | 
280 | /*
281 | initPDFDocument() :: Initialize pdfDocument structure.
282 | parameters:
283 | - none
284 | returns: (struct pdfDocument *)
285 | - the pdfDocument pointer on success.
286 | - NULL on error.
287 | */
288 | struct pdfDocument* initPDFDocument(){
289 | 
290 | 	struct pdfDocument* pdf = NULL;
291 | 	int err = 0;
292 | 	
293 | 	if( (pdf = (struct pdfDocument *)calloc(1,sizeof(struct pdfDocument))) == NULL ){
294 | 		err_log("initPDFDocument :: memory allocation failed\n");
295 | 		err++;
296 | 		goto clean;		
297 | 	}
298 | 
299 | 	if( (pdf->testStruct = initTestsPDFStruct()) == NULL ){
300 | 		err_log("initPDFDocument :: testsPDFstruct initialization failed!\n");
301 | 		err++;
302 | 		goto clean;
303 | 	}
304 | 
305 | 	if( (pdf->testObjAnalysis = initTestsPDFObjAnalysisStruct()) == NULL ){
306 | 		err_log("initPDFDocument :: testsPDFObjAnalysisStruct initialization failed!\n");
307 | 		err++;
308 | 		goto clean;		
309 | 	}
310 | 
311 | 	// Initialize entries
312 | 	pdf->fh = NULL;
313 | 	pdf->fd = -1;
314 | 	pdf->fname = NULL;
315 | 	pdf->content = NULL;
316 | 	pdf->objects =NULL;
317 | 	pdf->coef = 0;
318 | 	pdf->size = 0;
319 | 	pdf->version = NULL;
320 | 	pdf->trailers = NULL;
321 | 	pdf->xref = NULL;
322 | 	pdf->errors = 0;
323 | 	pdf->scan_time=0;
324 | 
325 | clean:
326 | 	if (err != 0){
327 | 		if (pdf != NULL){
328 | 			freePDFDocumentStruct(pdf);
329 | 			pdf = NULL;
330 | 		}
331 | 	}
332 | 
333 | 	return pdf;
334 | 
335 | }
336 | 
337 | 
338 | /*
339 | initPDFObject() :: Initialize pdfObject object structure.
340 | parameters:
341 | - none
342 | returns: (struct pdfObject *)
343 | - the pdfObject pointer on success.
344 | - NULL on error.
345 | */
346 | struct pdfObject* initPDFObject(){
347 | 
348 | 	struct pdfObject* obj = NULL;
349 | 	
350 | 	
351 | 	if( !(obj = (struct pdfObject*)calloc(1,sizeof(struct pdfObject)) ) ){
352 | 		err_log("initPDFObject :: memory allocation failed\n");
353 | 		return NULL;
354 | 	}
355 | 	
356 | 	// Initialize entries
357 | 	obj->reference = NULL;
358 | 	obj->content = NULL;
359 | 	obj->dico = NULL;
360 | 	obj->type = NULL;
361 | 	obj->stream = NULL;
362 | 	obj->filters = NULL;
363 | 	obj->decoded_stream = NULL;
364 | 	obj->offset = 0;
365 | 	obj->next = NULL;
366 | 	obj->stream_size = 0;
367 | 	obj->tmp_stream_size = 0;
368 | 	obj->content_size = 0;
369 | 	obj->decoded_stream_size = 0;
370 | 	obj->errors = 0;
371 | 	
372 | 	return obj;
373 | 
374 | }
375 | 
376 | 
377 | /*
378 | initPDFTrailer() :: Initialize pdf trailer structure
379 | parameters:
380 | - none
381 | returns: (struct pdfTrailer *)
382 | - the pdfTrailer pointer on success.
383 | - NULL on error.
384 | */
385 | struct pdfTrailer* initPDFTrailer(){
386 | 
387 | 	struct pdfTrailer* trailer = NULL;
388 | 	
389 | 	if( !(trailer = (struct pdfTrailer *)calloc(1,sizeof(struct pdfTrailer)) ) ){
390 | 		err_log("initPDFTrailer :: memory allocation failed\n");
391 | 		return NULL;
392 | 	}
393 | 	
394 | 	// Initialize entries
395 | 	trailer->offset = 0;
396 | 	trailer->content = NULL;
397 | 	trailer->dico = NULL;
398 | 	trailer->next = NULL;
399 | 	
400 | 	return trailer;
401 | 
402 | }
403 | 
404 | 
405 | /*
406 | addTrailerInList() ::  add a trailer in the list of trailers
407 | parameters:
408 | - struct pdfDocument * pdf
409 | - struct pdfTrailer * trailer
410 | returns: (int)
411 | - 0 on success
412 | - -1 on error.
413 | */
414 | int addTrailerInList(struct pdfDocument * pdf, struct pdfTrailer * trailer){
415 | 
416 | 	struct pdfTrailer * tmp =  NULL;
417 | 
418 | 	if(pdf == NULL || trailer == NULL){		
419 | 		err_log("addTrailerInList :: invalid parameters\n");
420 | 		return -1;
421 | 	}
422 | 
423 | 	
424 | 	if(pdf->trailers == NULL){
425 | 		pdf->trailers = trailer;
426 | 	}else{
427 | 		
428 | 		tmp = pdf->trailers;
429 | 		while(tmp->next != NULL){
430 | 			tmp = tmp->next;	
431 | 		}
432 | 		tmp->next = trailer;
433 | 		
434 | 	}
435 | 
436 | 	return 0;
437 | }


--------------------------------------------------------------------------------
/tools/perl_poc/lib/analysis/ObjectAnalysis.pm:
--------------------------------------------------------------------------------
  1 | package ObjectAnalysis;
  2 | 
  3 | use strict;
  4 | 
  5 | use lib::conf::Config;
  6 | 
  7 | my $DEBUG = "no";
  8 | 
  9 | 
 10 | # This function analyzes uri (for example detect a path traversal pattern in URI object.)
 11 | # TODO to improve
 12 | sub URI_analysis{
 13 | 
 14 | 	my $obj_ref = shift;
 15 | 	
 16 | 	
 17 | 	#print "Warning :: URI_analysis :: $obj_ref->{ref}\n";
 18 | 	
 19 | 	
 20 | 	my $test = "../../../myPath";
 21 | 	#my $test = "..\..\..\myPath";
 22 | 	
 23 | 	
 24 | 	# Path traversal detection
 25 | 	#if($obj_ref->{uri} =~ /([\.\.\/|\.\.\\])+/){
 26 | 	#if($test =~ /(\.\.\/)+/){
 27 | 	if($obj_ref->{uri} =~ /(\.\.\/)+/){
 28 | 	#if($obj_ref->{uri} =~ /(\.\.\/|\.\.\\)+/){
 29 | 		print "Warning :: URI_analysis :: Found path traversal in $obj_ref->{ref} URI :: $obj_ref->{uri}\n";
 30 | 		
 31 | 		if(exists($main::TESTS_CAT_2{"Malicious URI"})){
 32 | 			$main::TESTS_CAT_2{"Malicious URI"} ++;
 33 | 		}else{
 34 | 			$main::TESTS_CAT_2{"Malicious URI"} =1;
 35 | 		}
 36 | 		
 37 | 	}
 38 | 	
 39 | 	
 40 | 	# potential dangerous pattern :: system32
 41 | 	if($obj_ref->{uri} =~ /(system32)+/){
 42 | 		print "Warning :: URI_analysis :: Found potentially dangerous pattern in $obj_ref->{ref} URI :: $obj_ref->{uri}\n";
 43 | 		#$main::%TESTS_CAT_2{"Malicious URI"} ++;
 44 | 		
 45 | 		if(exists($main::TESTS_CAT_2{"Malicious URI"})){
 46 | 			$main::TESTS_CAT_2{"Malicious URI"} ++;
 47 | 		}else{
 48 | 			$main::TESTS_CAT_2{"Malicious URI"} =1;
 49 | 		}
 50 | 	}
 51 | 	
 52 | 	
 53 | 
 54 | }
 55 | 
 56 | 
 57 | 
 58 | # The basic analysis consists to parse the content of object and detect all potential dangerous patterns.
 59 | # Returns "none" - "high" - "medium" - or "low"
 60 | sub DangerousKeywordsResearch{
 61 | 
 62 | 	# 
 63 | 	#$TESTS_CAT_2{"Dangerous Pattern High"} ;
 64 | 	#$TESTS_CAT_2{"Dangerous Pattern Medium"};
 65 | 	#$TESTS_CAT_2{"Dangerous Pattern Low"};
 66 | 
 67 | 	my ($obj_ref,$content) = @_;
 68 | 	
 69 | 	if(!$content){
 70 | 		#print "Error :: DangerousKeywordsResearch :: empty content\n";
 71 | 		return "none";
 72 | 	}
 73 | 	
 74 | 	
 75 | 	
 76 | 	# keywords (HIGH) :: HeapSpray - heap - spray - hack - shellcode - shell - Execute - exe - exploit - pointers - memory - exportDataObject -app.LaunchURL -byteToChar - system32  - payload
 77 | 	if( $content =~ /(HeapSpray|heap|spray|hack|shellcode|shell|Execute|pointers|byteToChar|system32|payload|console)/si ){
 78 | 		#$TESTS_CAT_2{"Dangerous Pattern High"} ++;
 79 | 		print "Dangerous Pattern \(High\) found :: $1 :: in $obj_ref->{ref} \n";
 80 | 		return "High";
 81 | 	}
 82 | 	
 83 | 	# Unicode detection
 84 | 	my @rep_unicode = ($content =~ /(\%u[a-f0-9]{4})/gi);
 85 | 	my $count = @rep_unicode;
 86 | 	print "unicode string = $count :: @rep_unicode\n" if ($count > 0);
 87 | 	
 88 | 	if($count > 10){
 89 | 		print "Warning :: DangerousKeywordsResearch :: Found unicode strings :: @rep_unicode\n";
 90 | 		return "High";
 91 | 	}
 92 | 	
 93 | 	# TODO combinaison between unicode and medium
 94 | 
 95 | 	
 96 | 	# Javascript keywords (MEDIUM) :: substring - toSring - split - eval - String.replace - unescape - exportDataObject - StringfromChar - util.print
 97 | 	if( $content =~ /(toString|substring|split|eval|addToolButton|String\.replace|unescape|exportDataObject|StringfromChar|util\.print)/si ){
 98 | 		#$TESTS_CAT_2{"Dangerous Pattern Medium"} ++;
 99 | 		print "Dangerous Pattern \(Medium\) found :: $1 :: in $obj_ref->{ref} \n";
100 | 		return "Medium";
101 | 	}
102 | 	
103 | 	
104 | 	# javascript keywords :: 
105 | 	# 
106 | 	# 
107 | 	# NOP detection "90"
108 | 	# 
109 | 	# %u... like   %u4141%u4141%u63a5%u4a80%u0000
110 | 	
111 | 	
112 | 	
113 | 	# TODO Look for JavaScript in XFA block Ex: <script name="ADBE::FileAttachment" contentType="application/x-javascript" ></script>
114 | 	
115 | 	return "none";
116 | }
117 | 
118 | 
119 | # This function detect the wide repetition of an unknown pattern
120 | # Test1 files	:: unknown pattern repetition
121 | # 618b5fcf762bc7397a22e568753858c9
122 | # 6254e7e17d9796028bdc56ba81022617
123 | # 6bffa8f1f0155a554fcdca6a1839576e
124 | # 8e88d64028093d2ef6a633c83ee28e44
125 | # b400e8d3635f91176e1d56a38e6aa590
126 | # c8c39082dfca15d5ded02ca050a96112
127 | # de8bcc90ecd0049a1ab4e5a5087359b4
128 | # fa2ddb10d9184dba0f90c88b7786f6ec
129 | sub Unknown_Pattern_Repetition_Detection{
130 | 
131 | 
132 | 	my $result = 0;
133 | 	my $objcontent = shift;
134 | 	my %h; # hash table containing the results.
135 | 	my $cpt=5; # number of characteres repetition to detect
136 | 	
137 | 	my $timeout = 5;
138 | 	my $nb_rep_max = 200;
139 | 
140 | 	if(!$objcontent){
141 | 		return 0;
142 | 	}
143 | 	
144 | 	my $start_time = time - $^T;
145 | 
146 | 	# Remove a white characters for a better processing
147 | 	$objcontent =~ s/\s//g;
148 | 
149 | 	# split into array
150 | 	my @a =split('',$objcontent);
151 | 	
152 | 	for (my $i = 0 ; $i<= $#a-$cpt ; $i++){
153 | 	
154 | 		my $pat;
155 | 		
156 | 		# generate pattern according to number of caracter
157 | 		for (my $y=0 ; $y<$cpt ; $y++){
158 | 			$pat .= $a[$i+$y];
159 | 		}
160 | 		
161 | 		# if the pattern is already in the table
162 | 		if(!exists($h{"$pat"})){
163 | 
164 | 			my $count = 0;
165 | 			# count the number of repetition in the content
166 | 			my @rep = ($objcontent =~ /\Q$pat/g);
167 | 			$count = @rep;
168 | 			#print "pat = $pat :: count = $count :: rep = $#repp \n";
169 | 			$h{"$pat"} = $count;
170 | 			
171 | 			if($count > $Config::MAX_REP_DETECTION){
172 | 				print "FOUND = $pat => $count\n\n" unless $DEBUG eq "yes";
173 | 				$result ++;
174 | 				return $result;
175 | 			}
176 | 			
177 | 		}
178 | 		
179 | 		my $time_elapsed = time - $^T;
180 | 		if($time_elapsed-$start_time > $Config::ANALYSIS_TIMEOUT ){
181 | 		
182 | 			print "TIME_EXCEEDED\n";
183 | 			return -1;
184 | 		}
185 | 
186 | 	}
187 | 	
188 | 	
189 | 	# print
190 | 	my $sum=0;
191 | 	my $nb =0;
192 | 	while ((my $key, my $value) = each %h)  {
193 | 		$sum+= $value;
194 | 		$nb ++;
195 | 		#print "$key => $value\n";
196 | 	}
197 | 	
198 | 	if($nb==0){
199 | 		return 0;
200 | 	}
201 | 	
202 | 	# Calcul de l'ecart-type
203 | 	my $moyenne =0 ;# moyenne
204 | 	my $var =0; # variance
205 | 	my $et = 0; # ecart type
206 | 	
207 | 	
208 | 	$moyenne = $sum/$nb;
209 | 	
210 | 	
211 | 	print "100% => $sum :: cpt =>  $cpt :: m => $moyenne \n" unless $DEBUG eq "no";
212 | 
213 | 	while ((my $key, my $value) = each %h)  {
214 | 		
215 | 		my $pourcent = ($value*100)/$sum;
216 | 		$var += ($value-$moyenne)*($value-$moyenne);
217 | 	}
218 | 
219 | 	
220 | 	$var = $var/$nb;
221 | 	$et = sqrt($var);
222 | 	
223 | 	print "moyenne = $moyenne :: nb = $nb :: variance = $var :: ecartype = $et\n" unless $DEBUG eq "no";
224 | 
225 | 	while ((my $key, my $value) = each %h)  {
226 | 
227 | 		if($value > 2*$et  && $value > $nb_rep_max){
228 | 			print "FOUND = $key => $value :: \n\n" unless $DEBUG eq "yes";
229 | 			$result ++ ;
230 | 		}	
231 | 	}
232 | 	
233 | 	#print "END\n\n";
234 | 	return $result;
235 | 
236 | }
237 | 
238 | 
239 | sub Unknown_Pattern_Repetition_Detection__{
240 | 
241 | 
242 | 	my $result = 0;
243 | 	my @found;
244 | 	my $objcontent = shift;
245 | 	my %h; # hash table containing the results.
246 | 	my $cpt=5; # number of characteres repetition to detect
247 | 	my $rep; # The number of repetition to reach to trigger an alert
248 | 
249 | 	if(!$objcontent){
250 | 		return;
251 | 	}
252 | 
253 | 	# Remove a white characters for a better processing
254 | 	$objcontent =~ s/\s//g;
255 | 
256 | 	# split into array
257 | 	my @a =split('',$objcontent);
258 | 
259 | 	for (my $i = 0 ; $i<= $#a-$cpt ; $i++){
260 | 
261 | 		#my $pat = $a[$i].$a[$i+1];
262 | 		my $pat;
263 | 
264 | 		# generate pattern according to number of caracter
265 | 		for (my $y=0 ; $y<$cpt ; $y++){
266 | 			$pat .= $a[$i+$y];
267 | 		}
268 | 
269 | 		# if the pattern is already in the table
270 | 		if(exists($h{"$pat"})){
271 | 			next;
272 | 		}
273 | 
274 | 		for (my $j = $i+$cpt ; $j<= $#a-$cpt ; $j++){
275 | 
276 | 			my $pat2;
277 | 			# generate pattern according to number of caracter
278 | 			for (my $y=0 ; $y<$cpt ; $y++){
279 | 				$pat2 .= $a[$j+$y];
280 | 			}
281 | 
282 | 			if($pat eq $pat2 && $i!=$j){
283 | 
284 | 
285 | 				# add in repetition hash table
286 | 				if(exists($h{"$pat"})){ # If the pattern as already been detected
287 | 					# add in offset array
288 | 					# search if the offset is already in the array
289 | 					my $in=0;
290 | 					my @tmp=@{$h{"$pat"}};
291 | 					foreach(@tmp){
292 | 						if($_ == $j){
293 | 							$in = 1;
294 | 						}
295 | 					}
296 | 					
297 | 					push($h{"$pat"}, $j) unless $in == 1;
298 | 				}else{
299 | 					my @tmp_array;
300 | 					push @tmp_array, $i;
301 | 					push @tmp_array, $j;
302 | 					$h{"$pat"}= \@tmp_array;
303 | 				}
304 | 
305 | 			}
306 | 		}
307 | 	}
308 | 
309 | 
310 | 
311 | 	my $sum=0;
312 | 	my $nb =0;
313 | 	while ((my $key, my $value) = each %h)  {
314 | 
315 | 		my @arr= @{$value};
316 | 		$sum+= $#arr+1;
317 | 		$nb ++;		
318 | 	}
319 | 
320 | 	# Calcul de l'ecart-type
321 | 	
322 | 	my $moyenne =0 ;# moyenne
323 | 	my $var =0; # variance
324 | 	my $et = 0; # ecart type
325 | 
326 | 
327 | 	if($nb > 0){
328 | 		$moyenne = $sum/$nb;
329 | 	}
330 | 	print "100% => $sum :: cpt =>  $cpt :: m => $moyenne \n" unless $DEBUG eq "no";
331 | 
332 | 	while ((my $key, my $value) = each %h)  {
333 | 		my @arr= @{$value};
334 | 		my $rep = $#arr+1;
335 | 		my $pourcent = ($rep*100)/$sum;
336 | 		#print "$key => $rep ::: $pourcent %\n\n";
337 | 
338 | 		$var += ($rep-$moyenne)*($rep-$moyenne);
339 | 		
340 | 	}
341 | 
342 | 	if($nb > 0){
343 | 		$var = $var/$nb;
344 | 		$et = sqrt($var);
345 | 	}
346 | 	
347 | 	
348 | 	print "moyenne = $moyenne :: nb = $nb :: variance = $var :: ecartype = $et\n" unless $DEBUG eq "no";
349 | 
350 | 	while ((my $key, my $value) = each %h)  {
351 | 
352 | 		my @arr= @{$value};
353 | 		my $rep = $#arr+1;
354 | 
355 | 		if($rep > 2*$et  && $rep > 30){
356 | 			print "FOUND = $key => $rep\n\n" unless $DEBUG eq "yes";
357 | 			$result ++ ;
358 | 		}	
359 | 	}
360 | 	
361 | 	
362 | 	#if($result > 0){
363 | 	#	$TESTS_CAT_2{"Pattern Repetition"} = "DETECTED";
364 | 	#}
365 | 	
366 | 	return $result;
367 | 	
368 | 
369 | }
370 | 
371 | 
372 | 
373 | 
374 | # This function detect a shellcode or suite of hexa insertion
375 | # Test2  files	:: shellcode or hexa insertion
376 | # 5c08ea688165940008949a86805ff1d0
377 | # 5f27adfa55628ea4674348351e241be8
378 | # 73b0e8c5a7e5814c723295313ce0262d
379 | # 75c1ae242d07bb738a5d9a9766c2a7de
380 | # 7bcb4c9c35e01bd985f74aec66c19876
381 | # 84d860a4c9e8d2baec983ef35789449a
382 | # ab3f72df228715e6265cb222c586254e
383 | # b823473c7206d64fa3ce20c4669b707d
384 | # d785f43c523bf36d1678da84fa84617f
385 | # edab6ed2809f739b67667e8fed689992
386 | sub Shellcode_Detection{
387 | 
388 | 	my $objcontent = shift;
389 | 	my $res = 0;
390 | 	my @found;
391 | 	
392 | 	if(!$objcontent){
393 | 		return 0;
394 | 	}
395 | 
396 | 	# Remove white space for a better processing
397 | 	$objcontent =~ s/\s//g;
398 | 
399 | 	
400 | 	# Shellcode detection // ou repetition de chiffres, separated by an element (,_\-...)
401 | 
402 | 	# 73b0e8c5a7e5814c723295313ce0262d
403 | 	# 5f27adfa55628ea4674348351e241be8
404 | 	# 5c08ea688165940008949a86805ff1d0
405 | 	# 73b0e8c5a7e5814c723295313ce0262d
406 | 	# 7bcb4c9c35e01bd985f74aec66c19876
407 | 	# d785f43c523bf36d1678da84fa84617f
408 | 	# 75c1ae242d07bb738a5d9a9766c2a7de
409 | 	# ab3f72df228715e6265cb222c586254e
410 | 	# b823473c7206d64fa3ce20c4669b707d
411 | 	if( $objcontent =~ /(([\d]{1,2}[\/,%\$@^_]{1,2}){100})/ig){
412 | 		print "\n\n:::TEST 2:::\n" unless $DEBUG eq "no";
413 | 		print "DANGEROUS PATTERN 1 FOUND !!\n" unless $DEBUG eq "no";
414 | 		$res ++;
415 | 		push @found, $1;
416 | 		#print "$1\n";
417 | 		
418 | 		# TODO look for "split" pattern (or medium dangerous pattern)
419 | 		
420 | 		
421 | 	}
422 | 
423 | 
424 | 	if( $objcontent =~ /(([\d]{1,}[\/,%\$@^_-]{1,2}){100})/ig){
425 | 		print "\n\n:::TEST 2:::\n" unless $DEBUG eq "no";
426 | 		print "DANGEROUS PATTERN 1.1 FOUND !!\n" unless $DEBUG eq "no";
427 | 		$res ++;
428 | 		push @found, $1;
429 | 		print "$1\n" unless $DEBUG eq "no";
430 | 
431 | 		# TODO look for "split" pattern (or medium dangerous pattern)
432 | 		
433 | 	}
434 | 
435 | 	#pat = 9804c-9686c7351c-7254c27757c-27643c18532c-18500c32447c-32352c28309c-28201c10773c-10724c12582c-12521c
436 | 	# 84d860a4c9e8d2baec983ef35789449a
437 | 	#if( $objcontent =~ /([\dABCDEF\-]{100})/ig){
438 | 	if( $objcontent =~ /(([\dABCDEF]{2,}[-]){100})/ig){
439 | 		print "\n\n:::TEST 2:::\n" unless $DEBUG eq "no";
440 | 		print "DANGEROUS PATTERN 2 FOUND !!\n" unless $DEBUG eq "no";
441 | 		$res ++;
442 | 		push @found, $1;
443 | 		print "$1\n" unless $DEBUG eq "no";	
444 | 	}
445 | 
446 | 	# edab6ed2809f739b67667e8fed689992
447 | 	#if( $objcontent =~ /([\d\/A-z,]{100})/ig){
448 | 
449 | 	#if($res eq "true"){
450 | 	#	$TESTS_CAT_2{"Shellcode"} = "DETECTED";
451 | 	#}
452 | 	
453 | 
454 | 	return $res;
455 | 
456 | }
457 | 
458 | 
459 | 
460 | 1;
461 | 


--------------------------------------------------------------------------------
/tools/perl_poc/lib/analysis/DocumentStruct.pm:
--------------------------------------------------------------------------------
  1 | package DocumentStruct;
  2 | 
  3 | use strict;
  4 | use MIME::Base64 ();
  5 | 
  6 | my $DEBUG = "no";
  7 | 
  8 | 
  9 | # Check the magic number of a PDF file 
 10 | sub CheckMagicNumber{
 11 | 
 12 | 	my $file_ref= shift;
 13 | 	my $file = $file_ref;
 14 | 	
 15 | 	my $len=8;
 16 | 	my $offset=0;
 17 | 	my $ver="undef";
 18 | 	
 19 | 	
 20 | 	
 21 | 	seek ($file, 0, 0);
 22 | 	read $file, $ver, $len, $offset or print "read failed :: $!\n";
 23 | 	
 24 | 	
 25 | 	if( $ver =~ /\%PDF-\d\.\d/){
 26 | 		print "PDF header : OK\n" unless $DEBUG eq "no";
 27 | 		
 28 | 		
 29 | 		# Check if there is several headers in file
 30 | 		seek ($file, 0, 0);
 31 | 		my $content = do { local $/; <$file>};
 32 | 		
 33 | 		my @pdf_headers = $content =~ /\%PDF-\d\.\d/sg;
 34 | 		my $num = @pdf_headers;
 35 | 		
 36 | 		if($num > 1){
 37 | 			print "Warning :: CheckMagicNumber :: There are $num pdf headers in this file\n";
 38 | 			$main::TESTS_CAT_1{"Multiple Headers"} = $num;
 39 | 		}
 40 | 		
 41 | 		return ($ver,"OK");
 42 | 	}
 43 | 	
 44 | 	# Check string <?xml version="1.0"?><?xfa ?><xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/"><pdf xmlns="http://ns.adobe.com/xdp/pdf/"><document><chunk>
 45 | 	seek ($file, 0, 0); 	# rewind file
 46 | 	my $content = do { local $/; <$file>};
 47 | 	#print "$content\n";
 48 | 	if($content =~ /<xdp:xdp\sxmlns:xdp=\"http:\/\/ns\.adobe\.com\/xdp\/"><pdf\sxmlns=\"http:\/\/ns\.adobe\.com\/xdp\/pdf\/\"><document><chunk>(.*)<\/chunk><\/document><\/pdf>/si){
 49 | 	#if($content =~ /<chunk>(.*)<\/chunk>/si){
 50 | 		print "This document is an XML Data Package (XDP)\n" unless $DEBUG eq "no";
 51 | 		my $chunkContent = $1;
 52 | 		#print "chunkContent = $chunkContent\n";
 53 | 		
 54 | 		#decode base64 content
 55 | 		my $decodedContent = MIME::Base64::decode($chunkContent) or print "Error while decoding base64 :: $!\n";
 56 | 		#print "decoded content = $decodedContent\n";
 57 | 		
 58 | 		# write content in a new file
 59 | 		close($file);
 60 | 		open $file, ">tmp.pdf" or die "open failed in tmp.pdf : $! ";
 61 | 		binmode $file;
 62 | 		print $file $decodedContent;
 63 | 		#print "file handle = $file\n";
 64 | 		close($file);
 65 | 		open $file, "<tmp.pdf" or die "open failed in tmp.pdf : $! ";
 66 | 		binmode $file;
 67 | 		#print "file handle 2 = $file\n";
 68 | 		#print "$file;
 69 | 		
 70 | 		seek ($file, 0, 0);
 71 | 		read $file, $ver, $len, $offset or print "read failed :: $!\n";
 72 | 		
 73 | 		if( $ver =~ /\%PDF-\d\.\d/){
 74 | 			print "PDF header : OK\n" unless $DEBUG eq "no";
 75 | 			return ($ver,"XDP_FILE");
 76 | 		}
 77 | 		
 78 | 		return ($ver,"BAD_MAGIC");
 79 | 		
 80 | 		
 81 | 	}
 82 | 	
 83 | 	return ($ver, "BAD_MAGIC");
 84 | }
 85 | 
 86 | 
 87 | # This function check if the pdf document is empty with only active content (js,embedded_file,openaction etc.)
 88 | # TODO to improve
 89 | sub Empty_Pages_Document_detection__old{
 90 | 
 91 | 	#my $ref = shift;
 92 | 	my $pdfObjects = shift;
 93 | 	
 94 | 	#print "DEBUG = $ref\n";
 95 | 	
 96 | 	#my %pdfObjects = %{$ref};
 97 | 	
 98 | 	
 99 | 	my $ret=0;
100 | 	my $numPages =0; # Number of pages found
101 | 	my $active_content =0; # Number of js, embedded files
102 | 	
103 | 	print "\n\n ::: Empty Pages With Active Content detection ::: \n" unless $DEBUG eq "no";
104 | 	
105 | 	my @objs = values(%{$pdfObjects});
106 | 	foreach(@objs){
107 | 	
108 | 		
109 | 		if( exists($_->{"type"}) && $_->{"type"} eq "/Pages" ){
110 | 		
111 | 			print "FOUND Pages object :: $_->{ref} :: \n" unless $DEBUG eq "yes";
112 | 			
113 | 			# Get kid node pages
114 | 			my @pages = $_->{"kids"} =~ /(\d+\s\d\sR)/sg;
115 | 			#print @pages;
116 | 				
117 | 			foreach(@pages){
118 | 				my $page_ref = $_;
119 | 				$page_ref =~ s/R/obj/;
120 | 				print "page ref = $page_ref\n";
121 | 				
122 | 				# if the page exists and the /Content parameter is set
123 | 				if(exists($pdfObjects->{$page_ref}) && exists($pdfObjects->{$page_ref}->{"pagecontent"})  ){
124 | 				
125 | 					# Check if it's not an empty content
126 | 					#my $p_content = $pdfObjects{$page_ref}->{"pagecontent"};
127 | 					
128 | 					
129 | 					# If the Contents fiels is an array
130 | 					my @pcontents = $pdfObjects->{$page_ref}->{"pagecontent"} =~ /(\d+\s\d\sR)/sg;
131 | 					
132 | 					foreach (@pcontents){
133 | 					
134 | 						my $content_page_obj = $_;
135 | 						$content_page_obj =~ s/R/obj/;
136 | 						
137 | 						print ":: page content = $content_page_obj :: \n";#" $pdfObjects{$contentp}->{content}\n";
138 | 						
139 | 						if(exists($pdfObjects->{$content_page_obj}) && exists($pdfObjects->{$content_page_obj}->{"stream"}) && length($pdfObjects->{$content_page_obj}->{"stream"}) > 0  ){
140 | 							$ret ++;
141 | 							print "Page $page_ref is not empty => OK\n"unless $DEBUG eq "no";
142 | 						
143 | 						}elsif(! exists($pdfObjects->{$content_page_obj})){
144 | 							print "Warning : Content Object ($content_page_obj) of page $page_ref doesn\'t exist\n" unless $DEBUG eq "no";
145 | 							
146 | 						}elsif( exists($pdfObjects->{$content_page_obj}->{content}) ){
147 | 						
148 | 							# Trigger the case when the object represents an array of objects Ex: [422 0 R 423 0 R 424 0 R 425 0 R 426 0 R 427 0 R 428 0 R 429 0 R]
149 | 							
150 | 							my @content_page_array = $pdfObjects->{$content_page_obj}->{"content"} =~ /(\d+\s\d\sR)/sg;
151 | 							
152 | 							foreach(@content_page_array){
153 | 								
154 | 								my $content_page_obj_2 = $_;
155 | 								$content_page_obj_2 =~ s/R/obj/;
156 | 								#print " Found obj :: $content_page_obj_2\n";
157 | 								
158 | 								if(exists($pdfObjects->{$content_page_obj_2})){
159 | 								
160 | 									# TODEBUG print "";
161 | 									my $test = $pdfObjects->{$content_page_obj_2} ;
162 | 									
163 | 									print "DEBUG ::  ".$pdfObjects->{$content_page_obj_2}->{stream}."\n";
164 | 								
165 | 									if( exists($pdfObjects->{$content_page_obj_2}->{"stream"}) && length($pdfObjects->{$content_page_obj_2}->{"stream"}) > 0 ){
166 | 										$ret ++;
167 | 										print "Found content of the page $page_ref in obj $content_page_obj_2 => OK\n"unless $DEBUG eq "no";
168 | 									}else{
169 | 										print "Warning :: Page content Object ($content_page_obj_2) is empty !!!!\n" unless $DEBUG eq "yes";
170 | 									}
171 | 								
172 | 								}else{
173 | 									print "Warning :: Empty_Pages_Document_detection :: Page content Object ($content_page_obj_2) is not defined\n" unless $DEBUG eq "yes";
174 | 								}	
175 | 							}
176 | 							
177 | 						
178 | 						}else{
179 | 							print "Warning :: Empty_Pages_Document_detection :: The Stream of the Content Object is empty\n" unless $DEBUG eq "yes";
180 | 							
181 | 						}	
182 | 					
183 | 					}
184 | 				
185 | 					
186 | 				}elsif(! exists($pdfObjects->{$page_ref})){
187 | 					print "Warning :: Empty_Pages_Document_detection :: Page $page_ref does\'nt exist.\n" unless $DEBUG eq "o";
188 | 				}else{
189 | 					print "Warning :: Empty_Pages_Document_detection :: Page $page_ref is empty\n" unless $DEBUG eq "o";
190 | 				}
191 | 				
192 | 			
193 | 			}
194 | 			
195 | 		}
196 | 		
197 | 		# TODO Verify that the number of treated pages is the number of pages in the document.
198 | 	
199 | 	}
200 | 	
201 | 	return $ret;
202 | 	
203 | }
204 | 
205 | 
206 | sub Empty_Pages_Document_detection{
207 | 
208 | 	#my $ref = shift;
209 | 	my $pdfObjects = shift;
210 | 	
211 | 	#print "DEBUG = $ref\n";
212 | 	
213 | 	#my %pdfObjects = %{$ref};
214 | 	
215 | 	
216 | 	my $ret=0;
217 | 	my $numPages =0; # Number of pages found
218 | 	my $active_content =0; # Number of js, embedded files
219 | 	
220 | 	print "\n\n ::: Empty Pages With Active Content detection ::: \n" unless $DEBUG eq "no";
221 | 	
222 | 	my @objs = values(%{$pdfObjects});
223 | 	foreach(@objs){
224 | 	
225 | 		
226 | 		if( exists($_->{"type"}) && $_->{"type"} eq "/Pages" ){
227 | 		
228 | 			print "FOUND Pages object :: $_->{ref} :: \n" unless $DEBUG eq "no";
229 | 			
230 | 			# Get kid node pages
231 | 			my @pages = $_->{"kids"} =~ /(\d+\s\d\sR)/sg;
232 | 			#print @pages;
233 | 				
234 | 			foreach(@pages){
235 | 				my $page_ref = $_;
236 | 				$page_ref =~ s/R/obj/;
237 | 				#print "page ref = $page_ref\n";
238 | 				
239 | 				# if the page exists and the /Content parameter is set
240 | 				if($pdfObjects->{$page_ref}->{"type"} eq "/Page" && exists($pdfObjects->{$page_ref}) && exists($pdfObjects->{$page_ref}->{"pagecontent"})  ){
241 | 				
242 | 					# Check if it's not an empty content
243 | 					#my $p_content = $pdfObjects{$page_ref}->{"pagecontent"};
244 | 					
245 | 					
246 | 					# If the Contents fiels is an array
247 | 					my @pcontents = $pdfObjects->{$page_ref}->{"pagecontent"} =~ /(\d+\s\d\sR)/sg;
248 | 					
249 | 					foreach (@pcontents){
250 | 					
251 | 						my $content_page_obj = $_;
252 | 						$content_page_obj =~ s/R/obj/;
253 | 						
254 | 						#print ":: page content = $content_page_obj :: \n";#" $pdfObjects{$contentp}->{content}\n";
255 | 						
256 | 						if(exists($pdfObjects->{$content_page_obj}) && exists($pdfObjects->{$content_page_obj}->{"stream"}) && length($pdfObjects->{$content_page_obj}->{"stream"}) > 0  ){
257 | 							$ret ++;
258 | 							print "Page $page_ref is not empty => OK\n"unless $DEBUG eq "no";
259 | 						
260 | 						}elsif(! exists($pdfObjects->{$content_page_obj})){
261 | 							print "Warning : Content Object ($content_page_obj) of page $page_ref doesn\'t exist\n" unless $DEBUG eq "yes";
262 | 							
263 | 						}elsif( exists($pdfObjects->{$content_page_obj}->{content}) ){
264 | 						
265 | 							# Trigger the case when the object represents an array of objects Ex: [422 0 R 423 0 R 424 0 R 425 0 R 426 0 R 427 0 R 428 0 R 429 0 R]
266 | 							
267 | 							my @content_page_array = $pdfObjects->{$content_page_obj}->{"content"} =~ /(\d+\s\d\sR)/sg;
268 | 							
269 | 							foreach(@content_page_array){
270 | 								
271 | 								my $content_page_obj_2 = $_;
272 | 								$content_page_obj_2 =~ s/R/obj/;
273 | 								#print " Found obj :: $content_page_obj_2\n";
274 | 								
275 | 								if(exists($pdfObjects->{$content_page_obj_2})){
276 | 								
277 | 									# TODEBUG print "";
278 | 									#my $test = $pdfObjects->{$content_page_obj_2};
279 | 									
280 | 									#print "DEBUG ::  ".$test->{stream}."\n";
281 | 								
282 | 									if( exists($pdfObjects->{$content_page_obj_2}->{"stream"}) && length($pdfObjects->{$content_page_obj_2}->{"stream"}) > 0 ){
283 | 										$ret ++;
284 | 										print "Found content of the page $page_ref in obj $content_page_obj_2 => OK\n"unless $DEBUG eq "no";
285 | 									}else{
286 | 										print "Warning :: Page content Object ($content_page_obj_2) is empty \n" unless $DEBUG eq "yes";
287 | 									}
288 | 								
289 | 								}else{
290 | 									print "Warning :: Empty_Pages_Document_detection :: Page content Object ($content_page_obj_2) is not defined\n" unless $DEBUG eq "yes";
291 | 								}	
292 | 							}
293 | 							
294 | 						
295 | 						}else{
296 | 							print "Warning :: Empty_Pages_Document_detection :: The Stream of the Content Object is empty\n" unless $DEBUG eq "yes";
297 | 							
298 | 						}	
299 | 					
300 | 					}
301 | 				
302 | 					
303 | 				}elsif(! exists($pdfObjects->{$page_ref})){
304 | 					print "Warning :: Empty_Pages_Document_detection :: Page $page_ref does\'nt exist.\n" unless $DEBUG eq "o";
305 | 				}elsif( (! exists($pdfObjects->{$page_ref}->{"pagecontent"})) && $pdfObjects->{$page_ref}->{"type"} eq "/Page" ){
306 | 					print "Warning :: Empty_Pages_Document_detection :: Page $page_ref is empty\n" unless $DEBUG eq "o";
307 | 				}
308 | 				
309 | 			
310 | 			}
311 | 			
312 | 		}
313 | 		
314 | 		# TODO Verify that the number of treated pages is the number of pages in the document.
315 | 	
316 | 	}
317 | 	
318 | 	return $ret;
319 | 	
320 | }
321 | 
322 | 
323 | 
324 | 
325 | 
326 | 
327 | # This function check if the xref table is conform
328 | # TODO return 0 if failed and 1 if sucess and the error status
329 | sub Check_xref{
330 | 	
331 | 	my ($trailer, $fh, $pdfObjects_ref) = @_;
332 | 	my $xref_offset;
333 | 	my $len=4; # "xref" string length.
334 | 	my $res;
335 | 	my $ret = 0;
336 | 	
337 | 	my %pdfObjects = %{$pdfObjects_ref};
338 | 
339 | 	# Get the startxref offset in the trailer
340 | 	if ($trailer =~ /startxref\s*(\d+)\s*%%EOF/){
341 | 		$xref_offset = $1;
342 | 	}else{
343 | 		#return (0,$BAD_XREF_OFFSET);
344 | 		return 0;
345 | 	}
346 | 	print "\nxref_offset = $xref_offset\n" unless $DEBUG eq "no";
347 | 
348 | 
349 | 	# Test XRef keyword
350 | 	seek ($fh, $xref_offset, 0); # Go to the xref offset
351 | 	read ($fh, $res, $len) or print "Check_xref :: read failed :: $!\n";
352 | 	print "res = $res\n" unless $DEBUG eq "no";
353 | 
354 | 	
355 | 	if($res ne "xref"){ # Test for object stream reference
356 | 		$len = 10;
357 | 		seek ($fh, $xref_offset, 0); # Go to the xref offset
358 | 		read ($fh, $res, $len) or print "Check_xref :: read failed :: $!\n";
359 | 		print "res2 = $res\n" unless $DEBUG eq "no";
360 | 
361 | 		if($res =~ /^(\d+\s\d\sobj)/){
362 | 			# TODO decode xref stream.
363 | 			#print "";
364 | 			# Check if the object is well a XRef type object
365 | 			my $obj_ref= $1;
366 | 			
367 | 			if(exists($pdfObjects{$obj_ref}) && $pdfObjects{$obj_ref}->{"type"} eq "/XRef"){
368 | 				return 1;
369 | 			}else{
370 | 				return 0;
371 | 			}
372 | 			
373 | 			
374 | 		}else{
375 | 			#print "BAD xref offset!!\n";
376 | 			#return $BAD_XREF_OFFSET;
377 | 			#return (0,$BAD_XREF_OFFSET);
378 | 			return 0;
379 | 		}
380 | 
381 | 	}
382 | 
383 | 	# Get xref entries
384 | 	my $xref_content=$res;
385 | 	#print "Offset position = ".tell($fh)."\n" unless $DEBUG eq "no";
386 | 	my $i=5;
387 | 	while(!( $xref_content =~ /trailer$/)){
388 | 		
389 | 		read ($fh, $xref_content, 1, $i) or print "Check_xref :: read failed :: $!\n";
390 | 		$i++;	
391 | 	}
392 | 
393 | 	print "$xref_content\n" unless $DEBUG eq "no";
394 | 
395 | 	# nnnnnnnnnn ggggg n eol
396 | 	# nnnnnnnnnn is a 10-digit byte offset
397 | 	# ggggg is a 5-digit generation number
398 | 	# n is a literal keyword identifying this as an in-use entry
399 | 	# my @xref_entries = $xref_content =~ /(\d{10}\s\d{5}\s[f|n]\n)/;
400 | 	my $first_obj;
401 | 	my $number_of_entries;
402 | 	if($xref_content =~ /(\d{1,3})\s(\d{1,3})/g){
403 | 		$first_obj = $1;
404 | 		$number_of_entries=$2;
405 | 		print "$first_obj :: $number_of_entries\n\n" unless $DEBUG eq "no";
406 | 	}
407 | 	my @xref_entries = $xref_content =~ /(\d{10}\s\d{5}\s[f|n])/g;
408 | 
409 | 	# @pdf_objects;
410 | 
411 | 	# Check object's offets
412 | 	my $id=0;
413 | 	foreach(@xref_entries){
414 | 		
415 | 		if(/(\d{10})\s(\d{5})\s([f|n])/){
416 | 
417 | 			#print "\n$1::$2::$3\n";
418 | 			my $off = $1;
419 | 			my $gen = $2;
420 | 			my $free = $3;
421 | 
422 | 			my $digit = length($id);
423 | 			
424 | 			$len = 6+length($id); # len depends of the number of digit of ref id
425 | 			
426 | 			seek ($fh, $off, 0);
427 | 			read ($fh, $res, $len) or print "Check_xref :: read failed :: off=$off :: len=$len\n";
428 | 			chomp $res;
429 | 			
430 | 			#print "res = $res\n" if $id == 100;
431 | 
432 | 			if($res =~/($id\s0\sobj)/ or $free ne "n"){
433 | 			
434 | 				my $obj_ref = $1;	
435 | 				
436 | 				# save the object's offset
437 | 				if(exists($pdfObjects{$obj_ref}) ){
438 | 					print "object $obj_ref is at offset $off\n" unless $DEBUG eq "no";
439 | 					$pdfObjects{$obj_ref}->{"offset"} = $off ;
440 | 				}
441 | 				
442 | 			
443 | 			}else{
444 | 				print "WRONG Object offset :: $id $gen obj :: offset $off\n"unless $DEBUG eq "yes";
445 | 				#$ret = $BAD_OBJ_OFFSET;
446 | 				#return (0,$BAD_OBJ_OFFSET);
447 | 				return 0;
448 | 			}
449 | 			$id ++;
450 | 
451 | 		}
452 | 	}
453 | 	
454 | 
455 | 	return 1;
456 | }
457 | 
458 | 
459 | 
460 | 
461 | 
462 | 1;
463 | __END__
464 | 


--------------------------------------------------------------------------------
/tools/perl_poc/lib/utils/CleanRewriting.pm:
--------------------------------------------------------------------------------
  1 | package CleanRewriting;
  2 | 
  3 | use strict;
  4 | 
  5 | use lib::utils::Filters;
  6 | use File::Basename;
  7 | 
  8 | #use bytes;
  9 | 
 10 | my $DEBUG = "no";
 11 | 
 12 | 
 13 | # This function remove the JavaScript content of an object (not an object stream)
 14 | sub RemoveJSContentFromObj{
 15 | 
 16 | 	my ($obj,$pdfObjects) = @_;
 17 | 	
 18 | 		
 19 | 	print "The object is at offset $pdfObjects->{$obj}->{objStmOff} in object stream\n";	
 20 | 		
 21 | 		
 22 | 	my $len = length($pdfObjects->{$obj}->{js});
 23 | 		
 24 | 	print "js len = $len\n";
 25 | 	
 26 | 	
 27 | 	my $comment = "";
 28 | 	my $pat= $pdfObjects->{$obj}->{content};
 29 | 	for(my $i =0; $i <$len-2 ; $i++){
 30 | 		$comment.=" ";
 31 | 	}
 32 | 	$comment="(".$comment.")";
 33 | 		
 34 | 	# Get the offset of the js content
 35 | 	my $off = index($pdfObjects->{$obj}->{content}, $pdfObjects->{$obj}->{js});
 36 | 		
 37 | 		
 38 | 	print "verif1 :: $pdfObjects->{$obj}->{content}\n";
 39 | 	
 40 | 	# Replace js content by empty string
 41 | 	substr($pdfObjects->{$obj}->{content},$off,$len,$comment);
 42 | 	
 43 | 	print "verif2 :: $pdfObjects->{$obj}->{content}\n";
 44 | 		
 45 | 	return 0;			
 46 | }
 47 | 
 48 | sub RemoveJSContentFromXFA{
 49 | 
 50 | 	my ($obj,$pdfObjects) = @_;
 51 | 	
 52 | 	#print " Content = $pdfObjects->{$obj}->{stream_d}\n\n" if $obj eq "26 0 obj";
 53 | 	
 54 | 	# <script contentType="application/x-javascript"></script>
 55 | 	# 
 56 | 	#my @js_content = $pdfObjects->{$obj}->{stream_d} =~ /(javascript)/gi ;
 57 | 	#my @js_content = $pdfObjects->{$obj}->{stream_d} =~ /(<script contentType="application\/x-javascript"\s*>(.*)<\/script\s*>)/gi ; 
 58 | 	#my @js_content = $pdfObjects->{$obj}->{stream_d} =~ /(<script contentType="application\/x-javascript"\s*>)/gi ; 
 59 | 	#my @js_content = $pdfObjects->{$obj}->{stream_d} =~ /(<script contentType="application\/x-javascript"\s*>(.*)<\/script\s*>)/ig;
 60 | 	#my @js_content = $pdfObjects->{$obj}->{stream_d} =~ /(<script contentType="application\/x-javascript"\s*>(.*?)<\/script)/sig;
 61 | 	#my @js_content = $pdfObjects->{$obj}->{content} =~ /(<script contentType="application\/x-javascript"\s*>.*?<\/script\s*>)/sig;
 62 | 	my @js_content; # = $pdfObjects->{$obj}->{content} =~ /(<script contentType="application\/x-javascript"\s*>.*?<\/script\s*>)/sig;
 63 | 	
 64 | 	if( exists($pdfObjects->{$obj}->{"stream_d"})){
 65 | 		#print "stream_d == $pdfObjects->{$obj}->{stream_d} \n";
 66 | 		@js_content = $pdfObjects->{$obj}->{"stream_d"} =~ /(<script contentType="application\/x-javascript"\s*>.*?<\/script\s*>)/sig;
 67 | 	}else{
 68 | 		@js_content = $pdfObjects->{$obj}->{"stream"} =~ /(<script contentType="application\/x-javascript"\s*>.*?<\/script\s*>)/sig;
 69 | 	}
 70 | 	
 71 | 		
 72 | 	print "js_content = $#js_content\n";
 73 | 	
 74 | 	if($#js_content < 0 ){
 75 | 		print "Error :: RemoveJSContentFromXFA :: No JavaScript balise found in XFA form\n";
 76 | 		return -1;
 77 | 	}
 78 | 	
 79 | 	foreach(@js_content){
 80 | 		
 81 | 		print "JS_CONTENT == $_\n\n";
 82 | 		
 83 | 		# locate de content in the stream
 84 | 		#my $off = index($pdfObjects->{$obj}->{$content}, $pdfObjects->{$obj}->{js});
 85 | 		
 86 | 		if( exists($pdfObjects->{$obj}->{"stream_d"})){
 87 | 		
 88 | 			# TODO if the object is encoded
 89 | 			print "TODO if the stream is encoded\n";
 90 | 			my $old_content = $pdfObjects->{$obj}->{content};
 91 | 			
 92 | 			$pdfObjects->{$obj}->{"stream_d"} =~ s/\Q$_\E//sig;
 93 | 			
 94 | 			my $stream = &Filters::FlateEncode($pdfObjects->{$obj}->{"stream_d"});
 95 | 			
 96 | 			$pdfObjects->{$obj}->{content} = $pdfObjects->{$obj}->{"ref"}."\r<<".$pdfObjects->{$obj}->{"dico"}.">>stream"."\r\n".$stream."\r\nendstream\rendobj";
 97 | 			
 98 | 		}else{
 99 | 		
100 | 			my $old_content = $pdfObjects->{$obj}->{content};
101 | 			$pdfObjects->{$obj}->{content} =~ s/\Q$_\E//sig;
102 | 
103 | 		}
104 | 
105 | 		#if($old_content eq $pdfObjects->{$obj}->{content}){
106 | 		#	print "ERR :: No modification on content :: $_ :: $old_content\n";
107 | 		#}
108 | 		
109 | 
110 | 	}
111 | 	
112 | 	
113 | 	
114 | 
115 | }
116 | 
117 | 
118 | # This function remove an object given in parameter from an object stream
119 | sub RemoveObjectFromObjStream{
120 | 
121 | 	my ($objStm,$obj,$pdfObjects) = @_;
122 | 	
123 | 	
124 | 	if(exists($pdfObjects->{$objStm}->{stream_d})){
125 | 	
126 | 		print "The object is at offset $pdfObjects->{$obj}->{objStmOff} in object stream\n";
127 | 		
128 | 			
129 | 		my $len = length($pdfObjects->{$obj}->{js});
130 | 		
131 | 		print "len = $len\n";
132 | 		
133 | 		my $comment = "";
134 | 		my $pat= $pdfObjects->{$obj}->{content};
135 | 		for(my $i =0; $i <$len-2 ; $i++){
136 | 			$comment.=" ";
137 | 		}
138 | 		$comment="(".$comment.")";
139 | 		my $com_len = length($comment);
140 | 		
141 | 		my $js = $pdfObjects->{$obj}->{js};
142 | 		#print "Replacing :: $js :: in :: $pat :: by :: $comment\n";
143 | 		
144 | 		# search the offset of the js code
145 | 		my $off = index($pdfObjects->{$objStm}->{stream_d}, $js ,$pdfObjects->{$obj}->{objStmOff} );
146 | 		
147 | 		print "js to replace = $pdfObjects->{$obj}->{js} :: $com_len \n";
148 | 		my $verif = substr($pdfObjects->{$objStm}->{stream_d},$off,$len );
149 | 		print "verif :: to replace  = $verif\n";
150 | 		$verif = substr( $pdfObjects->{$objStm}->{stream_d}, $pdfObjects->{$obj}->{objStmOff}, length($pdfObjects->{$obj}->{content}) );
151 | 		print "verif2 :: rplace in  = $verif\n";
152 | 		
153 | 		
154 | 		my $length_d = length($pdfObjects->{$objStm}->{stream_d});
155 | 		# Comment the content by =>  (//)
156 | 		substr($pdfObjects->{$objStm}->{stream_d},$off,$len,$comment);
157 | 		
158 | 		#$off = index($pdfObjects->{$obj}->{content}, $js);
159 | 		# 
160 | 		#substr($pdfObjects->{$obj}->{content},$off,$len,$comment);
161 | 		
162 | 		substr($pdfObjects->{$objStm}->{stream_d},$pdfObjects->{$obj}->{objStmOff},$len,$comment);
163 | 
164 | 
165 | 		
166 | 		$verif = substr($pdfObjects->{$objStm}->{stream_d},$pdfObjects->{$obj}->{objStmOff},length($pdfObjects->{$obj}->{content}) );
167 | 		print "verif3 :: res = $verif\n";
168 | 		print "verifx :: length_d = $length_d :: length_after ".length($pdfObjects->{$objStm}->{stream_d})."\n" if ($length_d != length($pdfObjects->{$objStm}->{stream_d}));
169 | 		# Apply filter mention in the content
170 | 		
171 | 		# Get filters
172 | 		print " Filters  :: :$pdfObjects->{$objStm}->{filters}:\n";
173 | 		if($pdfObjects->{$objStm}->{"filters"}  =~ /FlateDecode/){
174 | 		
175 | 			print "Encode stream_d with Flate encoding\n";
176 | 			
177 | 			###################
178 | 			#my $test = &Filters::FlateEncode("Hello");
179 | 			#my $test2 = &Filters::FlateEncode("He  o");
180 | 			
181 | 			#print "DEBUG_TEST :: $test :: $test2 :: ".length($test)." :: ".length($test2)."\n";
182 | 			###################
183 | 			
184 | 			my $stream = &Filters::FlateEncode($pdfObjects->{$objStm}->{stream_d});
185 | 			my $oldstream = $pdfObjects->{$objStm}->{stream};
186 | 			$pdfObjects->{$objStm}->{stream} = $stream; 
187 | 			
188 | 			print " verif4 :: stream length old : new ".length($oldstream)." :: ".length($stream)."\n";
189 | 						
190 | 			
191 | 			#replace stream in content;
192 | 			#$pdfObjects->{$objStm}->{stream} = s/\Q$oldstream\E/$stream\n/s;
193 | 			
194 | 			#$pdfObjects->{$objStm}->{content} =~ s///;
195 | 			
196 | 			# Get index of stream in content
197 | #			my $old_content = $pdfObjects->{$objStm}->{content} ;
198 | #			my $ind = bytes::index ($pdfObjects->{$objStm}->{content}, $pdfObjects->{$objStm}->{stream}, $pdfObjects->{$objStm}->{offset});
199 | #			print "verif z :: $ind ".substr ($pdfObjects->{$objStm}->{content} , $ind, bytes::length($pdfObjects->{$objStm}->{stream}))."\n";
200 | #			substr ($pdfObjects->{$objStm}->{content} , $ind, bytes::length($pdfObjects->{$objStm}->{stream}), $stream);
201 | #			
202 | #			my $diff = length($stream) - length($pdfObjects->{$objStm}->{stream});
203 | #			if($diff > 0){
204 | #				my @objs = values(%{$pdfObjects});
205 | #				#my $obj_off = $pdfObjects->{$objStm}->{offset};
206 | #				foreach(@objs){
207 | #					if(exists($_->{offset}) && $_->{offset} > $pdfObjects->{$objStm}->{offset} ){
208 | #						$_->{offset}+=($diff+1);
209 | #					}
210 | #				}
211 | #			}
212 | 			
213 | 			
214 | 			
215 | 			my $content = $pdfObjects->{$objStm}->{content};
216 | 			
217 | 			# Build new content
218 | 			$pdfObjects->{$objStm}->{content} = $pdfObjects->{$objStm}->{"ref"}."\r<<".$pdfObjects->{$objStm}->{"dico"}.">>stream"."\r\n".$stream."\r\nendstream\rendobj";
219 | 			
220 | 		
221 | 			#my $diff = bytes::length($stream) - bytes::length($oldstream);
222 | 			my $diff = bytes::length($pdfObjects->{$objStm}->{content}) - bytes::length($content);
223 | 			print "length new = ".length($pdfObjects->{$objStm}->{content})." :: ".length($content)." :: diff = $diff\n";
224 | 			if($diff > 0){
225 | 				my @objs = values(%{$pdfObjects});
226 | 				#my $obj_off = $pdfObjects->{$objStm}->{offset};
227 | 				foreach(@objs){
228 | 					if(exists($_->{offset}) && $_->{offset} > $pdfObjects->{$objStm}->{offset} ){
229 | 						$_->{offset}+=($diff);
230 | 					}
231 | 				}
232 | 			}
233 | 				
234 | 				
235 | 		}else{
236 | 			print "ERROR :: RemoveObjectFromObjStream :: another filter used :: $pdfObjects->{$objStm}->{filters} \n";
237 | 		}
238 | 		
239 | 		
240 | 		
241 | 	}
242 | 	
243 | 	
244 | 	
245 | 	
246 | 	
247 | }
248 | 
249 | 
250 | 
251 | # this function remove or modify from the list all potentially dangerous objects
252 | sub RemoveModifyDangerousObjects{
253 | 
254 | 	my @to_analyse;
255 | 	my $active_content =0;
256 | 	
257 | 	#my (%js, %ef, %xfa ); # javascript , embedded files, xfa
258 | 	my ($js, $ef, $xfa ) = (0,0,0); # javascript , embedded files, xfa
259 | 	
260 | 	my $pdfObjects = shift;
261 | 	my @objs = values(%{$pdfObjects});
262 | 	
263 | 	foreach(@objs){
264 | 	
265 | 		# remove javascript 
266 | 		if( exists($_->{"js"}) or exists($_->{"javascript"}) or exists($_->{"js_obj"}) ){
267 | 		
268 | 			print "Warning :: RemoveModifyDangerousObjects :: Found javascript in  $_->{ref} :: \n" unless $DEBUG eq "no";
269 | 			
270 | 			# case if the javascript is described in another object
271 | 			if(exists($_->{"js_obj"})){
272 | 			
273 | 				# get the object
274 | 				my $js_obj = $_->{"js_obj"};
275 | 				$js_obj =~ s/R/obj/;
276 | 				print "Deleting javascript content of object = $js_obj :: \n";
277 | 				
278 | 				# erase content
279 | 				$pdfObjects->{$js_obj}->{content} = "$js_obj\nendobj";
280 | 				
281 | 				
282 | 			}elsif(exists($_->{"js"})){ # if the js is a string
283 | 				
284 | 				print "javascript content :: $_->{js} \n" unless $DEBUG eq "no";
285 | 				
286 | 				# If the object is packed in an object stream
287 | 				if( exists($_->{"objStm"})  ){
288 | 					
289 | 					print "javascript object $_->{ref} in Object stream $_->{objStm}\n" unless $DEBUG eq "no";
290 | 					
291 | 					# Remove the object in object stream
292 | 					#&RemoveObjectFromObjStream($_->{objStm},$_->{"ref"},$pdfObjects);
293 | 					&RemoveJSContentFromObj($_->{"ref"}, $pdfObjects);
294 | 
295 | 				}else{
296 | 					$pdfObjects->{$_->{"ref"}}->{content} =~ s/\Q$_->{js}\E//;
297 | 				}
298 | 				
299 | 			}
300 | 			
301 | 			#print "Warning :: Active_Contents :: Found javascript in  $_->{ref} :: \n" unless $DEBUG eq "yes";
302 | 			#print "content = $_->{content}\n";
303 | 			$js ++;
304 | 			$active_content ++;
305 | 			print "\n\n";
306 | 		}
307 | 		
308 | 		if( exists($_->{"type"}) && $_->{"type"} eq "/EmbeddedFile" ){
309 | 			#print "Warning :: Found EmbeddedFile in $_->{ref}\n" unless $DEBUG eq "yes";
310 | 			$ef ++;
311 | 			$active_content ++;
312 | 			#print "\n\n";
313 | 		}
314 | 		
315 | 		
316 | 		# XFA processing
317 | 		if(exists($_->{"xfa"}) ){
318 | 			
319 | 			# an array of object
320 | 			my @xfas = $_->{"xfa"} =~ /(\d+\s\d\sR)/sg;
321 | 			
322 | 			#print @xfas;
323 | 			
324 | 			foreach (@xfas){
325 | 			
326 | 				my $xfa = $_;
327 | 				$xfa =~ s/R/obj/;
328 | 				print "found XFA obj :: $xfa\n" unless $DEBUG eq "no";
329 | 				
330 | 				if(exists($pdfObjects->{$xfa})){
331 | 				
332 | 					#print "found XFA obj :: $xfa\n";
333 | 					if(exists($pdfObjects->{$xfa}->{"stream_d"}) && length($pdfObjects->{$xfa}->{"stream_d"})>0 ){
334 | 						
335 | 						# Search javascript content
336 | 						# <script contentTyp='application'contentType='application/x-javascript'>
337 | 						if($pdfObjects->{$xfa}->{"stream_d"} =~ /(javascript)/si){
338 | 							
339 | 							print "Warning :: $1 :: RemoveModifyDangerousObjects :: Found javaScript in XFA (stream_d): $xfa\n" unless $DEBUG eq "no";
340 | 							$active_content ++;
341 | 							&RemoveJSContentFromXFA($xfa,$pdfObjects);
342 | 						}
343 | 						
344 | 						
345 | 					}elsif(exists($pdfObjects->{$xfa}->{"stream"}) && length($pdfObjects->{$xfa}->{"stream"})>0){
346 | 					
347 | 						if($pdfObjects->{$xfa}->{"stream"} =~ /javascript/si){
348 | 							print "Warning :: RemoveModifyDangerousObjects :: found javaScript in XFA (stream):: $xfa\n" unless $DEBUG eq "no";
349 | 							$active_content ++;
350 | 							&RemoveJSContentFromXFA($xfa,$pdfObjects);
351 | 						}
352 | 					}
353 | 				}
354 | 				
355 | 			}
356 | 		}
357 | 		
358 | 	}
359 | 	
360 | 	#if($active_content > 0){
361 | 		#$TESTS_CAT_1{"Active Content"} = $active_content;
362 | 	#}
363 | 
364 | 	
365 | 	
366 | 	return $active_content;
367 | 
368 | }
369 | 
370 | # This function clean all javascript and suspiscious embedded files in the pdf.
371 | sub Rewrite_clean{
372 | 
373 | 
374 | 	#my $filename = shift;
375 | 	my ($filename, $version, $pdfObjects, @trailers) = @_;
376 | 	#my %pdfObjects = %{$pdfObjs_ref};
377 | 	my @xref_table;
378 | 	
379 | 	#$filename ="clean.pdf";
380 | 	my $clean_pdf;
381 | 	
382 | 	my $clean_filename =basename($filename);
383 | 	#$clean_filename =~ s/\.pdf//;
384 | 	#$clean_filename .= "_clean.pdf";
385 | 	
386 | 	$clean_filename = "Cleaned_PDF/clean_".$clean_filename;
387 | 	print "cleaned file = $clean_filename\n";
388 | 
389 | 	# Create the clean file
390 | 	open ($clean_pdf, ">$clean_filename" ) or die "Rewrite_clean :: failed to open file :: filename\n";
391 | 
392 | 	# write header
393 | 	print $clean_pdf "$version\n";
394 | 	print $clean_pdf "%äüöß\n"; # add binary data for PDF interpretation
395 | 	
396 | 	
397 | 	&RemoveModifyDangerousObjects($pdfObjects);
398 | 	
399 | 	# TODO Remove dangerous embedded files (executables);
400 | 	
401 | 	
402 | 	my @objs = values(%{$pdfObjects});
403 | 	
404 | 	
405 | 	#print "......\n";
406 | 	
407 | 	my $high = 0;
408 | 	my $root = 0;
409 | 	
410 | 	# Write objects extracted from object stream
411 | 	foreach(@objs){
412 | 	
413 | 		my $num = -1;
414 | 		my $gen = 0;
415 | 		if($_->{ref} =~ /(\d+)\s(\d)\sobj/){
416 | 		
417 | 			$num =$1;
418 | 			$gen = $2;
419 | 			if( $1 > $high){
420 | 				$high = $1;
421 | 				
422 | 			}
423 | 		}
424 | 		
425 | 		if(exists($_->{type}) && $_->{type} eq "/Catalog"){
426 | 			$root = $_->{"ref"};
427 | 		}
428 | 		
429 | 		if(exists($_->{type}) && $_->{type} eq "/XRef"){
430 | 			next;
431 | 		}
432 | 		
433 | 		if(exists($_->{type}) && $_->{type} eq "/ObjStm"){
434 | 			next;
435 | 		}
436 | 		
437 | #		if(exists($_->{type}) && $_->{type} eq "/Metadata"){
438 | #			next;
439 | #		}
440 | #		
441 | #		if(exists($_->{type}) && $_->{type} eq "/Info"){
442 | #			next;
443 | #		}
444 | 		
445 | 		# TODO rewrite the Info object
446 | 		# modify metadata		
447 | 		
448 | 
449 | 		# Reach the end of the file
450 | 		seek($clean_pdf,0,2);
451 | 		my $offset = tell($clean_pdf);
452 | 		print "writing object $_->{ref} at ".tell($clean_pdf)."\n" unless $DEBUG eq "no";
453 | 		
454 | 		
455 | 		my $xref = sprintf("%010d",$offset)." ".sprintf("%05d",$gen)." n";
456 | 		$xref_table[$num+1] = $xref;
457 | 			
458 | 		
459 | 		# rebuild content
460 | 		if(exists($_->{objStm})){
461 | 			$_->{"content"} = $_->{"ref"}."\r".$_->{"content"}."\rendobj";
462 | 		}
463 | 				
464 | 		print $clean_pdf $_->{"content"};
465 | 		print $clean_pdf "\n";
466 | 		
467 | 		
468 | 	}
469 | 	
470 | 	# Go to the end of the file
471 | 	seek($clean_pdf,0,2);
472 | 	
473 | 	
474 | 	$root =~ s/obj/R/;
475 | 	$high = $high+1;
476 | 	
477 | 	# Write the XRef
478 | 	$xref_table[0] = "0 $high";
479 | 	my $offset = 0;
480 | 	my $gen = 65535;
481 | 	$xref_table[1] = "0000000000 65535 f";
482 | 	my $xref_offset = tell($clean_pdf);
483 | 	print $clean_pdf "xref\n";
484 | 	
485 | 	foreach(@xref_table){
486 | 	
487 | 		if($_){
488 | 			print $clean_pdf "$_\n";
489 | 			#print "$_\n";
490 | 		}else{
491 | 			print $clean_pdf "0000000000 65535 f\n";
492 | 			#print "0000000000 65535 f\n";
493 | 		}
494 | 		
495 | 		
496 | 	}
497 | 	
498 | 	
499 | 	# Write the trailer at the end of the file
500 | 	#$high = $high+1;
501 | 	my $trailer = "trailer\n<</Size $high /Root $root>>\nstartxref\n$xref_offset\n\%\%EOF";
502 | 	#print "trailer = \n$trailer\n";
503 | 	
504 | 	print $clean_pdf $trailer;
505 | 	
506 | 	# write the xref tables
507 | 	close($clean_pdf);
508 | 
509 | }
510 | 
511 | 
512 | # This function clean all javascript and suspiscious embedded files in the pdf.
513 | sub Rewrite_clean_2{
514 | 
515 | 
516 | 	my ($filename, $version, $pdfObjects, @trailers) = @_;
517 | 	#my %pdfObjects = %{$pdfObjs_ref};
518 | 	#$version = "OK";
519 | 	$filename ="clean.pdf";
520 | 	my $clean_pdf;
521 | 
522 | 	# Create the clean file
523 | 	open ($clean_pdf, ">$filename" ) or die "Rewrite_clean :: failed to open file :: filename\n";
524 | 
525 | 	# write header
526 | 	print $clean_pdf "$version\n";
527 | 	print $clean_pdf "%äüöß\n"; # add binary data for PDF interpretation
528 | 	
529 | 	
530 | 	&RemoveModifyDangerousObjects($pdfObjects);
531 | 	
532 | 	
533 | 	my @objs = values(%{$pdfObjects});
534 | 		
535 | 	
536 | 	foreach(@objs){
537 | 	
538 | 		if(!($_->{"content"} =~ /\d+\s\d\sobj/) ){
539 | 			next;
540 | 		}
541 | 		
542 | 		if(exists($_->{offset}) ){
543 | 			seek($clean_pdf,$_->{offset},0);
544 | 		}else{
545 | 			seek($clean_pdf,0,2);
546 | 		}
547 | 
548 | 		#print "writing object $_->{ref} at ".tell($clean_pdf)."\n";
549 | 		
550 | 		#print $clean_pdf $_->{"ref"};
551 | 		print $clean_pdf $_->{"content"};
552 | 		print $clean_pdf "\n";
553 | 		
554 | 		# fix "zero" bug by remplacing with carriage
555 | 		# read the next byte an replace if it's zero
556 | 		#my $b;
557 | 		#print "b = $b\n";
558 | 		#read $clean_pdf, $b, 1;
559 | 		#if( $b == 00  ){
560 | 		#}
561 | 		#print $clean_pdf "endobj\n\n";
562 | 	}
563 | 	
564 | 	
565 | 	# write the trailers
566 | 	seek($clean_pdf,0,2);
567 | 	foreach(@trailers){
568 | 		#print "trailer __ $_\n";
569 | 		print $clean_pdf $_;	
570 | 	}
571 | 	
572 | 	
573 | 	#print $main::dblezo;
574 | 	
575 | 	# write the xref tables
576 | 	close($clean_pdf);
577 | 
578 | }
579 | 
580 | # This function clean all javascript and suspiscious embedded files in the pdf.
581 | sub Rewrite_clean_1{
582 | 
583 | 
584 | 	#my $filename = shift;
585 | 	my ($filename, $version, $pdfObjects, @trailers) = @_;
586 | 	#my %pdfObjects = %{$pdfObjs_ref};
587 | 	#$version = "OK";
588 | 	$filename ="clean.pdf";
589 | 		
590 | 	my $clean_pdf;
591 | 
592 | 	
593 | 
594 | 	# Create the clean file
595 | 	open ($clean_pdf, ">$filename" ) or die "Rewrite_clean :: failed to open file :: filename\n";
596 | 
597 | 	# write header
598 | 	print $clean_pdf "$version\n";
599 | 	print $clean_pdf "%äüöß\n"; # add binary data for PDF interpretation
600 | 	
601 | 	my @objs = values(%{$pdfObjects});
602 | 	
603 | 	foreach(@objs){
604 | 	
605 | 		if(exists($_->{offset})){
606 | 			seek($clean_pdf,$_->{offset},0);
607 | 		}else{
608 | 			seek($clean_pdf,0,2);
609 | 		}
610 | 		
611 | 		print "Writing object $_->{ref} at ".tell($clean_pdf)."\n" unless $DEBUG eq "no";
612 | 		
613 | 		#print $clean_pdf $_->{"ref"};
614 | 		print $clean_pdf $_->{"content"};
615 | 		print $clean_pdf "endobj\n\n";
616 | 	}
617 | 	
618 | 	#print "Position in file :: ".tell($clean_pdf)."\n";
619 | 	#foreach(sort(@objs)){
620 | 	#	print "$_\n";	
621 | 	#}
622 | 	
623 | 	# write the trailers
624 | 	seek($clean_pdf,0,2);
625 | 	foreach(@trailers){
626 | 		#print "trailer __ $_\n";
627 | 		print $clean_pdf $_;	
628 | 	}
629 | 	
630 | 	# write the xref tables
631 | 	close($clean_pdf);
632 | 
633 | }
634 | 
635 | 
636 | 1;;
637 | 
638 | __END__
639 | 


--------------------------------------------------------------------------------
/lib/src/utils.c:
--------------------------------------------------------------------------------
   1 | /***
   2 | 
   3 | Copyright (C) 2015, 2016 Teclib'
   4 | 
   5 | This file is part of Armadito module PDF.
   6 | 
   7 | Armadito module PDF is free software: you can redistribute it and/or modify
   8 | it under the terms of the GNU General Public License as published by
   9 | the Free Software Foundation, either version 3 of the License, or
  10 | (at your option) any later version.
  11 | 
  12 | Armadito module PDF is distributed in the hope that it will be useful,
  13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 | GNU General Public License for more details.
  16 | 
  17 | You should have received a copy of the GNU General Public License
  18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
  19 | 
  20 | ***/
  21 | 
  22 | 
  23 | #include "utils.h"
  24 | #include "osdeps.h"
  25 | #include "log.h"
  26 | 
  27 | 
  28 | /*
  29 | getPDFObjectByRef() :: return the object corresponding to the reference given in parameter
  30 | parameters:
  31 | - struct pdfDocument * pdf (the pdf document pointer).
  32 | - char * ref (the reference of the object to search)
  33 | returns: (struct pdfObject *)
  34 | - the pointer of the pdf object on success
  35 | - NULL on error or if not found
  36 | */
  37 | struct pdfObject * getPDFObjectByRef(struct pdfDocument * pdf, char * ref){
  38 | 
  39 | 	struct pdfObject * obj = NULL;
  40 | 
  41 | 	if(pdf == NULL  || ref == NULL){		
  42 | 		err_log("getPDFObjectByRef :: invalid parameter\n");
  43 | 		return NULL;
  44 | 	}
  45 | 
  46 | 	obj = pdf->objects;
  47 | 		
  48 | 	while(obj != NULL){
  49 | 	
  50 | 		if( strncmp(ref,obj->reference,strlen(ref)) == 0 ){
  51 | 			return obj;
  52 | 		}
  53 | 		
  54 | 		obj = obj->next;	
  55 | 	}	
  56 | 
  57 | 	return NULL;
  58 | }
  59 | 
  60 | 
  61 | /*
  62 | getPDFNextObjectByRef() :: return the object within the reference given in parameters (get the next object in the list, starting from obj, with the reference given in parameter)
  63 | parameters:
  64 | - struct pdfDocument * pdf (the pdf document pointer).
  65 | - char * ref (the reference of the object to search)
  66 | returns: (struct pdfObject *)
  67 | - the pointer of the pdf object on success
  68 | - NULL on error or if not found
  69 | */
  70 | struct pdfObject * getPDFNextObjectByRef(struct pdfDocument * pdf, struct pdfObject * obj, char * ref){
  71 | 
  72 | 	struct pdfObject * tmp = NULL;
  73 | 
  74 | 	if (pdf == NULL || ref == NULL || obj == NULL){
  75 | 		err_log("getPDFNextObjectByRef :: invalid parameter\n");
  76 | 		return NULL;
  77 | 	}
  78 | 
  79 | 	tmp = obj->next;
  80 | 
  81 | 	while(tmp != NULL){
  82 | 	
  83 | 		if( strncmp(ref,tmp->reference,strlen(ref)) == 0 ){
  84 | 			return tmp;
  85 | 		}
  86 | 		
  87 | 		tmp = tmp->next;	
  88 | 	}
  89 | 
  90 | 	return NULL;
  91 | }
  92 | 
  93 | 
  94 | /*
  95 | searchPattern() :: search a pattern in a stream
  96 | parameters:
  97 | - char * src (the source stream).
  98 | - char * pat (the pattern to search)
  99 | - int pat_size (the size of the pattern)
 100 | - int size (the size of the src stream)
 101 | returns: (char*)
 102 | - a pointer to the pattern string on success.
 103 | - NULL if not found or on error.
 104 | */
 105 | void * searchPattern(char* src, char* pat , int pat_size ,  int size){
 106 | 
 107 | 	char* res = NULL;
 108 | 	char * tmp = NULL;
 109 | 	char * end = NULL;
 110 | 	int len = 0;
 111 | 	int len_verif = 0;
 112 | 	
 113 | 	
 114 | 	if( size < pat_size || src == NULL || pat == NULL || pat_size == 0 || size == 0){
 115 | 		err_log("searchPattern :: invalid parameters\n");
 116 | 		//dbg_log("searchPattern :: src = %s :: pat = %s :: pat_size = %d :: size = %d\n", src, pat, pat_size, size);
 117 | 		return NULL;
 118 | 	}
 119 | 		
 120 | 	tmp =  (char*)calloc(pat_size+1,sizeof(char));
 121 | 	tmp[pat_size] = '\0';
 122 | 	
 123 | 
 124 | 	len = size; 
 125 | 	end = src;
 126 | 	while(len >= pat_size){
 127 | 	
 128 | 		
 129 | 		res = memchr(end,pat[0],len);		
 130 | 		if(res == NULL){
 131 | 			free(tmp);			
 132 | 			return NULL;
 133 | 		}
 134 | 
 135 | 
 136 | 		len_verif = (int)(res-end);
 137 | 		len_verif = len - len_verif;
 138 | 
 139 | 		if(len_verif < pat_size){
 140 | 			free(tmp);
 141 | 			return NULL;
 142 | 		}
 143 | 
 144 | 		memcpy(tmp,res,pat_size);
 145 | 		
 146 | 		if( memcmp(tmp,pat,pat_size) == 0){
 147 | 			free(tmp);
 148 | 			return res;
 149 | 		}
 150 | 		
 151 | 		end = res +1;
 152 | 
 153 | 		len=(int)(end-src);
 154 | 		len = size - len;
 155 | 
 156 | 	}
 157 | 	
 158 | 	free(tmp);
 159 | 	
 160 | 	return NULL;
 161 | }
 162 | 
 163 | 
 164 | // Print object in a debug file (debug.txt)
 165 | void printObjectInFile(struct pdfObject * obj){
 166 | 
 167 | 
 168 | 	FILE * debug = NULL;
 169 | 
 170 | 	// Open en file
 171 | 	if(!(debug = os_fopen("debug.txt","wb+"))){
 172 | 		printf("open failed\n");
 173 | 		return ;
 174 | 	}
 175 | 
 176 | 	//printf("DEBUG ::: \n");
 177 | 
 178 | 
 179 | 	fputc('\n',debug);
 180 | 	fputc('\n',debug);
 181 | 	fputc('\n',debug);
 182 | 	fwrite("---------------------------------------------",sizeof(char),45,debug);
 183 | 	fputc('\n',debug);
 184 | 
 185 | 	// Reference
 186 | 	fwrite(obj->reference,sizeof(char),strlen(obj->reference),debug);
 187 | 
 188 | 	
 189 | 	
 190 | 
 191 | 	// Dictionary
 192 | 	if(obj->dico != NULL){
 193 | 		fputc('\n',debug);
 194 | 		fputc('\n',debug);
 195 | 		fputc('\n',debug);
 196 | 		fwrite(obj->dico,sizeof(char),strlen(obj->dico),debug);
 197 | 	}
 198 | 	
 199 | 
 200 | 	
 201 | 	// Type
 202 | 	if(obj->type != NULL){
 203 | 		fputc('\n',debug);
 204 | 		fputc('\n',debug);
 205 | 		fputc('\n',debug);
 206 | 		fwrite(obj->type,sizeof(char),strlen(obj->type),debug);
 207 | 	}
 208 | 	
 209 | 
 210 | 
 211 | 
 212 | 	// Filters
 213 | 	if(obj->filters != NULL){
 214 | 		fputc('\n',debug);
 215 | 		fputc('\n',debug);
 216 | 		fputc('\n',debug);
 217 | 		fwrite(obj->filters,sizeof(char),strlen(obj->filters),debug);
 218 | 	}
 219 | 
 220 | 	
 221 | 
 222 | 	// Stream 
 223 | 	if(obj->stream != NULL){
 224 | 		fputc('\n',debug);
 225 | 		fputc('\n',debug);
 226 | 		fputc('\n',debug);
 227 | 		fwrite(obj->stream, sizeof(char),obj->stream_size,debug);
 228 | 	}
 229 | 		
 230 | 	//printf("\n\n::: Object :::\n\n");
 231 | 
 232 | 
 233 | 	fputc('\n',debug);
 234 | 	fputc('\n',debug);
 235 | 	fputc('\n',debug);
 236 | 
 237 | 	// Decoded Stream 
 238 | 	if(obj->decoded_stream != NULL){
 239 | 		printf("------------------\n");
 240 | 		fwrite(obj->decoded_stream, sizeof(char),obj->decoded_stream_size,debug);	
 241 | 	}
 242 | 	
 243 | 	//printf("Reference = %s\n",);
 244 | 
 245 | 
 246 | 	fclose(debug);
 247 | 
 248 | 	return;
 249 | }
 250 | 
 251 | 
 252 | // Print object in a debug file (debug.txt)
 253 | void printObject(struct pdfObject * obj){
 254 | 
 255 | 	
 256 | 	if(obj == NULL){
 257 | 		return ;
 258 | 	}
 259 | 
 260 | 
 261 | 
 262 | 	// Reference
 263 | 	printf("\n\nObject :: %s\n", obj->reference);
 264 | 	
 265 | 
 266 | 	// Dictionary
 267 | 	if(obj->dico != NULL)
 268 | 		printf("\tDictionary = %s\n",obj->dico);
 269 | 	
 270 | 
 271 | 	
 272 | 	// Type
 273 | 	if(obj->type != NULL)
 274 | 		printf("\tType = %s\n",obj->type);
 275 | 
 276 | 
 277 | 
 278 | 	// Filters
 279 | 	if(obj->filters != NULL)
 280 | 		printf("\tFilters = %s\n",obj->filters);
 281 | 	
 282 | 
 283 | 	// Stream 
 284 | 	if(obj->stream != NULL){
 285 | 		printf("\tStream = %s\n", obj->stream); // Print in debug file
 286 | 		printf("\tStream size = %d\n",obj->stream_size);
 287 | 	}
 288 | 	
 289 | 	if(obj->decoded_stream != NULL){
 290 | 		printf("\tDecoded Stream = %s\n", obj->decoded_stream);	// Print in debug file
 291 | 		printf("\tDecoded Stream size = %d\n",obj->decoded_stream_size);
 292 | 	}
 293 | 		
 294 | 
 295 | 	return;
 296 | }
 297 | 
 298 | 
 299 | void printObjectByRef(struct pdfDocument * pdf, char * ref){
 300 | 
 301 | 	struct pdfObject * obj = NULL;
 302 | 
 303 | 
 304 | 	if(pdf == NULL || ref == NULL ){
 305 | 		err_log("printObjectByRef :: invalid parameter\n");
 306 | 		return;
 307 | 	}
 308 | 
 309 | 	obj = pdf->objects;
 310 | 	
 311 | 	while(obj != NULL){
 312 | 	
 313 | 		if( strncmp(ref,obj->reference,strlen(ref)) == 0 ){
 314 | 			printObject(obj);
 315 | 			return;
 316 | 		}
 317 | 		
 318 | 		obj = obj->next;	
 319 | 	}
 320 | 
 321 | 	return;
 322 | 
 323 | 
 324 | }
 325 | 
 326 | 
 327 | /*
 328 | printPDFObjects() :: Prints all object described in the PDF Document
 329 | parameters:
 330 | - struct pdfDocument * pdf (pdf document structure)
 331 | returns: (void)
 332 | - none.
 333 | */
 334 | void printPDFObjects(struct pdfDocument * pdf){
 335 | 
 336 | 
 337 | 	struct pdfObject * obj =  NULL;
 338 | 
 339 | 	if(pdf == NULL || pdf->objects == NULL)
 340 | 		return;
 341 | 
 342 | 	printf("\n::: Objects Lists :::\n");
 343 | 
 344 | 	obj = pdf->objects;
 345 | 
 346 | 	while(obj){
 347 | 
 348 | 		printObject(obj);
 349 | 		obj = obj->next;
 350 | 		printf("------------------------------------\n");
 351 | 		printf("------------------------------------\n\n");
 352 | 	}
 353 | 
 354 | 	return;
 355 | 
 356 | 
 357 | }
 358 | 
 359 | 
 360 | /*
 361 | getNumber() :: Return a number (int) in a string or stream at a given pointer
 362 | parameters:
 363 | - char * ptr (the pointer of the string)
 364 | - int size (the size of the string)
 365 | returns: (int)
 366 | - A digit number.
 367 | - An error code (<0) on error.
 368 | */
 369 | int getNumber(char* ptr, int size){
 370 | 
 371 | 	int num;
 372 | 	char * num_a = NULL;
 373 | 	char * end = NULL;
 374 | 	int len = 0;
 375 | 
 376 | 	end = ptr;
 377 | 
 378 | 	if (ptr == NULL || size <= 0){
 379 | 		err_log("getNumber :: invalid parameters\n");
 380 | 		return -1;
 381 | 	}
 382 | 
 383 | 	while( (end[0] >= 48 && end[0] <= 57) &&  len < size ){
 384 | 		len ++;
 385 | 		end ++;
 386 | 	}
 387 | 
 388 | 	if(len == 0){
 389 | 		return -1;
 390 | 	}
 391 | 
 392 | 	num_a = (char*)calloc(len+1,sizeof(char));
 393 | 	num_a[len]='\0';
 394 | 	memcpy(num_a,ptr,len);
 395 | 
 396 | 	num = atoi(num_a);
 397 | 	free(num_a);
 398 | 
 399 | 	if (num < 0)
 400 | 		return -1;
 401 | 
 402 | 	return num;
 403 | }
 404 | 
 405 | 
 406 | /*
 407 | getNumber_s() :: Return a number (in ascii string) in a string or stream at a given pointer
 408 | parameters:
 409 | - char * ptr (the pointer of the string)
 410 | - int size (the size of the string)
 411 | returns: (char *)
 412 | - A digit string.
 413 | - NULL on error.
 414 | */
 415 | char* getNumber_s(char* ptr, int size){
 416 | 	
 417 | 	char * num_a = NULL;
 418 | 	char * end = NULL;
 419 | 	int len = 0;
 420 | 
 421 | 	if (ptr == NULL || size <= 0) {
 422 | 		err_log("getNumber_s :: invalid parameters\n");
 423 | 		return NULL;
 424 | 	}
 425 | 
 426 | 	end = ptr;
 427 | 
 428 | 	while( len < size && (end[0] >= 48 && end[0] <= 57)  ){
 429 | 		len ++;
 430 | 		end ++;
 431 | 	}
 432 | 
 433 | 	if(len == 0){
 434 | 		return NULL;
 435 | 	}
 436 | 
 437 | 	num_a = (char*)calloc(len+1,sizeof(char));
 438 | 	num_a[len]='\0';
 439 | 	memcpy(num_a,ptr,len);
 440 | 
 441 | 	return num_a;
 442 | 
 443 | }
 444 | 
 445 | 
 446 | /*
 447 | getIndirectRef() :: Get the indirect reference string at a given pointer
 448 | parameters:
 449 | - char * ptr (the pointer of the string)
 450 | - int size (the size of the string)
 451 | returns: (char *)
 452 | - the indirect reference string. (Ex: 1 0 R)
 453 | - NULL on error.
 454 | */
 455 | char * getIndirectRef(char * ptr, int size){
 456 | 
 457 | 	char * ref = NULL;
 458 | 	char * obj_num = NULL; // object number
 459 | 	char * gen_num = NULL; // generation number	
 460 | 	char * end = NULL;
 461 | 	int len = 0;
 462 | 
 463 | 	if (ptr == NULL || size <= 0) {
 464 | 		err_log("getIndirectRef :: invalid parameters\n");
 465 | 		return NULL;
 466 | 	}
 467 | 
 468 | 	end = ptr;
 469 | 	len = size;
 470 | 
 471 | 	if(size < 5){
 472 | 		return NULL;
 473 | 	}
 474 | 
 475 | 	// Get the object number
 476 | 	if ((obj_num = getNumber_s(end, len)) == NULL)
 477 | 		return NULL;
 478 | 
 479 | 	end += strlen(obj_num);
 480 | 	len -=  strlen(obj_num);
 481 | 
 482 | 	// Move ptr for white space
 483 | 	end ++ ;
 484 | 
 485 | 	gen_num = getNumber_s(end,len);
 486 | 	if(gen_num == NULL) {
 487 | 		free(obj_num);
 488 | 		return NULL;
 489 | 	}
 490 | 
 491 | 	end += strlen(gen_num);
 492 | 
 493 | 
 494 | 	// Move ptr for white space
 495 | 	end ++ ;
 496 | 	
 497 | 	// Check the presence of R => 12 0 R 
 498 | 	if(end[0] != 'R'){
 499 | 		free(gen_num);
 500 | 		free(obj_num);
 501 | 		return NULL;
 502 | 	}
 503 | 
 504 | 	len = strlen(obj_num) + strlen(gen_num) + 5 ;
 505 | 	ref = (char*)calloc(len+1,sizeof(char));
 506 | 	ref[len] = '\0';
 507 | 
 508 | 	os_strncat(ref,len+1, obj_num, strlen(obj_num));
 509 | 	os_strncat(ref,len+1, " ", strlen(obj_num));
 510 | 	os_strncat(ref,len+1, gen_num, strlen(gen_num));
 511 | 	os_strncat(ref,len+1, " obj", 4);
 512 | 
 513 | 	free(gen_num);
 514 | 	free(obj_num);
 515 | 
 516 | 	return ref;
 517 | 
 518 | }
 519 | 
 520 | 
 521 | /*
 522 | getDelimitedStringContent() :: get a string delimited by a given character/pattern (take into account sub delimiters) Ex : << foo << bar >> >>
 523 | parameters:
 524 | - char * src
 525 | - char * delimiter1
 526 | - char * delimiter2
 527 | - int src_len
 528 | returns: (char *)
 529 | - string between delimiters
 530 | - NULL on error.
 531 | */
 532 | char * getDelimitedStringContent(char * src, char * delimiter1, char * delimiter2, int src_len){
 533 | 
 534 | 	char * content = NULL;
 535 | 	int sub = 1;
 536 | 	char * start = NULL;
 537 | 	char * end = NULL;
 538 | 	int len = 0;
 539 | 	int lim = src_len;
 540 | 	char * tmp = NULL;
 541 | 	char * tmp2 = NULL;
 542 | 	char * echap = NULL; // bug fix when Ex: (string = "parenthesis =\) " )  ;where delimiters are "(" and ")"	
 543 | 	//int found = NULL;
 544 | 
 545 | 	if (src == NULL || src_len <= 0 || delimiter1 == NULL || delimiter2 == NULL){
 546 | 		err_log("getDelimitedStringContent :: invalid parameters\n");
 547 | 		return NULL;
 548 | 	}
 549 | 
 550 | 	tmp = (char*)calloc(strlen(delimiter1) +1,sizeof(char));
 551 | 	tmp2 = (char*)calloc(strlen(delimiter2) +1,sizeof(char));
 552 | 
 553 | 	tmp[strlen(delimiter1)] = '\0';
 554 | 	tmp2[strlen(delimiter2)] = '\0';
 555 | 
 556 | 
 557 | 	start = src;
 558 | 
 559 | 
 560 | 	memcpy(tmp,start,strlen(delimiter1));
 561 | 
 562 | 	// find start point
 563 | 	while (memcmp(tmp, delimiter1, strlen(delimiter1)) != 0 && lim > 0){
 564 | 
 565 | 		start ++;
 566 | 		lim--;
 567 | 		if (lim > 0)
 568 | 			memcpy(tmp,start,strlen(delimiter1));
 569 | 
 570 | 	}
 571 | 
 572 | 	if (lim <= 0){
 573 | 		free(tmp);
 574 | 		free(tmp2);
 575 | 		return NULL;
 576 | 	}
 577 | 		
 578 | 
 579 | 	len = (int)(start - src);
 580 | 
 581 | 	end = start + strlen(delimiter1);
 582 | 
 583 | 	memcpy(tmp2,start,strlen(delimiter2));
 584 | 
 585 | 	
 586 | 	while( sub > 0  && len <= src_len-2){ // TODO :: why? src_len-2 or src_len -1;
 587 | 
 588 | 		memcpy(tmp,end,strlen(delimiter1));
 589 | 		memcpy(tmp2,end,strlen(delimiter2));
 590 | 		echap = end-1;
 591 | 
 592 | 
 593 | 		if( memcmp(tmp,delimiter1,strlen(delimiter1)) == 0 && echap[0]!='\\'){
 594 | 
 595 | 			sub ++;
 596 | 			end += strlen(delimiter1);
 597 | 			len += strlen(delimiter1);
 598 | 		}else{
 599 | 
 600 | 			if( memcmp(tmp2,delimiter2,strlen(delimiter2)) == 0 && echap[0]!='\\'){
 601 | 
 602 | 				sub --;
 603 | 				end += strlen(delimiter2);
 604 | 				len += strlen(delimiter2);
 605 | 
 606 | 			}else{
 607 | 				end ++;
 608 | 				len++;
 609 | 			}
 610 | 		}
 611 | 
 612 | 	}
 613 | 	
 614 | 	
 615 | 	if( sub > 0){
 616 | 		
 617 | 		warn_log("getDelimitedStringContent :: Odd number of delimiters :: %d :: src = %s :: delimiter1 = %s :: delimiter2 = %s\n",sub,src,delimiter1,delimiter2);
 618 | 		
 619 | 		free(tmp);
 620 | 		free(tmp2);
 621 | 		return NULL;
 622 | 	}
 623 | 	
 624 | 
 625 | 	if(len > src_len){
 626 | 
 627 | 		err_log("getDelimitedStringContent :: delimiter2 (%s) not found :: len > src_len\n", delimiter2);		
 628 | 
 629 | 		free(tmp);
 630 | 		free(tmp2);
 631 | 		return NULL;
 632 | 	}
 633 | 
 634 | 	len = (int)(end - start);
 635 | 	
 636 | 	content = (char*)calloc(len+1,sizeof(char));
 637 | 	content[len] = '\0';
 638 | 
 639 | 	memcpy(content,start,len);
 640 | 
 641 | 	free(tmp);
 642 | 	free(tmp2);
 643 | 
 644 | 	return content;
 645 | }
 646 | 
 647 | 
 648 | /*
 649 | getIndirectRefInString() :: search an object indirect reference in a string starting in "ptr"
 650 | parameters:
 651 | - char * ptr
 652 | - int size
 653 | returns: (char *)
 654 | - string between delimiters
 655 | - NULL on error.
 656 | */
 657 | char * getIndirectRefInString(char * ptr, int size){
 658 | 
 659 | 	char * ref = NULL;
 660 | 	char * start = NULL;
 661 | 	int len = 0;
 662 | 
 663 | 	if( ptr == NULL || size <= 0){
 664 | 		err_log("getIndirectRefInString :: invalid parameter!\n");
 665 | 		return NULL;
 666 | 	}
 667 | 
 668 | 	start = ptr;
 669 | 	len = size;
 670 | 
 671 | 	while(ref == NULL && len >= 5 ){
 672 | 
 673 | 		ref = getIndirectRef(start, len);
 674 | 		start ++;
 675 | 		len --;
 676 | 
 677 | 	}
 678 | 
 679 | 	return ref;
 680 | }
 681 | 
 682 | 
 683 | // get a pattern of length (size)  in ptr and skip white spaces
 684 | char * getPattern(char * ptr, int size, int len){
 685 | 
 686 | 	char * pattern = NULL;
 687 | 	int i = 0;
 688 | 	//int white_spaces = 0;
 689 | 	//int tmp = 0;
 690 | 	//int tmp_len = 0;
 691 | 
 692 | 
 693 | 	if(len < size){
 694 | 		return NULL;
 695 | 	}
 696 | 
 697 | 	//tmp = len;
 698 | 
 699 | 	pattern = (char*)calloc(size+1,sizeof(char));
 700 | 	pattern[size]='\0';
 701 | 
 702 | 	for(i=0; i< size ; i++){
 703 | 
 704 | 		// Skip white spaces
 705 | 		/*
 706 | 		while(ptr[0] == '\n' || ptr[0] == '\r' || ptr[0] == ' '){
 707 | 			ptr ++;
 708 | 			white_spaces ++;
 709 | 			len--;
 710 | 			if( (size - i)  > len )
 711 | 				return NULL;
 712 | 		}*/
 713 | 
 714 | 		/*
 715 | 		len --;
 716 | 		if( (size - i)  > len )
 717 | 				return NULL;
 718 | 		*/
 719 | 
 720 | 		pattern[i] = ptr[0];
 721 | 
 722 | 		ptr++;
 723 | 	}
 724 | 
 725 | 
 726 | 	return pattern;
 727 | 
 728 | }
 729 | 
 730 | 
 731 | /*
 732 | getUnicodeInString() :: Return the first unicode string if present in the stream given in parameters
 733 | parameters:
 734 | - char * stream
 735 | - int size
 736 | returns: (char*)
 737 | - the unicode string if found.
 738 | - NULL if not found or on error.
 739 | */
 740 | char * getUnicodeInString(char * stream, int size){
 741 | 
 742 | 	char * unicode = NULL;
 743 | 	char * start = NULL;
 744 | 	char * end = NULL;
 745 | 	int len = 0;
 746 | 
 747 | 	if (stream  == NULL || size <= 0) {
 748 | 		err_log("getUnicodeInString :: invalid parameters\n");
 749 | 		return NULL;
 750 | 	}
 751 | 	
 752 | 	len = size ;
 753 | 	end = stream;
 754 | 
 755 | 
 756 | 	while( unicode == NULL && len > 6){
 757 | 
 758 | 		start = searchPattern(end, "%u", 2, len);
 759 | 		if(start == NULL){
 760 | 			//printf("No unicode detected\n");
 761 | 			return NULL;
 762 | 		}
 763 | 
 764 | 		end = start +2 ;
 765 | 
 766 | 		len = 0;
 767 | 		while( ((end[0] >= 65 && end[0] <=70) || (end[0] >= 97 && end[0] <= 102) || (end[0] >= 48 && end[0] <= 57)) && len != 4 ){
 768 | 			len ++;
 769 | 			end ++;
 770 | 		}
 771 | 
 772 | 		if(len == 4){			
 773 | 			unicode = start;
 774 | 			return unicode;
 775 | 		}
 776 | 
 777 | 
 778 | 		len = (int)(start - stream);
 779 | 		len = size - len;
 780 | 
 781 | 
 782 | 	}
 783 | 
 784 | 	return NULL;
 785 | }
 786 | 
 787 | 
 788 | /*
 789 | replaceInString() ::  replace all occurrences of the pattern in the stream by another pattern
 790 | parameters:
 791 | - char * src (the source entry).
 792 | - char * toReplace (the string to replace).
 793 | - char * pat (the pattern which replace the string).
 794 | returns: (char*)
 795 | - the new string with the pattern replaced.
 796 | - NULL if not found or on error.
 797 | TODO :: replaceString :: replace all occurrences. :: in function replaceAll.
 798 | */
 799 | char * replaceInString(char * src, char * toReplace , char * pat){
 800 | 
 801 | 	char * dest = NULL;
 802 | 	char * start = NULL;
 803 | 	char * end = NULL;
 804 | 	int len = 0;
 805 | 	int len_alloc = 0;
 806 | 	int off = 0;
 807 | 
 808 | 	if (src == NULL || toReplace == NULL || pat == NULL){
 809 | 		err_log("replaceInString :: invalid parameter\n");
 810 | 		return NULL;
 811 | 	}
 812 | 
 813 | 	// TODO: calc the number of occurrencies of the pattern to replace
 814 | 
 815 | 	// get the positions
 816 | 	start = searchPattern(src,toReplace,strlen(toReplace),strlen(src));
 817 | 
 818 | 	if(start == NULL){
 819 | 		err_log("String to replace (%s) not found in src \n",toReplace);
 820 | 		return src;
 821 | 	}
 822 | 
 823 | 
 824 | 	// calc the new length = len - diff(pat et pat2)
 825 | 	len = strlen(src) - (strlen(toReplace) - strlen(pat));
 826 | 	len_alloc = len;
 827 | 
 828 | 	dest = (char*)calloc(len+1,sizeof(char));
 829 | 	dest[len] = '\0';
 830 | 
 831 | 
 832 | 	// get the position
 833 | 	off = (int)(start - src);
 834 | 
 835 | 	memcpy(dest, src, off);
 836 | 
 837 | 	// replace
 838 | 	os_strncat(dest,len_alloc+1,pat,strlen(pat));
 839 | 
 840 | 	end = start + strlen(toReplace);
 841 | 
 842 | 	len = strlen(src) - off - strlen(toReplace);
 843 | 	
 844 | 	os_strncat(dest,len_alloc+1,end,len);
 845 | 	
 846 | 	return dest;
 847 | }
 848 | 
 849 | 
 850 | /*
 851 | getHexa() ::  return a pointer to the first hexa string (#F6) or NULL if any;
 852 | parameters:
 853 | - char * dico (object dictionnary).
 854 | - int size, the size of the dico
 855 | returns: (char*)
 856 | - the hex string found
 857 | - NULL if not found or on error.
 858 | */
 859 | char * getHexa(char * dico, int size){
 860 | 
 861 | 	char *  start = NULL;
 862 | 	char * end = NULL;
 863 | 	char * hexa = NULL;
 864 | 	int len = 0;
 865 | 
 866 | 	len = size ;
 867 | 	end = dico;
 868 | 
 869 | 	if (dico == NULL || size <= 0){
 870 | 		err_log("getHexa :: invalid parameter\n");
 871 | 		return NULL;
 872 | 	}
 873 | 
 874 | 	while( hexa == NULL && len >= 3  ){
 875 | 
 876 | 		start = searchPattern(end,"#",1,len);		
 877 | 		if(start == NULL){
 878 | 			return NULL;
 879 | 		}
 880 | 
 881 | 		end = start +1 ;
 882 | 
 883 | 		// test the two next characters
 884 | 		if( ((end[0] >= 65 && end[0] <=70) || (end[0] >= 97 && end[0] <= 102) || (end[0] >= 48 && end[0] <= 57)) && ((end[1] >= 65 && end[1] <=70) || (end[1] >= 97 && end[1] <= 102) || (end[1] >= 48 && end[1] <= 57)) ){
 885 | 			//dbg_log("getHexa :: hex found\n");
 886 | 			return start;
 887 | 		}		
 888 | 
 889 | 		len = (int)(end - dico);
 890 | 		len = size - len;
 891 | 
 892 | 
 893 | 	}
 894 | 
 895 | 
 896 | 	return NULL;
 897 | }
 898 | 
 899 | // print objects references
 900 | void printObjectReferences(struct pdfDocument* pdf){
 901 | 
 902 | 	if(pdf->objects == NULL)
 903 | 		return;
 904 | 
 905 | 
 906 | 	while(pdf->objects != NULL){
 907 | 		dbg_log("object = %s\n",pdf->objects->reference);
 908 | 
 909 | 		pdf->objects = pdf->objects->next;
 910 | 	}
 911 | 
 912 | 	return;
 913 | 
 914 | }
 915 | 
 916 | 
 917 | void debugPrint(char * stream, int len){
 918 | 
 919 | 	FILE * debug = NULL;
 920 | 
 921 | 	if(stream == NULL || len <= 0){
 922 | 		err_log("debugPrint :: invalid parameter\n");
 923 | 		return;
 924 | 	}
 925 | 
 926 | 	// Open en file
 927 | 	if(!(debug = os_fopen("debug","wb+"))){
 928 | 		err_log("debugPrint :: open failed\n");
 929 | 		return ;
 930 | 	}
 931 | 
 932 | 	// Reference
 933 | 	fwrite(stream,sizeof(char),len,debug);
 934 | 
 935 | 
 936 | 	fclose(debug);
 937 | 
 938 | 
 939 | 	return;
 940 | }
 941 | 
 942 | 
 943 | // This function convert a string into binary.
 944 | char * toBinary(char * stream, int size){
 945 | 
 946 | 
 947 | 	char * binary = NULL;
 948 | 	int len = 0;
 949 | 	//char * byte = NULL;
 950 | 	int bit = 0;
 951 | 	int i = 0, j = 0;
 952 | 	char bit_s = 0;
 953 | 	int off = 0;
 954 | 
 955 | 	len = 8*size;
 956 | 
 957 | 	binary = (char*)calloc(len+1,sizeof(char));
 958 | 	binary[len] = '\0';
 959 | 
 960 | 
 961 | 	for(i = 0; i < size; i++){
 962 | 
 963 | 
 964 | 		for(j = 0; j < 8; j++){
 965 | 
 966 | 			bit = stream[i] & (1 << (7-j));
 967 | 			bit = bit >> (7-j);
 968 | 			//printf("%d ",bit);
 969 | 
 970 | 			bit_s = bit - '\0' + 48;
 971 | 
 972 | 			binary[off] = bit_s;
 973 | 			off ++;
 974 | 		}
 975 | 
 976 | 	}
 977 | 
 978 | 	//printf("binary = %s\n",binary);
 979 | 
 980 | 
 981 | 	return binary;
 982 | 
 983 | }
 984 | 
 985 | 
 986 | // Converts a binary string to a char string
 987 | char * binarytoChar(char * binary, int size, int * returned_size){
 988 | 
 989 | 	char * string = NULL;
 990 | 	int i =0,j=0;
 991 | 	int len = 0;
 992 | 	int off = 0;
 993 | 	char * byte = NULL;
 994 | 	char * ptr;
 995 | 	char res = 0;
 996 | 	int mod = 0;
 997 | 
 998 | 
 999 | 	len = size/8;
1000 | 	mod = size%8;
1001 | 	if(mod != 0){
1002 | 		warn_log("binarytoChar :: len not a multiple of 8 :: padding with zero :: size %d :: len = %d :: mod8 = %d\n",size,len,mod);
1003 | 		//TODO Padd with 0
1004 | 	}
1005 | 
1006 | 	ptr = binary;
1007 | 
1008 | 	byte = (char*)calloc(9,sizeof(char));
1009 | 	byte[8]='\0';
1010 | 
1011 | 	//printf("len = %d :: size = %d\n",len, size);
1012 | 	string = (char*)calloc(len+1,sizeof(char));
1013 | 	string[len] = '\0';
1014 | 
1015 | 	for(i= 0; i<len; i++){
1016 | 
1017 | 		for(j=0;j<8;j++){		
1018 | 			//printf("%c",binary[j]);
1019 | 			byte[j]=ptr[j];
1020 | 		}
1021 | 		res = strtol(byte,NULL,2);
1022 | 		//printf("%s ==> %c\n\n",byte,res);
1023 | 		string[off] = res;
1024 | 		off ++;
1025 | 		ptr+=8;
1026 | 
1027 | 	}
1028 | 
1029 | 	*returned_size = len;
1030 | 
1031 | 	//printf("string = %s\n",string);
1032 | 
1033 | 	free(byte);
1034 | 
1035 | 	return string;
1036 | }
1037 | 
1038 | 
1039 | // This function print a stream with a null characters
1040 | void printStream(char * stream, int size){
1041 | 
1042 | 	int len = 0;
1043 | 	//int rsize = 0;
1044 | 	int nul = 0;
1045 | 	char * ptr = NULL;
1046 | 
1047 | 	if(stream == NULL || size <= 0){
1048 | 		return;
1049 | 	}
1050 | 
1051 | 	ptr = stream;
1052 | 
1053 | 	while (len < size){
1054 | 
1055 | 		if(ptr[0] == '\0'){
1056 | 			printf("<NUL>");
1057 | 			nul ++;
1058 | 
1059 | 		}else{
1060 | 			printf("%c",ptr[0]);
1061 | 			nul = 0;
1062 | 		}
1063 | 
1064 | 		len ++;
1065 | 		ptr ++;
1066 | 
1067 | 		if (nul >= 5){
1068 | 			dbg_log("printStream :: len = %d\n",len);
1069 | 			return;
1070 | 		}
1071 | 		
1072 | 
1073 | 	}
1074 | 
1075 | 	printf("\n");
1076 | 
1077 | 	return ;
1078 | 	
1079 | 
1080 | 
1081 | }
1082 | 


--------------------------------------------------------------------------------
/lib/src/pdfStructAnalysis.c:
--------------------------------------------------------------------------------
  1 | /***
  2 | 
  3 | Copyright (C) 2015, 2016 Teclib'
  4 | 
  5 | This file is part of Armadito module PDF.
  6 | 
  7 | Armadito module PDF is free software: you can redistribute it and/or modify
  8 | it under the terms of the GNU General Public License as published by
  9 | the Free Software Foundation, either version 3 of the License, or
 10 | (at your option) any later version.
 11 | 
 12 | Armadito module PDF is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with Armadito module PDF.  If not, see <http://www.gnu.org/licenses/>.
 19 | 
 20 | ***/
 21 | 
 22 | 
 23 | 
 24 | #include "armaditopdf.h"
 25 | #include "pdfAnalysis.h"
 26 | #include "utils.h"
 27 | #include "osdeps.h"
 28 | #include "log.h"
 29 | 
 30 | 
 31 | // Check the trailer content
 32 | int checkTrailer(struct pdfDocument * pdf){
 33 | 
 34 | 
 35 | 	char * start = NULL;
 36 | 	char * xref_obj_ref = NULL;
 37 | 	struct pdfTrailer * trailer = NULL;
 38 | 	int len = 0;
 39 | 	int xref_offset;
 40 | 
 41 | 	if(pdf->trailers == NULL){
 42 | 		err_log("checkXTrailer :: No trailer found in pdfDocument\n");		
 43 | 		return -1;
 44 | 	}
 45 | 
 46 | 	trailer = pdf->trailers;
 47 | 
 48 | 	while(trailer != NULL){
 49 | 
 50 | 
 51 | 		// trailer with a dico 
 52 | 		if(trailer->dico != NULL){
 53 | 
 54 | 			// TODO
 55 | 
 56 | 		}else{
 57 | 
 58 | 			// get the offset of the XRef object
 59 | 			start = searchPattern(trailer->content, "startxref", 9 , strlen(trailer->content));
 60 | 			if(start == NULL){
 61 | 				err_log("checkTrailer :: XRef offset not found in trailer\n");				
 62 | 				return -1;
 63 | 			}
 64 | 
 65 | 			start += 9;
 66 | 
 67 | 			while(start[0] == '\r' || start[0] == '\n' || start[0] == ' '){
 68 | 				start ++;
 69 | 			}
 70 | 
 71 | 			len = strlen(trailer->content) - (int)(start - trailer->content);
 72 | 
 73 | 			xref_offset = getNumber(start,len);
 74 | 
 75 | 			dbg_log("checkTrailer :: Xref object offset = %d\n",xref_offset);
 76 | 
 77 | 			if(xref_offset <= 0){
 78 | 				trailer = trailer->next;
 79 | 				continue;
 80 | 			}
 81 | 
 82 | 			// go to xref object offset
 83 | 			//start = pdf->content + xref_offset;
 84 | 
 85 | 
 86 | 			// if the offset is higher than th PDF size
 87 | 			if(xref_offset > pdf->size){
 88 | 				warn_log("Warning :: checkTrailer :: Wrong xref object offset %d\n",xref_offset);				
 89 | 				trailer = trailer->next;
 90 | 				continue;
 91 | 			}
 92 | 
 93 | 
 94 | 		}
 95 | 
 96 | 		trailer = trailer->next;
 97 | 
 98 | 
 99 | 	}
100 | 
101 | 
102 | 
103 | 	return 0; 
104 | }
105 | 
106 | 
107 | /*
108 | documentStructureAnalysis() :: check the consitency of the Cross-reference table
109 | parameters:
110 | - struct pdfDocument * pdf (pdf document pointer)
111 | returns: (int)
112 | - 0 on success.
113 | - an error code (<0) on error.
114 | */
115 | int checkXRef(struct pdfDocument * pdf){
116 | 
117 | 	int ret = 1;
118 | 	int xref_offset = 0;
119 | 	int len = 0;
120 | 	int num_entries = 0;
121 | 	int obj_num = 0;
122 | 	int first_obj_num = 0;
123 | 	int i = 0;
124 | 	int ref_size = 12; // xxxx 0 obj
125 | 	int off = 0;
126 | 	int gen = 0; // generation number
127 | 
128 | 	char * off_s = NULL;
129 | 	char * gen_s = NULL;
130 | 	char * start = NULL;
131 | 	char * xref = NULL;
132 | 	char * xref_orig = NULL;
133 | 	char * num_entries_a = NULL;
134 | 	char * obj_num_a = NULL;
135 | 	char * first_obj_num_a = NULL;
136 | 	char * ref = NULL;
137 | 
138 | 	char free_obj;
139 | 
140 | 	struct pdfObject * obj = NULL;
141 | 	struct pdfTrailer * trailer = NULL;
142 | 	
143 | 
144 | 	if(pdf == NULL){
145 | 		err_log("checkXRef :: invalid parameter\n");
146 | 		return -1;
147 | 	}
148 | 
149 | 
150 | 	if(pdf->trailers == NULL){
151 | 		err_log("checkXRef :: No trailer found in pdfDocument\n");
152 | 		return -1;
153 | 	}
154 | 
155 | 	trailer = pdf->trailers;
156 | 
157 | 	while(trailer != NULL){
158 | 
159 | 		// Get xref offset
160 | 		if(trailer->content == NULL){
161 | 			err_log("checkXRef :: Empty trailer content\n");
162 | 			trailer = trailer->next;
163 | 			continue;
164 | 		}
165 | 		
166 | 		start = searchPattern(trailer->content, "startxref", 9 , strlen(trailer->content));
167 | 		if(start == NULL){
168 | 			dbg_log("checkXRef :: XRef offset not found in trailer\n");
169 | 			return -1;
170 | 		}
171 | 
172 | 		start += 9; // 9 => startxref.
173 | 		while(start[0] == '\r' || start[0] == '\n' || start[0] == ' '){
174 | 			start ++;
175 | 		}
176 | 
177 | 		len =  (int)(start - trailer->content);
178 | 		len = strlen(trailer->content) - len;
179 | 
180 | 		xref_offset = getNumber(start,len);
181 | 		if (xref_offset < 0){
182 | 			err_log("checkXRef :: get xref offset failed!\n");
183 | 			return -1;
184 | 		}
185 | 		
186 | 		// Goto the xref offset and check the "xref" keyword.		
187 | 		xref = (char*)calloc(5, sizeof(char));
188 | 		xref[4] = '\0';
189 | 
190 | 		if(pdf->fh != NULL){
191 | 
192 | 			fseek(pdf->fh, 0, SEEK_SET);
193 | 			fseek(pdf->fh, xref_offset, SEEK_SET);
194 | 			fread(xref, 1, 4, pdf->fh);
195 | 
196 | 		}
197 | 		else if (pdf->fd >= 0){
198 | 
199 | 			os_lseek(pdf->fd, 0, SEEK_SET);
200 | 			os_lseek(pdf->fd, xref_offset, SEEK_SET);
201 | 			os_read(pdf->fd, xref, 4);
202 | 			
203 | 		}
204 | 		else{
205 | 			err_log("checkXRef :: invalid file handle or file descriptor\n");
206 | 			free(xref);
207 | 			return -1;
208 | 		}
209 | 
210 | 		
211 | 		//dbg_log("checkXRef :: xref keyword = %s\n",xref);
212 | 
213 | 
214 | 		if(memcmp(xref,"xref",4) == 0){
215 | 
216 | 
217 | 			//dbg_log("checkXRef :: Good xref table offset : %d\n",xref_offset);
218 | 			start = pdf->content;
219 | 			start += xref_offset;
220 | 
221 | 			len = (int)(start - pdf->content);
222 | 			len = pdf->size -len ;
223 | 
224 | 			free(xref);
225 | 
226 | 			// Get xref table content from pdf document content
227 | 			xref_orig = getDelimitedStringContent( start, "xref" , "trailer" , len);
228 | 			xref = xref_orig;
229 | 
230 | 
231 | 			if(xref == NULL){
232 | 				err_log("checkXRef :: Error while getting the xref table\n");				
233 | 				return -1;
234 | 			}
235 | 
236 | 			//dbg_log("checkXRef :: xref table content = \n%s\n",xref);
237 | 
238 | 			// shift "xref" keyword
239 | 			xref += 4;
240 | 
241 | 			len = strlen(xref) - 4;
242 | 
243 | 			while(xref[0] == '\r' || xref[0] == '\n' || xref[0] == ' '){
244 | 				xref++;
245 | 				len --;
246 | 			}
247 | 
248 | 
249 | 			// Get the object number of the first object described in xref table
250 | 			first_obj_num_a = getNumber_s(xref,len);
251 | 			if (first_obj_num_a == NULL){
252 | 				err_log("checkXRef :: can't get first object number\n");
253 | 				//ret = unexpected_error;
254 | 				// goto_clean;
255 | 				free(xref_orig);
256 | 				return bad_xref_format;				
257 | 			}
258 | 
259 | 			first_obj_num = atoi(first_obj_num_a);
260 | 
261 | 
262 | 			//dbg_log("checkXRef :: first_obj_num = %d\n",first_obj_num);
263 | 
264 | 			len -= strlen(first_obj_num_a);
265 | 			xref += strlen(first_obj_num_a);
266 | 
267 | 			// move for white space
268 | 			len --;
269 | 			xref ++;
270 | 
271 | 
272 | 			// get the number of entries in the xref table
273 | 			num_entries_a = getNumber_s(xref,len);
274 | 			if (num_entries_a == NULL){
275 | 				err_log("checkXRef :: can't get number of entries\n");
276 | 				free(xref_orig);
277 | 				free(first_obj_num_a);
278 | 				return bad_xref_format;
279 | 			}
280 | 
281 | 			num_entries = atoi(num_entries_a);
282 | 
283 | 			//dbg_log("checkXRef :: num_entries = %d\n",num_entries);
284 | 
285 | 			len -= strlen(num_entries_a);
286 | 			xref += strlen(num_entries_a);
287 | 
288 | 			// move for white space
289 | 			// Check xref format.
290 | 			
291 | 
292 | 			// hint after the number of entries it should be '\r' or '\n' not a space.
293 | 			if (*xref != '\r' && *xref != '\n'){
294 | 				err_log("checkXref :: bad xref format!\n");
295 | 				free(xref_orig);
296 | 				free(num_entries_a);
297 | 				free(first_obj_num_a);
298 | 				return bad_xref_format;
299 | 			}
300 | 
301 | 			// skip white spaces
302 | 			while (*xref == ' ' || *xref == '\r' || *xref == '\n'){
303 | 				len--;
304 | 				xref++;
305 | 			}
306 | 			
307 | 
308 | 			// For each entry of table
309 | 			for(i = 0; i< num_entries ; i++){
310 | 
311 | 
312 | 				// TODO :: check offset length. :: use get_Number_s first.
313 | 				off_s = getNumber_s(xref, len);
314 | 				if (off_s == NULL || strlen(off_s) != 10){					
315 | 					err_log("chackXref :: bad offset format in xref table! :: offset = %s :: xref = %s\n", off_s,xref);
316 | 					ret = bad_xref_format;
317 | 					goto clean;
318 | 				}
319 | 
320 | 				free(off_s);
321 | 				off_s = NULL;
322 | 
323 | 				off = getNumber(xref,len);
324 | 
325 | 				// skip 10 bytes corresponding to obj offset.
326 | 				xref += 10;
327 | 				len -= 10;
328 | 
329 | 				// check white space between offset and generation number.
330 | 				if (*xref != ' '){
331 | 					err_log("chackXref :: bad xref format!\n");
332 | 					ret = bad_xref_format;
333 | 					goto clean;
334 | 				}
335 | 
336 | 				// skip white space between offset and generation number..
337 | 				xref++;
338 | 				len--;
339 | 
340 | 
341 | 				// Get generation number.
342 | 				gen_s = getNumber_s(xref, len);
343 | 				if(gen_s == NULL){
344 | 					err_log("checkXref :: bad generation number format in xref table!\n");
345 | 					ret = bad_xref_format;
346 | 					goto clean;
347 | 				}
348 | 
349 | 				if (strlen(gen_s) != 5){
350 | 					err_log("checkXref :: bad generation number format in xref table! :: gen_number = %s\n",gen_s);
351 | 					ret = bad_xref_format;
352 | 					free(gen_s);
353 | 					goto clean;
354 | 				}
355 | 
356 | 				free(gen_s);
357 | 				//gen = getNumber(xref, len);
358 | 
359 | 				// skip 10 bytes corresponding to obj gen number.
360 | 				xref += 5;
361 | 				len -= 5;
362 | 
363 | 				// check white space between generation number and free flag.
364 | 				if (*xref != ' '){
365 | 					err_log("chackXref :: bad xref format!\n");
366 | 					ret = bad_xref_format;
367 | 					goto clean;
368 | 				}
369 | 
370 | 				// skip white space
371 | 				xref++;
372 | 				len--;
373 | 
374 | 				
375 | 				//xref += 17;
376 | 
377 | 				free_obj = xref[0];
378 | 
379 | 				obj_num = first_obj_num + i;
380 | 
381 | 				dbg_log("checkXRef :: object number = %d :: offset = %d :: free = %c\n",obj_num,off,free_obj);
382 | 
383 | 				ref = (char*)calloc(ref_size+1,sizeof(char));
384 | 				
385 | 				os_sprintf(ref,ref_size+1,"%d 0 obj",obj_num);
386 | 				
387 | 				//dbg_log("checkXRef :: ref = %s at %d\n",ref, off);
388 | 
389 | 				// get object by ref
390 | 				if(obj_num > 0){
391 | 					obj = getPDFObjectByRef(pdf,ref);
392 | 				}
393 | 
394 | 				if( obj_num > 0 && obj == NULL){
395 | 
396 | 					warn_log("checkXRef :: object not found %s--\n",ref);
397 | 
398 | 				}else{
399 | 
400 | 					
401 | 					if(obj_num > 0 && free_obj == 'n' && obj != NULL && off != obj->offset ){
402 | 						
403 | 						
404 | 						//warn_log("checkXRef :: Bad offset for object %s :: %d differs from %d\n",ref,off,obj->offset);
405 | 
406 | 						// the object could be defnied more than once
407 | 						if(pdf->testStruct->object_collision > 0){
408 | 
409 | 							while( (obj = getPDFNextObjectByRef(pdf,obj,ref)) != NULL ){
410 | 
411 | 								if(obj_num > 0 && free_obj == 'n' && off != obj->offset ){
412 | 						
413 | 									//warn_log("Warning :: checkXRef :: Bad offset for object %s :: %d differs from %d\n",ref,off,obj->offset);
414 | 									
415 | 									pdf->testStruct->bad_obj_offset ++;
416 | 									ret = 0;
417 | 								}else{
418 | 									ret = 1;
419 | 								}
420 | 							}
421 | 
422 | 						}else{
423 | 							
424 | 							//warn_log("checkXRef :: Bad offset for object %s :: %d differs from %d\n",ref,off,obj->offset);
425 | 
426 | 							pdf->testStruct->bad_obj_offset ++;
427 | 							ret = 0;
428 | 						}
429 | 						
430 | 
431 | 					}
432 | 
433 | 				}
434 | 
435 | 				// skip "free object" flag. ( 'f' or 'n' )
436 | 				xref++;
437 | 				len--;
438 | 
439 | 				// skip white spaces.				
440 | 				while (*xref == '\r' || *xref == '\n' || *xref == ' '){
441 | 					xref += 1;
442 | 					len -= 1;
443 | 				}
444 | 				//dbg_log("xref = %s\n", xref);
445 | 
446 | 				free(ref);
447 | 				ref = NULL;
448 | 
449 | 			}
450 | 
451 | 			free(xref_orig);
452 | 			xref_orig = NULL;
453 | 			
454 | 
455 | 		}else{
456 | 
457 | 			// get the xref object.
458 | 
459 | 			// if the offset is higher than th PDF size
460 | 			if(xref_offset > pdf->size){
461 | 				warn_log("checkXRef :: Wrong Xref table (or xref object) offset %d\n",xref_offset);
462 | 				ret = 0;
463 | 				pdf->testStruct->bad_xref_offset ++;
464 | 				trailer = trailer->next;
465 | 				free(xref);
466 | 				continue;
467 | 			}
468 | 			
469 | 			//dbg_log("checkXRef :: xref keyword = %s\n",xref);
470 | 
471 | 			start = pdf->content;
472 | 			start += xref_offset;
473 | 
474 | 
475 | 			len = (int)(start - pdf->content);
476 | 			len = pdf->size -len ;
477 | 
478 | 			//  Check if the offset point to a Xref Object type /XRef
479 | 			obj_num_a = getNumber_s(start,len);
480 | 
481 | 			if(obj_num_a == NULL){
482 | 				pdf->testStruct->bad_xref_offset ++;
483 | 				trailer = trailer->next;
484 | 				free(xref);
485 | 				continue;
486 | 			}
487 | 			
488 | 			obj_num = atoi(obj_num_a);
489 | 
490 | 			len = strlen(obj_num_a) + 7;
491 | 
492 | 			
493 | 
494 | 			ref = (char*)calloc(len+1,sizeof(char));
495 | 			ref[len] = '\0';
496 | 
497 | 			os_sprintf(ref,len,"%d 0 obj",obj_num);
498 | 			//dbg_log("xref object = %s\n",ref);
499 | 
500 | 			free(obj_num_a);
501 | 			obj_num_a = NULL;
502 | 
503 | 			obj = getPDFObjectByRef(pdf,ref);
504 | 
505 | 
506 | 
507 | 			if(obj != NULL){
508 | 				
509 | 				if(obj->type == NULL || memcmp(obj->type,"/XRef",5) != 0){
510 | 
511 | 					warn_log("checkXRef :: Wrong Xref table (or xref object) offset %d\n",xref_offset);					
512 | 
513 | 					//dbg_log("checkXRef ::type = %s\n",obj->type);
514 | 
515 | 					ret = 0;
516 | 					pdf->testStruct->bad_xref_offset ++;
517 | 
518 | 
519 | 				}else{ // if the xref object is found
520 | 
521 | 					//dbg_log("checkXRef :: XRef obj Dico =  %s\n",obj->dico);
522 | 
523 | 					// Check if the document is encrypted 
524 | 					if( searchPattern(obj->dico, "/Encrypt",8,strlen(obj->dico)) != NULL  ){
525 | 						pdf->testStruct->encrypted ++;
526 | 					}
527 | 
528 | 					pdf->testStruct->bad_xref_offset = 0;
529 | 				}
530 | 
531 | 			}else{
532 | 
533 | 				warn_log("checkXRef :: checkXRef :: object not found %s\n",ref);
534 | 				//pdf->testStruct->bad_xref_offset ++;
535 | 
536 | 			}
537 | 
538 | 			free(ref);
539 | 			ref = NULL;
540 | 			free(xref);
541 | 
542 | 		}
543 | 
544 | 	clean:
545 | 
546 | 		if (off_s != NULL){
547 | 			free(off_s);
548 | 			off_s = NULL;
549 | 		}
550 | 
551 | 		if (xref_orig != NULL){
552 | 			free(xref_orig);
553 | 			xref_orig = NULL;
554 | 		}
555 | 
556 | 		if (num_entries_a != NULL){
557 | 			free(num_entries_a);
558 | 			num_entries_a = NULL;
559 | 		}
560 | 		
561 | 		if (first_obj_num_a != NULL){
562 | 			free(first_obj_num_a);
563 | 			first_obj_num_a = NULL;
564 | 		}		
565 | 
566 | 
567 | 		trailer = trailer->next;
568 | 
569 | 	}
570 | 
571 | 
572 | 
573 | 	return ret;
574 | 
575 | }
576 | 
577 | 
578 | /*
579 | checkEmptyDocument() :: check if pages in the document are not all empty. => returns 1 if not empty
580 | parameters:
581 | - struct pdfDocument * pdf (pdf document pointer)
582 | returns: (int)
583 | - 1 if an non empty page is found on success.
584 | - 0 if the document is empty.
585 | - an error code (<0) on error.
586 | 
587 | TODO :: checkEmptyDocument :: Improve this function by creating a function getPagesKids which could be called recursively when the Kids objects reffers also to a /Pages object.
588 | */
589 | int checkEmptyDocument(struct pdfDocument * pdf){
590 | 
591 | 	int ret = 0;
592 | 	int len = 0;
593 | 	int len2 = 0;
594 | 	
595 | 	char * start = NULL;
596 | 	char * end = NULL;
597 | 	char * kids = NULL;
598 | 	char * kid_obj_ref = NULL;
599 | 	char * content_array = NULL;
600 | 	char * pageContents = NULL;
601 | 	char * pageContent_obj_ref = NULL;
602 | 
603 | 	struct pdfObject * pageContent_obj = NULL;
604 | 	struct pdfObject * obj = NULL;
605 | 	struct pdfObject * kid_obj = NULL;
606 | 	
607 | 	
608 | 	if(pdf == NULL){
609 | 		err_log("checkEmptyDocument :: invalid parameter\n");		
610 | 		return -1;
611 | 	}
612 | 
613 | 
614 | 	obj = pdf->objects;
615 | 
616 | 	while(obj != NULL){
617 | 
618 | 		if(obj->type != NULL && strncmp(obj->type,"/Pages",6) == 0){
619 | 
620 | 			dbg_log("checkEmptyDocument :: Found /Pages object :: %s\n", obj->reference);
621 | 			//dbg_log("Dico = %s\n",obj->dico);
622 | 
623 | 			// get kids pages
624 | 			start = searchPattern(obj->dico, "/Kids", 5 , strlen(obj->dico));
625 | 			if(start == NULL){
626 | 				warn_log("checkEmptyDocument :: no kids entry in pages dictionary %s\n",obj->reference);				
627 | 				obj = obj->next; // go to the next object
628 | 				continue;
629 | 			}
630 | 
631 | 			start += 5;
632 | 
633 | 			len = (int)(start - obj->dico);
634 | 			len = strlen(obj->dico) -len;
635 | 
636 | 			//while
637 | 			kids = getDelimitedStringContent(start,"[","]",len);
638 | 			dbg_log("kids = %s\n",kids);
639 | 
640 | 			if (kids == NULL){
641 | 				// on error, got to next obj.
642 | 				goto next;
643 | 			}
644 | 
645 | 			len = strlen(kids);
646 | 			end = kids;
647 | 
648 | 			// get kids pages object references.
649 | 			while( (kid_obj_ref = getIndirectRefInString(end,len)) != NULL){
650 | 
651 | 				
652 | 				dbg_log("checkEmptyDocument :: kid ref = %s\n",kid_obj_ref);
653 | 
654 | 				end = searchPattern(end,kid_obj_ref,strlen(kid_obj_ref)-3,len);
655 | 				if(end == NULL){
656 | 					err_log("checkEmptyDocument :: end == NULL\n" );
657 | 					free(kids);
658 | 					free(kid_obj_ref);
659 | 					return -1;
660 | 				}
661 | 
662 | 				end += strlen(kid_obj_ref) - 2;
663 | 
664 | 				len = (int)(end - kids);
665 | 				len = strlen(kids) - len;
666 | 
667 | 				if((kid_obj = getPDFObjectByRef(pdf,kid_obj_ref)) == NULL){
668 | 					warn_log("checkEmptyDocument :: Object %s not found\n", kid_obj_ref);						
669 | 					continue;
670 | 				}
671 | 
672 | 				// check the type of the object
673 | 				if(kid_obj->dico != NULL && kid_obj->type != NULL && strncmp(kid_obj->type,"/Page",5) == 0 && strncmp(kid_obj->type,"/Page",strlen(kid_obj->type)) == 0 ){
674 | 
675 | 					
676 | 					// TODO :: checkEmptyDocument :: write function (checkEmptyPage) to split this function.
677 | 					// checkEmptyPages(pdf,kid_obj);
678 | 
679 | 					start = searchPattern(kid_obj->dico, "/Contents", 9 , strlen(kid_obj->dico));
680 | 					//dbg_log("checkEmptyDocument :: Page dico = %s\n",kid_obj->dico);
681 | 
682 | 					if(start == NULL){
683 | 						warn_log("Warning :: checkEmptyDocument :: No page content in object %s\n",kid_obj_ref);						
684 | 						continue;
685 | 					}
686 | 
687 | 					start += 9; // /Contents  => 9 strlen(Contents)
688 | 
689 | 					// skip white spaces
690 | 					while(start[0] == ' '){
691 | 						start ++;
692 | 					}
693 | 
694 | 
695 | 					// if there is serveral content objects
696 | 					if(start[0] == '['){
697 | 
698 | 						len2 = (int)(start - kid_obj->dico);
699 | 						len2 = strlen(kid_obj->dico) - len2;
700 | 						pageContents = getDelimitedStringContent(start,"[","]",len2);
701 | 
702 | 						if(pageContents == NULL){
703 | 							warn_log("checkEmptyDocument :: getting Page content array failed !\n");							
704 | 							continue;
705 | 						}
706 | 
707 | 						len2 = strlen(pageContents);
708 | 						start = pageContents;
709 | 
710 | 						// get page content objects ref
711 | 						while( (pageContent_obj_ref = getIndirectRefInString(start,len2)) != NULL){
712 | 
713 | 							dbg_log("checkEmptyDocument :: page content ref = %s\n",pageContent_obj_ref);
714 | 
715 | 							start = searchPattern(start,pageContent_obj_ref,strlen(pageContent_obj_ref)-3,len2);
716 | 
717 | 							if(start == NULL){
718 | 								err_log("checkEmptyDocument :: can't get page content object reference\n");
719 | 								free(pageContents);
720 | 								free(kids);
721 | 								free(kid_obj_ref);
722 | 								free(pageContent_obj_ref);
723 | 								return -1;
724 | 							}
725 | 
726 | 							start += strlen(pageContent_obj_ref) - 2;
727 | 
728 | 							len2 = strlen(pageContents) - (int)(end - pageContents);
729 | 
730 | 
731 | 							if((pageContent_obj = getPDFObjectByRef(pdf,pageContent_obj_ref)) == NULL){
732 | 								warn_log("Warning :: checkEmptyDocument :: Object %s not found \n", pageContent_obj_ref);
733 | 								free(pageContent_obj_ref);
734 | 								continue;
735 | 							}
736 | 
737 | 							// get the stream
738 | 							if(pageContent_obj->stream != NULL && pageContent_obj->stream_size > 0){
739 | 								//dbg_log("checkEmptyDocument :: Page %s is not empty\n",kid_obj_ref);
740 | 								//return 1; // TODO ::  when you find a non-empty page then return. + time
741 | 								/* Do no forget to free the allocated variables
742 | 								free(pageContent_obj_ref);
743 | 								free(kid_obj_ref);
744 | 								free(kids);
745 | 								*/
746 | 								if (pageContent_obj_ref != NULL) {
747 | 									free(pageContent_obj_ref);
748 | 								}
749 | 								if (kid_obj_ref != NULL) {
750 | 									free(kid_obj_ref);
751 | 								}
752 | 								if (kids != NULL) {
753 | 									free(kids);
754 | 								}
755 | 								free(pageContents);
756 | 								dbg_log("checkEmptyDocument :: non empty page found\n");
757 | 
758 | 								return 1;
759 | 
760 | 								
761 | 							}else{
762 | 
763 | 								warn_log("checkEmptyDocument :: Empty page content %s\n",pageContent_obj_ref);								
764 | 							}
765 | 
766 | 							free(pageContent_obj_ref);
767 | 
768 | 						}
769 | 
770 | 						free(pageContents);
771 | 
772 | 
773 | 					}else{
774 | 
775 | 
776 | 						len2 = (int)(start - kid_obj->dico);
777 | 						len2 = strlen(kid_obj->dico) - len2;
778 | 						pageContent_obj_ref = getIndirectRef(start, len2);
779 | 
780 | 						if(pageContent_obj_ref == NULL){
781 | 							warn_log("checkEmptyDocument :: Error while getting page content object reference \n");							
782 | 							continue;
783 | 						}
784 | 
785 | 						if((pageContent_obj = getPDFObjectByRef(pdf,pageContent_obj_ref)) == NULL){
786 | 							warn_log("checkEmptyDocument :: Object not found %s\n", pageContent_obj_ref);
787 | 							free(pageContent_obj_ref);
788 | 							continue;
789 | 						}
790 | 
791 | 
792 | 						// get the stream
793 | 						if(pageContent_obj->stream != NULL && pageContent_obj->stream_size > 0){
794 | 							
795 | 							//dbg_log("checkEmptyDocument :: Page %s is not empty\n",kid_obj_ref);
796 | 							free(kids);
797 | 							free(kid_obj_ref);
798 | 							free(pageContent_obj_ref);
799 | 							return 1;
800 | 
801 | 						}else{
802 | 
803 | 							// Case when the content is an array ::
804 | 							content_array = getDelimitedStringContent(pageContent_obj->content,"[","]",pageContent_obj->content_size);
805 | 
806 | 							//dbg_log("checkEmptyDocument :: content array = %s\n",content_array);
807 | 							if(content_array != NULL){
808 | 
809 | 								start = content_array;
810 | 								len2 = strlen(content_array);
811 | 								dbg_log("checkEmptyDoc :: content = %s\n", start);
812 | 								dbg_log("checkEmptyDoc :: content_len = %d\n",len2);
813 | 
814 | 								free(pageContent_obj_ref);
815 | 
816 | 								while((pageContent_obj_ref = getIndirectRefInString(start,len2) ) != NULL){
817 | 
818 | 									dbg_log("checkEmptyDocument :: page content ref = %s\n",pageContent_obj_ref );
819 | 
820 | 									start = searchPattern(start,pageContent_obj_ref,strlen(pageContent_obj_ref)-3,len2);
821 | 									if (start == NULL){
822 | 										err_log("checkEmptyDocument :: can't retrieve object reference in dico\n");
823 | 										free(pageContent_obj_ref);
824 | 										break;
825 | 									}
826 | 									start += strlen(pageContent_obj_ref) - 2;
827 | 
828 | 									len2 = (int)(start - content_array);
829 | 									len2 = strlen(content_array) - len2;
830 | 
831 | 
832 | 									if((pageContent_obj = getPDFObjectByRef(pdf,pageContent_obj_ref)) == NULL){
833 | 										warn_log("checkEmptyDocument :: Object not found %s\n", pageContent_obj_ref);										
834 | 										free(pageContent_obj_ref);
835 | 										continue;
836 | 									}
837 | 
838 | 									if(pageContent_obj->stream != NULL && pageContent_obj->stream_size > 0){
839 | 										
840 | 										//dbg_log("checkEmptyDocument :: Page %s is not empty\n",kid_obj_ref);
841 | 										free(kid_obj_ref);
842 | 										free(content_array);
843 | 										free(pageContent_obj_ref);
844 | 										free(kids);
845 | 										return 1;
846 | 
847 | 										// TODO :: 
848 | 										// ret = 1;
849 | 										// goto cleaning.
850 | 										
851 | 										
852 | 
853 | 									}else{
854 | 										warn_log("checkEmptyDocument :: Empty page content %s\n",pageContent_obj_ref);										
855 | 									}
856 | 
857 | 									free(pageContent_obj_ref);
858 | 
859 | 								}
860 | 
861 | 								free(content_array);
862 | 
863 | 							}else{
864 | 																
865 | 								warn_log("checkEmptyDocument :: Empty page content %s\n",pageContent_obj_ref);
866 | 								free(pageContent_obj_ref);
867 | 
868 | 							}
869 | 
870 | 
871 | 
872 | 						}
873 | 
874 | 					}
875 | 
876 | 
877 | 				}
878 | 				free(kid_obj_ref);
879 | 				
880 | 			}
881 | 
882 | 			free(kids);
883 | 		}
884 | 
885 | 		next:
886 | 		obj = obj->next;
887 | 
888 | 	}
889 | 
890 | 
891 | 	return ret;
892 | 
893 | }
894 | 
895 | 
896 | /*
897 | documentStructureAnalysis() :: check if the document respects the PDF reference recommendations
898 | parameters:
899 | - struct pdfDocument * pdf (pdf document pointer)
900 | returns: (int)
901 | - 0 on success.
902 | - an error code (<0) on error.
903 | 
904 | TODO :: documentStructureAnalysis :: check trailers.
905 | */
906 | int documentStructureAnalysis(struct pdfDocument * pdf){
907 | 
908 | 	int ret = 0;
909 | 
910 | 	dbg_log("documentStructureAnalysis ::\n");
911 | 
912 | 	// TODO.
913 | 	//res = checkTrailer(pdf);
914 | 
915 | 	if ((ret = checkXRef(pdf)) < 0){
916 | 		err_log("documentStructureAnalysis :: check reference table failed!\n");
917 | 		return ret;
918 | 	}
919 | 
920 | 	if ((ret = checkEmptyDocument(pdf)) < 0){
921 | 		err_log("documentStructureAnalysis :: check document pages content failed!\n");
922 | 		return ret;
923 | 	}
924 | 
925 | 	// TODO :: check if there is no error during pdf parsing.
926 | 	if(ret == 0){
927 | 		pdf->testStruct->empty_page_content ++;
928 | 	}
929 | 
930 | 
931 | 	return ret;
932 | }
933 | 


--------------------------------------------------------------------------------