├── .gitattributes
├── .gitignore
├── COPYING
├── COPYING.Boost
├── COPYING.GPLv2
├── Doxyfile.example
├── Makefile.am
├── NEWS
├── README
├── bootstrap.sh
├── configure.ac
├── examples
└── internal
│ ├── edit_distn.c
│ └── has_common_substring.c
├── ext
└── .gitignore
├── ffuzzy.h
├── ffuzzy_blocksize.c
├── ffuzzy_blocksize.h
├── ffuzzy_compare.c
├── ffuzzy_digest.c
├── ffuzzy_digest_conv.c
├── ffuzzy_digest_unnorm.c
├── ffuzzy_parse.c
├── ffuzzy_parse.h
├── ffuzzy_parse_unnorm.c
├── m4
└── .gitignore
├── str_base64.h
├── str_common_substr.h
├── str_edit_dist.h
├── str_hash_rolling.h
└── util.h
/.gitattributes:
--------------------------------------------------------------------------------
1 | /COPYING* -whitespace
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .*
2 | !.git?*
3 | *~
4 | *.a
5 | *.diff
6 | *.dll
7 | *.exe
8 | *.in
9 | *.la
10 | *.lo
11 | *.o
12 | *.obj
13 | *.out
14 | *.patch
15 | *.pdb
16 | *.swp
17 | *.tmp
18 | aclocal.m4
19 | autoscan.log
20 | autom4te.cache
21 | config.log
22 | config.status
23 | configure
24 | configure.scan
25 | confdefs*
26 | conftest*
27 | conf[0-9]*
28 | ffuzzy_config.h
29 | libtool
30 | Makefile
31 | so_locations
32 | stamp-h1
33 | tmp*
34 | _libs
35 |
36 | libffuzzy-*
37 |
38 | /Doxyfile
39 | /html/
40 | /latex/
41 | /doc/
42 |
--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | libfuzzy-compare-fast : Fast ssdeep comparison library
2 | =======================================================
3 |
4 |
5 | License / Copying
6 | ------------------
7 |
8 | This program is free software; you can redistribute it and/or modify
9 | it under the terms of the GNU General Public License as published by
10 | the Free Software Foundation; either version 2 of the License, or
11 | (at your option) any later version.
12 |
13 | This program is distributed in the hope that it will be useful,
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | GNU General Public License for more details.
17 |
18 | You should have received a copy of the GNU General Public License
19 | along with this program. If not, see .
20 |
21 |
22 | License Files
23 | --------------
24 |
25 | * COPYING.GPLv2
26 | GNU General Public License, version 2.0
27 | * COPYING.Boost
28 | Boost Software License, version 1.0
29 |
30 |
31 | Credits
32 | --------
33 |
34 | The most part of the source code are extracted from fuzzy.c
35 | from ssdeep version 2.11 but modified to keep compatibility
36 | with ssdeep version 2.10.
37 |
38 | Copyright (C) 2002 Andrew Tridgell
39 | Copyright (C) 2006 ManTech International Corporation
40 | Copyright (C) 2013 Helmut Grohne
41 |
42 | fuzzy.c (from ssdeep 2.11) is licensed under the terms of the
43 | GNU General Public License as published by the Free Software Foundation;
44 | either version 2 of the License, or (at your option) any later version.
45 |
46 | The Levenshtein distance code is contributed from kikairoya on Github.
47 |
48 | Copyright (C) 2014 kikairoya
49 |
50 | This portion of code is licensed under the terms of the
51 | Boost Software License, version 1.0 and linked with GPLv2+ code.
52 |
53 | Tsukasa OI (the original author of libffuzzy) modified
54 | them and wrote fast implementation.
55 |
56 | Copyright (C) 2014 Tsukasa OI
57 |
58 | The license of the modified portion varies by place.
59 | See the source code for details.
60 |
--------------------------------------------------------------------------------
/COPYING.Boost:
--------------------------------------------------------------------------------
1 | Boost Software License - Version 1.0 - August 17th, 2003
2 |
3 | Permission is hereby granted, free of charge, to any person or organization
4 | obtaining a copy of the software and accompanying documentation covered by
5 | this license (the "Software") to use, reproduce, display, distribute,
6 | execute, and transmit the Software, and to prepare derivative works of the
7 | Software, and to permit third-parties to whom the Software is furnished to
8 | do so, all subject to the following:
9 |
10 | The copyright notices in the Software and this entire statement, including
11 | the above license grant, this restriction and the following disclaimer,
12 | must be included in all copies of the Software, in whole or in part, and
13 | all derivative works of the Software, unless such copies or derivative
14 | works are solely in the form of machine-executable object code generated by
15 | a source language processor.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 |
--------------------------------------------------------------------------------
/COPYING.GPLv2:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 2, June 1991
3 |
4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6 | Everyone is permitted to copy and distribute verbatim copies
7 | of this license document, but changing it is not allowed.
8 |
9 | Preamble
10 |
11 | The licenses for most software are designed to take away your
12 | freedom to share and change it. By contrast, the GNU General Public
13 | License is intended to guarantee your freedom to share and change free
14 | software--to make sure the software is free for all its users. This
15 | General Public License applies to most of the Free Software
16 | Foundation's software and to any other program whose authors commit to
17 | using it. (Some other Free Software Foundation software is covered by
18 | the GNU Lesser General Public License instead.) You can apply it to
19 | your programs, too.
20 |
21 | When we speak of free software, we are referring to freedom, not
22 | price. Our General Public Licenses are designed to make sure that you
23 | have the freedom to distribute copies of free software (and charge for
24 | this service if you wish), that you receive source code or can get it
25 | if you want it, that you can change the software or use pieces of it
26 | in new free programs; and that you know you can do these things.
27 |
28 | To protect your rights, we need to make restrictions that forbid
29 | anyone to deny you these rights or to ask you to surrender the rights.
30 | These restrictions translate to certain responsibilities for you if you
31 | distribute copies of the software, or if you modify it.
32 |
33 | For example, if you distribute copies of such a program, whether
34 | gratis or for a fee, you must give the recipients all the rights that
35 | you have. You must make sure that they, too, receive or can get the
36 | source code. And you must show them these terms so they know their
37 | rights.
38 |
39 | We protect your rights with two steps: (1) copyright the software, and
40 | (2) offer you this license which gives you legal permission to copy,
41 | distribute and/or modify the software.
42 |
43 | Also, for each author's protection and ours, we want to make certain
44 | that everyone understands that there is no warranty for this free
45 | software. If the software is modified by someone else and passed on, we
46 | want its recipients to know that what they have is not the original, so
47 | that any problems introduced by others will not reflect on the original
48 | authors' reputations.
49 |
50 | Finally, any free program is threatened constantly by software
51 | patents. We wish to avoid the danger that redistributors of a free
52 | program will individually obtain patent licenses, in effect making the
53 | program proprietary. To prevent this, we have made it clear that any
54 | patent must be licensed for everyone's free use or not licensed at all.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | GNU GENERAL PUBLIC LICENSE
60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61 |
62 | 0. This License applies to any program or other work which contains
63 | a notice placed by the copyright holder saying it may be distributed
64 | under the terms of this General Public License. The "Program", below,
65 | refers to any such program or work, and a "work based on the Program"
66 | means either the Program or any derivative work under copyright law:
67 | that is to say, a work containing the Program or a portion of it,
68 | either verbatim or with modifications and/or translated into another
69 | language. (Hereinafter, translation is included without limitation in
70 | the term "modification".) Each licensee is addressed as "you".
71 |
72 | Activities other than copying, distribution and modification are not
73 | covered by this License; they are outside its scope. The act of
74 | running the Program is not restricted, and the output from the Program
75 | is covered only if its contents constitute a work based on the
76 | Program (independent of having been made by running the Program).
77 | Whether that is true depends on what the Program does.
78 |
79 | 1. You may copy and distribute verbatim copies of the Program's
80 | source code as you receive it, in any medium, provided that you
81 | conspicuously and appropriately publish on each copy an appropriate
82 | copyright notice and disclaimer of warranty; keep intact all the
83 | notices that refer to this License and to the absence of any warranty;
84 | and give any other recipients of the Program a copy of this License
85 | along with the Program.
86 |
87 | You may charge a fee for the physical act of transferring a copy, and
88 | you may at your option offer warranty protection in exchange for a fee.
89 |
90 | 2. You may modify your copy or copies of the Program or any portion
91 | of it, thus forming a work based on the Program, and copy and
92 | distribute such modifications or work under the terms of Section 1
93 | above, provided that you also meet all of these conditions:
94 |
95 | a) You must cause the modified files to carry prominent notices
96 | stating that you changed the files and the date of any change.
97 |
98 | b) You must cause any work that you distribute or publish, that in
99 | whole or in part contains or is derived from the Program or any
100 | part thereof, to be licensed as a whole at no charge to all third
101 | parties under the terms of this License.
102 |
103 | c) If the modified program normally reads commands interactively
104 | when run, you must cause it, when started running for such
105 | interactive use in the most ordinary way, to print or display an
106 | announcement including an appropriate copyright notice and a
107 | notice that there is no warranty (or else, saying that you provide
108 | a warranty) and that users may redistribute the program under
109 | these conditions, and telling the user how to view a copy of this
110 | License. (Exception: if the Program itself is interactive but
111 | does not normally print such an announcement, your work based on
112 | the Program is not required to print an announcement.)
113 |
114 | These requirements apply to the modified work as a whole. If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works. But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 |
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 |
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 |
134 | 3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 |
138 | a) Accompany it with the complete corresponding machine-readable
139 | source code, which must be distributed under the terms of Sections
140 | 1 and 2 above on a medium customarily used for software interchange; or,
141 |
142 | b) Accompany it with a written offer, valid for at least three
143 | years, to give any third party, for a charge no more than your
144 | cost of physically performing source distribution, a complete
145 | machine-readable copy of the corresponding source code, to be
146 | distributed under the terms of Sections 1 and 2 above on a medium
147 | customarily used for software interchange; or,
148 |
149 | c) Accompany it with the information you received as to the offer
150 | to distribute corresponding source code. (This alternative is
151 | allowed only for noncommercial distribution and only if you
152 | received the program in object code or executable form with such
153 | an offer, in accord with Subsection b above.)
154 |
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it. For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable. However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 |
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 |
172 | 4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License. Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 |
180 | 5. You are not required to accept this License, since you have not
181 | signed it. However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works. These actions are
183 | prohibited by law if you do not accept this License. Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 |
189 | 6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions. You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 |
197 | 7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License. If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all. For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 |
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 |
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices. Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 |
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 |
229 | 8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded. In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 |
237 | 9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time. Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 |
242 | Each version is given a distinguishing version number. If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation. If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 |
250 | 10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission. For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this. Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 |
258 | NO WARRANTY
259 |
260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 |
270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 |
280 | END OF TERMS AND CONDITIONS
281 |
282 | How to Apply These Terms to Your New Programs
283 |
284 | If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 |
288 | To do so, attach the following notices to the program. It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 |
293 |
294 | Copyright (C)
295 |
296 | This program is free software; you can redistribute it and/or modify
297 | it under the terms of the GNU General Public License as published by
298 | the Free Software Foundation; either version 2 of the License, or
299 | (at your option) any later version.
300 |
301 | This program is distributed in the hope that it will be useful,
302 | but WITHOUT ANY WARRANTY; without even the implied warranty of
303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304 | GNU General Public License for more details.
305 |
306 | You should have received a copy of the GNU General Public License along
307 | with this program; if not, write to the Free Software Foundation, Inc.,
308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 |
310 | Also add information on how to contact you by electronic and paper mail.
311 |
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 |
315 | Gnomovision version 69, Copyright (C) year name of author
316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 | This is free software, and you are welcome to redistribute it
318 | under certain conditions; type `show c' for details.
319 |
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License. Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 |
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary. Here is a sample; alter the names:
328 |
329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 | `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 |
332 | , 1 April 1989
333 | Ty Coon, President of Vice
334 |
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs. If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library. If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 |
--------------------------------------------------------------------------------
/Doxyfile.example:
--------------------------------------------------------------------------------
1 | DOXYFILE_ENCODING = UTF-8
2 |
3 | PROJECT_NAME = "libffuzzy"
4 | PROJECT_NUMBER = "2.1.4"
5 | PROJECT_BRIEF = "Fast ssdeep comparison library"
6 | PROJECT_LOGO =
7 | OUTPUT_DIRECTORY = doc
8 | CREATE_SUBDIRS = NO
9 | ALLOW_UNICODE_NAMES = NO
10 | OUTPUT_LANGUAGE = English
11 |
12 | EXAMPLE_PATH = .
13 |
14 | # Brief Template for API documentation
15 | INTERNAL_DOCS = NO
16 | INPUT = ffuzzy.h
17 |
18 | # Brief Template for internal documentation
19 | #INTERNAL_DOCS = YES
20 | #INPUT =
21 |
22 |
23 | # See Doxygen's documentation for details.
24 | BRIEF_MEMBER_DESC = YES
25 | REPEAT_BRIEF = YES
26 | ABBREVIATE_BRIEF =
27 | ALWAYS_DETAILED_SEC = NO
28 | INLINE_INHERITED_MEMB = NO
29 | FULL_PATH_NAMES = YES
30 | STRIP_FROM_PATH =
31 | STRIP_FROM_INC_PATH =
32 | SHORT_NAMES = NO
33 | JAVADOC_AUTOBRIEF = NO
34 | QT_AUTOBRIEF = NO
35 | MULTILINE_CPP_IS_BRIEF = NO
36 |
37 | INHERIT_DOCS = YES
38 | SEPARATE_MEMBER_PAGES = NO
39 | TAB_SIZE = 4
40 | OPTIMIZE_OUTPUT_FOR_C = YES
41 | OPTIMIZE_OUTPUT_JAVA = NO
42 | OPTIMIZE_FOR_FORTRAN = NO
43 | OPTIMIZE_OUTPUT_VHDL = NO
44 |
45 | EXTENSION_MAPPING =
46 |
47 | MARKDOWN_SUPPORT = YES
48 | AUTOLINK_SUPPORT = YES
49 | BUILTIN_STL_SUPPORT = NO
50 | CPP_CLI_SUPPORT = NO
51 | SIP_SUPPORT = NO
52 | IDL_PROPERTY_SUPPORT = NO
53 |
54 | DISTRIBUTE_GROUP_DOC = NO
55 | SUBGROUPING = YES
56 | INLINE_GROUPED_CLASSES = NO
57 | INLINE_SIMPLE_STRUCTS = NO
58 | TYPEDEF_HIDES_STRUCT = NO
59 | LOOKUP_CACHE_SIZE = 0
60 |
61 | EXTRACT_ALL = NO
62 | EXTRACT_PRIVATE = YES
63 | EXTRACT_PACKAGE = YES
64 | EXTRACT_STATIC = YES
65 | EXTRACT_LOCAL_CLASSES = YES
66 | EXTRACT_LOCAL_METHODS = YES
67 | EXTRACT_ANON_NSPACES = NO
68 |
69 | HIDE_UNDOC_MEMBERS = NO
70 | HIDE_UNDOC_CLASSES = NO
71 | HIDE_FRIEND_COMPOUNDS = NO
72 | HIDE_IN_BODY_DOCS = NO
73 | CASE_SENSE_NAMES = YES
74 | HIDE_SCOPE_NAMES = NO
75 | SHOW_INCLUDE_FILES = YES
76 | SHOW_GROUPED_MEMB_INC = NO
77 | FORCE_LOCAL_INCLUDES = NO
78 | INLINE_INFO = YES
79 |
80 | SORT_MEMBER_DOCS = YES
81 | SORT_BRIEF_DOCS = NO
82 | SORT_MEMBERS_CTORS_1ST = NO
83 | SORT_GROUP_NAMES = NO
84 | SORT_BY_SCOPE_NAME = NO
85 | STRICT_PROTO_MATCHING = NO
86 | GENERATE_TODOLIST = YES
87 | GENERATE_TESTLIST = YES
88 | GENERATE_BUGLIST = YES
89 | GENERATE_DEPRECATEDLIST= YES
90 | ENABLED_SECTIONS =
91 | MAX_INITIALIZER_LINES = 30
92 | SHOW_USED_FILES = YES
93 | SHOW_FILES = YES
94 | SHOW_NAMESPACES = YES
95 | FILE_VERSION_FILTER =
96 | LAYOUT_FILE =
97 | CITE_BIB_FILES =
98 |
99 | QUIET = NO
100 | WARNINGS = YES
101 | WARN_IF_UNDOCUMENTED = YES
102 | WARN_IF_DOC_ERROR = YES
103 | WARN_NO_PARAMDOC = NO
104 | WARN_FORMAT = "$file:$line: $text"
105 | WARN_LOGFILE =
106 |
107 | INPUT_ENCODING = UTF-8
108 | FILE_PATTERNS =
109 | RECURSIVE = NO
110 | EXCLUDE = ffuzzy_config.h
111 | EXCLUDE_SYMLINKS = NO
112 | EXCLUDE_PATTERNS =
113 | EXCLUDE_SYMBOLS =
114 | EXAMPLE_PATTERNS =
115 | EXAMPLE_RECURSIVE = NO
116 | IMAGE_PATH =
117 | INPUT_FILTER =
118 | FILTER_PATTERNS =
119 | FILTER_SOURCE_FILES = NO
120 | FILTER_SOURCE_PATTERNS =
121 | USE_MDFILE_AS_MAINPAGE =
122 |
123 | SOURCE_BROWSER = NO
124 | INLINE_SOURCES = NO
125 | STRIP_CODE_COMMENTS = YES
126 | REFERENCED_BY_RELATION = NO
127 | REFERENCES_RELATION = NO
128 | REFERENCES_LINK_SOURCE = YES
129 | SOURCE_TOOLTIPS = YES
130 | USE_HTAGS = NO
131 | VERBATIM_HEADERS = YES
132 |
133 | ALPHABETICAL_INDEX = YES
134 | COLS_IN_ALPHA_INDEX = 5
135 | IGNORE_PREFIX =
136 |
137 | GENERATE_HTML = YES
138 | HTML_OUTPUT = html
139 | HTML_FILE_EXTENSION = .html
140 | HTML_HEADER =
141 | HTML_FOOTER =
142 | HTML_STYLESHEET =
143 | HTML_EXTRA_STYLESHEET =
144 | HTML_EXTRA_FILES =
145 | HTML_COLORSTYLE_HUE = 220
146 | HTML_COLORSTYLE_SAT = 100
147 | HTML_COLORSTYLE_GAMMA = 80
148 | HTML_TIMESTAMP = YES
149 | HTML_DYNAMIC_SECTIONS = NO
150 | HTML_INDEX_NUM_ENTRIES = 100
151 | GENERATE_DOCSET = NO
152 | DOCSET_FEEDNAME = "Doxygen generated docs"
153 | DOCSET_BUNDLE_ID = org.doxygen.Project
154 | DOCSET_PUBLISHER_ID = org.doxygen.Publisher
155 | DOCSET_PUBLISHER_NAME = Publisher
156 |
157 | GENERATE_HTMLHELP = NO
158 | CHM_FILE =
159 | HHC_LOCATION =
160 |
161 | GENERATE_CHI = NO
162 | CHM_INDEX_ENCODING =
163 | BINARY_TOC = NO
164 | TOC_EXPAND = NO
165 |
166 | GENERATE_QHP = NO
167 | QCH_FILE =
168 | QHP_NAMESPACE = org.doxygen.Project
169 | QHP_VIRTUAL_FOLDER = doc
170 | QHP_CUST_FILTER_NAME =
171 | QHP_CUST_FILTER_ATTRS =
172 | QHP_SECT_FILTER_ATTRS =
173 | QHG_LOCATION =
174 |
175 | GENERATE_ECLIPSEHELP = NO
176 | ECLIPSE_DOC_ID = org.doxygen.Project
177 | DISABLE_INDEX = NO
178 | GENERATE_TREEVIEW = NO
179 | ENUM_VALUES_PER_LINE = 4
180 | TREEVIEW_WIDTH = 250
181 | EXT_LINKS_IN_WINDOW = NO
182 | FORMULA_FONTSIZE = 10
183 | FORMULA_TRANSPARENT = YES
184 | USE_MATHJAX = NO
185 | MATHJAX_FORMAT = HTML-CSS
186 | MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest
187 | MATHJAX_EXTENSIONS =
188 | MATHJAX_CODEFILE =
189 | SEARCHENGINE = YES
190 | SERVER_BASED_SEARCH = NO
191 | EXTERNAL_SEARCH = NO
192 | SEARCHENGINE_URL =
193 | SEARCHDATA_FILE = searchdata.xml
194 | EXTERNAL_SEARCH_ID =
195 | EXTRA_SEARCH_MAPPINGS =
196 |
197 | GENERATE_LATEX = NO
198 | LATEX_OUTPUT = latex
199 | LATEX_CMD_NAME = latex
200 | MAKEINDEX_CMD_NAME = makeindex
201 | COMPACT_LATEX = NO
202 | PAPER_TYPE = a4
203 | EXTRA_PACKAGES =
204 | LATEX_HEADER =
205 | LATEX_FOOTER =
206 | LATEX_EXTRA_FILES =
207 | PDF_HYPERLINKS = YES
208 | USE_PDFLATEX = YES
209 | LATEX_BATCHMODE = NO
210 | LATEX_HIDE_INDICES = NO
211 | LATEX_SOURCE_CODE = NO
212 | LATEX_BIB_STYLE = plain
213 |
214 | GENERATE_RTF = NO
215 | RTF_OUTPUT = rtf
216 | COMPACT_RTF = NO
217 | RTF_HYPERLINKS = NO
218 | RTF_STYLESHEET_FILE =
219 | RTF_EXTENSIONS_FILE =
220 |
221 | GENERATE_MAN = NO
222 | MAN_OUTPUT = man
223 | MAN_EXTENSION = .3
224 | MAN_SUBDIR =
225 | MAN_LINKS = NO
226 |
227 | GENERATE_XML = NO
228 | XML_OUTPUT = xml
229 | XML_PROGRAMLISTING = YES
230 |
231 | GENERATE_DOCBOOK = NO
232 | DOCBOOK_OUTPUT = docbook
233 | DOCBOOK_PROGRAMLISTING = NO
234 |
235 | GENERATE_AUTOGEN_DEF = NO
236 |
237 | GENERATE_PERLMOD = NO
238 | PERLMOD_LATEX = NO
239 | PERLMOD_PRETTY = YES
240 | PERLMOD_MAKEVAR_PREFIX =
241 |
242 | ENABLE_PREPROCESSING = YES
243 | MACRO_EXPANSION = NO
244 | EXPAND_ONLY_PREDEF = NO
245 | SEARCH_INCLUDES = YES
246 | INCLUDE_PATH =
247 | INCLUDE_FILE_PATTERNS =
248 | PREDEFINED =
249 | EXPAND_AS_DEFINED =
250 | SKIP_FUNCTION_MACROS = YES
251 |
252 | TAGFILES =
253 | GENERATE_TAGFILE =
254 | ALLEXTERNALS = NO
255 | EXTERNAL_GROUPS = YES
256 | EXTERNAL_PAGES = YES
257 | PERL_PATH = /usr/bin/perl
258 |
259 | CLASS_DIAGRAMS = YES
260 | MSCGEN_PATH =
261 | DIA_PATH =
262 | HIDE_UNDOC_RELATIONS = YES
263 | HAVE_DOT = NO
264 | DOT_NUM_THREADS = 0
265 | DOT_FONTNAME = Helvetica
266 | DOT_FONTSIZE = 10
267 | DOT_FONTPATH =
268 | CLASS_GRAPH = YES
269 | COLLABORATION_GRAPH = YES
270 | GROUP_GRAPHS = YES
271 | UML_LOOK = NO
272 | UML_LIMIT_NUM_FIELDS = 10
273 | TEMPLATE_RELATIONS = NO
274 | INCLUDE_GRAPH = YES
275 | INCLUDED_BY_GRAPH = YES
276 | CALL_GRAPH = NO
277 | CALLER_GRAPH = NO
278 | GRAPHICAL_HIERARCHY = YES
279 | DIRECTORY_GRAPH = YES
280 | DOT_IMAGE_FORMAT = png
281 | INTERACTIVE_SVG = NO
282 | DOT_PATH =
283 | DOTFILE_DIRS =
284 | MSCFILE_DIRS =
285 | DIAFILE_DIRS =
286 | PLANTUML_JAR_PATH =
287 | DOT_GRAPH_MAX_NODES = 50
288 | MAX_DOT_GRAPH_DEPTH = 0
289 | DOT_TRANSPARENT = NO
290 | DOT_MULTI_TARGETS = NO
291 | GENERATE_LEGEND = YES
292 | DOT_CLEANUP = YES
293 |
--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
1 | ACLOCAL_AMFLAGS = -I m4
2 | lib_LTLIBRARIES = libffuzzy.la
3 | libffuzzy_la_LDFLAGS = -no-undefined -version-info 4:3:1
4 | libffuzzy_la_SOURCES = \
5 | ffuzzy_compare.c \
6 | ffuzzy_blocksize.c \
7 | ffuzzy_digest.c \
8 | ffuzzy_digest_unnorm.c \
9 | ffuzzy_digest_conv.c \
10 | ffuzzy_parse.c \
11 | ffuzzy_parse_unnorm.c
12 | include_HEADERS = ffuzzy.h
13 | EXTRA_DIST = \
14 | README NEWS \
15 | COPYING COPYING.GPLv2 COPYING.Boost \
16 | bootstrap.sh \
17 | ffuzzy_blocksize.h \
18 | ffuzzy_parse.h \
19 | str_base64.h \
20 | str_common_substr.h \
21 | str_edit_dist.h \
22 | str_hash_rolling.h \
23 | util.h \
24 | .gitignore .gitattributes ext/.gitignore m4/.gitignore \
25 | Doxyfile.example \
26 | examples/internal/has_common_substring.c \
27 | examples/internal/edit_distn.c
28 |
--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | libffuzzy : Fast ssdeep comparison library
2 | ===========================================
3 |
4 |
5 | Version 2.1.4 - 2014-11-03
6 | ---------------------------
7 |
8 | * Added block size bounds from the original version of ssdeep
9 |
10 |
11 | Version 2.1.3 - 2014-10-28
12 | ---------------------------
13 |
14 | * Added bootstrap script to build from git repository
15 |
16 |
17 | Version 2.1.2 - 2014-10-28
18 | ---------------------------
19 |
20 | * libffuzzy now accepts all unsigned long values as block sizes
21 | * Optimized block size capping
22 | * Fixed implementation where is different from documentation
23 |
24 |
25 | Version 2.1.1 - 2014-10-25
26 | ---------------------------
27 |
28 | * Removed redundant debug code
29 |
30 |
31 | Version 2.1 - 2014-10-25
32 | -------------------------
33 |
34 | * Added interface for unnormalized form
35 |
36 |
37 | Version 2.0.1 - 2014-10-25
38 | ---------------------------
39 |
40 | * Changed internal interface for future extension
41 | * Fixed buffer overflow vulnerability on ffuzzy_pretty_digest function
42 |
43 |
44 | Version 2.0 - 2014-10-22
45 | -------------------------
46 |
47 | * Optimized the code (further more)
48 | * Added specialized comparison/clustering/manipulation interface
49 | * Fixed parser to prevent arithmetic overflow
50 | * Added debug/assertion code
51 | * Fixed documentation
52 | * Added documentation for Doxygen
53 |
54 |
55 | Version 1.1.1 - 2014-10-20
56 | ---------------------------
57 |
58 | * Optimized edit distance code
59 | * Fixed changelog on version 1.1
60 |
61 |
62 | Version 1.1 - 2014-10-20
63 | -------------------------
64 |
65 | * Added ffuzzy_digest structure
66 | (which holds ssdeep digest after parsing)
67 | * Optimized digest reader and edit distance code
68 | * Improved compatibility with ssdeep
69 |
70 |
71 | Version 1.0 - 2014-10-20
72 | -------------------------
73 |
74 | * The initial release (compatible with ssdeep 2.10 [not 2.11])
75 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | libffuzzy : Fast ssdeep comparison library
2 | ===========================================
3 |
4 | libffuzzy is a fuzzy hash comparison library compatible with
5 | ssdeep () version 2.10.
6 |
7 | This library is designed to be fast and thread-safe. It does not even
8 | allocate memory at run time (which may increase performance
9 | on parallel computation).
10 |
11 | The another purpose to write this library is to find implementation
12 | issues in ssdeep. During this re-implementation, the author found
13 | several issues which may affect robustness and portability of ssdeep.
14 |
15 |
16 | Installation
17 | -------------
18 |
19 | If you don't have `configure` script (e.g. on git work-tree),
20 | run `./bootstrap.sh` or `autoreconf -i` first.
21 | Then you can install the program by `./configure`, `make`
22 | and then `make install`.
23 |
24 |
25 | Functions
26 | ----------
27 |
28 | Global functions are named like ffuzzy_SOMETHING.
29 | You can use ffuzzy_compare function just like fuzzy_compare.
30 |
31 | You can also hold ssdeep digest after parsing.
32 | Use ffuzzy_read_digest to parse digest and use ffuzzy_compare_digest
33 | to compare against another digest (after parsing).
34 |
35 |
36 | Performance
37 | ------------
38 |
39 | Version 1.1 achieved expected performance gain.
40 | libffuzzy is now faster than libfuzzy in most cases.
41 |
42 | In version 2.0, interfaces for specialized comparison and clustering
43 | are added. Clustering applications will get faster if you use
44 | this optimized implementation correctly.
45 |
46 | In the large clustering job (about 100k hashes),
47 | libffuzzy 2.0 was about 60% faster than libfuzzy 2.11.
48 |
--------------------------------------------------------------------------------
/bootstrap.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | autoreconf -i
3 |
--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
1 | AC_PREREQ([2.65])
2 | AC_INIT([libffuzzy], [2.1.4], [li@livegrid.org])
3 | AC_CONFIG_SRCDIR([ffuzzy_compare.c])
4 | AC_CONFIG_AUX_DIR([ext])
5 | AC_CONFIG_MACRO_DIR([m4])
6 | AC_CONFIG_HEADERS([ffuzzy_config.h])
7 | AM_INIT_AUTOMAKE([foreign dist-xz])
8 |
9 | AC_ARG_ENABLE([debug],AS_HELP_STRING([--enable-debug],[enable debugging code]),,[enable_debug=no])
10 | if test "x$enable_debug" = xno
11 | then
12 | AC_DEFINE([NDEBUG],[1],[Disable assertion code])
13 | fi
14 |
15 | AC_PROG_CC_C99
16 | LT_INIT
17 |
18 | AC_OUTPUT([Makefile])
19 |
--------------------------------------------------------------------------------
/examples/internal/edit_distn.c:
--------------------------------------------------------------------------------
1 | void examples()
2 | {
3 | // Example 1 ("123" to "1234"):
4 | // expect 1 (insert '4' to tail)
5 | assert(edit_distn("123", 3, "1234", 4) == 1);
6 | // Example 2 ("2034" to "234"):
7 | // expect 1 (remove '0' in the middle of the first string)
8 | assert(edit_distn("2034", 4, "234", 3) == 1);
9 | // Example 3 ("kiss" to "miss"):
10 | // expect 2 (remove 'k' and then insert 'm' to the same place)
11 | assert(edit_distn("kiss", 4, "miss", 4) == 2);
12 | // Example 4 ("kitten" to "sitting"):
13 | // expect 5 (remove 'k', insert 's', remove 'e', insert 'i' and insert 'g' to the tail)
14 | assert(edit_distn("kitten", 6, "sitting", 7) == 5);
15 | }
16 |
--------------------------------------------------------------------------------
/examples/internal/has_common_substring.c:
--------------------------------------------------------------------------------
1 | // Assume that FFUZZY_MIN_MATCH == 7.
2 | static void examples()
3 | {
4 | // Example 1 ("abcdefghijklmn" and "hijklmnopqrstu"):
5 | // expect true because they have common substring "hijklmn".
6 | assert(has_common_substring("abcdefghijklmn", 14, "hijklmnopqrstu", 14) == true);
7 | // Example 2 ("commonstring" and "differentstring"):
8 | // expect false because they don't have common substrings of length FFUZZY_MIN_MATCH.
9 | assert(has_common_substring("commonstring", 12, "differentstring", 15) == false);
10 | // Example 3 ("abcdefg" and "abcdefg"):
11 | // expect true because they have common substring "abcdefg" (which is the whole string)
12 | assert(has_common_substring("abcdefg", 7, "abcdefg", 7) == true);
13 | // Example 3 ("abc" and "abc"):
14 | // expect false because they don't have common substrings of length FFUZZY_MIN_MATCH
15 | // (even if they are identical).
16 | assert(has_common_substring("abc", 3, "abc", 3) == false);
17 | }
18 |
--------------------------------------------------------------------------------
/ext/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !/.gitignore
3 |
--------------------------------------------------------------------------------
/ffuzzy.h:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | ffuzzy.h
6 | Public API for libffuzzy
7 |
8 |
9 | CREDITS OF ORIGINAL VERSION OF SSDEEP
10 |
11 | Copyright (C) 2002 Andrew Tridgell
12 | Copyright (C) 2006 ManTech International Corporation
13 | Copyright (C) 2013 Helmut Grohne
14 |
15 | This program is free software; you can redistribute it and/or modify
16 | it under the terms of the GNU General Public License as published by
17 | the Free Software Foundation; either version 2 of the License, or
18 | (at your option) any later version.
19 |
20 | This program is distributed in the hope that it will be useful,
21 | but WITHOUT ANY WARRANTY; without even the implied warranty of
22 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 | GNU General Public License for more details.
24 |
25 | You should have received a copy of the GNU General Public License
26 | along with this program; if not, write to the Free Software
27 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 |
29 |
30 | CREDIT OF MODIFIED PORTIONS
31 |
32 | Copyright (C) 2014 Tsukasa OI
33 |
34 | */
35 | #ifndef FFUZZY_FFUZZY_H
36 | #define FFUZZY_FFUZZY_H
37 |
38 | /**
39 |
40 | \mainpage
41 |
42 | This is the documentation for the libffuzzy,
43 | a fast ssdeep comparison library.
44 |
45 | \author Tsukasa OI, li@livegrid.org and original ssdeep authors
46 | \version 2.1.4
47 |
48 | **/
49 |
50 | /**
51 | \file ffuzzy.h
52 | \brief Public API for libffuzzy
53 | **/
54 |
55 | #ifndef __cplusplus
56 | #include
57 | #endif
58 | #include
59 |
60 | /** \brief Maximum length for the digest block **/
61 | #define FFUZZY_SPAMSUM_LENGTH 64
62 |
63 | /** \brief Minimum block size to start in ssdeep implementation **/
64 | #define FFUZZY_MIN_BLOCKSIZE 3ul
65 |
66 | /** \brief Number of digest blocks (block hashes in ssdeep) */
67 | #define FFUZZY_NUM_BLOCKHASHES 31
68 |
69 | /** \brief Maximum block size **/
70 | #define FFUZZY_MAX_BLOCKSIZE (FFUZZY_MIN_BLOCKSIZE << (FFUZZY_NUM_BLOCKHASHES-1))
71 |
72 | /**
73 | \brief Maximum size of buffer required for natural pretty printing
74 | \details
75 | This is the buffer size which is enough to store all
76 | fuzzy hashes generated from ssdeep / libfuzzy.
77 | This is a sum of following components:
78 |
79 | - Two colons for "separator" token (2)
80 | - First digest block (max 64)
81 | - Second digest block (max 64)
82 | - Block size (maximum block size from libfuzzy is 3221225472 [length of 10])
83 |
84 | \see bool ffuzzy_pretty_digest(char*, size_t, const ffuzzy_digest*)
85 | \see bool ffuzzy_pretty_udigest(char*, size_t, const ffuzzy_udigest*)
86 | **/
87 | #define FFUZZY_PRETTY_LEN 141
88 |
89 | /** \brief
90 | The minimal match (length of common substring) required
91 | for (at least) one of the block digests
92 | **/
93 | #define FFUZZY_MIN_MATCH 7
94 |
95 |
96 | #ifdef __cplusplus
97 | extern "C" {
98 | #endif
99 |
100 | /**
101 |
102 | \struct ffuzzy_digest
103 | \brief The type to store ssdeep digest after parsing.
104 | \details
105 | This structure contains all information in the ssdeep digest
106 | in machine-friendly format. You can accelerate comparing
107 | fuzzy hashes by storing this type first.
108 | \see ffuzzy_read_digest(ffuzzy_digest*, const char*)
109 | \see ffuzzy_compare_digest(const ffuzzy_digest*, const ffuzzy_digest*)
110 |
111 | \var ffuzzy_digest::len1
112 | \brief Digest length for first block of the digest.
113 |
114 | \var ffuzzy_digest::len2
115 | \brief Digest length for second block of the digest.
116 |
117 | \var ffuzzy_digest::block_size
118 | \brief Block size of the ssdeep digest.
119 | \details
120 | Technically, this is the block size of first block of the digest.
121 | Block size of the second block is twice as this.
122 |
123 | \var ffuzzy_digest::digest
124 | \brief Digest buffer for both blocks of the digest.
125 | \details
126 | This buffer stores the digest in a compressed form.
127 | From the beginning, the buffer is formed like this:
128 |
129 | - @link len1 len1@endlink-sized characters of the first block
130 | - @link len2 len2@endlink-sized characters of the second block
131 |
132 | Valid blocks in the buffer do not contain sequences of
133 | four or more identical characters.
134 |
135 | **/
136 | typedef struct
137 | {
138 | size_t len1, len2;
139 | unsigned long block_size;
140 | char digest[FFUZZY_SPAMSUM_LENGTH * 2];
141 | } ffuzzy_digest;
142 |
143 |
144 |
145 | /**
146 | \name Comparison and Parsing
147 | \{
148 | **/
149 |
150 | /**
151 | \fn bool ffuzzy_read_digest(ffuzzy_digest*, const char*)
152 | \brief Read ssdeep digest from the string
153 | \details
154 | This function always sets valid digest if succeeds.
155 | \param [out] digest The pointer to the buffer to store valid digest after parsing.
156 | \param [in] s The string which contains a ssdeep digest.
157 | \return true if succeeds; false otherwise.
158 | **/
159 | bool ffuzzy_read_digest(ffuzzy_digest *digest, const char *s);
160 |
161 | /**
162 | \fn int ffuzzy_compare_digest(const ffuzzy_digest*, const ffuzzy_digest*)
163 | \brief Compare two fuzzy hashes and compute similarity score
164 | \param [in] d1 Valid digest 1
165 | \param [in] d2 Valid digest 2
166 | \return [0,100] values represent similarity score or negative values on failure.
167 | **/
168 | int ffuzzy_compare_digest(const ffuzzy_digest *d1, const ffuzzy_digest *d2);
169 |
170 | /**
171 | \fn int ffuzzy_compare(const char*, const char*)
172 | \brief Compute similarity score for given ssdeep hash strings
173 | \param [in] str1 ssdeep hash 1
174 | \param [in] str2 ssdeep hash 2
175 | \return [0,100] values represent similarity score or negative values on failure.
176 | **/
177 | int ffuzzy_compare(const char *str1, const char *str2);
178 |
179 | /** \} **/
180 |
181 |
182 |
183 | /**
184 | \name Optimized / Specialized Comparison
185 | \{
186 | **/
187 |
188 | /**
189 | \fn int ffuzzy_compare_digest_near(const ffuzzy_digest*, const ffuzzy_digest*)
190 | \brief Compare two fuzzy hashes assuming two block sizes of given hashes are "near"
191 | \details
192 | In this context, "near" means two block sizes are equal or
193 | one of the block size is twice as other.
194 |
195 | This function assumes two block sizes are "near"
196 | (ffuzzy_blocksize_is_near on two block sizes returns true) and
197 | make the computation slightly faster.
198 | \param [in] d1 Valid digest 1
199 | \param [in] d2 Valid digest 2
200 | \return [0,100] values represent similarity score or negative values on failure.
201 | **/
202 | int ffuzzy_compare_digest_near(const ffuzzy_digest *d1, const ffuzzy_digest *d2);
203 |
204 | /**
205 | \fn int ffuzzy_compare_digest_near_eq(const ffuzzy_digest*, const ffuzzy_digest*)
206 | \brief Compare two fuzzy hashes assuming two block sizes are same
207 | \details
208 | This function assumes two block sizes are same.
209 | \param [in] d1 Valid digest 1 (with same block size as d2)
210 | \param [in] d2 Valid digest 2 (with same block size as d1)
211 | \return [0,100] values represent similarity score or negative values on failure.
212 | \see int ffuzzy_compare_digest_near(const ffuzzy_digest*, const ffuzzy_digest*)
213 | **/
214 | int ffuzzy_compare_digest_near_eq(const ffuzzy_digest *d1, const ffuzzy_digest *d2);
215 |
216 | /**
217 | \fn int ffuzzy_compare_digest_near_lt(const ffuzzy_digest*, const ffuzzy_digest*)
218 | \brief Compare two fuzzy hashes assuming second block size is double as first one
219 | \details
220 | This function assumes second block size is double as first one.
221 | \param [in] d1 Valid digest 1
222 | \param [in] d2 Valid digest 2 (with double block size as d1)
223 | \return [0,100] values represent similarity score or negative values on failure.
224 | \see int ffuzzy_compare_digest_near(const ffuzzy_digest*, const ffuzzy_digest*)
225 | **/
226 | int ffuzzy_compare_digest_near_lt(const ffuzzy_digest *d1, const ffuzzy_digest *d2);
227 |
228 | /** \} **/
229 |
230 |
231 |
232 | /**
233 | \name Block Size Utilities
234 | \{
235 | **/
236 |
237 | /**
238 | \fn bool ffuzzy_blocksize_is_valid(unsigned long)
239 | \brief Determines whether given block size is valid to use in libffuzzy
240 | \details
241 | Use of this function is now deprecated.
242 | libffuzzy now accepts all unsigned long values to compare.
243 | \param block_size Block size (which may not be valid)
244 | \return true if the given block size is valid; false otherwise.
245 | **/
246 | bool ffuzzy_blocksize_is_valid(unsigned long block_size);
247 |
248 | /**
249 | \fn bool ffuzzy_blocksize_is_natural(unsigned long)
250 | \brief Determines whether given block size is "natural"
251 | \details
252 | In this context, "natural" means given parameter of fuzzy hash
253 | may be generated by ssdeep or its backend, libfuzzy.
254 | Depending on the job, handling only "natural" digests
255 | may make your program efficient.
256 |
257 | This function doesn't only check whether the block size is valid,
258 | but it checks the given size is a product of FFUZZY_MIN_BLOCKSIZE
259 | and a power of two.
260 | \param block_size Block size (which may not be valid or "natural")
261 | \return true if the given block size is valid and "natural"; false otherwise.
262 | **/
263 | bool ffuzzy_blocksize_is_natural(unsigned long block_size);
264 |
265 | /**
266 | \fn bool ffuzzy_blocksize_is_near(unsigned long, unsigned long)
267 | \brief Determines whether given block sizes are "near"
268 | \details
269 | In this context, "near" means two block sizes are equal or
270 | one of the block size is twice as other.
271 |
272 | This function determines whether given block sizes are "near".
273 | If this function returns true, it is safe to use ffuzzy_compare_digest_near
274 | function for two digests which have given block sizes.
275 | \param block_size1 Valid block size 1
276 | \param block_size2 Valid block size 2
277 | \return true if the given block sizes are "near"; false otherwise.
278 | **/
279 | bool ffuzzy_blocksize_is_near(unsigned long block_size1, unsigned long block_size2);
280 |
281 | /**
282 | \fn bool ffuzzy_blocksize_is_far_le(unsigned long, unsigned long)
283 | \brief Determines whether given ordered block sizes "far" enough
284 | \details
285 | In this context, "far" means
286 | the second block size is greater than double of the first block size.
287 |
288 | For block size-sorted digests, "far" means there are no
289 | subsequent entries which will match.
290 |
291 | This function determines whether given block sizes are "far".
292 |
293 | You may want to inline or reimplement this because
294 | this function is very easy. There's nothing preventing you to do that.
295 | \param block_size1 Valid block size 1
296 | \param block_size2 Valid block size 2 (must be equal or greater than block_size1)
297 | \return true if the given block sizes are "far"; false otherwise.
298 | **/
299 | bool ffuzzy_blocksize_is_far_le(unsigned long block_size1, unsigned long block_size2);
300 |
301 | /** \} **/
302 |
303 |
304 |
305 | /**
306 | \name Digest Utilities
307 | \{
308 | **/
309 |
310 | /**
311 | \fn bool ffuzzy_digest_is_valid_lengths(const ffuzzy_digest*)
312 | \brief Determines whether block lengths of given digest are valid
313 | \param [in] digest Digest (which may not be valid)
314 | \return true if values of ffuzzy_digest::len1 and ffuzzy_digest::len2 are valid.
315 | **/
316 | bool ffuzzy_digest_is_valid_lengths(const ffuzzy_digest *digest);
317 |
318 | /**
319 | \fn bool ffuzzy_digest_is_valid_buffer(const ffuzzy_digest*)
320 | \brief Determines whether digest blocks are valid
321 | \details
322 | This function determines whether there are no sequences
323 | which consist of four or more identical characters.
324 |
325 | This function needs valid digest block lengths.
326 | If digest block lengths are not guaranteed to be valid,
327 | use ffuzzy_digest_is_valid_lengths first.
328 |
329 | You will not need to use this function if you use ffuzzy_read_digest function
330 | because it always returns valid digests on success.
331 | \param [in] digest Digest (which may not be valid but block lengths are valid)
332 | \return true if the digest blocks are valid; false otherwise.
333 | **/
334 | bool ffuzzy_digest_is_valid_buffer(const ffuzzy_digest *digest);
335 |
336 | /**
337 | \fn bool ffuzzy_digest_is_natural_buffer(const ffuzzy_digest*)
338 | \brief Determines whether digest blocks are valid and "natural"
339 | \details
340 | This function determines whether valid range of ffuzzy_digest::digest
341 | values consist of base64 characters (in other words, "natural").
342 |
343 | This function needs valid digest block lengths.
344 | If digest block lengths are not guaranteed to be valid,
345 | use ffuzzy_digest_is_valid_lengths first.
346 |
347 | You may need to use this function even after success call to
348 | ffuzzy_read_digest because this function is not guaranteed to set
349 | digests with "natural" digest blocks.
350 |
351 | However, if you are just comparing, this check is not necessary because
352 | fuzzy hash comparison will not decode base64 characters (it just "compares").
353 |
354 | You will need this function ONLY if you need to verify
355 | whether given digest is truly "natural".
356 | \param [in] digest Digest (which may not be valid or natural but block lengths are valid)
357 | \return true if the digest blocks are valid and "natural"; false otherwise.
358 | **/
359 | bool ffuzzy_digest_is_natural_buffer(const ffuzzy_digest *digest);
360 |
361 | /**
362 | \fn bool ffuzzy_digest_is_valid(const ffuzzy_digest*)
363 | \brief Determines whether given digest is valid
364 | \param [in] digest Digest (which may not be valid)
365 | \return true if the digest is valid; false otherwise.
366 | **/
367 | bool ffuzzy_digest_is_valid(const ffuzzy_digest *digest);
368 |
369 | /**
370 | \fn bool ffuzzy_digest_is_natural(const ffuzzy_digest*)
371 | \brief Determines whether given digest is valid and "natural"
372 | \param [in] digest Digest (which may not be valid or natural)
373 | \return true if the digest is valid and natural; false otherwise.
374 | **/
375 | bool ffuzzy_digest_is_natural(const ffuzzy_digest *digest);
376 |
377 | /**
378 | \fn int ffuzzy_digestcmp(const ffuzzy_digest*, const ffuzzy_digest*)
379 | \brief Compare two ffuzzy_digest values
380 | \details
381 | This comparison has priorities.
382 |
383 | 1. Compare block sizes.
384 | 2. Compare block lengths of the first block.
385 | 3. Compare block lengths of the second block.
386 | 4. Compare block buffer contents (first and second).
387 |
388 | \param [in] d1 Valid digest 1
389 | \param [in] d2 Valid digest 2
390 | \return
391 | Positive value if d1 < d2, negativa value if d2 > d1
392 | and 0 if d1 is equal to d2.
393 | **/
394 | int ffuzzy_digestcmp(const ffuzzy_digest *d1, const ffuzzy_digest *d2);
395 |
396 | /**
397 | \fn int ffuzzy_digestcmp_blocksize(const ffuzzy_digest*, const ffuzzy_digest*)
398 | \brief Compare two ffuzzy_digest values by block sizes
399 | \param [in] d1 Valid digest 1
400 | \param [in] d2 Valid digest 2
401 | \return
402 | Positive value if d1 < d2, negativa value if d2 > d1
403 | and 0 if block size of d1 is equal to d2.
404 | \see int ffuzzy_digestcmp(const ffuzzy_digest*, const ffuzzy_digest*)
405 | **/
406 | int ffuzzy_digestcmp_blocksize(const ffuzzy_digest *d1, const ffuzzy_digest *d2);
407 |
408 | /**
409 | \fn int ffuzzy_digestcmp_blocksize_n(const ffuzzy_digest*, const ffuzzy_digest*)
410 | \brief Compare two ffuzzy_digest values by whether block sizes are "natural" and block size values
411 | \details
412 | This comparison has priorities.
413 |
414 | 1. Compare whether block sizes are "natural" (for ffuzzy_blocksize_is_natural return value, true comes first)
415 | 2. Compare block sizes.
416 |
417 | \param [in] d1 Valid digest 1
418 | \param [in] d2 Valid digest 2
419 | \return
420 | Positive value if d1 < d2, negativa value if d2 > d1
421 | and 0 if block size of d1 is equal to d2.
422 | \see bool ffuzzy_blocksize_is_natural(unsigned long)
423 | \see int ffuzzy_digestcmp(const ffuzzy_digest*, const ffuzzy_digest*)
424 | **/
425 | int ffuzzy_digestcmp_blocksize_n(const ffuzzy_digest *d1, const ffuzzy_digest *d2);
426 |
427 | /**
428 | \fn bool ffuzzy_pretty_digest(char*, size_t, const ffuzzy_digest*)
429 | \brief Convert ffuzzy_digest to the string
430 | \param [out] buf Buffer to store string
431 | \param buflen Size of buf
432 | \param [in] digest A valid digest to convert
433 | \return true if succeeds; false otherwise.
434 | **/
435 | bool ffuzzy_pretty_digest(char *buf, size_t buflen, const ffuzzy_digest *digest);
436 |
437 | /** \} **/
438 |
439 |
440 |
441 | /**
442 | \name Unnormalized Digests
443 | \{
444 | **/
445 |
446 | /**
447 |
448 | \struct ffuzzy_udigest
449 | \brief The type to store unnormalized ssdeep digest after parsing.
450 | \details
451 | Unlike ffuzzy_digest type, this type (actually, identical to
452 | ffuzzy_digest) may include sequences of four or more identical
453 | characters. fuzzy_digest function (by default) does not eliminate
454 | such sequences and this type allows preserving such sequences.
455 |
456 | This type is easily convertible to ffuzzy_digest.
457 |
458 | \var ffuzzy_udigest::len1
459 | \brief Digest length for first block of the digest.
460 | \see ffuzzy_digest::len1
461 |
462 | \var ffuzzy_udigest::len2
463 | \brief Digest length for second block of the digest.
464 | \see ffuzzy_digest::len2
465 |
466 | \var ffuzzy_udigest::block_size
467 | \brief Block size of the ssdeep digest.
468 | \see ffuzzy_digest::block_size
469 |
470 | \var ffuzzy_udigest::digest
471 | \brief Digest buffer for both blocks of the unnormalized digest.
472 | \details
473 | This buffer is very similar to ffuzzy_digest::digest but
474 | allows long sequences of identical characters.
475 | \see ffuzzy_digest::digest
476 |
477 | **/
478 | typedef struct
479 | {
480 | size_t len1, len2;
481 | unsigned long block_size;
482 | char digest[FFUZZY_SPAMSUM_LENGTH * 2];
483 | } ffuzzy_udigest;
484 |
485 |
486 | /**
487 | \fn bool ffuzzy_read_udigest(ffuzzy_udigest*, const char*)
488 | \brief Read unnormalized ssdeep digest from the string
489 | \details
490 | This function always sets valid and unnormalized digest if succeeds.
491 | \param [out] udigest The pointer to the buffer to store valid unnormalized digest after parsing.
492 | \param [in] s The string which contains a ssdeep digest.
493 | \return true if succeeds; false otherwise.
494 | \see ffuzzy_udigest
495 | **/
496 | bool ffuzzy_read_udigest(ffuzzy_udigest *udigest, const char *s);
497 |
498 | /**
499 | \fn bool ffuzzy_udigest_is_valid_lengths(const ffuzzy_udigest*)
500 | \brief Determines whether block lengths of given digest are valid
501 | \param [in] udigest Unnormalized digest (which may not be valid)
502 | \return true if values of ffuzzy_udigest::len1 and ffuzzy_udigest::len2 are valid.
503 | **/
504 | bool ffuzzy_udigest_is_valid_lengths(const ffuzzy_udigest *udigest);
505 |
506 | /**
507 | \fn bool ffuzzy_udigest_is_natural_buffer(const ffuzzy_udigest*)
508 | \brief Determines whether digest blocks are "natural"
509 | \details
510 | This function determines whether valid range of ffuzzy_udigest::digest
511 | values consist of base64 characters (in other words, "natural").
512 |
513 | This function needs valid digest block lengths.
514 | If digest block lengths are not guaranteed to be valid,
515 | use ffuzzy_udigest_is_valid_lengths first.
516 |
517 | You will need this function ONLY if you need to verify
518 | whether given digest is truly "natural".
519 | \param [in] udigest Unnormalized digest (which may not be natural but block lengths are valid)
520 | \return true if the digest blocks are "natural"; false otherwise.
521 | **/
522 | bool ffuzzy_udigest_is_natural_buffer(const ffuzzy_udigest *udigest);
523 |
524 | /**
525 | \fn bool ffuzzy_udigest_is_valid(const ffuzzy_udigest*)
526 | \brief Determines whether given digest is valid
527 | \param [in] udigest Unnormalized digest (which may not be valid)
528 | \return true if the digest is valid; false otherwise.
529 | **/
530 | bool ffuzzy_udigest_is_valid(const ffuzzy_udigest *udigest);
531 |
532 | /**
533 | \fn bool ffuzzy_udigest_is_natural(const ffuzzy_udigest*)
534 | \brief Determines whether given digest is valid and "natural"
535 | \param [in] udigest Unnormalized digest (which may not be valid or natural)
536 | \return true if the digest is valid and "natural"; false otherwise.
537 | **/
538 | bool ffuzzy_udigest_is_natural(const ffuzzy_udigest *udigest);
539 |
540 | /**
541 | \fn int ffuzzy_udigestcmp(const ffuzzy_udigest*, const ffuzzy_udigest*)
542 | \brief Compare two ffuzzy_udigest values
543 | \details
544 | This comparison has priorities.
545 |
546 | 1. Compare block sizes.
547 | 2. Compare block lengths of the first block.
548 | 3. Compare block lengths of the second block.
549 | 4. Compare block buffer contents (first and second).
550 |
551 | \param [in] d1 Valid digest 1
552 | \param [in] d2 Valid digest 2
553 | \return
554 | Positive value if d1 < d2, negativa value if d2 > d1
555 | and 0 if d1 is equal to d2.
556 | **/
557 | int ffuzzy_udigestcmp(const ffuzzy_udigest *d1, const ffuzzy_udigest *d2);
558 |
559 | /**
560 | \fn int ffuzzy_udigestcmp_blocksize(const ffuzzy_udigest*, const ffuzzy_udigest*)
561 | \brief Compare two ffuzzy_udigest values by block sizes
562 | \param [in] d1 Valid digest 1
563 | \param [in] d2 Valid digest 2
564 | \return
565 | Positive value if d1 < d2, negativa value if d2 > d1
566 | and 0 if block size of d1 is equal to d2.
567 | \see int ffuzzy_udigestcmp(const ffuzzy_udigest*, const ffuzzy_udigest*)
568 | **/
569 | int ffuzzy_udigestcmp_blocksize(const ffuzzy_udigest *d1, const ffuzzy_udigest *d2);
570 |
571 | /**
572 | \fn int ffuzzy_udigestcmp_blocksize_n(const ffuzzy_udigest*, const ffuzzy_udigest*)
573 | \brief Compare two ffuzzy_udigest values by whether block sizes are "natural" and block size values
574 | \details
575 | This comparison has priorities.
576 |
577 | 1. Compare whether block sizes are "natural" (for ffuzzy_blocksize_is_natural return value, true comes first)
578 | 2. Compare block sizes.
579 |
580 | \param [in] d1 Valid digest 1
581 | \param [in] d2 Valid digest 2
582 | \return
583 | Positive value if d1 < d2, negativa value if d2 > d1
584 | and 0 if block size of d1 is equal to d2.
585 | \see bool ffuzzy_blocksize_is_natural(unsigned long)
586 | \see int ffuzzy_udigestcmp(const ffuzzy_udigest*, const ffuzzy_udigest*)
587 | **/
588 | int ffuzzy_udigestcmp_blocksize_n(const ffuzzy_udigest *d1, const ffuzzy_udigest *d2);
589 |
590 | /**
591 | \fn bool ffuzzy_pretty_udigest(char*, size_t, const ffuzzy_udigest*)
592 | \brief Convert ffuzzy_udigest to the string
593 | \param [out] buf Buffer to store string
594 | \param buflen Size of buf
595 | \param [in] udigest A valid digest to convert
596 | \return true if succeeds; false otherwise.
597 | **/
598 | bool ffuzzy_pretty_udigest(char *buf, size_t buflen, const ffuzzy_udigest *udigest);
599 |
600 |
601 | /**
602 | \fn void ffuzzy_convert_digest_to_udigest(ffuzzy_udigest*, const ffuzzy_digest*)
603 | \brief Convert ffuzzy_digest to ffuzzy_udigest
604 | \param [out] udigest The pointer to buffer to the unnormalized digest
605 | \param [in] digest The pointer to the valid digest
606 | **/
607 | void ffuzzy_convert_digest_to_udigest(ffuzzy_udigest *udigest, const ffuzzy_digest *digest);
608 |
609 | /**
610 | \fn void ffuzzy_convert_udigest_to_digest(ffuzzy_digest*, const ffuzzy_udigest*)
611 | \brief Convert ffuzzy_udigest to ffuzzy_digest
612 | \param [out] digest The pointer to buffer to the normalized digest
613 | \param [in] udigest The pointer to the valid and unnormalized digest
614 | **/
615 | void ffuzzy_convert_udigest_to_digest(ffuzzy_digest *digest, const ffuzzy_udigest *udigest);
616 |
617 | /** \} **/
618 |
619 |
620 |
621 | /**
622 | \name Internal Comparison Utilities
623 | \{
624 | **/
625 |
626 | /**
627 | \fn int ffuzzy_score_cap(int, int, unsigned long)
628 | \brief Retrieve score cap for given block lengths and the block size
629 | \details
630 | The (partial) similarity score is capped when the block is short
631 | and the block size is small to prevent exaggerate match.
632 | This function returns this score cap for given block lengths and the block size.
633 | \param s1len Length of block 1
634 | \param s2len Length of block 2
635 | \param block_size Block size
636 | \return
637 | Maximum (partial) similarity score value.
638 | If the return value is greater than 100, the score cap is 100.
639 |
640 | If s1len or s2len is out of range [0,FFUZZY_SPAMSUM_LENGTH], the value is undefined.
641 | **/
642 | int ffuzzy_score_cap(int s1len, int s2len, unsigned long block_size);
643 |
644 | /**
645 | \fn int ffuzzy_score_cap_1(int, unsigned long)
646 | \brief Retrieve score cap for given block length and size
647 | \details
648 | ffuzzy_score_cap function computes the score cap by
649 | the block size and "minimum" length of the given blocks.
650 | This function exposes internal interface of ffuzzy_score_cap.
651 | \param minslen Minimum length of the blocks
652 | \param block_size Block size
653 | \return
654 | Maximum (partial) similarity score value.
655 | If the return value is greater than 100, the score cap is 100.
656 |
657 | If minslen is out of range [0,FFUZZY_SPAMSUM_LENGTH], the value is undefined.
658 | **/
659 | int ffuzzy_score_cap_1(int minslen, unsigned long block_size);
660 |
661 | /**
662 | \fn int ffuzzy_score_strings(const char*, size_t, const char*, size_t, unsigned long)
663 | \brief Compute partial similarity score for given two block strings and block size
664 | \details
665 | In the fuzzy computation, the digest block of the
666 | same block sizes are selected to compare.
667 | This is the internal interface for ffuzzy_compare and ffuzzy_compare_digest.
668 | \param [in] s1 Digest block 1
669 | \param s1len Length of s1
670 | \param [in] s2 Digest block 2
671 | \param s2len Length of s2
672 | \param block_size Block size for two digest blocks
673 | \return [0,100] values represent partial similarity score or negative values on failure.
674 | **/
675 | int ffuzzy_score_strings(
676 | const char *s1, size_t s1len,
677 | const char *s2, size_t s2len,
678 | unsigned long block_size
679 | );
680 |
681 | /** \} **/
682 |
683 |
684 |
685 | #ifdef __cplusplus
686 | }
687 | #endif
688 |
689 | #endif
690 |
--------------------------------------------------------------------------------
/ffuzzy_blocksize.c:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | ffuzzy_blocksize.c
6 | Block size utility for fuzzy hashes
7 |
8 |
9 | Copyright (C) 2014 Tsukasa OI
10 |
11 |
12 | This program is free software; you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation; either version 2 of the License, or
15 | (at your option) any later version.
16 |
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | GNU General Public License for more details.
21 |
22 | You should have received a copy of the GNU General Public License
23 | along with this program. If not, see .
24 |
25 | */
26 |
27 | /**
28 | \internal
29 | \file ffuzzy_blocksize.c
30 | \brief Block size utility for fuzzy hashes
31 | **/
32 |
33 | #include "ffuzzy_config.h"
34 |
35 | #include
36 | #include
37 | #include
38 |
39 | #include "ffuzzy.h"
40 | #include "ffuzzy_blocksize.h"
41 |
42 |
43 | bool ffuzzy_blocksize_is_valid(unsigned long block_size)
44 | {
45 | return true;
46 | }
47 |
48 | bool ffuzzy_blocksize_is_natural(unsigned long block_size)
49 | {
50 | return ffuzzy_blocksize_is_natural_(block_size);
51 | }
52 |
53 | bool ffuzzy_blocksize_is_near(unsigned long block_size1, unsigned long block_size2)
54 | {
55 | return ffuzzy_blocksize_is_near_(block_size1, block_size2);
56 | }
57 |
58 | bool ffuzzy_blocksize_is_far_le(unsigned long block_size1, unsigned long block_size2)
59 | {
60 | assert(block_size1 <= block_size2);
61 | if (block_size1 <= (ULONG_MAX / 2))
62 | return block_size1 * 2 < block_size2;
63 | else
64 | return block_size1 != block_size2;
65 | }
66 |
--------------------------------------------------------------------------------
/ffuzzy_blocksize.h:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | ffuzzy_blocksize.h
6 | Block size utility for fuzzy hashes (internal)
7 |
8 |
9 | Copyright (C) 2014 Tsukasa OI
10 |
11 |
12 | This program is free software; you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation; either version 2 of the License, or
15 | (at your option) any later version.
16 |
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | GNU General Public License for more details.
21 |
22 | You should have received a copy of the GNU General Public License
23 | along with this program. If not, see .
24 |
25 | */
26 | #ifndef FFUZZY_FFUZZY_BLOCKSIZE_H
27 | #define FFUZZY_FFUZZY_BLOCKSIZE_H
28 |
29 | /**
30 | \internal
31 | \file ffuzzy_blocksize.h
32 | \brief Block size utility for fuzzy hashes
33 | **/
34 |
35 | #include "ffuzzy_config.h"
36 |
37 | #include
38 | #include
39 | #include
40 |
41 | #include "ffuzzy.h"
42 |
43 | /**
44 | \internal
45 | \fn bool ffuzzy_blocksize_is_natural_(unsigned long)
46 | \see bool ffuzzy_blocksize_is_natural(unsigned long)
47 | **/
48 | static inline bool ffuzzy_blocksize_is_natural_(unsigned long block_size)
49 | {
50 | if (block_size < FFUZZY_MIN_BLOCKSIZE)
51 | return false;
52 | if (block_size > FFUZZY_MAX_BLOCKSIZE)
53 | return false;
54 | while (block_size != FFUZZY_MIN_BLOCKSIZE && !(block_size & 1ul))
55 | block_size >>= 1;
56 | return block_size == FFUZZY_MIN_BLOCKSIZE;
57 | }
58 |
59 | /**
60 | \internal
61 | \fn bool ffuzzy_blocksize_is_near_(unsigned long, unsigned long)
62 | \see bool ffuzzy_blocksize_is_near(unsigned long, unsigned long)
63 | **/
64 | static inline bool ffuzzy_blocksize_is_near_(unsigned long block_size1, unsigned block_size2)
65 | {
66 | return (
67 | block_size1 == block_size2 ||
68 | (block_size1 <= (ULONG_MAX / 2) && block_size1 * 2 == block_size2) ||
69 | (!(block_size1 & 1ul) && block_size1 / 2 == block_size2)
70 | );
71 | }
72 |
73 |
74 | /**
75 | \internal
76 | \fn int ffuzzy_blocksizecmp(unsigned long, unsigned long)
77 | \brief Compare two block size values
78 | \param block_size1 Block size 1
79 | \param block_size2 Block size 2
80 | \return
81 | Positive value if block_size1 < block_size2, negative value if block_size1 > block_size2
82 | and 0 if block_size1 is equal to block_size2.
83 | **/
84 | static inline int ffuzzy_blocksizecmp(unsigned long block_size1, unsigned long block_size2)
85 | {
86 | if (block_size1 > block_size2)
87 | return +1;
88 | if (block_size1 < block_size2)
89 | return -1;
90 | return 0;
91 | }
92 |
93 | #endif
94 |
--------------------------------------------------------------------------------
/ffuzzy_compare.c:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | ffuzzy_compare.c
6 | Fuzzy hash comparison implementation
7 |
8 |
9 | CREDITS OF ORIGINAL VERSION OF SSDEEP
10 |
11 | Copyright (C) 2002 Andrew Tridgell
12 | Copyright (C) 2006 ManTech International Corporation
13 | Copyright (C) 2013 Helmut Grohne
14 |
15 | This program is free software; you can redistribute it and/or modify
16 | it under the terms of the GNU General Public License as published by
17 | the Free Software Foundation; either version 2 of the License, or
18 | (at your option) any later version.
19 |
20 | This program is distributed in the hope that it will be useful,
21 | but WITHOUT ANY WARRANTY; without even the implied warranty of
22 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 | GNU General Public License for more details.
24 |
25 | You should have received a copy of the GNU General Public License
26 | along with this program; if not, write to the Free Software
27 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 |
29 |
30 | CREDIT OF MODIFIED PORTIONS
31 |
32 | Copyright (C) 2014 Tsukasa OI
33 |
34 | */
35 |
36 | /**
37 | \internal
38 | \file ffuzzy_compare.c
39 | \brief Fuzzy hash comparison implementation
40 | **/
41 |
42 | #include "ffuzzy_config.h"
43 |
44 | #include
45 | #include
46 | #include
47 | #include
48 | #include
49 | #include
50 | #include
51 | #include
52 |
53 | #include "ffuzzy.h"
54 | #include "ffuzzy_blocksize.h"
55 | #include "ffuzzy_parse.h"
56 |
57 | #include "str_common_substr.h"
58 | #include "str_edit_dist.h"
59 | #include "util.h"
60 |
61 | #if FFUZZY_SPAMSUM_LENGTH > EDIT_DISTN_MAXLEN
62 | #error EDIT_DISTN_MAXLEN must be large enough to contain FFUZZY_SPAMSUM_LENGTH string
63 | #endif
64 | #if FFUZZY_SPAMSUM_LENGTH > HAS_COMMON_SUBSTR_MAXLEN
65 | #error HAS_COMMON_SUBSTR_MAXLEN must be large enough to contain FFUZZY_SPAMSUM_LENGTH string
66 | #endif
67 |
68 |
69 | /**
70 | \internal
71 | \fn int ffuzzy_score_cap_1_(int, unsigned long)
72 | \see int ffuzzy_score_cap_1(int, unsigned long)
73 | **/
74 | static inline int ffuzzy_score_cap_1_(int minslen, unsigned long block_size)
75 | {
76 | assert(minslen > 0 && minslen < FFUZZY_SPAMSUM_LENGTH);
77 | if (block_size >= FFUZZY_MIN_BLOCKSIZE * 100)
78 | return 100;
79 | return (int)block_size / FFUZZY_MIN_BLOCKSIZE * minslen;
80 | }
81 |
82 |
83 | inline int ffuzzy_score_cap_1(int minslen, unsigned long block_size)
84 | {
85 | if (minslen == 0)
86 | return 0;
87 | return ffuzzy_score_cap_1_(minslen, block_size);
88 | }
89 |
90 |
91 | int ffuzzy_score_cap(int s1len, int s2len, unsigned long block_size)
92 | {
93 | return ffuzzy_score_cap_1(MIN(s1len, s2len), block_size);
94 | }
95 |
96 |
97 | /**
98 | \internal
99 | \fn int ffuzzy_score_strings_unsafe(const char*, size_t, const char*, size_t, unsigned long)
100 | \brief Compute partial similarity score for given two block strings and block size (unsafe version)
101 | \param [in] s1 Digest block 1
102 | \param s1len Length of s1
103 | \param [in] s2 Digest block 2
104 | \param s2len Length of s2
105 | \param block_size Block size for two digest blocks
106 | \return [0,100] values represent partial similarity score or negative values on failure.
107 | \see fuzzy_score_strings(const char*, size_t, const char*, size_t, unsigned long)
108 | **/
109 | static inline int ffuzzy_score_strings_unsafe(
110 | const char *s1, size_t s1len,
111 | const char *s2, size_t s2len,
112 | unsigned long block_size
113 | )
114 | {
115 | // the two strings must have a common substring
116 | // of length FFUZZY_MIN_MATCH to be candidates
117 | if (!has_common_substring(s1, s1len, s2, s2len))
118 | return 0;
119 | // compute the score by scaling edit distance by
120 | // the lengths of the two strings, and then
121 | // scale it to [0,100] scale (0 is the worst match)
122 | int score = edit_distn_norm(s1, s1len, s2, s2len) * FFUZZY_SPAMSUM_LENGTH / ((int)s1len + (int)s2len);
123 | score = 100 - (100 * score) / FFUZZY_SPAMSUM_LENGTH;
124 | // when the blocksize is small we don't want to exaggerate the match size
125 | if (block_size >= FFUZZY_MIN_BLOCKSIZE * 100)
126 | {
127 | // don't cap first (to avoid arithmetic overflow)
128 | return score;
129 | }
130 | int score_cap = (int)block_size / FFUZZY_MIN_BLOCKSIZE * MIN((int)s1len, (int)s2len);
131 | return MIN(score, score_cap);
132 | }
133 |
134 |
135 | int ffuzzy_score_strings(
136 | const char *s1, size_t s1len,
137 | const char *s2, size_t s2len,
138 | unsigned long block_size
139 | )
140 | {
141 | // cannot score long signatures
142 | if (s1len > FFUZZY_SPAMSUM_LENGTH || s2len > FFUZZY_SPAMSUM_LENGTH)
143 | return 0;
144 | return ffuzzy_score_strings_unsafe(s1, s1len, s2, s2len, block_size);
145 | }
146 |
147 |
148 | inline int ffuzzy_compare_digest_near(const ffuzzy_digest *d1, const ffuzzy_digest *d2)
149 | {
150 | assert(ffuzzy_blocksize_is_near_(d1->block_size, d2->block_size));
151 | assert(ffuzzy_digest_is_valid(d1));
152 | assert(ffuzzy_digest_is_valid(d2));
153 | // special case if two signatures are identical
154 | if (
155 | d1->block_size == d2->block_size &&
156 | d1->len1 == d2->len1 &&
157 | d1->len2 == d2->len2 &&
158 | !memcmp(d1->digest, d2->digest, d1->len1 + d1->len2)
159 | )
160 | {
161 | // cap scores (same as ffuzzy_score_strings)
162 | int score_cap;
163 | if (d1->len2 >= FFUZZY_MIN_MATCH)
164 | {
165 | if (d1->block_size > FFUZZY_MIN_BLOCKSIZE * 50)
166 | return 100;
167 | score_cap = ffuzzy_score_cap_1_((int)d1->len2, d1->block_size * 2);
168 | if (score_cap >= 100)
169 | return 100;
170 | }
171 | else
172 | score_cap = 0;
173 | if (d1->len1 >= FFUZZY_MIN_MATCH)
174 | {
175 | int tmp = ffuzzy_score_cap_1_((int)d1->len1, d1->block_size);
176 | score_cap = MAX(score_cap, tmp);
177 | }
178 | return MIN(100, score_cap);
179 | }
180 | // each signature has a string for two block sizes. We now
181 | // choose how to combine the two block sizes. We checked above
182 | // that they have at least one block size in common
183 | if (d1->block_size <= (ULONG_MAX / 2))
184 | {
185 | if (d1->block_size == d2->block_size)
186 | {
187 | int score1 = ffuzzy_score_strings_unsafe(d1->digest, d1->len1, d2->digest, d2->len1, d1->block_size);
188 | int score2 = ffuzzy_score_strings_unsafe(d1->digest+d1->len1, d1->len2, d2->digest+d2->len1, d2->len2, d1->block_size * 2);
189 | return MAX(score1, score2);
190 | }
191 | else if (d1->block_size * 2 == d2->block_size)
192 | return ffuzzy_score_strings_unsafe(d1->digest + d1->len1, d1->len2, d2->digest, d2->len1, d2->block_size);
193 | else
194 | return ffuzzy_score_strings_unsafe(d1->digest, d1->len1, d2->digest + d2->len1, d2->len2, d1->block_size);
195 | }
196 | else
197 | {
198 | if (d1->block_size == d2->block_size) // second digest block is empty or invalid
199 | return ffuzzy_score_strings_unsafe(d1->digest, d1->len1, d2->digest, d2->len1, d1->block_size);
200 | else if (!(d1->block_size & 1ul) && (d1->block_size / 2 == d2->block_size))
201 | return ffuzzy_score_strings_unsafe(d1->digest, d1->len1, d2->digest + d2->len1, d2->len2, d1->block_size);
202 | else
203 | return 0;
204 | }
205 | }
206 |
207 |
208 | int ffuzzy_compare_digest_near_eq(const ffuzzy_digest *d1, const ffuzzy_digest *d2)
209 | {
210 | assert(ffuzzy_digest_is_valid(d1));
211 | assert(ffuzzy_digest_is_valid(d2));
212 | assert(d1->block_size == d2->block_size);
213 | // special case if two signatures are identical
214 | if (
215 | d1->len1 == d2->len1 &&
216 | d1->len2 == d2->len2 &&
217 | !memcmp(d1->digest, d2->digest, d1->len1 + d1->len2)
218 | )
219 | {
220 | // cap scores (same as ffuzzy_score_strings)
221 | int score_cap;
222 | if (d1->len2 >= FFUZZY_MIN_MATCH)
223 | {
224 | if (d1->block_size > FFUZZY_MIN_BLOCKSIZE * 50)
225 | return 100;
226 | score_cap = ffuzzy_score_cap_1_((int)d1->len2, d1->block_size * 2);
227 | if (score_cap >= 100)
228 | return 100;
229 | }
230 | else
231 | score_cap = 0;
232 | if (d1->len1 >= FFUZZY_MIN_MATCH)
233 | {
234 | int tmp = ffuzzy_score_cap_1_((int)d1->len1, d1->block_size);
235 | score_cap = MAX(score_cap, tmp);
236 | }
237 | return MIN(100, score_cap);
238 | }
239 | if (d1->block_size <= (ULONG_MAX / 2))
240 | {
241 | int score1 = ffuzzy_score_strings_unsafe(d1->digest, d1->len1, d2->digest, d2->len1, d1->block_size);
242 | int score2 = ffuzzy_score_strings_unsafe(d1->digest+d1->len1, d1->len2, d2->digest+d2->len1, d2->len2, d1->block_size * 2);
243 | return MAX(score1, score2);
244 | }
245 | else
246 | {
247 | // second digest block is empty or invalid
248 | return ffuzzy_score_strings_unsafe(d1->digest, d1->len1, d2->digest, d2->len1, d1->block_size);
249 | }
250 | }
251 |
252 |
253 | int ffuzzy_compare_digest_near_lt(const ffuzzy_digest *d1, const ffuzzy_digest *d2)
254 | {
255 | assert(ffuzzy_digest_is_valid(d1));
256 | assert(ffuzzy_digest_is_valid(d2));
257 | assert(d1->block_size <= (ULONG_MAX / 2));
258 | assert(d1->block_size * 2 == d2->block_size);
259 | return ffuzzy_score_strings_unsafe(d1->digest + d1->len1, d1->len2, d2->digest, d2->len1, d2->block_size);
260 | }
261 |
262 |
263 | int ffuzzy_compare_digest(const ffuzzy_digest *d1, const ffuzzy_digest *d2)
264 | {
265 | // don't compare if the blocksizes are not close.
266 | if (!ffuzzy_blocksize_is_near_(d1->block_size, d2->block_size))
267 | return 0;
268 | return ffuzzy_compare_digest_near(d1, d2);
269 | }
270 |
271 |
272 | int ffuzzy_compare(const char *str1, const char *str2)
273 | {
274 | ffuzzy_digest d1, d2;
275 | char *p1, *p2;
276 | // read blocksize part first
277 | if (!ffuzzy_read_digests_blocksize(&(d1.block_size), &p1, str1) || !ffuzzy_read_digests_blocksize(&(d2.block_size), &p2, str2))
278 | return -1;
279 | // don't compare if the blocksizes are not close.
280 | if (!ffuzzy_blocksize_is_near_(d1.block_size, d2.block_size))
281 | return 0;
282 | // read remaining parts
283 | if (!ffuzzy_read_digest_after_blocksize(&d1, p1) || !ffuzzy_read_digest_after_blocksize(&d2, p2))
284 | return -1;
285 | // then compare without blocksize checks
286 | return ffuzzy_compare_digest_near(&d1, &d2);
287 | }
288 |
--------------------------------------------------------------------------------
/ffuzzy_digest.c:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | ffuzzy_digest.c
6 | Digest utility for fuzzy hashes
7 |
8 |
9 | Copyright (C) 2014 Tsukasa OI
10 |
11 |
12 | This program is free software; you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation; either version 2 of the License, or
15 | (at your option) any later version.
16 |
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | GNU General Public License for more details.
21 |
22 | You should have received a copy of the GNU General Public License
23 | along with this program. If not, see .
24 |
25 | */
26 |
27 | /**
28 | \internal
29 | \file ffuzzy_digest.c
30 | \brief Digest utility for fuzzy hashes
31 | **/
32 |
33 | #include "ffuzzy_config.h"
34 |
35 | #include
36 | #include
37 | #include
38 | #include
39 |
40 | #include "ffuzzy.h"
41 | #include "ffuzzy_blocksize.h"
42 | #include "str_base64.h"
43 |
44 |
45 | inline bool ffuzzy_digest_is_valid_lengths(const ffuzzy_digest *digest)
46 | {
47 | return
48 | digest->len1 <= FFUZZY_SPAMSUM_LENGTH &&
49 | digest->len2 <= FFUZZY_SPAMSUM_LENGTH &&
50 | (digest->len1 + digest->len2) <= FFUZZY_SPAMSUM_LENGTH * 2;
51 | }
52 |
53 |
54 | bool ffuzzy_digest_is_valid_buffer(const ffuzzy_digest *digest)
55 | {
56 | assert(fuzzy_digest_is_valid_lengths(digest));
57 | const char *buf = digest->digest;
58 | for (size_t i = 3; i < digest->len1; i++, buf++)
59 | if (buf[0] == buf[1] && buf[0] == buf[2] && buf[0] == buf[3])
60 | return false;
61 | for (size_t i = 3; i < digest->len2; i++, buf++)
62 | if (buf[0] == buf[1] && buf[0] == buf[2] && buf[0] == buf[3])
63 | return false;
64 | return true;
65 | }
66 |
67 |
68 | bool ffuzzy_digest_is_natural_buffer(const ffuzzy_digest *digest)
69 | {
70 | assert(fuzzy_digest_is_valid_lengths(digest));
71 | const char *buf = digest->digest;
72 | for (size_t i = 0; i < digest->len1 && i < 3; i++)
73 | if (!is_base64(buf[i]))
74 | return false;
75 | for (size_t i = 3; i < digest->len1; i++, buf++)
76 | if (!is_base64(buf[3]) || (buf[0] == buf[1] && buf[0] == buf[2] && buf[0] == buf[3]))
77 | return false;
78 | for (size_t i = 0; i < digest->len2 && i < 3; i++, buf++)
79 | if (!is_base64(buf[3]))
80 | return false;
81 | for (size_t i = 3; i < digest->len2; i++, buf++)
82 | if (!is_base64(buf[3]) || (buf[0] == buf[1] && buf[0] == buf[2] && buf[0] == buf[3]))
83 | return false;
84 | return true;
85 | }
86 |
87 |
88 | bool ffuzzy_digest_is_valid(const ffuzzy_digest *digest)
89 | {
90 | return
91 | ffuzzy_digest_is_valid_lengths(digest) &&
92 | ffuzzy_digest_is_valid_buffer(digest);
93 | }
94 |
95 |
96 | bool ffuzzy_digest_is_natural(const ffuzzy_digest *digest)
97 | {
98 | return
99 | ffuzzy_blocksize_is_natural_(digest->block_size) &&
100 | ffuzzy_digest_is_valid_lengths(digest) &&
101 | ffuzzy_digest_is_natural_buffer(digest);
102 | }
103 |
104 |
105 | int ffuzzy_digestcmp(const ffuzzy_digest *d1, const ffuzzy_digest *d2)
106 | {
107 | if (d1->block_size > d2->block_size)
108 | return +1;
109 | if (d1->block_size < d2->block_size)
110 | return -1;
111 | if (d1->len1 > d2->len1)
112 | return +1;
113 | if (d1->len1 < d2->len1)
114 | return -1;
115 | if (d1->len2 > d2->len2)
116 | return +1;
117 | if (d1->len2 < d2->len2)
118 | return -1;
119 | return memcmp(d1->digest, d2->digest, d1->len1 + d1->len2);
120 | }
121 |
122 |
123 | int ffuzzy_digestcmp_blocksize(const ffuzzy_digest *d1, const ffuzzy_digest *d2)
124 | {
125 | return ffuzzy_blocksizecmp(d1->block_size, d2->block_size);
126 | }
127 |
128 |
129 | int ffuzzy_digestcmp_blocksize_n(const ffuzzy_digest *d1, const ffuzzy_digest *d2)
130 | {
131 | bool nat1 = ffuzzy_blocksize_is_natural_(d1->block_size);
132 | bool nat2 = ffuzzy_blocksize_is_natural_(d2->block_size);
133 | if (nat1 == nat2)
134 | return ffuzzy_blocksizecmp(d1->block_size, d2->block_size);
135 | else
136 | {
137 | if (!nat1 && nat2)
138 | return +1;
139 | else
140 | return -1;
141 | }
142 | }
143 |
144 |
145 | bool ffuzzy_pretty_digest(char *buf, size_t buflen, const ffuzzy_digest *digest)
146 | {
147 | assert(fuzzy_digest_is_valid_lengths(digest));
148 | // pretty hash contains two colons and trailing '\0'
149 | if (buflen < 3)
150 | return false;
151 | // buf must be big enough to contain two colons and two buffers.
152 | if ((buflen - 3) < (digest->len1 + digest->len2))
153 | return false;
154 | // write block size if possible
155 | {
156 | size_t bslen = (buflen - 2) - (digest->len1 + digest->len2);
157 | int bsret = snprintf(buf, bslen, "%lu", digest->block_size);
158 | if (bsret < 0)
159 | return false;
160 | if ((size_t)bsret >= bslen)
161 | return false;
162 | buf += bsret;
163 | }
164 | // write blocks
165 | buf[0] = ':';
166 | memcpy(buf + 1, digest->digest, digest->len1);
167 | buf[digest->len1 + 1] = ':';
168 | memcpy(buf + digest->len1 + 2, digest->digest + digest->len1, digest->len2);
169 | buf[digest->len1 + digest->len2 + 2] = '\0';
170 | return true;
171 | }
172 |
--------------------------------------------------------------------------------
/ffuzzy_digest_conv.c:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | ffuzzy_digest_conv.c
6 | Digest form converter (from/to unnormalized form)
7 |
8 |
9 | Copyright (C) 2014 Tsukasa OI
10 |
11 |
12 | This program is free software; you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation; either version 2 of the License, or
15 | (at your option) any later version.
16 |
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | GNU General Public License for more details.
21 |
22 | You should have received a copy of the GNU General Public License
23 | along with this program. If not, see .
24 |
25 | */
26 |
27 | /**
28 | \internal
29 | \file ffuzzy_digest_conv.c
30 | \brief Digest form converter (from/to unnormalized form)
31 | **/
32 |
33 | #include "ffuzzy_config.h"
34 |
35 | #include
36 | #include
37 |
38 | #include "ffuzzy.h"
39 |
40 |
41 | void ffuzzy_convert_digest_to_udigest(ffuzzy_udigest *udigest, const ffuzzy_digest *digest)
42 | {
43 | assert(ffuzzy_digest_is_valid(digest));
44 | udigest->len1 = digest->len1;
45 | udigest->len2 = digest->len2;
46 | udigest->block_size = digest->block_size;
47 | memcpy(udigest->digest, digest->digest, digest->len1 + digest->len2);
48 | }
49 |
50 | void ffuzzy_convert_udigest_to_digest(ffuzzy_digest *digest, const ffuzzy_udigest *udigest)
51 | {
52 | assert(ffuzzy_udigest_is_valid(udigest));
53 | digest->block_size = udigest->block_size;
54 | char *o = digest->digest;
55 | // eliminate sequences for digest block 1
56 | if (udigest->len1 <= 3)
57 | {
58 | memcpy(o, udigest->digest, udigest->len1);
59 | digest->len1 = udigest->len1;
60 | o += digest->len1;
61 | }
62 | else
63 | {
64 | memcpy(o, udigest->digest, 3);
65 | digest->len1 = 3;
66 | o += 3;
67 | for (size_t i = 3; i < udigest->len1; i++)
68 | {
69 | if (
70 | udigest->digest[i] != o[-3] ||
71 | udigest->digest[i] != o[-2] ||
72 | udigest->digest[i] != o[-1]
73 | )
74 | {
75 | *o++ = udigest->digest[i];
76 | digest->len1++;
77 | }
78 | }
79 | }
80 | // eliminate sequences for digest block 2
81 | if (udigest->len2 <= 3)
82 | {
83 | memcpy(o, udigest->digest + udigest->len1, udigest->len2);
84 | digest->len2 = udigest->len2;
85 | }
86 | else
87 | {
88 | memcpy(o, udigest->digest + udigest->len1, 3);
89 | digest->len2 = 3;
90 | o += 3;
91 | for (size_t i = 3; i < udigest->len2; i++)
92 | {
93 | if (
94 | udigest->digest[udigest->len1 + i] != o[-3] ||
95 | udigest->digest[udigest->len1 + i] != o[-2] ||
96 | udigest->digest[udigest->len1 + i] != o[-1]
97 | )
98 | {
99 | *o++ = udigest->digest[udigest->len1 + i];
100 | digest->len2++;
101 | }
102 | }
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/ffuzzy_digest_unnorm.c:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | ffuzzy_digest_unnorm.c
6 | Digest utility for fuzzy hashes (unnormalized form)
7 |
8 |
9 | Copyright (C) 2014 Tsukasa OI
10 |
11 |
12 | This program is free software; you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation; either version 2 of the License, or
15 | (at your option) any later version.
16 |
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | GNU General Public License for more details.
21 |
22 | You should have received a copy of the GNU General Public License
23 | along with this program. If not, see .
24 |
25 | */
26 |
27 | /**
28 | \internal
29 | \file ffuzzy_digest_unnorm.c
30 | \brief Digest utility for fuzzy hashes (unnormalized form)
31 | **/
32 |
33 | #include "ffuzzy_config.h"
34 |
35 | #include
36 | #include
37 | #include
38 | #include
39 |
40 | #include "ffuzzy.h"
41 | #include "ffuzzy_blocksize.h"
42 | #include "str_base64.h"
43 |
44 |
45 | inline bool ffuzzy_udigest_is_valid_lengths(const ffuzzy_udigest *udigest)
46 | {
47 | return
48 | udigest->len1 <= FFUZZY_SPAMSUM_LENGTH &&
49 | udigest->len2 <= FFUZZY_SPAMSUM_LENGTH &&
50 | (udigest->len1 + udigest->len2) <= FFUZZY_SPAMSUM_LENGTH * 2;
51 | }
52 |
53 |
54 | bool ffuzzy_udigest_is_natural_buffer(const ffuzzy_udigest *udigest)
55 | {
56 | assert(fuzzy_udigest_is_valid_lengths(udigest));
57 | for (size_t i = 0; i < udigest->len1 + udigest->len2; i++)
58 | if (!is_base64(udigest->digest[i]))
59 | return false;
60 | return true;
61 | }
62 |
63 |
64 | bool ffuzzy_udigest_is_valid(const ffuzzy_udigest *udigest)
65 | {
66 | return ffuzzy_udigest_is_valid_lengths(udigest);
67 | }
68 |
69 |
70 | bool ffuzzy_udigest_is_natural(const ffuzzy_udigest *udigest)
71 | {
72 | return
73 | ffuzzy_blocksize_is_natural_(udigest->block_size) &&
74 | ffuzzy_udigest_is_valid_lengths(udigest) &&
75 | ffuzzy_udigest_is_natural_buffer(udigest);
76 | }
77 |
78 |
79 | int ffuzzy_udigestcmp(const ffuzzy_udigest *d1, const ffuzzy_udigest *d2)
80 | {
81 | if (d1->block_size > d2->block_size)
82 | return +1;
83 | if (d1->block_size < d2->block_size)
84 | return -1;
85 | if (d1->len1 > d2->len1)
86 | return +1;
87 | if (d1->len1 < d2->len1)
88 | return -1;
89 | if (d1->len2 > d2->len2)
90 | return +1;
91 | if (d1->len2 < d2->len2)
92 | return -1;
93 | return memcmp(d1->digest, d2->digest, d1->len1 + d1->len2);
94 | }
95 |
96 |
97 | int ffuzzy_udigestcmp_blocksize(const ffuzzy_udigest *d1, const ffuzzy_udigest *d2)
98 | {
99 | return ffuzzy_blocksizecmp(d1->block_size, d2->block_size);
100 | }
101 |
102 |
103 | int ffuzzy_udigestcmp_blocksize_n(const ffuzzy_udigest *d1, const ffuzzy_udigest *d2)
104 | {
105 | bool nat1 = ffuzzy_blocksize_is_natural_(d1->block_size);
106 | bool nat2 = ffuzzy_blocksize_is_natural_(d2->block_size);
107 | if (nat1 == nat2)
108 | return ffuzzy_blocksizecmp(d1->block_size, d2->block_size);
109 | else
110 | {
111 | if (!nat1 && nat2)
112 | return +1;
113 | else
114 | return -1;
115 | }
116 | }
117 |
118 |
119 | bool ffuzzy_pretty_udigest(char *buf, size_t buflen, const ffuzzy_udigest *udigest)
120 | {
121 | assert(fuzzy_udigest_is_valid_lengths(udigest));
122 | // pretty hash contains two colons and trailing '\0'
123 | if (buflen < 3)
124 | return false;
125 | // buf must be big enough to contain two colons and two buffers.
126 | if ((buflen - 3) < (udigest->len1 + udigest->len2))
127 | return false;
128 | // write block size if possible
129 | {
130 | size_t bslen = (buflen - 2) - (udigest->len1 + udigest->len2);
131 | int bsret = snprintf(buf, bslen, "%lu", udigest->block_size);
132 | if (bsret < 0)
133 | return false;
134 | if ((size_t)bsret >= bslen)
135 | return false;
136 | buf += bsret;
137 | }
138 | // write blocks
139 | buf[0] = ':';
140 | memcpy(buf + 1, udigest->digest, udigest->len1);
141 | buf[udigest->len1 + 1] = ':';
142 | memcpy(buf + udigest->len1 + 2, udigest->digest + udigest->len1, udigest->len2);
143 | buf[udigest->len1 + udigest->len2 + 2] = '\0';
144 | return true;
145 | }
146 |
--------------------------------------------------------------------------------
/ffuzzy_parse.c:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | ffuzzy_parse.c
6 | Fuzzy hash parser
7 |
8 |
9 | CREDITS OF ORIGINAL VERSION OF SSDEEP
10 |
11 | Copyright (C) 2002 Andrew Tridgell
12 | Copyright (C) 2006 ManTech International Corporation
13 | Copyright (C) 2013 Helmut Grohne
14 |
15 | This program is free software; you can redistribute it and/or modify
16 | it under the terms of the GNU General Public License as published by
17 | the Free Software Foundation; either version 2 of the License, or
18 | (at your option) any later version.
19 |
20 | This program is distributed in the hope that it will be useful,
21 | but WITHOUT ANY WARRANTY; without even the implied warranty of
22 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 | GNU General Public License for more details.
24 |
25 | You should have received a copy of the GNU General Public License
26 | along with this program; if not, write to the Free Software
27 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 |
29 |
30 | CREDIT OF MODIFIED PORTIONS
31 |
32 | Copyright (C) 2014 Tsukasa OI
33 |
34 | */
35 |
36 | /**
37 | \internal
38 | \file ffuzzy_parse.c
39 | \brief Fuzzy hash parser
40 | **/
41 |
42 | #include "ffuzzy_config.h"
43 |
44 | #include "ffuzzy_parse.h"
45 |
46 | bool ffuzzy_read_digest(ffuzzy_digest *digest, const char *s)
47 | {
48 | char *p;
49 | if (!ffuzzy_read_digests_blocksize(&(digest->block_size), &p, s))
50 | return false;
51 | return ffuzzy_read_digest_after_blocksize(digest, p);
52 | }
53 |
--------------------------------------------------------------------------------
/ffuzzy_parse.h:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | ffuzzy_parse.h
6 | Fuzzy hash parser
7 |
8 |
9 | CREDITS OF ORIGINAL VERSION OF SSDEEP
10 |
11 | Copyright (C) 2002 Andrew Tridgell
12 | Copyright (C) 2006 ManTech International Corporation
13 | Copyright (C) 2013 Helmut Grohne
14 |
15 | This program is free software; you can redistribute it and/or modify
16 | it under the terms of the GNU General Public License as published by
17 | the Free Software Foundation; either version 2 of the License, or
18 | (at your option) any later version.
19 |
20 | This program is distributed in the hope that it will be useful,
21 | but WITHOUT ANY WARRANTY; without even the implied warranty of
22 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 | GNU General Public License for more details.
24 |
25 | You should have received a copy of the GNU General Public License
26 | along with this program; if not, write to the Free Software
27 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 |
29 |
30 | CREDIT OF MODIFIED PORTIONS
31 |
32 | Copyright (C) 2014 Tsukasa OI
33 |
34 | */
35 | #ifndef FFUZZY_FFUZZY_PARSE_H
36 | #define FFUZZY_FFUZZY_PARSE_H
37 |
38 | /**
39 | \internal
40 | \file ffuzzy_parse.h
41 | \brief Fuzzy hash parser
42 | **/
43 |
44 | #include "ffuzzy_config.h"
45 |
46 | #include
47 | #include
48 | #include
49 |
50 | #include "ffuzzy_blocksize.h"
51 |
52 |
53 | /**
54 | \internal
55 | \fn bool ffuzzy_read_digests_blocksize(unsigned long*, char**, const char*)
56 | \brief Read block size from the string
57 | \param [out] block_size The pointer to the block size
58 | \param [out] srem The value pointed by this parameter is set to the first non-numerical character.
59 | \param [in] s The string which contains a ssdeep digest.
60 | \return true if succeeds; false otherwise.
61 | **/
62 | static inline bool ffuzzy_read_digests_blocksize(unsigned long *block_size, char** srem, const char *s)
63 | {
64 | errno = 0;
65 | *block_size = strtoul(s, srem, 10);
66 | // arithmetic overflow occurred
67 | if (*block_size == ULONG_MAX && errno == ERANGE)
68 | return false;
69 | // the string does not start with numbers
70 | if (*srem == s)
71 | return false;
72 | return true;
73 | }
74 |
75 |
76 | /**
77 | \internal
78 | \fn bool ffuzzy_read_digest_after_blocksize(ffuzzy_digest*, const char*)
79 | \brief Read remaining digest parts (except block size) from the string
80 | \param [out] digest The pointer to the buffer to store valid digest after parsing.
81 | \param [in] s The pointer to the first non-numerical part of a ssdeep digest.
82 | \return true if succeeds; false otherwise.
83 | \see bool ffuzzy_read_digests_blocksize(unsigned long*, char**, const char*)
84 | **/
85 | static inline bool ffuzzy_read_digest_after_blocksize(ffuzzy_digest *digest, const char *s)
86 | {
87 | // ':' must follow after the number (which is block_size)
88 | if (*s != ':')
89 | return false;
90 | // read first block of ssdeep hash
91 | // (eliminating sequences of 4 or more identical characters)
92 | digest->len2 = 0;
93 | char *o = digest->digest;
94 | while (true)
95 | {
96 | char c = *++s;
97 | if (!c)
98 | return false;
99 | if (c == ':')
100 | break;
101 | if (digest->len2 < 3 || c != s[-1] || c != s[-2] || c != s[-3])
102 | {
103 | if (digest->len2 == FFUZZY_SPAMSUM_LENGTH)
104 | return false;
105 | digest->len2++;
106 | *o++ = c;
107 | }
108 | }
109 | // read second block of ssdeep hash
110 | // (eliminating sequences of 4 or more identical characters)
111 | digest->len1 = digest->len2;
112 | while (true)
113 | {
114 | char c = *++s;
115 | if (!c || c == ',')
116 | break;
117 | if (digest->len2 < 3 || c != s[-1] || c != s[-2] || c != s[-3])
118 | {
119 | if (digest->len2 == digest->len1 + FFUZZY_SPAMSUM_LENGTH)
120 | return false;
121 | digest->len2++;
122 | *o++ = c;
123 | }
124 | }
125 | digest->len2 -= digest->len1;
126 | return true;
127 | }
128 |
129 |
130 | /**
131 | \internal
132 | \fn bool ffuzzy_read_udigest_after_blocksize(ffuzzy_udigest*, const char*)
133 | \brief Read remaining unnormalized digest parts (except block size) from the string
134 | \param [out] udigest The pointer to the buffer to store valid unnormalized digest after parsing.
135 | \param [in] s The pointer to the first non-numerical part of a ssdeep digest.
136 | \return true if succeeds; false otherwise.
137 | \see bool ffuzzy_read_digests_blocksize(unsigned long*, char**, const char*)
138 | **/
139 | static inline bool ffuzzy_read_udigest_after_blocksize(ffuzzy_udigest *udigest, const char *s)
140 | {
141 | // ':' must follow after the number (which is block_size)
142 | if (*s != ':')
143 | return false;
144 | // read first block of ssdeep hash
145 | // (WITHOUT eliminating sequences)
146 | udigest->len1 = 0;
147 | char *o = udigest->digest;
148 | while (true)
149 | {
150 | char c = *++s;
151 | if (!c)
152 | return false;
153 | if (c == ':')
154 | break;
155 | if (udigest->len1++ == FFUZZY_SPAMSUM_LENGTH)
156 | return false;
157 | *o++ = c;
158 | }
159 | // read second block of ssdeep hash
160 | // (WITHOUT eliminating sequences)
161 | udigest->len2 = 0;
162 | while (true)
163 | {
164 | char c = *++s;
165 | if (!c || c == ',')
166 | break;
167 | if (udigest->len2++ == FFUZZY_SPAMSUM_LENGTH)
168 | return false;
169 | *o++ = c;
170 | }
171 | return true;
172 | }
173 |
174 | #endif
175 |
--------------------------------------------------------------------------------
/ffuzzy_parse_unnorm.c:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | ffuzzy_parse_unnorm.c
6 | Fuzzy hash parser (unnormalized form)
7 |
8 |
9 | CREDITS OF ORIGINAL VERSION OF SSDEEP
10 |
11 | Copyright (C) 2002 Andrew Tridgell
12 | Copyright (C) 2006 ManTech International Corporation
13 | Copyright (C) 2013 Helmut Grohne
14 |
15 | This program is free software; you can redistribute it and/or modify
16 | it under the terms of the GNU General Public License as published by
17 | the Free Software Foundation; either version 2 of the License, or
18 | (at your option) any later version.
19 |
20 | This program is distributed in the hope that it will be useful,
21 | but WITHOUT ANY WARRANTY; without even the implied warranty of
22 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 | GNU General Public License for more details.
24 |
25 | You should have received a copy of the GNU General Public License
26 | along with this program; if not, write to the Free Software
27 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 |
29 |
30 | CREDIT OF MODIFIED PORTIONS
31 |
32 | Copyright (C) 2014 Tsukasa OI
33 |
34 | */
35 |
36 | /**
37 | \internal
38 | \file ffuzzy_parse_unnorm.c
39 | \brief Fuzzy hash parser (unnormalized form)
40 | **/
41 |
42 | #include "ffuzzy_config.h"
43 |
44 | #include "ffuzzy_parse.h"
45 |
46 | bool ffuzzy_read_udigest(ffuzzy_udigest *udigest, const char *s)
47 | {
48 | char *p;
49 | if (!ffuzzy_read_digests_blocksize(&(udigest->block_size), &p, s))
50 | return false;
51 | return ffuzzy_read_udigest_after_blocksize(udigest, p);
52 | }
53 |
--------------------------------------------------------------------------------
/m4/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !/.gitignore
3 |
--------------------------------------------------------------------------------
/str_base64.h:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | str_base64.h
6 | Base64 utility
7 |
8 |
9 | Copyright (C) 2014 Tsukasa OI
10 |
11 |
12 | This program is free software; you can redistribute it and/or modify
13 | it under the terms of the GNU General Public License as published by
14 | the Free Software Foundation; either version 2 of the License, or
15 | (at your option) any later version.
16 |
17 | This program is distributed in the hope that it will be useful,
18 | but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 | GNU General Public License for more details.
21 |
22 | You should have received a copy of the GNU General Public License
23 | along with this program. If not, see .
24 |
25 | */
26 | #ifndef FFUZZY_STR_BASE64_H
27 | #define FFUZZY_STR_BASE64_H
28 |
29 | /**
30 | \internal
31 | \file str_base64.h
32 | \brief Base64 utility
33 | **/
34 |
35 | #include "ffuzzy_config.h"
36 |
37 |
38 | /**
39 | \internal
40 | \fn bool is_base64(char)
41 | \brief Determine if given character is a base64 character
42 | \param c The character to determine.
43 | \return true if c is one of the base64 characters; false otherwise.
44 | **/
45 | static inline bool is_base64(char c)
46 | {
47 | switch (c)
48 | {
49 | case '0': case '1': case '2': case '3': case '4':
50 | case '5': case '6': case '7': case '8': case '9':
51 | case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
52 | case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
53 | case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
54 | case 'V': case 'W': case 'X': case 'Y': case 'Z':
55 | case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
56 | case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
57 | case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
58 | case 'v': case 'w': case 'x': case 'y': case 'z':
59 | case '+': case '/':
60 | return true;
61 | default:
62 | return false;
63 | }
64 | }
65 |
66 | #endif
67 |
--------------------------------------------------------------------------------
/str_common_substr.h:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | str_common_substr.h
6 | Common substring finder
7 |
8 |
9 | CREDITS OF ORIGINAL VERSION OF SSDEEP
10 |
11 | Copyright (C) 2002 Andrew Tridgell
12 | Copyright (C) 2006 ManTech International Corporation
13 | Copyright (C) 2013 Helmut Grohne
14 |
15 | This program is free software; you can redistribute it and/or modify
16 | it under the terms of the GNU General Public License as published by
17 | the Free Software Foundation; either version 2 of the License, or
18 | (at your option) any later version.
19 |
20 | This program is distributed in the hope that it will be useful,
21 | but WITHOUT ANY WARRANTY; without even the implied warranty of
22 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 | GNU General Public License for more details.
24 |
25 | You should have received a copy of the GNU General Public License
26 | along with this program; if not, write to the Free Software
27 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 |
29 |
30 | CREDIT OF MODIFIED PORTIONS
31 |
32 | Copyright (C) 2014 Tsukasa OI
33 |
34 | */
35 | #ifndef FFUZZY_STR_COMMON_SUBSTR_H
36 | #define FFUZZY_STR_COMMON_SUBSTR_H
37 |
38 | /**
39 | \internal
40 | \file str_common_substr.h
41 | \brief Common substring finder
42 | **/
43 |
44 | #include "ffuzzy_config.h"
45 |
46 | #include
47 | #include
48 | #include
49 | #include
50 |
51 | #include "str_hash_rolling.h"
52 |
53 | /** \internal \brief Maximum length for has_common_substring function **/
54 | #define HAS_COMMON_SUBSTR_MAXLEN 64
55 |
56 | #if FFUZZY_MIN_MATCH < ROLLING_WINDOW
57 | #error FFUZZY_MIN_MATCH must have at least ROLLING_WINDOW on current implementation.
58 | #endif
59 |
60 |
61 | /**
62 | \internal
63 | \fn bool has_common_substring(const char*, size_t, const char*, size_t)
64 | \brief Determine if given strings have common substring of length FFUZZY_MIN_MATCH
65 | \details
66 | We only accept a match if we have at least one common substring
67 | in the signature of length FFUZZY_MIN_MATCH.
68 | \return true if the given strings have a common substring of length FFUZZY_MIN_MATCH.
69 | \example examples/internal/has_common_substring.c
70 | **/
71 | static inline bool has_common_substring(
72 | const char *s1, size_t s1len,
73 | const char *s2, size_t s2len
74 | )
75 | {
76 | assert(s1len <= HAS_COMMON_SUBSTR_MAXLEN);
77 | assert(s2len <= HAS_COMMON_SUBSTR_MAXLEN);
78 | #if HAS_COMMON_SUBSTR_MAXLEN >= FFUZZY_MIN_MATCH
79 | // if (at least) one of two strings is shorter than
80 | // FFUZZY_MIN_MATCH length, it will never find substring
81 | if (s1len < FFUZZY_MIN_MATCH)
82 | return false;
83 | if (s2len < FFUZZY_MIN_MATCH)
84 | return false;
85 |
86 | uint_least32_t hashes[HAS_COMMON_SUBSTR_MAXLEN - (FFUZZY_MIN_MATCH - 1)];
87 | roll_state state;
88 |
89 | // compute FFUZZY_MIN_MATCH-width rolling hashes for each index of s1
90 | memset(hashes, 0, sizeof(hashes));
91 | roll_init(&state);
92 | for (size_t i = 0; i < FFUZZY_MIN_MATCH - 1; i++)
93 | roll_hash(&state, (unsigned char)s1[i]);
94 | for (size_t i = FFUZZY_MIN_MATCH - 1; i < s1len; i++)
95 | {
96 | roll_hash(&state, (unsigned char)s1[i]);
97 | hashes[i - (FFUZZY_MIN_MATCH - 1)] = roll_sum(&state);
98 | }
99 |
100 | // compute FFUZZY_MIN_MATCH-width rolling hashes for each index of s2
101 | roll_init(&state);
102 | for (size_t j = 0; j < FFUZZY_MIN_MATCH - 1; j++)
103 | roll_hash(&state, (unsigned char)s2[j]);
104 | for (size_t j = 0; j < s2len - (FFUZZY_MIN_MATCH - 1); j++)
105 | {
106 | roll_hash(&state, (unsigned char)s2[j + (FFUZZY_MIN_MATCH - 1)]);
107 | uint_least32_t h = roll_sum(&state);
108 | for (size_t i = 0; i < s1len - (FFUZZY_MIN_MATCH - 1); i++)
109 | {
110 | // make sure we actually have common substring if hash matches
111 | if (hashes[i] == h && !memcmp(s1 + i, s2 + j, FFUZZY_MIN_MATCH))
112 | {
113 | return true;
114 | }
115 | }
116 | }
117 | return false;
118 | #else
119 | return false;
120 | #endif
121 | }
122 |
123 | #endif
124 |
--------------------------------------------------------------------------------
/str_edit_dist.h:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | str_edit_dist.h
6 | Edit distance (Levenshtein distance with no "replacement")
7 |
8 | Copyright (C) 2014 kikairoya
9 | Copyright (C) 2014 Tsukasa OI
10 |
11 |
12 | This program can be used, redistributed or modified under any of
13 | Boost Software License 1.0, GPL v2 or GPL v3
14 |
15 | */
16 | #ifndef FFUZZY_STR_EDIT_DIST_H
17 | #define FFUZZY_STR_EDIT_DIST_H
18 |
19 | /**
20 | \internal
21 | \file str_edit_dist.h
22 | \brief Edit distance (Levenshtein distance with no "replacement")
23 | **/
24 |
25 | #include "ffuzzy_config.h"
26 |
27 | #include
28 | #include "util.h"
29 |
30 | /** \internal \brief Maximum length for edit_distn function **/
31 | #define EDIT_DISTN_MAXLEN 64
32 |
33 |
34 | /**
35 | \internal
36 | \fn int edit_distn(const char*, size_t, const char*, size_t)
37 | \brief Compute edit distance between two strings with no replacement
38 | \details
39 | This function computes Levenshtein distance with no "replacement".
40 | It means, single cost 1 operations allowed are "insertion" and "removal".
41 | \param s1 String 1 (non-empty)
42 | \param s1len Length of s1
43 | \param s2 String 2
44 | \param s2len Length of s2
45 | \return The edit distance.
46 | \example examples/internal/edit_distn.c
47 | **/
48 | static inline int edit_distn(const char *s1, size_t s1len, const char *s2, size_t s2len)
49 | {
50 | int t[2][EDIT_DISTN_MAXLEN+1];
51 | int *t1 = t[0], *t2 = t[1], *t3;
52 | size_t i1, i2;
53 | // assume that s1len (length of s1) is greater than zero.
54 | assert(s1len > 0);
55 | assert(s1len <= EDIT_DISTN_MAXLEN);
56 | assert(s2len <= EDIT_DISTN_MAXLEN);
57 | t1[0] = 1;
58 | for (i2 = 0; i2 < s2len; i2++)
59 | {
60 | int cost_d = t1[i2] + 1;
61 | int cost_r = i2 + (s1[0] == s2[i2] ? 0 : 2);
62 | t1[i2+1] = MIN(cost_d, cost_r);
63 | }
64 | for (i1 = 1; i1 < s1len; i1++)
65 | {
66 | t2[0] = i1 + 1;
67 | for (i2 = 0; i2 < s2len; i2++)
68 | {
69 | int cost_a = t1[i2+1] + 1;
70 | int cost_d = t2[i2] + 1;
71 | int cost_r = t1[i2] + (s1[i1] == s2[i2] ? 0 : 2);
72 | t2[i2+1] = MIN(MIN(cost_a, cost_d), cost_r);
73 | }
74 | t3 = t1; t1 = t2; t2 = t3;
75 | }
76 | return t1[s2len];
77 | }
78 |
79 |
80 | /**
81 | \internal
82 | \fn int edit_distn_norm(const char*, size_t, const char*, size_t)
83 | \brief Compute edit distance between two strings with no replacement
84 | \details
85 | This function computes Levenshtein distance with no "replacement".
86 | It means, single cost 1 operations allowed are "insertion" and "removal".
87 |
88 | This function swaps the string to make sure that
89 | s1len to pass through edit_distn is always equal or less than s2len.
90 | This may prevent stalls on modern processors.
91 | \param s1 String 1 (non-empty)
92 | \param s1len Length of s1
93 | \param s2 String 2 (non-empty)
94 | \param s2len Length of s2
95 | \return The edit distance.
96 | **/
97 | static inline int edit_distn_norm(const char *s1, size_t s1len, const char *s2, size_t s2len)
98 | {
99 | assert(s1len > 0);
100 | assert(s2len > 0);
101 | if (s1len <= s2len)
102 | return edit_distn(s1, s1len, s2, s2len);
103 | else
104 | return edit_distn(s2, s2len, s1, s1len);
105 | }
106 |
107 | #endif
108 |
--------------------------------------------------------------------------------
/str_hash_rolling.h:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | str_hash_rolling.h
6 | Rolling hash implementation
7 |
8 |
9 | CREDITS OF ORIGINAL VERSION OF SSDEEP
10 |
11 | Copyright (C) 2002 Andrew Tridgell
12 | Copyright (C) 2006 ManTech International Corporation
13 | Copyright (C) 2013 Helmut Grohne
14 |
15 | This program is free software; you can redistribute it and/or modify
16 | it under the terms of the GNU General Public License as published by
17 | the Free Software Foundation; either version 2 of the License, or
18 | (at your option) any later version.
19 |
20 | This program is distributed in the hope that it will be useful,
21 | but WITHOUT ANY WARRANTY; without even the implied warranty of
22 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 | GNU General Public License for more details.
24 |
25 | You should have received a copy of the GNU General Public License
26 | along with this program; if not, write to the Free Software
27 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
28 |
29 |
30 | CREDIT OF MODIFIED PORTIONS
31 |
32 | Copyright (C) 2014 Tsukasa OI
33 |
34 | */
35 | #ifndef FFUZZY_STR_HASH_ROLLING_H
36 | #define FFUZZY_STR_HASH_ROLLING_H
37 |
38 | /**
39 | \internal
40 | \file str_hash_rolling.h
41 | \brief Rolling hash implementation
42 | **/
43 |
44 | #include "ffuzzy_config.h"
45 |
46 | #include
47 | #include
48 |
49 | /** \internal \brief The window size for rolling hash **/
50 | #define ROLLING_WINDOW 7
51 |
52 |
53 | /**
54 | \internal
55 | \struct roll_state
56 | \brief State for rolling hash
57 |
58 | \internal
59 | \var roll_state::h1
60 | \brief The sum of characters in the window.
61 | \internal
62 | \var roll_state::h2
63 | \brief The sum of characters in the window (weighted by its state).
64 | \internal
65 | \var roll_state::h3
66 | \brief Shift and XOR-based hash.
67 | \internal
68 | \var roll_state::n
69 | \brief Next index to insert.
70 | \internal
71 | \var roll_state::window
72 | \brief Inserted haracters.
73 | **/
74 | typedef struct
75 | {
76 | uint_least32_t h1, h2, h3;
77 | uint_least32_t n;
78 | unsigned char window[ROLLING_WINDOW];
79 | } roll_state;
80 |
81 |
82 | /**
83 | \internal
84 | \fn void roll_init(roll_state*)
85 | \brief Initialize rolling hash state
86 | \param [out] self The pointer to the rolling hash state to initialize.
87 | **/
88 | static inline void roll_init(roll_state *self)
89 | {
90 | memset(self, 0, sizeof(roll_state));
91 | }
92 |
93 |
94 | /**
95 | \internal
96 | \fn void roll_hash(roll_state*, unsigned char)
97 | \brief Insert a character to the rolling hash
98 | \param [in,out] self The pointer to the rolling hash state.
99 | \param c The character to insert.
100 | **/
101 | static inline void roll_hash(roll_state *self, unsigned char c)
102 | {
103 | self->h2 = self->h2 - self->h1 + ROLLING_WINDOW * (uint_least32_t)c;
104 | self->h1 = self->h1 + (uint_least32_t)c - (uint_least32_t)self->window[self->n];
105 | self->h3 = ((self->h3 << 5) ^ (uint_least32_t)c) & UINT32_C(0xffffffff);
106 | self->window[self->n] = c;
107 | // instead of modulo, branch will be faster on modern architectures
108 | self->n++;
109 | if (self->n == ROLLING_WINDOW)
110 | self->n = 0;
111 | }
112 |
113 |
114 | /**
115 | \internal
116 | \fn uint_least32_t roll_sum(const roll_state*)
117 | \brief Extract rolling hash from current state
118 | \param [in] self The pointer to the rolling hash state.
119 | \return A 32-bit rolling hash value.
120 | **/
121 | static inline uint_least32_t roll_sum(const roll_state *self)
122 | {
123 | return (self->h1 + self->h2 + self->h3) & UINT32_C(0xffffffff);
124 | }
125 |
126 | #endif
127 |
--------------------------------------------------------------------------------
/util.h:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | libffuzzy : Fast ssdeep comparison library
4 |
5 | util.h
6 | Miscellaneous Utilities
7 |
8 | Copyright (C) 2014 Tsukasa OI
9 |
10 |
11 | Permission to use, copy, modify, and/or distribute this software for
12 | any purpose with or without fee is hereby granted, provided that the
13 | above copyright notice and this permission notice appear in all copies.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22 |
23 | */
24 | #ifndef FFUZZY_UTIL_H
25 | #define FFUZZY_UTIL_H
26 |
27 | /**
28 | \internal
29 | \file util.h
30 | \brief Miscellaneous Utilities
31 | **/
32 |
33 | /**
34 | \internal
35 | \brief Take minimum value
36 | \param a Value 1
37 | \param b Value 2
38 | \return Minimum value of two
39 | **/
40 | #define MIN(a,b) ((a)<(b)?(a):(b))
41 |
42 | /**
43 | \internal
44 | \brief Take maximum value
45 | \param a Value 1
46 | \param b Value 2
47 | \return Maximum value of two
48 | **/
49 | #define MAX(a,b) ((a)>(b)?(a):(b))
50 |
51 | #endif
52 |
--------------------------------------------------------------------------------