├── tests
    ├── test_helper.h
    ├── dictionary_tests.h
    ├── test_dict.txt
    ├── test_helper.c
    ├── stem_plural_tests.h
    ├── stem_singular_tests.h
    ├── runtests.sh
    ├── remove_suffixes_tests.h
    ├── remove_prefixes_tests.h
    ├── minunit.h
    ├── dictionary_tests.c
    ├── precedence_adjustment_tests.c
    ├── stem_plural_tests.c
    ├── remove_suffixes_tests.c
    ├── stem_singular_tests.c
    └── remove_prefixes_tests.c
├── src
    ├── deps
    │   └── strndup
    │   │   ├── strndup.h
    │   │   ├── package.json
    │   │   └── strndup.c
    ├── sastrawi
    │   ├── stem_singular.h
    │   ├── text_util.h
    │   ├── dictionary.h
    │   ├── stem_plural.h
    │   ├── remove_suffixes.h
    │   ├── precedence_adjustment.c
    │   ├── stem_singular.c
    │   ├── remove_suffixes.c
    │   ├── text_util.c
    │   ├── remove_prefixes.h
    │   ├── dictionary.c
    │   ├── stem_plural.c
    │   └── remove_prefixes.c
    ├── libsastrawi.h
    ├── libsastrawi.c
    ├── regex
    │   ├── preg.h
    │   └── preg.c
    ├── dbg.h
    └── uthash
    │   ├── utringbuffer.h
    │   ├── utstring.h
    │   ├── utarray.h
    │   └── utlist.h
├── .gitignore
├── TODO
├── test.php
├── Makefile.ori
├── Makefile
└── README.md


/tests/test_helper.h:
--------------------------------------------------------------------------------
1 | void free_parts(int parts_count, char **parts[]);
2 | 


--------------------------------------------------------------------------------
/tests/dictionary_tests.h:
--------------------------------------------------------------------------------
1 | char *test_dictionary_load();
2 | char *test_dictionary_contains();
3 | char *test_dictionary_add();
4 | 


--------------------------------------------------------------------------------
/tests/test_dict.txt:
--------------------------------------------------------------------------------
 1 | aba
 2 | abad
 3 | abadi
 4 | abadiah
 5 | abai
 6 | zuhud
 7 | zuhur
 8 | zulfikar
 9 | zulhijah
10 | zulkaidah
11 | 


--------------------------------------------------------------------------------
/src/deps/strndup/strndup.h:
--------------------------------------------------------------------------------
1 | #ifndef HAVE_STRNDUP
2 | #define HAVE_STRNDUP
3 | 
4 | char *strndup(const char *s, size_t n);
5 | 
6 | #endif /* HAVE_STRNDUP */
7 | 


--------------------------------------------------------------------------------
/src/sastrawi/stem_singular.h:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | #include <string.h>
3 | #include <stdlib.h>
4 | 
5 | int stem_singular_word(char *word, char **stemmed_word);
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vagrant/
 2 | Vagrantfile
 3 | *.o
 4 | *.lo
 5 | pcre2*
 6 | test_text_cleanser
 7 | test_sastrawi
 8 | tags
 9 | *.dSYM
10 | build/*
11 | *.log
12 | tests/*_tests
13 | 


--------------------------------------------------------------------------------
/tests/test_helper.c:
--------------------------------------------------------------------------------
1 | void free_parts(int parts_count, char **parts[])
2 | {
3 |   for (int i = 0; i < parts_count; i++)
4 |   {
5 |     free((*parts)[i]);
6 |   }
7 |   free(*parts);
8 | }
9 | 


--------------------------------------------------------------------------------
/src/sastrawi/text_util.h:
--------------------------------------------------------------------------------
1 | int split_word(char *pattern, char *word, char **first_part, char **second_part);
2 | int split_word3(char *pattern, char *word, char **first_part, char **second_part, char **third_part);
3 | 


--------------------------------------------------------------------------------
/tests/stem_plural_tests.h:
--------------------------------------------------------------------------------
1 | char *test_is_plural();
2 | char *test_plural_parts();
3 | char *test_stem_plural_word_when_both_words_are_root_words_and_the_same();
4 | char *test_stem_plural_word_when_one_word_has_suffixes();
5 | 


--------------------------------------------------------------------------------
/src/libsastrawi.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include <stdlib.h>
 4 | #include "regex/preg.h"
 5 | #include "sastrawi/stem_plural.h"
 6 | #include "sastrawi/stem_singular.h"
 7 | #include "sastrawi/dictionary.h"
 8 | 
 9 | void print_my_name();
10 | 


--------------------------------------------------------------------------------
/src/sastrawi/dictionary.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include <stdlib.h>
 4 | 
 5 | char *dictionary_fullpath(char *relative_path);
 6 | int dictionary_load(char *path_to_dict);
 7 | int dictionary_contains(char *word);
 8 | int dictionary_add(char *word);
 9 | int dictionary_count();
10 | 


--------------------------------------------------------------------------------
/src/libsastrawi.c:
--------------------------------------------------------------------------------
 1 | #ifdef __linux
 2 |   #define _GNU_SOURCE 
 3 | #endif
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | #include "libsastrawi.h"
 8 | #include "sastrawi/stem_plural.h"
 9 | #include "sastrawi/stem_singular.h"
10 | 
11 | void print_my_name() 
12 | {
13 |   printf("mohan");
14 | }
15 | 


--------------------------------------------------------------------------------
/src/sastrawi/stem_plural.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include <stdlib.h>
 4 | 
 5 | //Sastrawi\Stemmer::isPlural
 6 | int is_plural(char *word);
 7 | 
 8 | int plural_parts(char *word, char **parts[]);
 9 | 
10 | //Sastrawi\Stemmer::stemPluralWord
11 | int stem_plural_word(char *word, char **stemmed_word);
12 | 


--------------------------------------------------------------------------------
/src/deps/strndup/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "strndup",
 3 |   "version": "0.0.1",
 4 |   "repo": "clibs/strndup",
 5 |   "description": "strndup implementation. Useful when unavailable on your platform.",
 6 |   "keywords": [ "string" ],
 7 |   "license": "public domain",
 8 |   "src": [
 9 |     "strndup.c",
10 |     "strndup.h"
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/src/deps/strndup/strndup.c:
--------------------------------------------------------------------------------
 1 | #ifndef HAVE_STRNDUP
 2 | 
 3 | #include <stdlib.h>
 4 | #include <string.h>
 5 | 
 6 | char *strndup(const char *s, size_t n)
 7 | {
 8 |     char* new = malloc(n+1);
 9 |     if (new) {
10 |         strncpy(new, s, n);
11 |         new[n] = '\0';
12 |     }
13 |     return new;
14 | }
15 | 
16 | #endif /* HAVE_STRNDUP */
17 | 


--------------------------------------------------------------------------------
/tests/stem_singular_tests.h:
--------------------------------------------------------------------------------
1 | char *test_stem_singular_word();
2 | char *test_stem_singular_word_removes_plain_prefixes();
3 | char *test_stem_singular_word_removes_suffixes();
4 | char *test_stem_singular_word_removes_complex_prefixes_1(); 
5 | char *test_stem_singular_word_removes_complex_prefixes_2(); 
6 | char *test_stem_singular_word_removes_complex_prefixes_3(); 
7 | 


--------------------------------------------------------------------------------
/src/regex/preg.h:
--------------------------------------------------------------------------------
 1 | #ifndef _preg_replace_h
 2 | #define _preg_replace_h
 3 | 
 4 | #define PCRE2_CODE_UNIT_WIDTH 8
 5 | #include <pcre2.h>
 6 | 
 7 | char *preg_replace(char *pattern, char *replacement, char *subject);
 8 | 
 9 | int preg_match(char *pattern, char *subject, char **matches[]);
10 | 
11 | void free_matches(int matches_count, char **matches[]);
12 | #endif
13 | 


--------------------------------------------------------------------------------
/tests/runtests.sh:
--------------------------------------------------------------------------------
 1 | echo "Running unit tests:"
 2 | 
 3 | for i in tests/*_tests
 4 | do
 5 |   if test -f $i
 6 |   then
 7 |     if $VALGRIND ./$i 2>> tests/tests.log
 8 |     then
 9 |       echo $i PASS
10 |     else
11 |       echo "ERROR in test $i: here's tests/tests.log"
12 |       echo "------"
13 |       tail tests/tests.log
14 |       exit 1
15 |     fi
16 |   fi
17 | done
18 | 
19 | echo ""
20 | 


--------------------------------------------------------------------------------
/TODO:
--------------------------------------------------------------------------------
1 | - Add test case for mengira - should get kira, currently ira
2 | - mengeri => ngeri not keri
3 | - memproteksi, mempatroli - proteksi, patroli not protek, patrol
4 | - Check remove_prefixes_rule6 for cases where it stems but its not in the dict
5 | - When no remove_prefixes match, stem_singular returns an empty string, it should return the original string
6 | - remove_suffixes should return 1 if its in dict
7 | 


--------------------------------------------------------------------------------
/src/sastrawi/remove_suffixes.h:
--------------------------------------------------------------------------------
1 | void remove_suffixes(char *word, char **stemmed_word);
2 | int remove_inflectional_particle(char *word, char **stemmed_word, char **removed_part);
3 | int remove_possessive_pronoun(char *word, char **stemmed_word, char **removed_part);
4 | int remove_derivational_suffix(char *word, char **stemmed_word, char **removed_part);
5 | int remove_suffix(char *suffixes, char *word, char **stemmed_word, char **removed_part);
6 | 


--------------------------------------------------------------------------------
/test.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | $plural = "malikat-malaikat-nya";
 3 | preg_match('/^(.*)-(.*)$/', $plural, $words);
 4 | 
 5 | if (!isset($words[1]) || !isset($words[2])) {
 6 |     return $plural;
 7 | }
 8 | 
 9 | // malaikat-malaikat-nya -> malaikat malaikat-nya
10 | print_r($words);
11 | $suffix = $words[2];
12 | if (in_array($suffix, array('ku', 'mu', 'nya', 'lah', 'kah', 'tah', 'pun')) &&
13 |     preg_match('/^(.*)-(.*)$/', $words[1], $words)) {
14 |     $words[2] .= '-' . $suffix;
15 | }
16 | 
17 | print "blah \n";
18 | print_r($words);
19 | print "blah2 \n";
20 | 


--------------------------------------------------------------------------------
/tests/remove_suffixes_tests.h:
--------------------------------------------------------------------------------
 1 | #ifdef __linux
 2 |   #define _GNU_SOURCE 
 3 | #endif
 4 | #include "minunit.h"
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | #include "../sastrawi/remove_suffixes.h"
 9 | #include "../dbg.h"
10 | 
11 | char *test_remove_inflectional_particle_with_dash();
12 | char *test_remove_inflectional_particle_without_dash();
13 | char *test_remove_inflectional_particle_no_match();
14 | char *test_remove_possessive_pronoun_with_dash();
15 | char *test_remove_possessive_pronoun_without_dash();
16 | char *test_remove_derivational_suffix_with_dash();
17 | char *test_remove_derivational_suffix_without_dash();
18 | char *test_remove_suffixes();
19 | 


--------------------------------------------------------------------------------
/tests/remove_prefixes_tests.h:
--------------------------------------------------------------------------------
 1 | #ifdef __linux
 2 |   #define _GNU_SOURCE 
 3 | #endif
 4 | #include "minunit.h"
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | #include "../sastrawi.h"
 9 | #include "../dbg.h"
10 | 
11 | char *test_remove_plain_prefix_di();
12 | char *test_remove_plain_prefix_ke(); 
13 | char *test_remove_plain_prefix_se();
14 | char *test_remove_complex_prefix_rule1_a();
15 | char *test_remove_complex_prefix_rule1_b();
16 | char *test_remove_complex_prefix_rule2();
17 | char *test_remove_complex_prefix_rule2_excludes_er();
18 | char *test_remove_complex_prefix_rule3_only_includes_er();
19 | char *test_remove_plain_prefix_returns_0_if_word_notin_dictionary();
20 | 


--------------------------------------------------------------------------------
/src/sastrawi/precedence_adjustment.c:
--------------------------------------------------------------------------------
 1 | #ifdef __linux
 2 |   #define _GNU_SOURCE
 3 | #endif
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | #include "../dbg.h"
 8 | 
 9 | 
10 | int is_precedence_adjustment_satisfied(char *original_word)
11 | {
12 |   int len = strlen(original_word);
13 | 
14 |   if(strncmp(original_word, "be", 2) == 0) {
15 |     if(len > 4 && strcmp(&original_word[len-3], "lah") == 0) {
16 |       return 1;
17 |     }
18 | 
19 |     if(len > 3 && strcmp(&original_word[len-2], "an") == 0) {
20 |       return 1;
21 |     }
22 | 
23 |   } else if(strncmp(original_word, "me", 2) == 0 ||
24 |       strncmp(original_word, "di", 2) == 0 ||
25 |       strncmp(original_word, "pe", 2) == 0 ||
26 |       strncmp(original_word, "ter", 3) == 0) {
27 | 
28 |     if(len > 2 && strcmp(&original_word[len-1], "i") == 0) {
29 |       return 1;
30 |     }
31 | 
32 |   }
33 | 
34 |   return 0;
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/tests/minunit.h:
--------------------------------------------------------------------------------
 1 | #undef NDEBUG
 2 | #ifndef _minunit_h
 3 | #define _minunit_h
 4 | 
 5 | #include <stdio.h>
 6 | #include "dbg.h"
 7 | #include <stdlib.h>
 8 | 
 9 | #define mu_suite_start() char *message = NULL
10 | 
11 | 
12 | #define mu_assert(test, message) if(!(test)) { log_err(message); return message; }
13 | #define mu_run_test(test) debug("\n------%s", " " #test); \
14 |   message = test(); tests_run++; if (message) return message;
15 | 
16 | #define RUN_TESTS(name) int main(int argc, char *argv[]) {\
17 |   argc = 1; \
18 |   debug("----- RUNNING: %s", argv[0]);\
19 |     printf("-----\n RUNNING: %s\n", argv[0]);\
20 |     char *result = name();\
21 |     if(result != 0) { \
22 |       printf("FAILED: %s\n", result);\
23 |     }\
24 |     else {\
25 |       printf("ALL TESTS PASSED\n"); \
26 |     }\
27 |   printf("Tests run: %d\n", tests_run);\
28 |     exit(result != 0);\
29 | }
30 | 
31 | int tests_run;
32 | 
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/Makefile.ori:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | CFLAGS = -O3 -Wall -g -std=c99
 3 | INCLUDES = -I/usr/local/include
 4 | LFLAGS = -L/usr/local/lib
 5 | LIBS = -lpcre2-8
 6 | SRCS = tests/test_sastrawi.c tests/test_dictionary.c tests/test_stem_singular.c tests/test_stem_plural.c tests/test_remove_prefixes.c tests/test_remove_suffixes.c sastrawi.c sastrawi/stem_plural.c sastrawi/stem_singular.c sastrawi/remove_prefixes.c sastrawi/remove_suffixes.c sastrawi/text_util.c sastrawi/dictionary.c regex/preg.c
 7 | OBJS = $(SRCS:.c=.o)
 8 | MAIN = test_sastrawi
 9 | 
10 | .PHONY: depend clean
11 | 
12 | all:    $(MAIN)
13 | 				@echo test_sastrawi compiled!
14 | 
15 | $(MAIN): $(OBJS) 
16 | 				 $(CC) $(CFLAGS) $(INCLUDES) -o $(MAIN) $(OBJS) $(LFLAGS) $(LIBS)
17 | .c.o:
18 | 				$(CC) $(CFLAGS) $(INCLUDES) -c $<  -o $@
19 | 
20 | clean:
21 | 				$(RM) *.o *~ regex/*.o tests/*.o sastrawi/*.o $(MAIN)
22 | 
23 | depend: $(SRCS)
24 | 				makedepend $(INCLUDES) $^
25 | 
26 | # DO NOT DELETE THIS LINE -- make depend needs it
27 | 
28 | 


--------------------------------------------------------------------------------
/src/dbg.h:
--------------------------------------------------------------------------------
 1 | #ifndef __dbg_h__
 2 | #define __dbg_h__
 3 | 
 4 | #include <stdio.h>
 5 | #include <errno.h>
 6 | #include <string.h>
 7 | 
 8 | #ifdef NDEBUG
 9 | #define debug(M, ...)
10 | #else
11 | #define debug(M, ...) fprintf(stderr, "DEBUG %s:%d: " M "\n", __FILE__, __LINE__, ##__VA_ARGS__)
12 | #endif
13 | 
14 | #define clean_errno() (errno == 0 ? "None" : strerror(errno))
15 | 
16 | #define log_err(M, ...) fprintf(stderr, "[ERROR] (%s:%d: errno: %s) " M "\n", __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__)
17 | 
18 | #define log_warn(M, ...) fprintf(stderr, "[WARN] (%s:%d: errno: %s) " M "\n", __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__)
19 | 
20 | #define log_info(M, ...) fprintf(stderr, "[INFO] (%s:%d) " M "\n", __FILE__, __LINE__, ##__VA_ARGS__)
21 | 
22 | #define check(A, M, ...) if(!(A)) { log_err(M, ##__VA_ARGS__); errno=0; goto error; }
23 | 
24 | #define sentinel(M, ...)  { log_err(M, ##__VA_ARGS__); errno=0; goto error; }
25 | 
26 | #define check_mem(A) check((A), "Out of memory.")
27 | 
28 | #define check_debug(A, M, ...) if(!(A)) { debug(M, ##__VA_ARGS__); errno=0; goto error; }
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/src/sastrawi/stem_singular.c:
--------------------------------------------------------------------------------
 1 | #ifdef __linux
 2 |   #define _GNU_SOURCE
 3 | #endif
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | #include "dictionary.h"
 8 | #include "remove_suffixes.h"
 9 | #include "remove_prefixes.h"
10 | #include "stem_singular.h"
11 | #include "../dbg.h"
12 | 
13 | int stem_singular_word(char *word, char **stemmed_word)
14 | {
15 | 
16 |   //step 1: word already in dictionary
17 |   if(dictionary_contains(word)) {
18 |     (*stemmed_word) = strndup(word, strlen(word));
19 |     return 1;
20 |   }
21 | 
22 |   if(is_precedence_adjustment_satisfied(word)) {
23 | 
24 |     int rc = remove_prefixes(word, stemmed_word);
25 | 
26 |     //in dict and done - return
27 |     if(rc) {
28 |       return 1;
29 |     }
30 | 
31 |     char *post_remove = strndup(*stemmed_word, strlen(*stemmed_word));
32 |     free(*stemmed_word);
33 |     *stemmed_word = NULL;
34 | 
35 |     remove_suffixes(post_remove, stemmed_word);
36 |     if(dictionary_contains(*stemmed_word)) {
37 |       return 1;
38 |     } else {
39 |       free(*stemmed_word);
40 |       *stemmed_word = NULL;
41 |     }
42 |   }
43 | 
44 | 
45 |   //step 2 & 3: remove suffixes
46 |   remove_suffixes(word, stemmed_word);
47 |   if(dictionary_contains(*stemmed_word)) {
48 |     return 1;
49 |   }
50 | 
51 |   char *post_suffix_removal_word = strndup(*stemmed_word, strlen(*stemmed_word));
52 |   free(*stemmed_word);
53 |   *stemmed_word = NULL;
54 | 
55 |   return remove_prefixes(post_suffix_removal_word, stemmed_word);
56 | }
57 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS=-Wall -g -O2 -Wextra -Isrc -I/usr/local/include -DNDEBUG $(OPTFLAGS)
 2 | LDFLAGS=-lpcre2-8  -L/usr/local/lib $(OPTLIBS)
 3 | PREFIX?=/usr/local
 4 | 
 5 | SOURCES=$(wildcard src/**/*.c src/*.c tests/test_helper.c)
 6 | OBJECTS=$(patsubst %.c,%.o,$(SOURCES))
 7 | 
 8 | TEST_SRC=$(wildcard tests/*_tests.c)
 9 | TESTS=$(patsubst %.c,%,$(TEST_SRC))
10 | 
11 | TARGET=build/libsastrawi.a
12 | SO_TARGET=$(patsubst %.a,%.so,$(TARGET))
13 | 
14 | all: $(TARGET)  tests
15 | 
16 | dev: CFLAGS=-Wall -g -Isrc -Wall -Wextra $(OPTFLAGS)
17 | dev: all
18 | 
19 | $(TARGET): CFLAGS += -fPIC
20 | $(TARGET): build $(OBJECTS)
21 | 	ar rcs $@ $(OBJECTS)
22 | 	ranlib $@
23 | 
24 | # $(SO_TARGET): $(TARGET) $(OBJECTS)
25 | # 	$(CC) -shared -o $@ $(OBJECTS)
26 | #
27 | build:
28 | 	@mkdir -p build
29 | 	@mkdir -p bin
30 | 
31 | .PHONY: tests
32 | tests: LDLIBS += $(TARGET) tests/test_helper.o
33 | tests: $(TESTS)
34 | 	$(TESTS)
35 | 	sh ./tests/runtests.sh
36 | 
37 | valgrind: all
38 | 	VALGRIND="valgrind --log-file=/tmp/valgrind-%p.log"
39 | 
40 | clean:
41 | 	rm -rf build $(OBJECTS) $(TESTS)
42 | 	rm -f tests/tests.log
43 | 	find . -name "*.gc*" -exec rm {} \;
44 | 	rm -rf `find . -name "*.dSYM" -print`
45 | 
46 | install: all
47 | 	install -d $(DESTDIR)/$(PREFIX)/lib/
48 | 	install $(TARGET) $(DESTDIR)/$(PREFIX)/lib/
49 | 
50 | BADFUNCS='[^_.>a-zA-Z0-9](str(n?cpy|n?cat|xfrm|n?dup|str|prrk|tok|_)stpm?cpy|a?sn?printf|byte_)'
51 | check:
52 | 	@echo Files with potentially dangerous functions.
53 | 	@egrep $(BADFUNCS) $(SOURCES) || true
54 | 
55 | list:
56 | 	sh -c "$(MAKE) -p no_targets__ | awk -F':' '/^[a-zA-Z0-9][^\$$#\/\\t=]*:([^=]|$$)/ {split(\$$1,A,/ /);for(i in A)print A[i]}' | grep -v '__\$$' | sort"
57 | 


--------------------------------------------------------------------------------
/src/sastrawi/remove_suffixes.c:
--------------------------------------------------------------------------------
 1 | #ifdef __linux
 2 |   #define _GNU_SOURCE
 3 | #endif
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include "text_util.h"
 7 | #include "remove_suffixes.h"
 8 | #include "../dbg.h"
 9 | 
10 | void remove_suffixes(char *word, char **stemmed_word)
11 | {
12 | 
13 |   char *removed_parts = NULL;
14 |   char *suffix_remove1 = NULL;
15 |   char *suffix_remove2 = NULL;
16 | 
17 |   //step 2a
18 |   remove_inflectional_particle(word, &suffix_remove1, &removed_parts);
19 |   free(removed_parts);
20 | 
21 |   //step 2b
22 |   remove_possessive_pronoun(suffix_remove1, &suffix_remove2, &removed_parts);
23 |   free(removed_parts);
24 | 
25 |   //step 3
26 |   remove_derivational_suffix(suffix_remove2, stemmed_word, &removed_parts);
27 |   free(removed_parts);
28 |   free(suffix_remove1);
29 |   free(suffix_remove2);
30 | }
31 | 
32 | 
33 | int remove_inflectional_particle(char *word, char **stemmed_word, char **removed_part)
34 | {
35 |   return remove_suffix("lah|kah|tah|pun", word, stemmed_word, removed_part);
36 | }
37 | 
38 | int remove_possessive_pronoun(char *word, char **stemmed_word, char **removed_part)
39 | {
40 |   return remove_suffix("ku|mu|nya", word, stemmed_word, removed_part);
41 | }
42 | 
43 | int remove_derivational_suffix(char *word, char **stemmed_word, char **removed_part)
44 | {
45 |   return remove_suffix("is|isme|isasi|i|kan|an", word, stemmed_word, removed_part);
46 | }
47 | 
48 | int remove_suffix(char *suffixes, char *word, char **stemmed_word, char **removed_part)
49 | {
50 |   char **matches = NULL;
51 |   int rc;
52 |   char *pattern = NULL;
53 | 
54 |   int pattern_rc = asprintf(&pattern, "(\\w+?)-?(%s)$", suffixes);
55 | 
56 |   rc = suffix_split_word(pattern, word, stemmed_word, removed_part);
57 | 
58 |   free(pattern);
59 |   return rc;
60 | }
61 | 


--------------------------------------------------------------------------------
/tests/dictionary_tests.c:
--------------------------------------------------------------------------------
 1 | #ifdef __linux
 2 |   #define _GNU_SOURCE 
 3 | #endif
 4 | #include "minunit.h"
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | #include "libsastrawi.h"
 9 | #include "dbg.h"
10 | #include "dictionary_tests.h"
11 | 
12 | char *test_dictionary_load() 
13 | {
14 |   int rc;
15 | 
16 |   rc = dictionary_load(dictionary_fullpath("tests/test_dict.txt"));
17 |   mu_assert(rc, "when test_dict exists return truthy");
18 | 
19 |   rc = dictionary_load(dictionary_fullpath("tests/test_not_exists.txt"));
20 |   mu_assert(!rc, "when the dict file does not exist it should return falsy");
21 | 
22 |   return NULL;
23 | }
24 | 
25 | char *test_dictionary_contains() 
26 | {
27 |   dictionary_load(dictionary_fullpath("tests/test_dict.txt"));
28 |   mu_assert(dictionary_contains("aba"), "test dict contains aba");
29 |   mu_assert(!dictionary_contains("non-existent"), "test dict does not contain non-existent");
30 | 
31 |   return NULL;
32 | }
33 | 
34 | char *test_dictionary_add() 
35 | {
36 |   dictionary_add("nonexistent");
37 | 
38 |   mu_assert(dictionary_contains("nonexistent"), "dict should contain nonexistent");
39 |   mu_assert(!dictionary_contains("nonexistent2"), "dict should not contain nonexistent2");
40 | 
41 |   int count = dictionary_count();
42 |   dictionary_add("bola");
43 |   int new_count = dictionary_count();
44 |   mu_assert(count == new_count, "dictionary_add ensures that entries are unique");
45 | 
46 |   return NULL;
47 | }
48 | 
49 | char *all_tests()
50 | {
51 |   mu_suite_start();
52 | 
53 |   dictionary_load(dictionary_fullpath("data/kata-dasar.txt"));
54 | 
55 |   mu_run_test(test_dictionary_load);
56 |   mu_run_test(test_dictionary_add);
57 |   mu_run_test(test_dictionary_contains);
58 | 
59 |   return NULL;
60 | }
61 | 
62 | RUN_TESTS(all_tests);
63 | 


--------------------------------------------------------------------------------
/src/sastrawi/text_util.c:
--------------------------------------------------------------------------------
 1 | #include <string.h>
 2 | #include "../regex/preg.h"
 3 | int prefix_split_word(char *pattern, char *word, char **first_part, char **second_part)
 4 | {
 5 |   char **matches = NULL;
 6 |   int rc = 0;
 7 | 
 8 |   int match_count = preg_match(pattern, word, &matches);
 9 | 
10 |   if(match_count == 3) {
11 |     (*first_part) = strndup(matches[1], strlen(matches[1]));
12 |     (*second_part) = strndup(matches[2], strlen(matches[2]));
13 |     rc = 1;
14 |     free_matches(match_count, &matches);
15 |   } else {
16 |     (*first_part) = strndup("", 0);
17 |     (*second_part) = strndup(word, strlen(word));
18 |   }
19 | 
20 |   return rc;
21 | }
22 | 
23 | int suffix_split_word(char *pattern, char *word, char **first_part, char **second_part)
24 | {
25 |   char **matches = NULL;
26 |   int rc = 0;
27 | 
28 |   int match_count = preg_match(pattern, word, &matches);
29 | 
30 |   if(match_count == 3) {
31 |     (*first_part) = strndup(matches[1], strlen(matches[1]));
32 |     (*second_part) = strndup(matches[2], strlen(matches[2]));
33 |     rc = 1;
34 |     free_matches(match_count, &matches);
35 |   } else {
36 |     (*first_part) = strndup(word, strlen(word));
37 |     (*second_part) = strndup("", 0);
38 |   }
39 | 
40 |   return rc;
41 | }
42 | int split_word3(char *pattern, char *word, char **first_part, char **second_part, char **third_part)
43 | {
44 |   char **matches = NULL;
45 |   int rc = 0;
46 | 
47 |   int match_count = preg_match(pattern, word, &matches);
48 | 
49 |   if(match_count == 4) {
50 |     (*first_part) = strndup(matches[1], strlen(matches[1]));
51 |     (*second_part) = strndup(matches[2], strlen(matches[2]));
52 |     (*third_part) = strndup(matches[3], strlen(matches[3]));
53 | 
54 |     rc = 1;
55 |     free_matches(match_count, &matches);
56 |   } 
57 | 
58 |   return rc;
59 | }
60 | 


--------------------------------------------------------------------------------
/src/sastrawi/remove_prefixes.h:
--------------------------------------------------------------------------------
 1 | typedef int (*PREFIX_REMOVER)(char *word, char **stemmed_word, char **removed_part);
 2 | 
 3 | int remove_prefixes(char *word, char **stemmed_word);
 4 | int remove_plain_prefix(char *word, char **stemmed_word, char **removed_part);
 5 | int remove_complex_prefix_rule1(char *word, char **stemmed_word, char **removed_part);
 6 | int remove_complex_prefix_rule2(char *word, char **stemmed_word, char **removed_part);
 7 | int remove_complex_prefix_rule3(char *word, char **stemmed_word, char **removed_part);
 8 | int remove_complex_prefix_rule4(char *word, char **stemmed_word, char **removed_part);
 9 | int remove_complex_prefix_rule5(char *word, char **stemmed_word, char **removed_part);
10 | int remove_complex_prefix_rule6(char *word, char **stemmed_word, char **removed_part);
11 | int remove_complex_prefix_rule7(char *word, char **stemmed_word, char **removed_part);
12 | int remove_complex_prefix_rule8(char *word, char **stemmed_word, char **removed_part);
13 | int remove_complex_prefix_rule9(char *word, char **stemmed_word, char **removed_part);
14 | int remove_complex_prefix_rule10(char *word, char **stemmed_word, char **removed_part);
15 | int remove_complex_prefix_rule11(char *word, char **stemmed_word, char **removed_part);
16 | int remove_complex_prefix_rule12(char *word, char **stemmed_word, char **removed_part);
17 | int remove_complex_prefix_rule13(char *word, char **stemmed_word, char **removed_part);
18 | int remove_complex_prefix_rule14(char *word, char **stemmed_word, char **removed_part);
19 | int remove_complex_prefix_rule15(char *word, char **stemmed_word, char **removed_part);
20 | int remove_complex_prefix_rule16(char *word, char **stemmed_word, char **removed_part);
21 | int remove_complex_prefix_rule17(char *word, char **stemmed_word, char **removed_part);
22 | int remove_complex_prefix_rule18(char *word, char **stemmed_word, char **removed_part);
23 | int remove_complex_prefix_rule19(char *word, char **stemmed_word, char **removed_part);
24 | int remove_complex_prefix_rule20(char *word, char **stemmed_word, char **removed_part);
25 | 


--------------------------------------------------------------------------------
/src/sastrawi/dictionary.c:
--------------------------------------------------------------------------------
 1 | #ifdef __linux
 2 |   #define _GNU_SOURCE 
 3 | #endif
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | #include <unistd.h>
 8 | #include "../uthash/uthash.h"
 9 | #include "dictionary.h"
10 | #include "../dbg.h"
11 | 
12 | struct dict_entry {
13 |   char *word;
14 |   UT_hash_handle hh;
15 | };
16 | 
17 | struct dict_entry *dict;
18 | 
19 | void remove_newline(char **word, int length) {
20 |   if((*word)[length-1] == '\n') {
21 |     (*word)[length-1] = '\0';
22 |   }
23 | }
24 | 
25 | char *dictionary_fullpath(char *relative_path) 
26 | {
27 |   char *cwd = NULL;
28 |   char *full_path = NULL;
29 | 
30 |   int rc = asprintf(&full_path, "%s/%s", getcwd(cwd,0), relative_path);
31 |   check(rc != -1, "Cannot allocate memory");
32 | 
33 |   return full_path;
34 | error:
35 |   exit(1);
36 | }
37 | 
38 | int dictionary_load(char *dict_path)
39 | {
40 |   FILE *dict_file = NULL;
41 |   char *word = NULL;
42 |   size_t linecap =  0;
43 |   ssize_t linelen;
44 | 
45 |   dict_file = fopen(dict_path, "rb");
46 |   check(dict_file, "Failed to open %s", dict_path);
47 | 
48 |   while((linelen = getline(&word, &linecap, dict_file)) > 0) {
49 |     remove_newline(&word, linelen);
50 |     dictionary_add(word);
51 |     free(word);
52 |     word = NULL;
53 |   }
54 | 
55 |   fclose(dict_file);
56 | 
57 |   return 1;
58 | error:
59 |   if(dict_file) fclose(dict_file);
60 |   if(word) free(word);
61 |   return 0;
62 | }
63 | 
64 | int dictionary_add(char *word)
65 | {
66 |   if(!dictionary_contains(word)) {
67 |     struct dict_entry *dict_word = NULL;
68 |     dict_word = malloc(sizeof(struct dict_entry));
69 |     check_mem(dict_word);
70 | 
71 |     dict_word->word = strndup(word, strlen(word));
72 |     HASH_ADD_KEYPTR(hh, dict, dict_word->word, strlen(dict_word->word), dict_word);
73 |   }
74 |   return 1;
75 | 
76 | error:
77 |   log_err("Failed to  allocate memory for dictionary entry");
78 |   exit(1);
79 | }
80 | 
81 | int dictionary_count() 
82 | {
83 |   return HASH_COUNT(dict);
84 | }
85 | 
86 | int dictionary_contains(char *word)
87 | {
88 |   struct dict_entry *dict_word = NULL;
89 | 
90 |   HASH_FIND_STR(dict, word, dict_word);
91 | 
92 |   if(dict_word == NULL) 
93 |     return 0;
94 |   else
95 |     return 1;
96 | }
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cSastrawi
 2 | 
 3 | ## Introduction
 4 | 
 5 | Bahasa Indonesia is one of the most spoken language in the world.
 6 | [Stemming](https://en.wikipedia.org/wiki/Stemming) is important for many fields of computer science, from text search to machine learning.
 7 | This is an attempt at porting the high quality [PHP based Bahasa Indonesian stemmer - Sastrawi](http://github.com/sastrawi/sastrawi) by [Andy Librian](https://github.com/andylibrian) to the C programming language.
 8 | 
 9 | ## Why a port to C?
10 | 
11 | - Because it will allow for a more direct integration with PostgreSQL via its full text search dictionary support.
12 | - We would be able to write bindings for most languages that provide ways to wrap C libraries
13 | - Because we can :)
14 | 
15 | ## Caveat emptor
16 | 
17 | - This is still super early code. Pretty much useless for anyone who actually wants something to use today. For that please look at PHP Sastrawi instead. I am putting this out here so that people who are interested/smarter then me can get involved early if they are so inclined.
18 | - I am learning C as I go along so apologies for the crappy code and lack of proper setup.
19 | 
20 | 
21 | ## Installation
22 | 
23 | ### Mac OS X
24 | 
25 | 1. Install PCRE2 via homebrew <pre>
26 | brew install pcre2`
27 | </pre>
28 | 
29 | ### Linux (tested on Ubuntu 14.04) 
30 | 
31 | 1. Download PCRE2 library from SourceForge  http://sourceforge.net/projects/pcre/files/latest/download?source=files
32 | 2. Uncompress and install <pre>
33 | $ tar -xvjf pcre2-10.20.tar.bz2
34 | $ ./configure --enable-jit --prefix=/usr
35 | $ make
36 | $ make install
37 | </pre>
38 | 
39 | ## Run tests
40 | 
41 | 1. For now just run `make` <br/><pre>
42 | ± |master ✓| → make
43 | gcc -O3 -Wall -g -I/usr/local/include -c tests/test_sastrawi.c  -o tests/test_sastrawi.o
44 | gcc -O3 -Wall -g -I/usr/local/include -c sastrawi.c  -o sastrawi.o
45 | gcc -O3 -Wall -g -I/usr/local/include -c regex/preg.c  -o regex/preg.o
46 | gcc -O3 -Wall -g -I/usr/local/include -o test_sastrawi tests/test_sastrawi.o sastrawi.o regex/preg.o -L/usr/local/lib -lpcre2-8
47 | test_sastrawi compiled!</pre>
48 | 1. A file called `test_sastrawi` will be created in the same folder. When you run it you should see something like <br>
49 | <pre>
50 | DEBUG tests/test_sastrawi.c:55: ----- RUNNING: ./test_sastrawi
51 | -----
52 |  RUNNING: ./test_sastrawi
53 | DEBUG tests/test_sastrawi.c:49:
54 | ------ test_is_plural
55 | DEBUG tests/test_sastrawi.c:50:
56 | ------ test_plural_parts
57 | ALL TESTS PASSED
58 | Tests run: 2
59 | </pre>
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/src/sastrawi/stem_plural.c:
--------------------------------------------------------------------------------
  1 | #ifdef __linux
  2 |   #define _GNU_SOURCE 
  3 | #endif
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string.h>
  7 | #include "stem_plural.h"
  8 | #include "stem_singular.h"
  9 | #include "../regex/preg.h"
 10 | #include "../dbg.h"
 11 | 
 12 | int is_plural(char *word)
 13 | {
 14 |   char **matches;
 15 | 
 16 |   int matches_count, dash_count;
 17 |     
 18 |   matches_count = preg_match("^(.*)-(ku|mu|nya)$", word, &matches);
 19 | 
 20 |   if(matches_count > 0) {
 21 |     dash_count = strchr(matches[1], '-') != NULL;
 22 |     free_matches(matches_count, &matches);
 23 |   } else {
 24 |     dash_count = strchr(word, '-') != NULL;
 25 |   }
 26 | 
 27 |   return dash_count;
 28 | 
 29 | }
 30 | 
 31 | int plural_parts(char *word, char **parts[])
 32 | {
 33 |   char **matches;
 34 |   int matches_count, parts_count, rc;
 35 | 
 36 |   matches_count = preg_match("^(.*)-(.*)-(ku|mu|nya)$", word, &matches);
 37 | 
 38 |   if(matches_count < 0) {
 39 |     matches_count = preg_match("^(.*)-(.*)$", word, &matches);
 40 |   }
 41 | 
 42 |   if(matches_count>0) {
 43 | 
 44 |     char *second_part;
 45 | 
 46 |     if(matches_count == 4) {
 47 |       rc = asprintf(&second_part, "%s-%s",matches[2], matches[3]);
 48 |       check_debug(rc != -1, "Cannot allocate memory");
 49 |     } else {
 50 |       second_part = strndup(matches[2], strlen(matches[2]));
 51 |     }
 52 | 
 53 |     *parts = malloc(2 * sizeof(char*));
 54 |     (*parts)[0] = strndup(matches[1], strlen(matches[1]));
 55 |     (*parts)[1] = second_part;
 56 | 
 57 |     parts_count = 2;
 58 | 
 59 |     free_matches(matches_count, &matches);
 60 |   } else {
 61 |     *parts = malloc(1 * sizeof(char*));
 62 |     (*parts)[0] = strndup(word, strlen(word));
 63 |     parts_count = 1;
 64 |   }
 65 | 
 66 | 
 67 |   return parts_count;
 68 | error:
 69 |   exit(1);
 70 | }
 71 | 
 72 | 
 73 | int stem_plural_word(char *word, char **stemmed_word)
 74 | {
 75 | 
 76 |   char **word_parts = NULL;
 77 |   char *root_word0 = NULL;
 78 |   char *root_word1 = NULL;
 79 | 
 80 |   int rc = plural_parts(word, &word_parts);
 81 | 
 82 |   stem_singular_word(word_parts[0], &root_word0);
 83 |   stem_singular_word(word_parts[1], &root_word1);
 84 | 
 85 | 
 86 |   debug("word parts %s => %s, %s => %s", word_parts[0], root_word0, word_parts[1], root_word1);
 87 | 
 88 |   if(strcmp(root_word0, root_word1) == 0) {
 89 |     (*stemmed_word) = strndup(word_parts[0], strlen(word_parts[0]));
 90 |   } else {
 91 |     (*stemmed_word) = strndup(word, strlen(word));
 92 |   }
 93 | 
 94 |   free_matches(rc, &word_parts);
 95 |   free(root_word0);
 96 |   free(root_word1);
 97 | 
 98 |   return 1;
 99 | }
100 | 


--------------------------------------------------------------------------------
/tests/precedence_adjustment_tests.c:
--------------------------------------------------------------------------------
 1 | #ifdef __linux
 2 |   #define _GNU_SOURCE 
 3 | #endif
 4 | #include "minunit.h"
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | #include "libsastrawi.h"
 9 | #include "dbg.h"
10 | 
11 | char *test_precendence_adjustment_satisfied_be_lah()
12 | {
13 |   char *word_to_stem = "belilah";
14 |   int rc = is_precedence_adjustment_satisfied(word_to_stem);
15 |   debug("word: %s", word_to_stem);
16 |   mu_assert(rc == 1, "should be satisfied");
17 |   return NULL;
18 | }
19 | 
20 | char *test_precendence_adjustment_satisfied_be_an()
21 | {
22 |   char *word_to_stem = "belaan";
23 |   int rc = is_precedence_adjustment_satisfied(word_to_stem);
24 |   debug("word: %s", word_to_stem);
25 |   mu_assert(rc == 1, "should be satisfied");
26 |   return NULL;
27 | }
28 | 
29 | char *test_precendence_adjustment_satisfied_me_i()
30 | {
31 |   char *word_to_stem = "merangkumi";
32 |   int rc = is_precedence_adjustment_satisfied(word_to_stem);
33 |   debug("word: %s", word_to_stem);
34 |   mu_assert(rc == 1, "should be satisfied");
35 |   return NULL;
36 | }
37 | 
38 | char *test_precendence_adjustment_satisfied_di_i()
39 | {
40 |   char *word_to_stem = "dikahwini";
41 |   int rc = is_precedence_adjustment_satisfied(word_to_stem);
42 |   debug("word: %s", word_to_stem);
43 |   mu_assert(rc == 1, "should be satisfied");
44 |   return NULL;
45 | }
46 | 
47 | char *test_precendence_adjustment_satisfied_pe_i()
48 | {
49 |   char *word_to_stem = "penyanyi";
50 |   int rc = is_precedence_adjustment_satisfied(word_to_stem);
51 |   debug("word: %s", word_to_stem);
52 |   mu_assert(rc == 1, "should be satisfied");
53 |   return NULL;
54 | }
55 | 
56 | char *test_precendence_adjustment_satisfied_ter_i()
57 | {
58 |   char *word_to_stem = "terkini";
59 |   int rc = is_precedence_adjustment_satisfied(word_to_stem);
60 |   debug("word: %s", word_to_stem);
61 |   mu_assert(rc == 1, "should be satisfied");
62 |   return NULL;
63 | }
64 | 
65 | char *test_precendence_adjustment_not_satisfied()
66 | {
67 |   char *word_to_stem = "terjunam";
68 |   int rc = is_precedence_adjustment_satisfied(word_to_stem);
69 |   debug("word: %s", word_to_stem);
70 |   mu_assert(rc == 0, "should not be satisfied");
71 |   return NULL;
72 | }
73 | 
74 | char *all_tests()
75 | {
76 |   mu_suite_start();
77 |   mu_run_test(test_precendence_adjustment_satisfied_be_lah);
78 |   mu_run_test(test_precendence_adjustment_satisfied_be_an);
79 |   mu_run_test(test_precendence_adjustment_satisfied_me_i);
80 |   mu_run_test(test_precendence_adjustment_satisfied_di_i);
81 |   mu_run_test(test_precendence_adjustment_satisfied_pe_i);
82 |   mu_run_test(test_precendence_adjustment_satisfied_ter_i);
83 |   mu_run_test(test_precendence_adjustment_not_satisfied);
84 |   return NULL;
85 | }
86 | 
87 | RUN_TESTS(all_tests);
88 | 


--------------------------------------------------------------------------------
/tests/stem_plural_tests.c:
--------------------------------------------------------------------------------
 1 | #ifdef __linux
 2 |   #define _GNU_SOURCE 
 3 | #endif
 4 | #include "minunit.h"
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | #include "libsastrawi.h"
 9 | #include "dbg.h"
10 | #include "stem_plural_tests.h"
11 | #include "test_helper.h"
12 | 
13 | 
14 | char *test_is_plural()
15 | {
16 | 
17 |   mu_assert(!is_plural("hati-ku"), "hati-ku is not plural");
18 |   mu_assert(!is_plural("test2"), "test2 is not plural");
19 |   mu_assert(is_plural("hati-hati"), "hati-hati is plural");
20 | 
21 |   return NULL;
22 | }
23 | 
24 | char *test_plural_parts() {
25 |   char **parts = NULL;
26 |   int rc;
27 | 
28 |   rc = plural_parts("beli", &parts);
29 |   mu_assert(rc == 1, "beli has 1 part");
30 |   mu_assert(strcmp("beli", parts[0]) == 0, "beli is returned in the parts");
31 | 
32 |   free_parts(rc, &parts);
33 | 
34 |   rc = plural_parts("beli-beli", &parts);
35 |   mu_assert(rc == 2, "beli-beli has 2 parts");
36 |   mu_assert(strcmp("beli", parts[0]) == 0, "beli-beli has 2 parts");
37 |   mu_assert(strcmp("beli", parts[1]) == 0, "beli-beli has 2 parts");
38 | 
39 |   free_parts(rc, &parts);
40 | 
41 |   rc = plural_parts("beli-beli-ku", &parts);
42 |   mu_assert(rc == 2, "beli-beli-ku has 2 parts");
43 |   mu_assert(strcmp("beli", parts[0]) == 0, "For beli-beli-ku, first part should be beli");
44 |   mu_assert(strcmp("beli-ku", parts[1]) == 0, "For beli-beli-ku, second part should be beli-ku");
45 | 
46 |   free_parts(rc, &parts);
47 | 
48 |   return NULL;
49 | }
50 | 
51 | char *test_stem_plural_word_when_both_words_are_root_words_and_the_same() 
52 | {
53 |   char *word = "malaikat-malaikat";
54 |   char *stemmed_word = NULL;
55 |   int rc = stem_plural_word(word, &stemmed_word);
56 |   mu_assert(strcmp("malaikat", stemmed_word) == 0, "it stems to malaikat");
57 |   free(stemmed_word);
58 | 
59 | 
60 | 
61 |   /* char *word3 = "berlari-lari"; */
62 |   /* char *stemmed_word3 = NULL; */
63 |   /* rc = stem_plural_word(word3, &stemmed_word3); */
64 |   /* debug("stem %s => %s, expected %s", word3, stemmed_word3, "lari"); */
65 |   /* mu_assert(strcmp("lari", stemmed_word3) == 0, "it stems to lari"); */
66 |   /* free(stemmed_word3); */
67 | 
68 |   return NULL;
69 | }
70 | 
71 | char *test_stem_plural_word_when_one_word_has_suffixes() 
72 | {
73 |   char *word = "malaikat-malaikatnya";
74 |   char *stemmed_word = NULL;
75 |   int rc = stem_plural_word(word, &stemmed_word);
76 |   mu_assert(strcmp("malaikat", stemmed_word) == 0, "it stems to malaikat");
77 |   free(stemmed_word);
78 | 
79 |   return NULL;
80 | }
81 | 
82 | 
83 | //TODO - create a test with berlarikah, to test return suffix
84 | char *all_tests()
85 | {
86 |   mu_suite_start();
87 | 
88 |   dictionary_load(dictionary_fullpath("data/kata-dasar.txt"));
89 | 
90 |   mu_run_test(test_is_plural);
91 |   mu_run_test(test_plural_parts);
92 |   mu_run_test(test_stem_plural_word_when_both_words_are_root_words_and_the_same);
93 |   mu_run_test(test_stem_plural_word_when_one_word_has_suffixes);
94 | 
95 |   return NULL;
96 | }
97 | 
98 | RUN_TESTS(all_tests);
99 | 


--------------------------------------------------------------------------------
/tests/remove_suffixes_tests.c:
--------------------------------------------------------------------------------
  1 | #ifdef __linux
  2 |   #define _GNU_SOURCE 
  3 | #endif
  4 | #include "minunit.h"
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | #include "libsastrawi.h"
  9 | #include "dbg.h"
 10 | 
 11 | char *test_remove_inflectional_particle_with_dash() 
 12 | {
 13 |   char *stemmed_word = NULL; 
 14 |   char *removed_part = NULL;
 15 | 
 16 |   int rc = remove_inflectional_particle("penting-kah", &stemmed_word, &removed_part);
 17 |   mu_assert(rc, "successfully stems");
 18 |   mu_assert(strcmp("penting", stemmed_word) == 0, "we expect 'penting' as the stemmed word");
 19 |   mu_assert(strcmp("kah", removed_part) == 0, "we expect 'kah' as the removed part");
 20 | 
 21 |   return NULL;
 22 | }
 23 | 
 24 | 
 25 | 
 26 | char *test_remove_inflectional_particle_without_dash() 
 27 | {
 28 |   char *stemmed_word = NULL; 
 29 |   char *removed_part = NULL;
 30 | 
 31 |   int rc = remove_inflectional_particle("pentingkah", &stemmed_word, &removed_part);
 32 |   mu_assert(rc, "successfully stems");
 33 |   mu_assert(strcmp("penting", stemmed_word) == 0, "we expect 'penting' as the stemmed word");
 34 |   mu_assert(strcmp("kah", removed_part) == 0, "we expect 'kah' as the removed part");
 35 | 
 36 |   return NULL;
 37 | }
 38 | 
 39 | char *test_remove_inflectional_particle_no_match() 
 40 | {
 41 |   char *stemmed_word = NULL; 
 42 |   char *removed_part = NULL;
 43 | 
 44 |   int rc = remove_inflectional_particle("penting", &stemmed_word, &removed_part);
 45 |   mu_assert(!rc, "fails stem");
 46 |   mu_assert(strcmp("penting", stemmed_word) == 0, "we expect no change in the word passed in");
 47 |   mu_assert(strcmp("", removed_part) == 0, "we expect empty string in the removed_part");
 48 | 
 49 |   return NULL;
 50 | }
 51 | 
 52 | char *test_remove_possessive_pronoun_with_dash() 
 53 | {
 54 |   char *stemmed_word = NULL; 
 55 |   char *removed_part = NULL;
 56 | 
 57 |   int rc = remove_possessive_pronoun("cinta-ku", &stemmed_word, &removed_part);
 58 | 
 59 |   mu_assert(rc, "successfully stems");
 60 |   mu_assert(strcmp("cinta", stemmed_word) == 0, "we expect 'cinta' as the stemmed word");
 61 |   mu_assert(strcmp("ku", removed_part) == 0, "we expect 'ku' as the removed part");
 62 | 
 63 |   return NULL;
 64 | }
 65 | 
 66 | char *test_remove_possessive_pronoun_without_dash() 
 67 | {
 68 |   char *stemmed_word = NULL; 
 69 |   char *removed_part = NULL;
 70 | 
 71 |   int rc = remove_possessive_pronoun("cintaku", &stemmed_word, &removed_part);
 72 |   mu_assert(rc, "successfully stems");
 73 |   mu_assert(strcmp("cinta", stemmed_word) == 0, "we expect 'cinta' as the stemmed word");
 74 |   mu_assert(strcmp("ku", removed_part) == 0, "we expect 'ku' as the removed part");
 75 | 
 76 |   return NULL;
 77 | }
 78 | 
 79 | char *test_remove_derivational_suffix_with_dash() 
 80 | {
 81 |   char *stemmed_word = NULL; 
 82 |   char *removed_part = NULL;
 83 | 
 84 |   int rc = remove_derivational_suffix("cinta-kan", &stemmed_word, &removed_part);
 85 | 
 86 |   mu_assert(rc, "successfully stems");
 87 |   mu_assert(strcmp("cinta", stemmed_word) == 0, "we expect 'cinta' as the stemmed word");
 88 |   mu_assert(strcmp("kan", removed_part) == 0, "we expect 'kan' as the removed part");
 89 | 
 90 |   return NULL;
 91 | }
 92 | 
 93 | char *test_remove_derivational_suffix_without_dash() 
 94 | {
 95 |   char *stemmed_word = NULL; 
 96 |   char *removed_part = NULL;
 97 | 
 98 |   int rc = remove_derivational_suffix("cintakan", &stemmed_word, &removed_part);
 99 |   mu_assert(rc, "successfully stems");
100 |   mu_assert(strcmp("cinta", stemmed_word) == 0, "we expect 'cinta' as the stemmed word");
101 |   mu_assert(strcmp("kan", removed_part) == 0, "we expect 'kan' as the removed part");
102 | 
103 |   return NULL;
104 | }
105 | 
106 | 
107 | 
108 | char *test_remove_suffixes() 
109 | {
110 |   char *word = "bajumukah";
111 |   char *stemmed_word = NULL;
112 |   remove_suffixes(word, &stemmed_word);
113 |   debug("stem word: %s, expected: baju, actual: %s", word, stemmed_word);
114 |   mu_assert(strcmp("baju", stemmed_word) == 0, "it stems to baju");
115 |   free(stemmed_word);
116 | 
117 |   return NULL;
118 | }
119 | 
120 | char *all_tests()
121 | {
122 |   mu_suite_start();
123 | 
124 |   dictionary_load(dictionary_fullpath("data/kata-dasar.txt"));
125 | 
126 |   mu_run_test(test_remove_inflectional_particle_with_dash);
127 |   mu_run_test(test_remove_inflectional_particle_without_dash);
128 |   mu_run_test(test_remove_inflectional_particle_no_match);
129 | 
130 |   mu_run_test(test_remove_possessive_pronoun_with_dash);
131 |   mu_run_test(test_remove_possessive_pronoun_without_dash);
132 | 
133 |   mu_run_test(test_remove_derivational_suffix_with_dash);
134 |   mu_run_test(test_remove_derivational_suffix_without_dash);
135 | 
136 |   mu_run_test(test_remove_suffixes);
137 | 
138 |   return NULL;
139 | }
140 | 
141 | RUN_TESTS(all_tests);
142 | 


--------------------------------------------------------------------------------
/src/regex/preg.c:
--------------------------------------------------------------------------------
  1 | #define PCRE2_STATIC
  2 | #define PCRE2_CODE_UNIT_WIDTH 8
  3 | 
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | #include <pcre2.h>
  7 | #include "../uthash/uthash.h"
  8 | #include "preg.h"
  9 | #include "../dbg.h"
 10 | #ifdef __linux
 11 |   #include "../deps/strndup/strndup.h"
 12 | #endif
 13 | 
 14 | /**
 15 |  * TODO: 
 16 |  * - Do we need memory management for the cached regex ?
 17 |  */
 18 | 
 19 | struct re_cache {
 20 |   char *re;
 21 |   pcre2_code *compiled_re;
 22 |   UT_hash_handle hh;
 23 | };
 24 | 
 25 | struct re_cache *active_re_cache;
 26 | 
 27 | pcre2_code *compile(char *pattern) {
 28 | 
 29 |   PCRE2_SPTR pcre2_pattern = (PCRE2_SPTR)pattern;
 30 | 
 31 |   pcre2_code *re;
 32 |   int errornumber;
 33 |   PCRE2_SIZE erroroffset;
 34 | 
 35 |   re = pcre2_compile(
 36 |     pcre2_pattern,               /* the pattern */
 37 |     PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
 38 |     0,                     /* default options */
 39 |     &errornumber,          /* for error number */
 40 |     &erroroffset,          /* for error offset */
 41 |     NULL);                 /* use default compile context */
 42 | 
 43 |   pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
 44 | 
 45 |   if (re == NULL) {
 46 |     PCRE2_UCHAR buffer[256];
 47 |     pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
 48 |     printf("PCRE2 compilation failed at offset %d: %s\n for pattern: %s", (int)erroroffset,
 49 |       buffer, pattern);
 50 |     exit(1);
 51 |   }
 52 | 
 53 |   return re;
 54 | }
 55 | 
 56 | pcre2_code *get_compiled_re(char *re)
 57 | {
 58 |   struct re_cache *re_cache_item = NULL;
 59 |   HASH_FIND_STR(active_re_cache, re, re_cache_item);
 60 |   if(re_cache_item == NULL) {
 61 | 
 62 |     re_cache_item = malloc(sizeof(struct re_cache));
 63 |     check_mem(re_cache_item);
 64 | 
 65 |     re_cache_item->re = strndup(re, strlen(re));
 66 |     re_cache_item->compiled_re = compile(re_cache_item->re);
 67 |     HASH_ADD_KEYPTR(hh, active_re_cache, re_cache_item->re, strlen(re_cache_item->re), re_cache_item);
 68 |   } 
 69 | 
 70 |   return re_cache_item->compiled_re;
 71 | 
 72 | error:
 73 |   log_err("Failed to  allocate memory for regex cache");
 74 |   exit(1);
 75 | }
 76 | 
 77 | 
 78 | int preg_match(char *pattern, char *subject, char **matches[]) {
 79 | 
 80 |   int rc;
 81 |   PCRE2_SIZE *ovector;
 82 | 
 83 |   pcre2_code *compiled_re = get_compiled_re(pattern);
 84 | 
 85 |   PCRE2_SPTR pcre2_subject = (PCRE2_SPTR)subject;
 86 |   size_t subject_length = strlen((char *)subject);
 87 | 
 88 |   pcre2_match_data *match_data;
 89 | 
 90 |   match_data = pcre2_match_data_create_from_pattern(compiled_re, NULL);
 91 | 
 92 |   rc = pcre2_match(
 93 |               compiled_re,
 94 |               pcre2_subject,
 95 |               subject_length,
 96 |               0,
 97 |               0,
 98 |               match_data,
 99 |               NULL);
100 | 
101 | 
102 |   if (rc > 1) {
103 |     *matches = malloc(rc * sizeof(char*));
104 |     ovector = pcre2_get_ovector_pointer(match_data);
105 | 
106 |     check_mem(matches);
107 | 
108 |     for (int i = 0; i < rc; i++)
109 |     {
110 |       PCRE2_SPTR substring_start = pcre2_subject + ovector[2*i];
111 |       size_t substring_length = ovector[2*i+1] - ovector[2*i];
112 |       (*matches)[i] = strndup((char *)substring_start, (int)substring_length);
113 |     }
114 |   }
115 | 
116 |   pcre2_match_data_free(match_data);
117 |   return rc;
118 | error:
119 |   log_err("Failed to allocate memory for matches");
120 |   exit(1);
121 | }
122 | 
123 | char *preg_replace(char *re, char *replacement, char *subject) {
124 |   int rc;
125 | 
126 |   pcre2_code *compiled_re = get_compiled_re(re);
127 | 
128 |   PCRE2_SPTR pcre2_subject = (PCRE2_SPTR)subject;
129 |   size_t subject_length = strlen((char *)subject);
130 | 
131 |   PCRE2_SPTR pcre2_replacement = (PCRE2_SPTR)replacement;
132 |   size_t replacement_length = strlen((char *)replacement);
133 | 
134 |   PCRE2_UCHAR output[256];
135 |   size_t output_length = 256;
136 | 
137 |   rc = pcre2_substitute(
138 |     compiled_re,
139 |     pcre2_subject,
140 |     subject_length,
141 |     0,
142 |     PCRE2_SUBSTITUTE_GLOBAL,
143 |     NULL,
144 |     NULL,
145 |     pcre2_replacement,
146 |     replacement_length,
147 |     output,
148 |     &output_length
149 |     );
150 | 
151 | 
152 |   if (rc < 0) {
153 | 
154 |     switch(rc) {
155 |       case PCRE2_ERROR_NOMEMORY:
156 |         printf("Output buffer not large enough\n"); break;
157 |       case PCRE2_ERROR_BADREPLACEMENT:
158 |         printf("Invalid replacement string %s\n", replacement); break;
159 |       default:
160 |        printf("Unknown error %d \n", rc); break;
161 |     }
162 | 
163 |     exit(1);
164 |   }
165 | 
166 |   return strndup((char *)output, output_length);
167 | }
168 | 
169 | void free_matches(int matches_count, char **matches[])
170 | {
171 |   for (int i = 0; i < matches_count; i++)
172 |   {
173 |     free((*matches)[i]);
174 |   }
175 |   free(*matches);
176 | }
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/src/uthash/utringbuffer.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2008-2014, Troy D. Hanson   http://troydhanson.github.com/uthash/
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without
  6 | modification, are permitted provided that the following conditions are met:
  7 | 
  8 |     * Redistributions of source code must retain the above copyright
  9 |       notice, this list of conditions and the following disclaimer.
 10 | 
 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 22 | */
 23 | 
 24 | /* a ring-buffer implementation using macros
 25 |  */
 26 | #ifndef UTRINGBUFFER_H
 27 | #define UTRINGBUFFER_H
 28 |  
 29 | #include <stdlib.h>
 30 | #include <string.h>
 31 | #include "utarray.h"  // for "UT_icd"
 32 |  
 33 | typedef struct {
 34 |     unsigned i;       /* index of next available slot; wraps at n */
 35 |     unsigned n;       /* capacity */
 36 |     unsigned char f;  /* full */
 37 |     UT_icd icd;       /* initializer, copy and destructor functions */
 38 |     char *d;          /* n slots of size icd->sz */
 39 | } UT_ringbuffer;
 40 |  
 41 | #define utringbuffer_init(a, _n, _icd) do {                               \
 42 |   memset(a, 0, sizeof(UT_ringbuffer));                                    \
 43 |   (a)->icd = *(_icd);                                                     \
 44 |   (a)->n = (_n);                                                          \
 45 |   if ((a)->n) { (a)->d = malloc((a)->n * (_icd)->sz); }                   \
 46 | } while(0)
 47 |  
 48 | #define utringbuffer_clear(a) do {                                        \
 49 |   if ((a)->icd.dtor) {                                                    \
 50 |     if ((a)->f) {                                                         \
 51 |       for (unsigned _ut_i=0; _ut_i < (a)->n; _ut_i++) {                   \
 52 |         (a)->icd.dtor(utringbuffer_eltptr(a, _ut_i));                     \
 53 |       }                                                                   \
 54 |     } else {                                                              \
 55 |       for (unsigned _ut_i=0; _ut_i < (a)->i; _ut_i++) {                   \
 56 |         (a)->icd.dtor(utringbuffer_eltptr(a, _ut_i));                     \
 57 |       }                                                                   \
 58 |     }                                                                     \
 59 |   }                                                                       \
 60 |   (a)->i = 0;                                                             \
 61 |   (a)->f = 0;                                                             \
 62 | } while(0)
 63 |  
 64 | #define utringbuffer_done(a) do {                                         \
 65 |   utringbuffer_clear(a);                                                  \
 66 |   free((a)->d); (a)->d = NULL;                                            \
 67 |   (a)->n = 0;                                                             \
 68 | } while(0)
 69 |  
 70 | #define utringbuffer_new(a,n,_icd) do {                                   \
 71 |   a = (UT_ringbuffer*)malloc(sizeof(UT_ringbuffer));                      \
 72 |   utringbuffer_init(a, n, _icd);                                          \
 73 | } while(0)
 74 |  
 75 | #define utringbuffer_free(a) do {                                         \
 76 |   utringbuffer_done(a);                                                   \
 77 |   free(a);                                                                \
 78 | } while(0)
 79 |  
 80 | #define utringbuffer_push_back(a,p) do {                                                \
 81 |   if ((a)->icd.dtor && (a)->f) { (a)->icd.dtor(_utringbuffer_internalptr(a,(a)->i)); }  \
 82 |   if ((a)->icd.copy) { (a)->icd.copy( _utringbuffer_internalptr(a,(a)->i), p); }        \
 83 |   else { memcpy(_utringbuffer_internalptr(a,(a)->i), p, (a)->icd.sz); };                \
 84 |   if (++(a)->i == (a)->n) { (a)->i = 0; (a)->f = 1; }                                   \
 85 | } while(0)
 86 |  
 87 | #define utringbuffer_len(a) ((a)->f ? (a)->n : (a)->i)
 88 | #define utringbuffer_empty(a) ((a)->i == 0 && !(a)->f)
 89 | #define utringbuffer_full(a) ((a)->f != 0)
 90 |  
 91 | #define _utringbuffer_real_idx(a,j) ((a)->f ? ((j) + (a)->i) % (a)->n : (j))
 92 | #define _utringbuffer_internalptr(a,j) ((void*)((char*)((a)->d + ((a)->icd.sz * (j)))))
 93 | #define utringbuffer_eltptr(a,j) ((0 <= (j) && (j) < utringbuffer_len(a)) ? _utringbuffer_internalptr(a,_utringbuffer_real_idx(a,j)) : NULL)
 94 |  
 95 | #define _utringbuffer_fake_idx(a,j) ((a)->f ? ((j) + (a)->n - (a)->i) % (a)->n : (j))
 96 | #define _utringbuffer_internalidx(a,e) (((char*)(e) >= (char*)(a)->d) ? (((char*)(e) - (char*)(a)->d)/(size_t)(a)->icd.sz) : -1)
 97 | #define utringbuffer_eltidx(a,e) _utringbuffer_fake_idx(a, _utringbuffer_internalidx(a,e))
 98 |  
 99 | #define utringbuffer_front(a) utringbuffer_eltptr(a,0)
100 | #define utringbuffer_next(a,e) ((e)==NULL ? utringbuffer_front(a) : utringbuffer_eltptr(a, utringbuffer_eltidx(a,e)+1))
101 | #define utringbuffer_prev(a,e) ((e)==NULL ? utringbuffer_back(a) : utringbuffer_eltptr(a, utringbuffer_eltidx(a,e)-1))
102 | #define utringbuffer_back(a) (utringbuffer_empty(a) ? NULL : utringbuffer_eltptr(a, utringbuffer_len(a) - 1))
103 | 
104 | #endif /* UTRINGBUFFER_H */
105 | 


--------------------------------------------------------------------------------
/tests/stem_singular_tests.c:
--------------------------------------------------------------------------------
  1 | #ifdef __linux
  2 |   #define _GNU_SOURCE 
  3 | #endif
  4 | #include "minunit.h"
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | #include "libsastrawi.h"
  9 | #include "dbg.h"
 10 | 
 11 | char *test_stem_singular_word_for(char *word, char *expected_stem_word) 
 12 | {
 13 |   char *stemmed_word = NULL;
 14 |   int rc = stem_singular_word(word, &stemmed_word);
 15 |   debug("stem word: %s, expected: %s, actual: %s", word, expected_stem_word, stemmed_word);
 16 |   mu_assert(rc == 1, "failed to stem");
 17 |   mu_assert(strcmp(expected_stem_word, stemmed_word) == 0, "failed to stem correctly");
 18 |   free(stemmed_word);
 19 | 
 20 |   return NULL;
 21 | }
 22 | 
 23 | char *test_stem_singular_word_does_not_need_stemming() 
 24 | {
 25 |   return test_stem_singular_word_for("bola", "bola");
 26 | }
 27 | 
 28 | char *test_stem_singular_word_removes_plain_prefixes() 
 29 | {
 30 |   return test_stem_singular_word_for("kerajinannya", "rajin");
 31 | }
 32 | 
 33 | char *test_stem_singular_word_removes_suffixes() 
 34 | {
 35 |   return test_stem_singular_word_for("bajumukah", "baju");
 36 | }
 37 | 
 38 | char *test_stem_singular_word_removes_complex_prefixes_1() 
 39 | {
 40 |   return test_stem_singular_word_for("beria", "ia");
 41 | }
 42 | 
 43 | char *test_stem_singular_word_removes_complex_prefixes_2() 
 44 | {
 45 |   return test_stem_singular_word_for("bertabur", "tabur");
 46 | }
 47 | 
 48 | char *test_stem_singular_word_removes_complex_prefixes_3() 
 49 | {
 50 |   return test_stem_singular_word_for("berdaerah", "daerah");
 51 | }
 52 | 
 53 | char *test_stem_singular_word_removes_complex_prefixes_4() 
 54 | {
 55 |   return test_stem_singular_word_for("belajar", "ajar");
 56 | }
 57 | 
 58 | char *test_stem_singular_word_removes_complex_prefixes_5() 
 59 | {
 60 |   return test_stem_singular_word_for("bekerja", "kerja");
 61 | }
 62 | 
 63 | char *test_stem_singular_word_removes_complex_prefixes_6() 
 64 | {
 65 |   return test_stem_singular_word_for("teracun", "racun");
 66 | }
 67 | 
 68 | char *test_stem_singular_word_removes_complex_prefixes_7() 
 69 | {
 70 |   return test_stem_singular_word_for("terperuk", "peruk");
 71 | }
 72 | 
 73 | char *test_stem_singular_word_removes_complex_prefixes_8() 
 74 | {
 75 |   return test_stem_singular_word_for("tertangkap", "tangkap");
 76 | }
 77 | 
 78 | char *test_stem_singular_word_removes_complex_prefixes_9() 
 79 | {
 80 |   return test_stem_singular_word_for("teterbang", "terbang");
 81 | }
 82 | 
 83 | char *test_stem_singular_word_removes_complex_prefixes_10() 
 84 | {
 85 |   return test_stem_singular_word_for("mewarnai", "warna");
 86 | }
 87 | 
 88 | char *test_stem_singular_word_removes_complex_prefixes_11() 
 89 | {
 90 |   return test_stem_singular_word_for("memfasilitasi", "fasilitas");
 91 | }
 92 | 
 93 | char *test_stem_singular_word_removes_complex_prefixes_12() 
 94 | {
 95 |   return test_stem_singular_word_for("mempengaruhi", "pengaruh");
 96 | }
 97 | 
 98 | char *test_stem_singular_word_removes_complex_prefixes_13() 
 99 | {
100 |   return test_stem_singular_word_for("memasuki", "masuk");
101 | }
102 | 
103 | char *test_stem_singular_word_removes_complex_prefixes_14() 
104 | {
105 |   return test_stem_singular_word_for("mentaati", "taat");
106 | }
107 | 
108 | char *test_stem_singular_word_removes_complex_prefixes_15() 
109 | {
110 |   return test_stem_singular_word_for("menikmati", "nikmat");
111 | }
112 | 
113 | char *test_stem_singular_word_removes_complex_prefixes_16() 
114 | {
115 |   return test_stem_singular_word_for("mengqasar", "qasar");
116 | }
117 | 
118 | char *test_stem_singular_word_removes_complex_prefixes_17() 
119 | {
120 |   return test_stem_singular_word_for("mengecil", "kecil");
121 | }
122 | 
123 | char *test_stem_singular_word_removes_complex_prefixes_18() 
124 | {
125 |   return test_stem_singular_word_for("menyapu", "sapu");
126 | }
127 | 
128 | char *test_stem_singular_word_removes_complex_prefixes_19() 
129 | {
130 |   return test_stem_singular_word_for("memprotes", "protes");
131 | }
132 | 
133 | char *test_stem_singular_word_removes_complex_prefixes_20() 
134 | {
135 |   return test_stem_singular_word_for("peyoga", "yoga");
136 | }
137 | 
138 | char *test_stem_singular_word_uses_precedence_adjustment()
139 | {
140 |   return test_stem_singular_word_for("memakai","pakai");
141 | }
142 | 
143 | char *test_stem_singular_word_uses_precedence_adjustment_2()
144 | {
145 |   return test_stem_singular_word_for("berbadankan","badan");
146 | }
147 | 
148 | char *all_tests()
149 | {
150 |   mu_suite_start();
151 | 
152 |   dictionary_load(dictionary_fullpath("data/kata-dasar.txt"));
153 | 
154 |   mu_run_test(test_stem_singular_word_does_not_need_stemming);
155 |   mu_run_test(test_stem_singular_word_removes_suffixes);
156 | 
157 |   mu_run_test(test_stem_singular_word_removes_plain_prefixes);
158 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_1);
159 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_2);
160 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_3);
161 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_4);
162 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_5);
163 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_6);
164 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_7);
165 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_8);
166 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_9);
167 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_10);
168 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_11);
169 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_12);
170 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_13);
171 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_14);
172 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_15);
173 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_16);
174 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_17);
175 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_18);
176 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_19);
177 |   mu_run_test(test_stem_singular_word_removes_complex_prefixes_20);
178 |   mu_run_test(test_stem_singular_word_uses_precedence_adjustment);
179 |   mu_run_test(test_stem_singular_word_uses_precedence_adjustment_2);
180 | 
181 |   return NULL;
182 | }
183 | 
184 | RUN_TESTS(all_tests);
185 | 


--------------------------------------------------------------------------------
/src/sastrawi/remove_prefixes.c:
--------------------------------------------------------------------------------
  1 | #ifdef __linux
  2 |   #define _GNU_SOURCE
  3 | #endif
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string.h>
  7 | #include "dictionary.h"
  8 | #include "text_util.h"
  9 | #include "remove_prefixes.h"
 10 | #include "../dbg.h"
 11 | 
 12 | const int prefix_remover_count = 21;
 13 | 
 14 | const PREFIX_REMOVER prefix_removers[prefix_remover_count] = {
 15 |   remove_plain_prefix, 
 16 |   remove_complex_prefix_rule1,
 17 |   remove_complex_prefix_rule2,
 18 |   remove_complex_prefix_rule3,
 19 |   remove_complex_prefix_rule4,
 20 |   remove_complex_prefix_rule5,
 21 |   remove_complex_prefix_rule6,
 22 |   remove_complex_prefix_rule7,
 23 |   remove_complex_prefix_rule8,
 24 |   remove_complex_prefix_rule9,
 25 |   remove_complex_prefix_rule10,
 26 |   remove_complex_prefix_rule11,
 27 |   remove_complex_prefix_rule12,
 28 |   remove_complex_prefix_rule13,
 29 |   remove_complex_prefix_rule14,
 30 |   remove_complex_prefix_rule15,
 31 |   remove_complex_prefix_rule16,
 32 |   remove_complex_prefix_rule17,
 33 |   remove_complex_prefix_rule18,
 34 |   remove_complex_prefix_rule19,
 35 |   remove_complex_prefix_rule20
 36 | };
 37 | 
 38 | 
 39 | int remove_prefixes(char *original_word, char **stemmed_word)
 40 | {
 41 |   int rc = 0;
 42 |   char *removed_parts = NULL;
 43 | 
 44 |   char *word = strndup(original_word, strlen(original_word));
 45 |   char *post_remove = NULL;
 46 | 
 47 |   for(int i =0; i < prefix_remover_count; i++) {
 48 | 
 49 |     free(post_remove);
 50 |     free(removed_parts);
 51 |     rc = (*prefix_removers[i])(word, &post_remove, &removed_parts);
 52 | 
 53 |     if(rc) {
 54 |       break;
 55 |     } else {
 56 |       free(word);
 57 |       word = strndup(post_remove, strlen(post_remove));
 58 |     }
 59 |   }
 60 | 
 61 |   *stemmed_word = strndup(post_remove, strlen(post_remove));
 62 | 
 63 |   //cleanup
 64 |   free(post_remove);
 65 |   free(removed_parts);
 66 |   free(word);
 67 | 
 68 |   return rc;
 69 | }
 70 | 
 71 | int remove_plain_prefix(char *word, char **stemmed_word, char **removed_part)
 72 | {
 73 |   int rc = 0;
 74 | 
 75 |   int split_rc =  prefix_split_word("^(di|ke|se)(\\w+)$", word, removed_part, stemmed_word);
 76 | 
 77 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
 78 |       rc = 1;
 79 |   }   
 80 |   
 81 |   return rc;
 82 | }
 83 | 
 84 | int remove_complex_prefix_rule1(char *word, char **stemmed_word, char **removed_part)
 85 | {
 86 |   int rc = 0;
 87 | 
 88 |   int split_rc = prefix_split_word("(^ber)([aiueo].*)$", word, removed_part, stemmed_word);
 89 | 
 90 |   //1a
 91 |   if(split_rc == 1) {
 92 |     if(dictionary_contains(*stemmed_word)) {
 93 |       rc = 1;
 94 |     } else {
 95 |       //1b
 96 |       char *alternative_stemmed_word;
 97 |       asprintf(&alternative_stemmed_word, "r%s", *stemmed_word);
 98 |       rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "be");
 99 |     }
100 |   }
101 |   return rc;
102 | }
103 | 
104 | int remove_complex_prefix_rule2(char *word, char **stemmed_word, char **removed_part)
105 | {
106 |   int rc = 0;
107 |   char *partial_stemmed_word;
108 | 
109 |   int split_rc = split_word3("(^ber)([^aeiou][a-z](\\w*))", word, removed_part, stemmed_word, &partial_stemmed_word);
110 | 
111 | 
112 |   if(split_rc == 1 && (strstr(partial_stemmed_word, "er") == NULL)) {
113 |     if(dictionary_contains(*stemmed_word)) {
114 |       rc = 1;
115 |     } 
116 |   } else {
117 |     (*stemmed_word) = strndup(word, strlen(word));
118 |     (*removed_part) = strndup("", 0);
119 |   }
120 | 
121 |   return rc;
122 | }
123 | 
124 | int remove_complex_prefix_rule3(char *word, char **stemmed_word, char **removed_part)
125 | {
126 |   int rc = 0;
127 | 
128 |   int split_rc = prefix_split_word("(^ber)([^aeiou][a-z]er\\w*)", word, removed_part, stemmed_word);
129 | 
130 | 
131 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
132 |       rc = 1;
133 |   }   
134 |   
135 |   return rc;
136 | }
137 | 
138 | int remove_complex_prefix_rule4(char *word, char **stemmed_word, char **removed_part)
139 | {
140 |   int rc = 0;
141 | 
142 |   int split_rc = prefix_split_word("(^bel)(ajar)", word, removed_part, stemmed_word);
143 | 
144 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
145 |       rc = 1;
146 |   }   
147 | 
148 |   return rc;
149 | }
150 | 
151 | int remove_complex_prefix_rule5(char *word, char **stemmed_word, char **removed_part)
152 | {
153 |   int rc = 0;
154 | 
155 |   int split_rc = prefix_split_word("(^be)([^aeiour]er[^aeiou]\\w*)", word, removed_part, stemmed_word);
156 | 
157 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
158 |       rc = 1;
159 |   }
160 |   return rc;
161 | }
162 | 
163 | int remove_complex_prefix_rule6(char *word, char **stemmed_word, char **removed_part)
164 | {
165 |   int rc = 0;
166 | 
167 |   int split_rc = prefix_split_word("(^ter)([aiueo].*)$", word, removed_part, stemmed_word);
168 | 
169 |   //6a
170 |   if(split_rc == 1) {
171 |     if(dictionary_contains(*stemmed_word)) {
172 |       rc = 1;
173 |     } else {
174 |   //6b
175 |       char *alternative_stemmed_word;
176 |       asprintf(&alternative_stemmed_word, "r%s", *stemmed_word);
177 |       rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "te");
178 |     }
179 |   }
180 |   return rc;
181 | }
182 | 
183 | int remove_complex_prefix_rule7(char *word, char **stemmed_word, char **removed_part)
184 | {
185 |   int rc = 0;
186 | 
187 |   int split_rc = prefix_split_word("(^ter)([^aeiour]er[aeiou]\\w*)", word, removed_part, stemmed_word);
188 | 
189 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
190 |       rc = 1;
191 |   } 
192 |   
193 |   return rc;
194 | }
195 | 
196 | int remove_complex_prefix_rule8(char *word, char **stemmed_word, char **removed_part)
197 | {
198 |   int rc = 0;
199 |   char *partial_stemmed_word;
200 | 
201 |   int split_rc = split_word3("(^ter)([^aeiour](\\w*))", word, removed_part, stemmed_word, &partial_stemmed_word);
202 | 
203 |   if(split_rc == 1 && (strstr(partial_stemmed_word, "er") == NULL)) {
204 |     if(dictionary_contains(*stemmed_word)) {
205 |       rc = 1;
206 |     } 
207 |   } else {
208 |     (*stemmed_word) = strndup(word, strlen(word));
209 |     (*removed_part) = strndup("", 0);
210 |   }
211 | 
212 |   return rc;
213 | }
214 | 
215 | int remove_complex_prefix_rule9(char *word, char **stemmed_word, char **removed_part)
216 | {
217 |   int rc = 0;
218 |   char *partial_stemmed_word;
219 | 
220 |   int split_rc = prefix_split_word("(^te)([^aeiour]er[^aeiou]\\w*)", word, removed_part, stemmed_word);
221 | 
222 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
223 |       rc = 1;
224 |   } 
225 |   return rc;
226 | }
227 | 
228 | int remove_complex_prefix_rule10(char *word, char **stemmed_word, char **removed_part)
229 | {
230 |   int rc = 0;
231 | 
232 |   int split_rc = prefix_split_word("(^me)([lrwy][aeiou]\\w*)", word, removed_part, stemmed_word);
233 | 
234 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
235 |       rc = 1;
236 |   }
237 |   return rc;
238 | }
239 | 
240 | int remove_complex_prefix_rule11(char *word, char **stemmed_word, char **removed_part)
241 | {
242 |   int rc = 0;
243 | 
244 |   int split_rc = prefix_split_word("(^mem)([fbv]\\w*)", word, removed_part, stemmed_word);
245 | 
246 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
247 |       rc = 1;
248 |   } 
249 | 
250 |   return rc;
251 | }
252 | 
253 | int remove_complex_prefix_rule12(char *word, char **stemmed_word, char **removed_part)
254 | {
255 |   int rc = 0;
256 | 
257 |   int split_rc = prefix_split_word("(^mem)(pe\\w*)", word, removed_part, stemmed_word);
258 | 
259 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
260 |       rc = 1;
261 |   } 
262 | 
263 |   return rc;
264 | }
265 | 
266 | int remove_complex_prefix_rule13(char *word, char **stemmed_word, char **removed_part)
267 | {
268 |   int rc = 0;
269 | 
270 |   int split_rc = prefix_split_word("(^me)(m[aeiou]\\w*)", word, removed_part, stemmed_word);
271 | 
272 |   if(split_rc == 1 ) {
273 |     if(dictionary_contains(*stemmed_word)) {
274 |       rc = 1;
275 |     } else {
276 |       char *alternative_stemmed_word;
277 |       asprintf(&alternative_stemmed_word, "p%s", *stemmed_word+1);
278 |       rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "me");
279 |     }
280 |   }
281 |   return rc;
282 | }
283 | 
284 | int remove_complex_prefix_rule14(char *word, char **stemmed_word, char **removed_part)
285 | {
286 |   int rc = 0;
287 | 
288 |   int split_rc = prefix_split_word("(^men)([cdjstz]\\w*)", word, removed_part, stemmed_word);
289 | 
290 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
291 |       rc = 1;
292 |   }
293 |   return rc;
294 | }
295 | 
296 | int remove_complex_prefix_rule15(char *word, char **stemmed_word, char **removed_part)
297 | {
298 |   int rc = 0;
299 | 
300 |   int split_rc = prefix_split_word("(^me)(n[aeiou]\\w*)", word, removed_part, stemmed_word);
301 | 
302 |   if(split_rc == 1 ) {
303 |     if(dictionary_contains(*stemmed_word)) {
304 |       rc = 1;
305 |     } else {
306 |       char *alternative_stemmed_word;
307 |       asprintf(&alternative_stemmed_word, "t%s", *stemmed_word+1);
308 |       rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "me");
309 |     }
310 |   }
311 |   return rc;
312 | }
313 | 
314 | int remove_complex_prefix_rule16(char *word, char **stemmed_word, char **removed_part)
315 | {
316 |   int rc = 0;
317 | 
318 |   int split_rc = prefix_split_word("(^meng)([ghqk]\\w*)", word, removed_part, stemmed_word);
319 | 
320 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
321 |       rc = 1;
322 |   }
323 |   return rc;
324 | }
325 | 
326 | int remove_complex_prefix_rule17(char *word, char **stemmed_word, char **removed_part)
327 | {
328 |   int rc = 0;
329 |   char *alternative_stemmed_word;
330 | 
331 |   int split_rc = prefix_split_word("(^meng)([aeiou]\\w*)", word, removed_part, stemmed_word);
332 | 
333 |   if(split_rc == 1) {
334 |     if(dictionary_contains(*stemmed_word)) {
335 |       rc = 1;
336 |     } 
337 | 
338 |     if(rc == 0) {
339 |       asprintf(&alternative_stemmed_word, "k%s", *stemmed_word);
340 |       rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "meng");
341 |       free(alternative_stemmed_word);
342 |     }
343 | 
344 |     if(rc == 0) {
345 |       asprintf(&alternative_stemmed_word, "%s", *stemmed_word+1);
346 |       rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "menge");
347 |       free(alternative_stemmed_word);
348 |     }
349 | 
350 |     if(rc == 0) {
351 |       asprintf(&alternative_stemmed_word, "ng%s", *stemmed_word);
352 |       rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "me");
353 |       free(alternative_stemmed_word);
354 |     }
355 |   }
356 |   return rc;
357 | }
358 | 
359 | int remove_complex_prefix_rule18(char *word, char **stemmed_word, char **removed_part)
360 | {
361 |   int rc = 0;
362 | 
363 |   int split_rc = prefix_split_word("(^me)(ny\\w*)", word, removed_part, stemmed_word);
364 | 
365 |   if(split_rc == 1 ) {
366 |     if(dictionary_contains(*stemmed_word)) {
367 |       rc = 1;
368 |     } else {
369 |       char *alternative_stemmed_word;
370 |       asprintf(&alternative_stemmed_word, "s%s", *stemmed_word+2);
371 |       rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "meny");
372 |     }
373 |   }
374 |   return rc;
375 | }
376 | 
377 | int remove_complex_prefix_rule19(char *word, char **stemmed_word, char **removed_part)
378 | {
379 |   int rc = 0;
380 | 
381 |   int split_rc = prefix_split_word("(^mem)(p[^e]\\w*)", word, removed_part, stemmed_word);
382 | 
383 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
384 |       rc = 1;
385 |   }
386 |   return rc;
387 | }
388 | 
389 | int remove_complex_prefix_rule20(char *word, char **stemmed_word, char **removed_part)
390 | {
391 |   int rc = 0;
392 | 
393 |   int split_rc = prefix_split_word("(^pe)([wy][aeiou]\\w*)", word, removed_part, stemmed_word);
394 | 
395 |   if(split_rc == 1 && dictionary_contains(*stemmed_word)) {
396 |       rc = 1;
397 |   }
398 |   return rc;
399 | }
400 | 
401 | int assign_if_root_word(char **stemmed_word, char *alternative_stemmed_word, char **removed_part, char *alternative_removed_part) {
402 |   int rc = 0;
403 | 
404 |   if(dictionary_contains(alternative_stemmed_word)) {
405 |     free(*removed_part);
406 |     *removed_part = strndup(alternative_removed_part, strlen(alternative_removed_part));
407 | 
408 |     free(*stemmed_word);
409 |     *stemmed_word = strndup(alternative_stemmed_word, strlen(alternative_stemmed_word));
410 |     rc = 1;
411 |   }
412 | 
413 |   return rc;
414 | }
415 | 


--------------------------------------------------------------------------------
/src/uthash/utstring.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2008-2014, Troy D. Hanson   http://troydhanson.github.com/uthash/
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without
  6 | modification, are permitted provided that the following conditions are met:
  7 | 
  8 |     * Redistributions of source code must retain the above copyright
  9 |       notice, this list of conditions and the following disclaimer.
 10 | 
 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 22 | */
 23 | 
 24 | /* a dynamic string implementation using macros
 25 |  */
 26 | #ifndef UTSTRING_H
 27 | #define UTSTRING_H
 28 | 
 29 | #define UTSTRING_VERSION 1.9.9
 30 | 
 31 | #ifdef __GNUC__
 32 | #define _UNUSED_ __attribute__ ((__unused__))
 33 | #else
 34 | #define _UNUSED_
 35 | #endif
 36 | 
 37 | #include <stdlib.h>
 38 | #include <string.h>
 39 | #include <stdio.h>
 40 | #include <stdarg.h>
 41 | #define oom() exit(-1)
 42 | 
 43 | typedef struct {
 44 |     char *d;
 45 |     size_t n; /* allocd size */
 46 |     size_t i; /* index of first unused byte */
 47 | } UT_string;
 48 | 
 49 | #define utstring_reserve(s,amt)                            \
 50 | do {                                                       \
 51 |   if (((s)->n - (s)->i) < (size_t)(amt)) {                 \
 52 |      (s)->d = (char*)realloc((s)->d, (s)->n + (amt));      \
 53 |      if ((s)->d == NULL) oom();                            \
 54 |      (s)->n += (amt);                                      \
 55 |   }                                                        \
 56 | } while(0)
 57 | 
 58 | #define utstring_init(s)                                   \
 59 | do {                                                       \
 60 |   (s)->n = 0; (s)->i = 0; (s)->d = NULL;                   \
 61 |   utstring_reserve(s,100);                                 \
 62 |   (s)->d[0] = '\0'; \
 63 | } while(0)
 64 | 
 65 | #define utstring_done(s)                                   \
 66 | do {                                                       \
 67 |   if ((s)->d != NULL) free((s)->d);                        \
 68 |   (s)->n = 0;                                              \
 69 | } while(0)
 70 | 
 71 | #define utstring_free(s)                                   \
 72 | do {                                                       \
 73 |   utstring_done(s);                                        \
 74 |   free(s);                                                 \
 75 | } while(0)
 76 | 
 77 | #define utstring_new(s)                                    \
 78 | do {                                                       \
 79 |    s = (UT_string*)calloc(sizeof(UT_string),1);            \
 80 |    if (!s) oom();                                          \
 81 |    utstring_init(s);                                       \
 82 | } while(0)
 83 | 
 84 | #define utstring_renew(s)                                  \
 85 | do {                                                       \
 86 |    if (s) {                                                \
 87 |      utstring_clear(s);                                    \
 88 |    } else {                                                \
 89 |      utstring_new(s);                                      \
 90 |    }                                                       \
 91 | } while(0)
 92 | 
 93 | #define utstring_clear(s)                                  \
 94 | do {                                                       \
 95 |   (s)->i = 0;                                              \
 96 |   (s)->d[0] = '\0';                                        \
 97 | } while(0)
 98 | 
 99 | #define utstring_bincpy(s,b,l)                             \
100 | do {                                                       \
101 |   utstring_reserve((s),(l)+1);                               \
102 |   if (l) memcpy(&(s)->d[(s)->i], b, l);                    \
103 |   (s)->i += (l);                                           \
104 |   (s)->d[(s)->i]='\0';                                         \
105 | } while(0)
106 | 
107 | #define utstring_concat(dst,src)                                 \
108 | do {                                                             \
109 |   utstring_reserve((dst),((src)->i)+1);                          \
110 |   if ((src)->i) memcpy(&(dst)->d[(dst)->i], (src)->d, (src)->i); \
111 |   (dst)->i += (src)->i;                                          \
112 |   (dst)->d[(dst)->i]='\0';                                       \
113 | } while(0)
114 | 
115 | #define utstring_len(s) ((unsigned)((s)->i))
116 | 
117 | #define utstring_body(s) ((s)->d)
118 | 
119 | _UNUSED_ static void utstring_printf_va(UT_string *s, const char *fmt, va_list ap) {
120 |    int n;
121 |    va_list cp;
122 |    while (1) {
123 | #ifdef _WIN32
124 |       cp = ap;
125 | #else
126 |       va_copy(cp, ap);
127 | #endif
128 |       n = vsnprintf (&s->d[s->i], s->n-s->i, fmt, cp);
129 |       va_end(cp);
130 | 
131 |       if ((n > -1) && ((size_t) n < (s->n-s->i))) {
132 |         s->i += n;
133 |         return;
134 |       }
135 | 
136 |       /* Else try again with more space. */
137 |       if (n > -1) utstring_reserve(s,n+1); /* exact */
138 |       else utstring_reserve(s,(s->n)*2);   /* 2x */
139 |    }
140 | }
141 | #ifdef __GNUC__
142 | /* support printf format checking (2=the format string, 3=start of varargs) */
143 | static void utstring_printf(UT_string *s, const char *fmt, ...)
144 |   __attribute__ (( format( printf, 2, 3) ));
145 | #endif
146 | _UNUSED_ static void utstring_printf(UT_string *s, const char *fmt, ...) {
147 |    va_list ap;
148 |    va_start(ap,fmt);
149 |    utstring_printf_va(s,fmt,ap);
150 |    va_end(ap);
151 | }
152 | 
153 | /*******************************************************************************
154 |  * begin substring search functions                                            *
155 |  ******************************************************************************/
156 | /* Build KMP table from left to right. */
157 | _UNUSED_ static void _utstring_BuildTable(
158 |     const char *P_Needle,
159 |     size_t P_NeedleLen,
160 |     long *P_KMP_Table)
161 | {
162 |     long i, j;
163 | 
164 |     i = 0;
165 |     j = i - 1;
166 |     P_KMP_Table[i] = j;
167 |     while (i < (long) P_NeedleLen)
168 |     {
169 |         while ( (j > -1) && (P_Needle[i] != P_Needle[j]) )
170 |         {
171 |            j = P_KMP_Table[j];
172 |         }
173 |         i++;
174 |         j++;
175 |         if (i < (long) P_NeedleLen)
176 |         {
177 |             if (P_Needle[i] == P_Needle[j])
178 |             {
179 |                 P_KMP_Table[i] = P_KMP_Table[j];
180 |             }
181 |             else
182 |             {
183 |                 P_KMP_Table[i] = j;
184 |             }
185 |         }
186 |         else
187 |         {
188 |             P_KMP_Table[i] = j;
189 |         }
190 |     }
191 | 
192 |     return;
193 | }
194 | 
195 | 
196 | /* Build KMP table from right to left. */
197 | _UNUSED_ static void _utstring_BuildTableR(
198 |     const char *P_Needle,
199 |     size_t P_NeedleLen,
200 |     long *P_KMP_Table)
201 | {
202 |     long i, j;
203 | 
204 |     i = P_NeedleLen - 1;
205 |     j = i + 1;
206 |     P_KMP_Table[i + 1] = j;
207 |     while (i >= 0)
208 |     {
209 |         while ( (j < (long) P_NeedleLen) && (P_Needle[i] != P_Needle[j]) )
210 |         {
211 |            j = P_KMP_Table[j + 1];
212 |         }
213 |         i--;
214 |         j--;
215 |         if (i >= 0)
216 |         {
217 |             if (P_Needle[i] == P_Needle[j])
218 |             {
219 |                 P_KMP_Table[i + 1] = P_KMP_Table[j + 1];
220 |             }
221 |             else
222 |             {
223 |                 P_KMP_Table[i + 1] = j;
224 |             }
225 |         }
226 |         else
227 |         {
228 |             P_KMP_Table[i + 1] = j;
229 |         }
230 |     }
231 | 
232 |     return;
233 | }
234 | 
235 | 
236 | /* Search data from left to right. ( Multiple search mode. ) */
237 | _UNUSED_ static long _utstring_find(
238 |     const char *P_Haystack,
239 |     size_t P_HaystackLen,
240 |     const char *P_Needle,
241 |     size_t P_NeedleLen,
242 |     long *P_KMP_Table)
243 | {
244 |     long i, j;
245 |     long V_FindPosition = -1;
246 | 
247 |     /* Search from left to right. */
248 |     i = j = 0;
249 |     while ( (j < (int)P_HaystackLen) && (((P_HaystackLen - j) + i) >= P_NeedleLen) )
250 |     {
251 |         while ( (i > -1) && (P_Needle[i] != P_Haystack[j]) )
252 |         {
253 |             i = P_KMP_Table[i];
254 |         }
255 |         i++;
256 |         j++;
257 |         if (i >= (int)P_NeedleLen)
258 |         {
259 |             /* Found. */
260 |             V_FindPosition = j - i;
261 |             break;
262 |         }
263 |     }
264 | 
265 |     return V_FindPosition;
266 | }
267 | 
268 | 
269 | /* Search data from right to left. ( Multiple search mode. ) */
270 | _UNUSED_ static long _utstring_findR(
271 |     const char *P_Haystack,
272 |     size_t P_HaystackLen,
273 |     const char *P_Needle,
274 |     size_t P_NeedleLen,
275 |     long *P_KMP_Table)
276 | {
277 |     long i, j;
278 |     long V_FindPosition = -1;
279 | 
280 |     /* Search from right to left. */
281 |     j = (P_HaystackLen - 1);
282 |     i = (P_NeedleLen - 1);
283 |     while ( (j >= 0) && (j >= i) )
284 |     {
285 |         while ( (i < (int)P_NeedleLen) && (P_Needle[i] != P_Haystack[j]) )
286 |         {
287 |             i = P_KMP_Table[i + 1];
288 |         }
289 |         i--;
290 |         j--;
291 |         if (i < 0)
292 |         {
293 |             /* Found. */
294 |             V_FindPosition = j + 1;
295 |             break;
296 |         }
297 |     }
298 | 
299 |     return V_FindPosition;
300 | }
301 | 
302 | 
303 | /* Search data from left to right. ( One time search mode. ) */
304 | _UNUSED_ static long utstring_find(
305 |     UT_string *s,
306 |     long P_StartPosition,   /* Start from 0. -1 means last position. */
307 |     const char *P_Needle,
308 |     size_t P_NeedleLen)
309 | {
310 |     long V_StartPosition;
311 |     long V_HaystackLen;
312 |     long *V_KMP_Table;
313 |     long V_FindPosition = -1;
314 | 
315 |     if (P_StartPosition < 0)
316 |     {
317 |         V_StartPosition = s->i + P_StartPosition;
318 |     }
319 |     else
320 |     {
321 |         V_StartPosition = P_StartPosition;
322 |     }
323 |     V_HaystackLen = s->i - V_StartPosition;
324 |     if ( (V_HaystackLen >= (long) P_NeedleLen) && (P_NeedleLen > 0) )
325 |     {
326 |         V_KMP_Table = (long *)malloc(sizeof(long) * (P_NeedleLen + 1));
327 |         if (V_KMP_Table != NULL)
328 |         {
329 |             _utstring_BuildTable(P_Needle, P_NeedleLen, V_KMP_Table);
330 | 
331 |             V_FindPosition = _utstring_find(s->d + V_StartPosition,
332 |                                             V_HaystackLen,
333 |                                             P_Needle,
334 |                                             P_NeedleLen,
335 |                                             V_KMP_Table);
336 |             if (V_FindPosition >= 0)
337 |             {
338 |                 V_FindPosition += V_StartPosition;
339 |             }
340 | 
341 |             free(V_KMP_Table);
342 |         }
343 |     }
344 | 
345 |     return V_FindPosition;
346 | }
347 | 
348 | 
349 | /* Search data from right to left. ( One time search mode. ) */
350 | _UNUSED_ static long utstring_findR(
351 |     UT_string *s,
352 |     long P_StartPosition,   /* Start from 0. -1 means last position. */
353 |     const char *P_Needle,
354 |     size_t P_NeedleLen)
355 | {
356 |     long V_StartPosition;
357 |     long V_HaystackLen;
358 |     long *V_KMP_Table;
359 |     long V_FindPosition = -1;
360 | 
361 |     if (P_StartPosition < 0)
362 |     {
363 |         V_StartPosition = s->i + P_StartPosition;
364 |     }
365 |     else
366 |     {
367 |         V_StartPosition = P_StartPosition;
368 |     }
369 |     V_HaystackLen = V_StartPosition + 1;
370 |     if ( (V_HaystackLen >= (long) P_NeedleLen) && (P_NeedleLen > 0) )
371 |     {
372 |         V_KMP_Table = (long *)malloc(sizeof(long) * (P_NeedleLen + 1));
373 |         if (V_KMP_Table != NULL)
374 |         {
375 |             _utstring_BuildTableR(P_Needle, P_NeedleLen, V_KMP_Table);
376 | 
377 |             V_FindPosition = _utstring_findR(s->d,
378 |                                              V_HaystackLen,
379 |                                              P_Needle,
380 |                                              P_NeedleLen,
381 |                                              V_KMP_Table);
382 | 
383 |             free(V_KMP_Table);
384 |         }
385 |     }
386 | 
387 |     return V_FindPosition;
388 | }
389 | /*******************************************************************************
390 |  * end substring search functions                                              *
391 |  ******************************************************************************/
392 | 
393 | #endif /* UTSTRING_H */
394 | 


--------------------------------------------------------------------------------
/src/uthash/utarray.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2008-2014, Troy D. Hanson   http://troydhanson.github.com/uthash/
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without
  6 | modification, are permitted provided that the following conditions are met:
  7 | 
  8 |     * Redistributions of source code must retain the above copyright
  9 |       notice, this list of conditions and the following disclaimer.
 10 | 
 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 22 | */
 23 | 
 24 | /* a dynamic array implementation using macros
 25 |  */
 26 | #ifndef UTARRAY_H
 27 | #define UTARRAY_H
 28 | 
 29 | #define UTARRAY_VERSION 1.9.9
 30 | 
 31 | #ifdef __GNUC__
 32 | #define _UNUSED_ __attribute__ ((__unused__))
 33 | #else
 34 | #define _UNUSED_
 35 | #endif
 36 | 
 37 | #include <stddef.h>  /* size_t */
 38 | #include <string.h>  /* memset, etc */
 39 | #include <stdlib.h>  /* exit */
 40 | 
 41 | #define oom() exit(-1)
 42 | 
 43 | typedef void (ctor_f)(void *dst, const void *src);
 44 | typedef void (dtor_f)(void *elt);
 45 | typedef void (init_f)(void *elt);
 46 | typedef struct {
 47 |     size_t sz;
 48 |     init_f *init;
 49 |     ctor_f *copy;
 50 |     dtor_f *dtor;
 51 | } UT_icd;
 52 | 
 53 | typedef struct {
 54 |     unsigned i,n;/* i: index of next available slot, n: num slots */
 55 |     UT_icd icd;  /* initializer, copy and destructor functions */
 56 |     char *d;     /* n slots of size icd->sz*/
 57 | } UT_array;
 58 | 
 59 | #define utarray_init(a,_icd) do {                                             \
 60 |   memset(a,0,sizeof(UT_array));                                               \
 61 |   (a)->icd=*_icd;                                                             \
 62 | } while(0)
 63 | 
 64 | #define utarray_done(a) do {                                                  \
 65 |   if ((a)->n) {                                                               \
 66 |     if ((a)->icd.dtor) {                                                      \
 67 |       size_t _ut_i;                                                           \
 68 |       for(_ut_i=0; _ut_i < (a)->i; _ut_i++) {                                 \
 69 |         (a)->icd.dtor(utarray_eltptr(a,_ut_i));                               \
 70 |       }                                                                       \
 71 |     }                                                                         \
 72 |     free((a)->d);                                                             \
 73 |   }                                                                           \
 74 |   (a)->n=0;                                                                   \
 75 | } while(0)
 76 | 
 77 | #define utarray_new(a,_icd) do {                                              \
 78 |   a=(UT_array*)malloc(sizeof(UT_array));                                      \
 79 |   utarray_init(a,_icd);                                                       \
 80 | } while(0)
 81 | 
 82 | #define utarray_free(a) do {                                                  \
 83 |   utarray_done(a);                                                            \
 84 |   free(a);                                                                    \
 85 | } while(0)
 86 | 
 87 | #define utarray_reserve(a,by) do {                                            \
 88 |   if (((a)->i+(by)) > ((a)->n)) {                                             \
 89 |     while(((a)->i+(by)) > ((a)->n)) { (a)->n = ((a)->n ? (2*(a)->n) : 8); }   \
 90 |     if ( ((a)->d=(char*)realloc((a)->d, (a)->n*(a)->icd.sz)) == NULL) oom();  \
 91 |   }                                                                           \
 92 | } while(0)
 93 | 
 94 | #define utarray_push_back(a,p) do {                                           \
 95 |   utarray_reserve(a,1);                                                       \
 96 |   if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,(a)->i++), p); }      \
 97 |   else { memcpy(_utarray_eltptr(a,(a)->i++), p, (a)->icd.sz); };              \
 98 | } while(0)
 99 | 
100 | #define utarray_pop_back(a) do {                                              \
101 |   if ((a)->icd.dtor) { (a)->icd.dtor( _utarray_eltptr(a,--((a)->i))); }       \
102 |   else { (a)->i--; }                                                          \
103 | } while(0)
104 | 
105 | #define utarray_extend_back(a) do {                                           \
106 |   utarray_reserve(a,1);                                                       \
107 |   if ((a)->icd.init) { (a)->icd.init(_utarray_eltptr(a,(a)->i)); }            \
108 |   else { memset(_utarray_eltptr(a,(a)->i),0,(a)->icd.sz); }                   \
109 |   (a)->i++;                                                                   \
110 | } while(0)
111 | 
112 | #define utarray_len(a) ((a)->i)
113 | 
114 | #define utarray_eltptr(a,j) (((j) < (a)->i) ? _utarray_eltptr(a,j) : NULL)
115 | #define _utarray_eltptr(a,j) ((char*)((a)->d + ((a)->icd.sz*(j) )))
116 | 
117 | #define utarray_insert(a,p,j) do {                                            \
118 |   if (j > (a)->i) utarray_resize(a,j);                                        \
119 |   utarray_reserve(a,1);                                                       \
120 |   if ((j) < (a)->i) {                                                         \
121 |     memmove( _utarray_eltptr(a,(j)+1), _utarray_eltptr(a,j),                  \
122 |              ((a)->i - (j))*((a)->icd.sz));                                   \
123 |   }                                                                           \
124 |   if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,j), p); }             \
125 |   else { memcpy(_utarray_eltptr(a,j), p, (a)->icd.sz); };                     \
126 |   (a)->i++;                                                                   \
127 | } while(0)
128 | 
129 | #define utarray_inserta(a,w,j) do {                                           \
130 |   if (utarray_len(w) == 0) break;                                             \
131 |   if (j > (a)->i) utarray_resize(a,j);                                        \
132 |   utarray_reserve(a,utarray_len(w));                                          \
133 |   if ((j) < (a)->i) {                                                         \
134 |     memmove(_utarray_eltptr(a,(j)+utarray_len(w)),                            \
135 |             _utarray_eltptr(a,j),                                             \
136 |             ((a)->i - (j))*((a)->icd.sz));                                    \
137 |   }                                                                           \
138 |   if ((a)->icd.copy) {                                                        \
139 |     size_t _ut_i;                                                             \
140 |     for(_ut_i=0;_ut_i<(w)->i;_ut_i++) {                                       \
141 |       (a)->icd.copy(_utarray_eltptr(a,j+_ut_i), _utarray_eltptr(w,_ut_i));    \
142 |     }                                                                         \
143 |   } else {                                                                    \
144 |     memcpy(_utarray_eltptr(a,j), _utarray_eltptr(w,0),                        \
145 |            utarray_len(w)*((a)->icd.sz));                                     \
146 |   }                                                                           \
147 |   (a)->i += utarray_len(w);                                                   \
148 | } while(0)
149 | 
150 | #define utarray_resize(dst,num) do {                                          \
151 |   size_t _ut_i;                                                               \
152 |   if (dst->i > (size_t)(num)) {                                               \
153 |     if ((dst)->icd.dtor) {                                                    \
154 |       for(_ut_i=num; _ut_i < dst->i; _ut_i++) {                               \
155 |         (dst)->icd.dtor(utarray_eltptr(dst,_ut_i));                           \
156 |       }                                                                       \
157 |     }                                                                         \
158 |   } else if (dst->i < (size_t)(num)) {                                        \
159 |     utarray_reserve(dst,num-dst->i);                                          \
160 |     if ((dst)->icd.init) {                                                    \
161 |       for(_ut_i=dst->i; _ut_i < num; _ut_i++) {                               \
162 |         (dst)->icd.init(utarray_eltptr(dst,_ut_i));                           \
163 |       }                                                                       \
164 |     } else {                                                                  \
165 |       memset(_utarray_eltptr(dst,dst->i),0,(dst)->icd.sz*(num-dst->i));       \
166 |     }                                                                         \
167 |   }                                                                           \
168 |   dst->i = num;                                                               \
169 | } while(0)
170 | 
171 | #define utarray_concat(dst,src) do {                                          \
172 |   utarray_inserta((dst),(src),utarray_len(dst));                              \
173 | } while(0)
174 | 
175 | #define utarray_erase(a,pos,len) do {                                         \
176 |   if ((a)->icd.dtor) {                                                        \
177 |     size_t _ut_i;                                                             \
178 |     for(_ut_i=0; _ut_i < len; _ut_i++) {                                      \
179 |       (a)->icd.dtor(utarray_eltptr((a),pos+_ut_i));                           \
180 |     }                                                                         \
181 |   }                                                                           \
182 |   if ((a)->i > (pos+len)) {                                                   \
183 |     memmove( _utarray_eltptr((a),pos), _utarray_eltptr((a),pos+len),          \
184 |             (((a)->i)-(pos+len))*((a)->icd.sz));                              \
185 |   }                                                                           \
186 |   (a)->i -= (len);                                                            \
187 | } while(0)
188 | 
189 | #define utarray_renew(a,u) do {                                               \
190 |   if (a) utarray_clear(a); \
191 |   else utarray_new((a),(u));   \
192 | } while(0)
193 | 
194 | #define utarray_clear(a) do {                                                 \
195 |   if ((a)->i > 0) {                                                           \
196 |     if ((a)->icd.dtor) {                                                      \
197 |       size_t _ut_i;                                                           \
198 |       for(_ut_i=0; _ut_i < (a)->i; _ut_i++) {                                 \
199 |         (a)->icd.dtor(utarray_eltptr(a,_ut_i));                               \
200 |       }                                                                       \
201 |     }                                                                         \
202 |     (a)->i = 0;                                                               \
203 |   }                                                                           \
204 | } while(0)
205 | 
206 | #define utarray_sort(a,cmp) do {                                              \
207 |   qsort((a)->d, (a)->i, (a)->icd.sz, cmp);                                    \
208 | } while(0)
209 | 
210 | #define utarray_find(a,v,cmp) bsearch((v),(a)->d,(a)->i,(a)->icd.sz,cmp)
211 | 
212 | #define utarray_front(a) (((a)->i) ? (_utarray_eltptr(a,0)) : NULL)
213 | #define utarray_next(a,e) (((e)==NULL) ? utarray_front(a) : ((((a)->i) > (utarray_eltidx(a,e)+1)) ? _utarray_eltptr(a,utarray_eltidx(a,e)+1) : NULL))
214 | #define utarray_prev(a,e) (((e)==NULL) ? utarray_back(a) : ((utarray_eltidx(a,e) > 0) ? _utarray_eltptr(a,utarray_eltidx(a,e)-1) : NULL))
215 | #define utarray_back(a) (((a)->i) ? (_utarray_eltptr(a,(a)->i-1)) : NULL)
216 | #define utarray_eltidx(a,e) (((char*)(e) >= (char*)((a)->d)) ? (((char*)(e) - (char*)((a)->d))/(size_t)(a)->icd.sz) : -1)
217 | 
218 | /* last we pre-define a few icd for common utarrays of ints and strings */
219 | static void utarray_str_cpy(void *dst, const void *src) {
220 |   char **_src = (char**)src, **_dst = (char**)dst;
221 |   *_dst = (*_src == NULL) ? NULL : strdup(*_src);
222 | }
223 | static void utarray_str_dtor(void *elt) {
224 |   char **eltc = (char**)elt;
225 |   if (*eltc) free(*eltc);
226 | }
227 | static const UT_icd ut_str_icd _UNUSED_ = {sizeof(char*),NULL,utarray_str_cpy,utarray_str_dtor};
228 | static const UT_icd ut_int_icd _UNUSED_ = {sizeof(int),NULL,NULL,NULL};
229 | static const UT_icd ut_ptr_icd _UNUSED_ = {sizeof(void*),NULL,NULL,NULL};
230 | 
231 | 
232 | #endif /* UTARRAY_H */
233 | 


--------------------------------------------------------------------------------
/tests/remove_prefixes_tests.c:
--------------------------------------------------------------------------------
  1 | #ifdef __linux
  2 |   #define _GNU_SOURCE 
  3 | #endif
  4 | #include "minunit.h"
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <string.h>
  8 | #include "libsastrawi.h"
  9 | #include "sastrawi/remove_prefixes.h"
 10 | #include "dbg.h"
 11 | 
 12 | char *test_remove_complex_prefix(char *stemable_word, char *expected_stemmed_word, char *expected_removed_part, PREFIX_REMOVER fn)
 13 | {  
 14 |   char *stemmed_word = NULL;
 15 |   char *removed_part = NULL;
 16 | 
 17 |   int rc = fn(stemable_word, &stemmed_word, &removed_part);
 18 |   debug("word: %s, expected stemmed word: %s, actual stemmed word: %s, expected removed part: %s, actual removed part: %s", stemable_word, expected_stemmed_word, stemmed_word, expected_removed_part, removed_part);
 19 |   mu_assert(rc == 1, "failed to stem");
 20 |   mu_assert(strcmp(expected_stemmed_word, stemmed_word) == 0, "failed while asserting stemmed word");
 21 |   mu_assert(strcmp(expected_removed_part, removed_part) == 0, "failed while asserting removed part");
 22 |   free(stemmed_word);
 23 |   free(removed_part);
 24 |   return NULL;
 25 | }
 26 | 
 27 | char *test_remove_plain_prefix_returns_0_if_word_notin_dictionary() 
 28 | {
 29 |   char *stemmed_word = NULL; 
 30 |   char *removed_part = NULL;
 31 | 
 32 |   int rc = remove_plain_prefix("dipertikai", &stemmed_word, &removed_part);
 33 | 
 34 |   mu_assert(rc == 0, "successfully stems but not in dictionary");
 35 |   mu_assert(strcmp("pertikai", stemmed_word) == 0, "we expect 'pertikai' as the stemmed word");
 36 |   mu_assert(strcmp("di", removed_part) == 0, "we expect 'di' as the removed part");
 37 | 
 38 |   return NULL;
 39 | }
 40 | 
 41 | char *test_remove_plain_prefix_di() 
 42 | {
 43 |   char *stemmed_word = NULL; 
 44 |   char *removed_part = NULL;
 45 | 
 46 |   int rc = remove_plain_prefix("dicinta", &stemmed_word, &removed_part);
 47 | 
 48 |   mu_assert(rc == 1, "successfully stems");
 49 |   mu_assert(strcmp("cinta", stemmed_word) == 0, "we expect 'sana' as the stemmed word");
 50 |   mu_assert(strcmp("di", removed_part) == 0, "we expect 'di' as the removed part");
 51 | 
 52 |   return NULL;
 53 | }
 54 | 
 55 | char *test_remove_plain_prefix_ke() 
 56 | {
 57 |   char *stemmed_word = NULL; 
 58 |   char *removed_part = NULL;
 59 | 
 60 |   int rc = remove_plain_prefix("kesana", &stemmed_word, &removed_part);
 61 | 
 62 |   mu_assert(rc == 1, "successfully stems");
 63 |   mu_assert(strcmp("sana", stemmed_word) == 0, "we expect 'sana' as the stemmed word");
 64 |   mu_assert(strcmp("ke", removed_part) == 0, "we expect 'ke' as the removed part");
 65 | 
 66 |   return NULL;
 67 | }
 68 | 
 69 | char *test_remove_plain_prefix_se() 
 70 | {
 71 |   char *stemmed_word = NULL; 
 72 |   char *removed_part = NULL;
 73 | 
 74 |   int rc = remove_plain_prefix("sejenis", &stemmed_word, &removed_part);
 75 | 
 76 |   mu_assert(rc == 1, "successfully stems");
 77 |   mu_assert(strcmp("jenis", stemmed_word) == 0, "we expect 'jenis' as the stemmed word");
 78 |   mu_assert(strcmp("se", removed_part) == 0, "we expect 'se' as the removed part");
 79 | 
 80 |   return NULL;
 81 | }
 82 | 
 83 | 
 84 | 
 85 | char *test_remove_complex_prefix_rule1_a() 
 86 | {
 87 |   char *stemmed_word = NULL;
 88 |   char *removed_part = NULL;
 89 | 
 90 |   int rc = remove_complex_prefix_rule1("beria", &stemmed_word, &removed_part);
 91 |   debug("stem word: beria, expected: ia, actual: %s", stemmed_word);
 92 |   mu_assert(rc == 1, "sucessfully stemmed");
 93 |   mu_assert(strcmp("ia", stemmed_word) == 0, "it stems to ia");
 94 |   mu_assert(strcmp("ber", removed_part) == 0, "remove part should be ber");
 95 |   free(stemmed_word);
 96 |   free(removed_part);
 97 | 
 98 |   return NULL;
 99 | }
100 | 
101 | char *test_remove_complex_prefix_rule1_b() 
102 | {
103 |   char *stemmed_word = NULL;
104 |   char *removed_part = NULL;
105 | 
106 |   int rc = remove_complex_prefix_rule1("berakit", &stemmed_word, &removed_part);
107 |   debug("stem word: berakit, expected: rakit, actual: %s", stemmed_word);
108 |   mu_assert(rc == 1, "sucessfully stemmed");
109 |   mu_assert(strcmp("rakit", stemmed_word) == 0, "it stems to rakit");
110 |   mu_assert(strcmp("be", removed_part) == 0, "remove part should be be");
111 |   free(stemmed_word);
112 |   free(removed_part);
113 | 
114 |   return NULL;
115 | }
116 | 
117 | char *test_remove_complex_prefix_rule2() 
118 | {
119 |   return test_remove_complex_prefix("berkop", "kop", "ber", remove_complex_prefix_rule2);
120 | }
121 | 
122 | char *test_remove_complex_prefix_rule2_excludes_er() 
123 | {
124 |   char *word = "berdaerah";
125 |   char *stemmed_word = NULL;
126 |   char *removed_part = NULL;
127 | 
128 |   int rc = remove_complex_prefix_rule2(word, &stemmed_word, &removed_part);
129 |   debug("stem word: %s, expected: berdaerah, actual: %s", word, stemmed_word);
130 |   mu_assert(rc == 0, "does not stem");
131 |   mu_assert(strcmp("berdaerah", stemmed_word) == 0, "it does not stem it");
132 |   free(stemmed_word);
133 |   free(removed_part);
134 | 
135 |   return NULL;
136 | }
137 | 
138 | char *test_remove_complex_prefix_rule3_only_includes_er() 
139 | {
140 |   char *stemable_word = "berdaerah";
141 |   char *nonstemable_word = "bertabur";
142 |   char *stemmed_word = NULL;
143 |   char *removed_part = NULL;
144 | 
145 |   int rc = remove_complex_prefix_rule3(stemable_word, &stemmed_word, &removed_part);
146 |   debug("stem word: %s, expected: daerah, actual: %s", stemable_word, stemmed_word);
147 |   mu_assert(rc == 1, "sucessfully stemmed");
148 |   mu_assert(strcmp("daerah", stemmed_word) == 0, "it stems to daerah");
149 |   mu_assert(strcmp("ber", removed_part) == 0, "remove part should be ber");
150 |   free(stemmed_word);
151 |   free(removed_part);
152 |  
153 |   rc = remove_complex_prefix_rule3(nonstemable_word, &stemmed_word, &removed_part);
154 |   mu_assert(rc == 0, "cannot stem");
155 |   free(stemmed_word);
156 |   free(removed_part);
157 | 
158 |   return NULL;
159 | }
160 | 
161 | char *test_remove_complex_prefix_rule4() 
162 | {
163 |   char *stemable_word = "belajar";
164 |   char *nonstemable_word = "bertabur";
165 |   char *stemmed_word = NULL;
166 |   char *removed_part = NULL;
167 | 
168 |   int rc = remove_complex_prefix_rule4(stemable_word, &stemmed_word, &removed_part);
169 |   debug("stem word: %s, expected: ajar, actual: %s", stemable_word, stemmed_word);
170 |   mu_assert(rc == 1, "sucessfully stemmed");
171 |   mu_assert(strcmp("ajar", stemmed_word) == 0, "it stems to ajar");
172 |   mu_assert(strcmp("bel", removed_part) == 0, "remove part should be bel");
173 |   free(stemmed_word);
174 |   free(removed_part);
175 |  
176 |   rc = remove_complex_prefix_rule4(nonstemable_word, &stemmed_word, &removed_part);
177 |   mu_assert(rc == 0, "cannot stem");
178 |   free(stemmed_word);
179 |   free(removed_part);
180 | 
181 |   return NULL;
182 | }
183 | 
184 | char *test_remove_complex_prefix_rule5() 
185 | {
186 |   char *stemable_word = "bekerja";
187 |   char *nonstemable_word = "berlari";
188 |   char *stemmed_word = NULL;
189 |   char *removed_part = NULL;
190 | 
191 |   int rc = remove_complex_prefix_rule5(stemable_word, &stemmed_word, &removed_part);
192 |   debug("stem word: %s, expected: kerja, actual: %s", stemable_word, stemmed_word);
193 |   mu_assert(rc == 1, "sucessfully stemmed");
194 |   mu_assert(strcmp("kerja", stemmed_word) == 0, "it stems to kerja");
195 |   mu_assert(strcmp("be", removed_part) == 0, "remove part should be be");
196 |   free(stemmed_word);
197 |   free(removed_part);
198 |  
199 |   rc = remove_complex_prefix_rule5(nonstemable_word, &stemmed_word, &removed_part);
200 |   mu_assert(rc == 0, "cannot stem");
201 |   free(stemmed_word);
202 |   free(removed_part);
203 | 
204 |   return NULL;
205 | }
206 | 
207 | char *test_remove_complex_prefix_rule6a() 
208 | {
209 |   char *stemable_word = "terancam";
210 |   char *nonstemable_word = "terbalik";
211 |   char *stemmed_word = NULL;
212 |   char *removed_part = NULL;
213 | 
214 |   int rc = remove_complex_prefix_rule6(stemable_word, &stemmed_word, &removed_part);
215 |   debug("stem word: %s, expected: ancam, actual: %s", stemable_word, stemmed_word);
216 |   mu_assert(rc == 1, "sucessfully stemmed");
217 |   mu_assert(strcmp("ancam", stemmed_word) == 0, "it stems to ancam");
218 |   mu_assert(strcmp("ter", removed_part) == 0, "remove part should be be");
219 |   free(stemmed_word);
220 |   free(removed_part);
221 |  
222 |   rc = remove_complex_prefix_rule6(nonstemable_word, &stemmed_word, &removed_part);
223 |   mu_assert(rc == 0, "cannot stem");
224 |   free(stemmed_word);
225 |   free(removed_part);
226 | 
227 |   return NULL;
228 | }
229 | 
230 | char *test_remove_complex_prefix_rule6b() 
231 | {
232 |   char *stemable_word = "teracun";
233 |   char *nonstemable_word = "terbalik";
234 |   char *stemmed_word = NULL;
235 |   char *removed_part = NULL;
236 | 
237 |   int rc = remove_complex_prefix_rule6(stemable_word, &stemmed_word, &removed_part);
238 |   debug("stem word: %s, expected: racun, actual: %s", stemable_word, stemmed_word);
239 |   mu_assert(rc == 1, "sucessfully stemmed");
240 |   mu_assert(strcmp("racun", stemmed_word) == 0, "it stems to racun");
241 |   mu_assert(strcmp("te", removed_part) == 0, "remove part should be te");
242 |   free(stemmed_word);
243 |   free(removed_part);
244 |  
245 |   rc = remove_complex_prefix_rule6(nonstemable_word, &stemmed_word, &removed_part);
246 |   mu_assert(rc == 0, "cannot stem");
247 |   free(stemmed_word);
248 |   free(removed_part);
249 | 
250 |   return NULL;
251 | }
252 | 
253 | char *test_remove_complex_prefix_rule7() 
254 | {
255 |   char *stemable_word = "terperuk";
256 |   char *stemmed_word = NULL;
257 |   char *removed_part = NULL;
258 | 
259 |   int rc = remove_complex_prefix_rule7(stemable_word, &stemmed_word, &removed_part);
260 |   debug("stem word: %s, expected: peruk, actual: %s", stemable_word, stemmed_word);
261 |   mu_assert(rc == 1, "sucessfully stemmed");
262 |   mu_assert(strcmp("peruk", stemmed_word) == 0, "it stems to peruk");
263 |   mu_assert(strcmp("ter", removed_part) == 0, "remove part should be ter");
264 |   free(stemmed_word);
265 |   free(removed_part);
266 |   return NULL;
267 | }
268 | 
269 | char *test_remove_complex_prefix_rule8() 
270 | {
271 |   char *stemable_word = "tertangkap";
272 |   char *stemmed_word = NULL;
273 |   char *removed_part = NULL;
274 | 
275 |   int rc = remove_complex_prefix_rule8(stemable_word, &stemmed_word, &removed_part);
276 |   debug("stem word: %s, expected: tangkap, actual: %s", stemable_word, stemmed_word);
277 |   mu_assert(rc == 1, "sucessfully stemmed");
278 |   mu_assert(strcmp("tangkap", stemmed_word) == 0, "it stems to tangkap");
279 |   mu_assert(strcmp("ter", removed_part) == 0, "remove part should be ter");
280 |   free(stemmed_word);
281 |   free(removed_part);
282 |   return NULL;
283 | }
284 | 
285 | char *test_remove_complex_prefix_rule8_excludes_er() 
286 | {
287 |   char *word = "terterbang";
288 |   char *stemmed_word = NULL;
289 |   char *removed_part = NULL;
290 | 
291 |   int rc = remove_complex_prefix_rule8(word, &stemmed_word, &removed_part);
292 |   debug("stem word: %s, expected: terterbang, actual: %s", word, stemmed_word);
293 |   mu_assert(rc == 0, "does not stem");
294 |   mu_assert(strcmp("terterbang", stemmed_word) == 0, "it does not stem it");
295 |   free(stemmed_word);
296 |   free(removed_part);
297 | 
298 |   return NULL;
299 | }
300 | 
301 | char *test_remove_complex_prefix_rule9() 
302 | {
303 |   return test_remove_complex_prefix("teterbang", "terbang", "te",  remove_complex_prefix_rule9);
304 | }
305 | 
306 | char *test_remove_complex_prefix_rule10_l() 
307 | {
308 |   return test_remove_complex_prefix("melalu", "lalu", "me",  remove_complex_prefix_rule10);
309 | }
310 | 
311 | char *test_remove_complex_prefix_rule10_r() 
312 | {
313 |   return test_remove_complex_prefix("meracun", "racun", "me",  remove_complex_prefix_rule10);
314 | }
315 | 
316 | char *test_remove_complex_prefix_rule10_w() 
317 | {
318 |   return test_remove_complex_prefix("mewarna", "warna", "me",  remove_complex_prefix_rule10);
319 | }
320 | 
321 | char *test_remove_complex_prefix_rule10_y() 
322 | {
323 |   return test_remove_complex_prefix("meyakin", "yakin", "me",  remove_complex_prefix_rule10);
324 | }
325 | 
326 | char *test_remove_complex_prefix_rule11_f() 
327 | {
328 |   return test_remove_complex_prefix("memfasilitas", "fasilitas", "mem",  remove_complex_prefix_rule11);
329 | }
330 | 
331 | char *test_remove_complex_prefix_rule11_b() 
332 | {
333 |   return test_remove_complex_prefix("membantu", "bantu", "mem",  remove_complex_prefix_rule11);
334 | }
335 | 
336 | char *test_remove_complex_prefix_rule11_v() 
337 | {
338 |   return test_remove_complex_prefix("memvonis", "vonis", "mem",  remove_complex_prefix_rule11);
339 | }
340 | 
341 | char *test_remove_complex_prefix_rule11_unstemmable() 
342 | {
343 |   char *word = "terbalik";
344 |   char *stemmed_word = NULL;
345 |   char *removed_part = NULL;
346 | 
347 |   int rc = remove_complex_prefix_rule11(word, &stemmed_word, &removed_part);
348 |   debug("word: %s, expected: %s, actual: %s, expected removed: %s, actual removed : %s",
349 |       word, word, 
350 |       stemmed_word, "", 
351 |       removed_part);
352 | 
353 |   mu_assert(rc == 0, "should not stem");
354 |   mu_assert(strcmp(word, stemmed_word) == 0, "it returns the original word");
355 |   mu_assert(strcmp("", removed_part) == 0, "it returns an empty string as the removed part");
356 |   free(stemmed_word);
357 |   free(removed_part);
358 | 
359 |   return NULL;
360 | }
361 | 
362 | char *test_remove_complex_prefix_rule12() 
363 | {
364 |   return test_remove_complex_prefix("mempengaruh", "pengaruh", "mem",  remove_complex_prefix_rule12);
365 | }
366 | 
367 | char *test_remove_complex_prefix_rule13a() 
368 | {
369 |   return test_remove_complex_prefix("memasuk", "masuk", "me",  remove_complex_prefix_rule13);
370 | }
371 | 
372 | char *test_remove_complex_prefix_rule13b() 
373 | {
374 |   return test_remove_complex_prefix("memakai", "pakai", "me",  remove_complex_prefix_rule13);
375 | }
376 | 
377 | char *test_remove_complex_prefix_rule14_c() 
378 | {
379 |   return test_remove_complex_prefix("mencantum", "cantum", "men",  remove_complex_prefix_rule14);
380 | }
381 | 
382 | char *test_remove_complex_prefix_rule14_d() 
383 | {
384 |   return test_remove_complex_prefix("menduduk", "duduk", "men",  remove_complex_prefix_rule14);
385 | }
386 | 
387 | char *test_remove_complex_prefix_rule14_j() 
388 | {
389 |   return test_remove_complex_prefix("menjemput", "jemput", "men",  remove_complex_prefix_rule14);
390 | }
391 | 
392 | char *test_remove_complex_prefix_rule14_s() 
393 | {
394 |   return test_remove_complex_prefix("mensyukur", "syukur", "men",  remove_complex_prefix_rule14);
395 | }
396 | 
397 | char *test_remove_complex_prefix_rule14_t() 
398 | {
399 |   return test_remove_complex_prefix("mentaat", "taat", "men",  remove_complex_prefix_rule14);
400 | }
401 | 
402 | char *test_remove_complex_prefix_rule14_z() 
403 | {
404 |   return test_remove_complex_prefix("menziarah", "ziarah", "men",  remove_complex_prefix_rule14);
405 | }
406 | 
407 | char *test_remove_complex_prefix_rule15a() 
408 | {
409 |   return test_remove_complex_prefix("menikmat", "nikmat", "me",  remove_complex_prefix_rule15);
410 | }
411 | 
412 | char *test_remove_complex_prefix_rule15b() 
413 | {
414 |   return test_remove_complex_prefix("menulis", "tulis", "me",  remove_complex_prefix_rule15);
415 | }
416 | 
417 | char *test_remove_complex_prefix_rule16_g() 
418 | {
419 |   return test_remove_complex_prefix("mengguna", "guna", "meng",  remove_complex_prefix_rule16);
420 | }
421 | 
422 | char *test_remove_complex_prefix_rule16_h() 
423 | {
424 |   return test_remove_complex_prefix("menghambat", "hambat", "meng",  remove_complex_prefix_rule16);
425 | }
426 | 
427 | char *test_remove_complex_prefix_rule16_q() 
428 | {
429 |   return test_remove_complex_prefix("mengqasar", "qasar", "meng",  remove_complex_prefix_rule16);
430 | }
431 | 
432 | char *test_remove_complex_prefix_rule16_k() 
433 | {
434 |   return test_remove_complex_prefix("mengkritik", "kritik", "meng",  remove_complex_prefix_rule16);
435 | }
436 | 
437 | char *test_remove_complex_prefix_rule17a() 
438 | {
439 |   return test_remove_complex_prefix("mengerat", "erat", "meng",  remove_complex_prefix_rule17);
440 | }
441 | 
442 | char *test_remove_complex_prefix_rule17b() 
443 | {
444 |   return test_remove_complex_prefix("mengecil", "kecil", "meng",  remove_complex_prefix_rule17);
445 | }
446 | 
447 | char *test_remove_complex_prefix_rule17c() 
448 | {
449 |   return test_remove_complex_prefix("mengecat", "cat", "menge",  remove_complex_prefix_rule17);
450 | }
451 | 
452 | char *test_remove_complex_prefix_rule17d() 
453 | {
454 |   return test_remove_complex_prefix("mengiang", "ngiang", "me",  remove_complex_prefix_rule17);
455 | }
456 | 
457 | char *test_remove_complex_prefix_rule18a() 
458 | {
459 |   return test_remove_complex_prefix("menyala", "nyala", "me",  remove_complex_prefix_rule18);
460 | }
461 | 
462 | char *test_remove_complex_prefix_rule18b() 
463 | {
464 |   return test_remove_complex_prefix("menyapu", "sapu", "meny",  remove_complex_prefix_rule18);
465 | }
466 | 
467 | char *test_remove_complex_prefix_rule19_1() 
468 | {
469 |   return test_remove_complex_prefix("memproteksi", "proteksi", "mem",  remove_complex_prefix_rule19);
470 | }
471 | 
472 | char *test_remove_complex_prefix_rule19_2() 
473 | {
474 |   return test_remove_complex_prefix("mempatroli", "patroli", "mem",  remove_complex_prefix_rule19);
475 | }
476 | 
477 | char *test_remove_complex_prefix_rule20_1() 
478 | {
479 |   return test_remove_complex_prefix("pewarna", "warna", "pe",  remove_complex_prefix_rule20);
480 | }
481 | 
482 | char *test_remove_complex_prefix_rule20_2() 
483 | {
484 |   return test_remove_complex_prefix("peyoga", "yoga", "pe",  remove_complex_prefix_rule20);
485 | }
486 | 
487 | char *test_remove_prefixes_when_cannot_stem_to_word_in_dict()
488 | {
489 |   char *stemmed_word;
490 | 
491 |   int rc = remove_prefixes("mewarnai", &stemmed_word);
492 |   debug("word: mewarnai, expected stemmed word: warnai, actual stemmed word: %s", stemmed_word);
493 |   mu_assert(rc == 0, "it changes the word, but its not done");
494 |   mu_assert(strcmp("warnai", stemmed_word) == 0, "failed while asserting stemmed word");
495 |   return NULL;
496 | }
497 | 
498 | char *all_tests()
499 | {
500 |   mu_suite_start();
501 | 
502 |   dictionary_load(dictionary_fullpath("data/kata-dasar.txt"));
503 | 
504 |   mu_run_test(test_remove_plain_prefix_returns_0_if_word_notin_dictionary)
505 |   mu_run_test(test_remove_plain_prefix_di);
506 |   mu_run_test(test_remove_plain_prefix_ke);
507 |   mu_run_test(test_remove_plain_prefix_se);
508 |   mu_run_test(test_remove_complex_prefix_rule1_a);
509 |   mu_run_test(test_remove_complex_prefix_rule1_b);
510 |   mu_run_test(test_remove_complex_prefix_rule2);
511 |   mu_run_test(test_remove_complex_prefix_rule2_excludes_er);
512 |   mu_run_test(test_remove_complex_prefix_rule3_only_includes_er);
513 |   mu_run_test(test_remove_complex_prefix_rule4);
514 |   mu_run_test(test_remove_complex_prefix_rule5);
515 |   mu_run_test(test_remove_complex_prefix_rule6a);
516 |   mu_run_test(test_remove_complex_prefix_rule6b);
517 |   mu_run_test(test_remove_complex_prefix_rule7);
518 |   mu_run_test(test_remove_complex_prefix_rule8);
519 |   mu_run_test(test_remove_complex_prefix_rule8_excludes_er);
520 |   mu_run_test(test_remove_complex_prefix_rule9);
521 |   mu_run_test(test_remove_complex_prefix_rule10_l);
522 |   mu_run_test(test_remove_complex_prefix_rule10_r);
523 |   mu_run_test(test_remove_complex_prefix_rule10_w);
524 |   mu_run_test(test_remove_complex_prefix_rule10_y);
525 |   mu_run_test(test_remove_complex_prefix_rule11_f);
526 |   mu_run_test(test_remove_complex_prefix_rule11_b);
527 |   mu_run_test(test_remove_complex_prefix_rule11_v);
528 |   mu_run_test(test_remove_complex_prefix_rule11_unstemmable);
529 |   mu_run_test(test_remove_complex_prefix_rule12);
530 |   mu_run_test(test_remove_complex_prefix_rule13a);
531 |   mu_run_test(test_remove_complex_prefix_rule13b);
532 |   mu_run_test(test_remove_complex_prefix_rule14_c);
533 |   mu_run_test(test_remove_complex_prefix_rule14_d);
534 |   mu_run_test(test_remove_complex_prefix_rule14_j);
535 |   mu_run_test(test_remove_complex_prefix_rule14_s);
536 |   mu_run_test(test_remove_complex_prefix_rule14_t);
537 |   mu_run_test(test_remove_complex_prefix_rule14_z);
538 |   mu_run_test(test_remove_complex_prefix_rule15a);
539 |   mu_run_test(test_remove_complex_prefix_rule15b);
540 |   mu_run_test(test_remove_complex_prefix_rule16_g);
541 |   mu_run_test(test_remove_complex_prefix_rule16_h);
542 |   mu_run_test(test_remove_complex_prefix_rule16_q);
543 |   mu_run_test(test_remove_complex_prefix_rule16_k);
544 |   mu_run_test(test_remove_complex_prefix_rule17a);
545 |   mu_run_test(test_remove_complex_prefix_rule17b);
546 |   mu_run_test(test_remove_complex_prefix_rule17c);
547 |   mu_run_test(test_remove_complex_prefix_rule17d);
548 |   mu_run_test(test_remove_complex_prefix_rule18a);
549 |   mu_run_test(test_remove_complex_prefix_rule18b);
550 |   mu_run_test(test_remove_complex_prefix_rule19_1);
551 |   mu_run_test(test_remove_complex_prefix_rule19_2);
552 |   mu_run_test(test_remove_complex_prefix_rule20_1);
553 |   mu_run_test(test_remove_complex_prefix_rule20_2);
554 |   mu_run_test(test_remove_prefixes_when_cannot_stem_to_word_in_dict);
555 |   return NULL;
556 | }
557 | 
558 | RUN_TESTS(all_tests);
559 | 


--------------------------------------------------------------------------------
/src/uthash/utlist.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2007-2014, Troy D. Hanson   http://troydhanson.github.com/uthash/
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without
  6 | modification, are permitted provided that the following conditions are met:
  7 | 
  8 |     * Redistributions of source code must retain the above copyright
  9 |       notice, this list of conditions and the following disclaimer.
 10 | 
 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 22 | */
 23 | 
 24 | #ifndef UTLIST_H
 25 | #define UTLIST_H
 26 | 
 27 | #define UTLIST_VERSION 1.9.9
 28 | 
 29 | #include <assert.h>
 30 | 
 31 | /*
 32 |  * This file contains macros to manipulate singly and doubly-linked lists.
 33 |  *
 34 |  * 1. LL_ macros:  singly-linked lists.
 35 |  * 2. DL_ macros:  doubly-linked lists.
 36 |  * 3. CDL_ macros: circular doubly-linked lists.
 37 |  *
 38 |  * To use singly-linked lists, your structure must have a "next" pointer.
 39 |  * To use doubly-linked lists, your structure must "prev" and "next" pointers.
 40 |  * Either way, the pointer to the head of the list must be initialized to NULL.
 41 |  *
 42 |  * ----------------.EXAMPLE -------------------------
 43 |  * struct item {
 44 |  *      int id;
 45 |  *      struct item *prev, *next;
 46 |  * }
 47 |  *
 48 |  * struct item *list = NULL:
 49 |  *
 50 |  * int main() {
 51 |  *      struct item *item;
 52 |  *      ... allocate and populate item ...
 53 |  *      DL_APPEND(list, item);
 54 |  * }
 55 |  * --------------------------------------------------
 56 |  *
 57 |  * For doubly-linked lists, the append and delete macros are O(1)
 58 |  * For singly-linked lists, append and delete are O(n) but prepend is O(1)
 59 |  * The sort macro is O(n log(n)) for all types of single/double/circular lists.
 60 |  */
 61 | 
 62 | /* These macros use decltype or the earlier __typeof GNU extension.
 63 |    As decltype is only available in newer compilers (VS2010 or gcc 4.3+
 64 |    when compiling c++ code), this code uses whatever method is needed
 65 |    or, for VS2008 where neither is available, uses casting workarounds. */
 66 | #ifdef _MSC_VER            /* MS compiler */
 67 | #if _MSC_VER >= 1600 && defined(__cplusplus)  /* VS2010 or newer in C++ mode */
 68 | #define LDECLTYPE(x) decltype(x)
 69 | #else                     /* VS2008 or older (or VS2010 in C mode) */
 70 | #define NO_DECLTYPE
 71 | #define LDECLTYPE(x) char*
 72 | #endif
 73 | #elif defined(__ICCARM__)
 74 | #define NO_DECLTYPE
 75 | #define LDECLTYPE(x) char*
 76 | #else                      /* GNU, Sun and other compilers */
 77 | #define LDECLTYPE(x) __typeof(x)
 78 | #endif
 79 | 
 80 | /* for VS2008 we use some workarounds to get around the lack of decltype,
 81 |  * namely, we always reassign our tmp variable to the list head if we need
 82 |  * to dereference its prev/next pointers, and save/restore the real head.*/
 83 | #ifdef NO_DECLTYPE
 84 | #define _SV(elt,list) _tmp = (char*)(list); {char **_alias = (char**)&(list); *_alias = (elt); }
 85 | #define _NEXT(elt,list,next) ((char*)((list)->next))
 86 | #define _NEXTASGN(elt,list,to,next) { char **_alias = (char**)&((list)->next); *_alias=(char*)(to); }
 87 | /* #define _PREV(elt,list,prev) ((char*)((list)->prev)) */
 88 | #define _PREVASGN(elt,list,to,prev) { char **_alias = (char**)&((list)->prev); *_alias=(char*)(to); }
 89 | #define _RS(list) { char **_alias = (char**)&(list); *_alias=_tmp; }
 90 | #define _CASTASGN(a,b) { char **_alias = (char**)&(a); *_alias=(char*)(b); }
 91 | #else
 92 | #define _SV(elt,list)
 93 | #define _NEXT(elt,list,next) ((elt)->next)
 94 | #define _NEXTASGN(elt,list,to,next) ((elt)->next)=(to)
 95 | /* #define _PREV(elt,list,prev) ((elt)->prev) */
 96 | #define _PREVASGN(elt,list,to,prev) ((elt)->prev)=(to)
 97 | #define _RS(list)
 98 | #define _CASTASGN(a,b) (a)=(b)
 99 | #endif
100 | 
101 | /******************************************************************************
102 |  * The sort macro is an adaptation of Simon Tatham's O(n log(n)) mergesort    *
103 |  * Unwieldy variable names used here to avoid shadowing passed-in variables.  *
104 |  *****************************************************************************/
105 | #define LL_SORT(list, cmp)                                                                     \
106 |     LL_SORT2(list, cmp, next)
107 | 
108 | #define LL_SORT2(list, cmp, next)                                                              \
109 | do {                                                                                           \
110 |   LDECLTYPE(list) _ls_p;                                                                       \
111 |   LDECLTYPE(list) _ls_q;                                                                       \
112 |   LDECLTYPE(list) _ls_e;                                                                       \
113 |   LDECLTYPE(list) _ls_tail;                                                                    \
114 |   int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping;                       \
115 |   if (list) {                                                                                  \
116 |     _ls_insize = 1;                                                                            \
117 |     _ls_looping = 1;                                                                           \
118 |     while (_ls_looping) {                                                                      \
119 |       _CASTASGN(_ls_p,list);                                                                   \
120 |       list = NULL;                                                                             \
121 |       _ls_tail = NULL;                                                                         \
122 |       _ls_nmerges = 0;                                                                         \
123 |       while (_ls_p) {                                                                          \
124 |         _ls_nmerges++;                                                                         \
125 |         _ls_q = _ls_p;                                                                         \
126 |         _ls_psize = 0;                                                                         \
127 |         for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) {                                         \
128 |           _ls_psize++;                                                                         \
129 |           _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list);                          \
130 |           if (!_ls_q) break;                                                                   \
131 |         }                                                                                      \
132 |         _ls_qsize = _ls_insize;                                                                \
133 |         while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) {                                    \
134 |           if (_ls_psize == 0) {                                                                \
135 |             _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
136 |               _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
137 |           } else if (_ls_qsize == 0 || !_ls_q) {                                               \
138 |             _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
139 |               _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
140 |           } else if (cmp(_ls_p,_ls_q) <= 0) {                                                  \
141 |             _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
142 |               _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
143 |           } else {                                                                             \
144 |             _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
145 |               _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
146 |           }                                                                                    \
147 |           if (_ls_tail) {                                                                      \
148 |             _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list);                \
149 |           } else {                                                                             \
150 |             _CASTASGN(list,_ls_e);                                                             \
151 |           }                                                                                    \
152 |           _ls_tail = _ls_e;                                                                    \
153 |         }                                                                                      \
154 |         _ls_p = _ls_q;                                                                         \
155 |       }                                                                                        \
156 |       if (_ls_tail) {                                                                          \
157 |         _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list);                     \
158 |       }                                                                                        \
159 |       if (_ls_nmerges <= 1) {                                                                  \
160 |         _ls_looping=0;                                                                         \
161 |       }                                                                                        \
162 |       _ls_insize *= 2;                                                                         \
163 |     }                                                                                          \
164 |   }                                                                                            \
165 | } while (0)
166 | 
167 | 
168 | #define DL_SORT(list, cmp)                                                                     \
169 |     DL_SORT2(list, cmp, prev, next)
170 | 
171 | #define DL_SORT2(list, cmp, prev, next)                                                        \
172 | do {                                                                                           \
173 |   LDECLTYPE(list) _ls_p;                                                                       \
174 |   LDECLTYPE(list) _ls_q;                                                                       \
175 |   LDECLTYPE(list) _ls_e;                                                                       \
176 |   LDECLTYPE(list) _ls_tail;                                                                    \
177 |   int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping;                       \
178 |   if (list) {                                                                                  \
179 |     _ls_insize = 1;                                                                            \
180 |     _ls_looping = 1;                                                                           \
181 |     while (_ls_looping) {                                                                      \
182 |       _CASTASGN(_ls_p,list);                                                                   \
183 |       list = NULL;                                                                             \
184 |       _ls_tail = NULL;                                                                         \
185 |       _ls_nmerges = 0;                                                                         \
186 |       while (_ls_p) {                                                                          \
187 |         _ls_nmerges++;                                                                         \
188 |         _ls_q = _ls_p;                                                                         \
189 |         _ls_psize = 0;                                                                         \
190 |         for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) {                                         \
191 |           _ls_psize++;                                                                         \
192 |           _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list);                          \
193 |           if (!_ls_q) break;                                                                   \
194 |         }                                                                                      \
195 |         _ls_qsize = _ls_insize;                                                                \
196 |         while ((_ls_psize > 0) || ((_ls_qsize > 0) && _ls_q)) {                                \
197 |           if (_ls_psize == 0) {                                                                \
198 |             _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
199 |               _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
200 |           } else if ((_ls_qsize == 0) || (!_ls_q)) {                                           \
201 |             _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
202 |               _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
203 |           } else if (cmp(_ls_p,_ls_q) <= 0) {                                                  \
204 |             _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
205 |               _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
206 |           } else {                                                                             \
207 |             _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
208 |               _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
209 |           }                                                                                    \
210 |           if (_ls_tail) {                                                                      \
211 |             _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list);                \
212 |           } else {                                                                             \
213 |             _CASTASGN(list,_ls_e);                                                             \
214 |           }                                                                                    \
215 |           _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list);                     \
216 |           _ls_tail = _ls_e;                                                                    \
217 |         }                                                                                      \
218 |         _ls_p = _ls_q;                                                                         \
219 |       }                                                                                        \
220 |       _CASTASGN(list->prev, _ls_tail);                                                         \
221 |       _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list);                       \
222 |       if (_ls_nmerges <= 1) {                                                                  \
223 |         _ls_looping=0;                                                                         \
224 |       }                                                                                        \
225 |       _ls_insize *= 2;                                                                         \
226 |     }                                                                                          \
227 |   }                                                                                            \
228 | } while (0)
229 | 
230 | #define CDL_SORT(list, cmp)                                                                    \
231 |     CDL_SORT2(list, cmp, prev, next)
232 | 
233 | #define CDL_SORT2(list, cmp, prev, next)                                                       \
234 | do {                                                                                           \
235 |   LDECLTYPE(list) _ls_p;                                                                       \
236 |   LDECLTYPE(list) _ls_q;                                                                       \
237 |   LDECLTYPE(list) _ls_e;                                                                       \
238 |   LDECLTYPE(list) _ls_tail;                                                                    \
239 |   LDECLTYPE(list) _ls_oldhead;                                                                 \
240 |   LDECLTYPE(list) _tmp;                                                                        \
241 |   int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping;                       \
242 |   if (list) {                                                                                  \
243 |     _ls_insize = 1;                                                                            \
244 |     _ls_looping = 1;                                                                           \
245 |     while (_ls_looping) {                                                                      \
246 |       _CASTASGN(_ls_p,list);                                                                   \
247 |       _CASTASGN(_ls_oldhead,list);                                                             \
248 |       list = NULL;                                                                             \
249 |       _ls_tail = NULL;                                                                         \
250 |       _ls_nmerges = 0;                                                                         \
251 |       while (_ls_p) {                                                                          \
252 |         _ls_nmerges++;                                                                         \
253 |         _ls_q = _ls_p;                                                                         \
254 |         _ls_psize = 0;                                                                         \
255 |         for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) {                                         \
256 |           _ls_psize++;                                                                         \
257 |           _SV(_ls_q,list);                                                                     \
258 |           if (_NEXT(_ls_q,list,next) == _ls_oldhead) {                                         \
259 |             _ls_q = NULL;                                                                      \
260 |           } else {                                                                             \
261 |             _ls_q = _NEXT(_ls_q,list,next);                                                    \
262 |           }                                                                                    \
263 |           _RS(list);                                                                           \
264 |           if (!_ls_q) break;                                                                   \
265 |         }                                                                                      \
266 |         _ls_qsize = _ls_insize;                                                                \
267 |         while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) {                                    \
268 |           if (_ls_psize == 0) {                                                                \
269 |             _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
270 |               _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
271 |             if (_ls_q == _ls_oldhead) { _ls_q = NULL; }                                        \
272 |           } else if (_ls_qsize == 0 || !_ls_q) {                                               \
273 |             _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
274 |               _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
275 |             if (_ls_p == _ls_oldhead) { _ls_p = NULL; }                                        \
276 |           } else if (cmp(_ls_p,_ls_q) <= 0) {                                                  \
277 |             _ls_e = _ls_p; _SV(_ls_p,list); _ls_p =                                            \
278 |               _NEXT(_ls_p,list,next); _RS(list); _ls_psize--;                                  \
279 |             if (_ls_p == _ls_oldhead) { _ls_p = NULL; }                                        \
280 |           } else {                                                                             \
281 |             _ls_e = _ls_q; _SV(_ls_q,list); _ls_q =                                            \
282 |               _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--;                                  \
283 |             if (_ls_q == _ls_oldhead) { _ls_q = NULL; }                                        \
284 |           }                                                                                    \
285 |           if (_ls_tail) {                                                                      \
286 |             _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list);                \
287 |           } else {                                                                             \
288 |             _CASTASGN(list,_ls_e);                                                             \
289 |           }                                                                                    \
290 |           _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list);                     \
291 |           _ls_tail = _ls_e;                                                                    \
292 |         }                                                                                      \
293 |         _ls_p = _ls_q;                                                                         \
294 |       }                                                                                        \
295 |       _CASTASGN(list->prev,_ls_tail);                                                          \
296 |       _CASTASGN(_tmp,list);                                                                    \
297 |       _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_tmp,next); _RS(list);                       \
298 |       if (_ls_nmerges <= 1) {                                                                  \
299 |         _ls_looping=0;                                                                         \
300 |       }                                                                                        \
301 |       _ls_insize *= 2;                                                                         \
302 |     }                                                                                          \
303 |   }                                                                                            \
304 | } while (0)
305 | 
306 | /******************************************************************************
307 |  * singly linked list macros (non-circular)                                   *
308 |  *****************************************************************************/
309 | #define LL_PREPEND(head,add)                                                                   \
310 |     LL_PREPEND2(head,add,next)
311 | 
312 | #define LL_PREPEND2(head,add,next)                                                             \
313 | do {                                                                                           \
314 |   (add)->next = head;                                                                          \
315 |   head = add;                                                                                  \
316 | } while (0)
317 | 
318 | #define LL_CONCAT(head1,head2)                                                                 \
319 |     LL_CONCAT2(head1,head2,next)
320 | 
321 | #define LL_CONCAT2(head1,head2,next)                                                           \
322 | do {                                                                                           \
323 |   LDECLTYPE(head1) _tmp;                                                                       \
324 |   if (head1) {                                                                                 \
325 |     _tmp = head1;                                                                              \
326 |     while (_tmp->next) { _tmp = _tmp->next; }                                                  \
327 |     _tmp->next=(head2);                                                                        \
328 |   } else {                                                                                     \
329 |     (head1)=(head2);                                                                           \
330 |   }                                                                                            \
331 | } while (0)
332 | 
333 | #define LL_APPEND(head,add)                                                                    \
334 |     LL_APPEND2(head,add,next)
335 | 
336 | #define LL_APPEND2(head,add,next)                                                              \
337 | do {                                                                                           \
338 |   LDECLTYPE(head) _tmp;                                                                        \
339 |   (add)->next=NULL;                                                                            \
340 |   if (head) {                                                                                  \
341 |     _tmp = head;                                                                               \
342 |     while (_tmp->next) { _tmp = _tmp->next; }                                                  \
343 |     _tmp->next=(add);                                                                          \
344 |   } else {                                                                                     \
345 |     (head)=(add);                                                                              \
346 |   }                                                                                            \
347 | } while (0)
348 | 
349 | #define LL_DELETE(head,del)                                                                    \
350 |     LL_DELETE2(head,del,next)
351 | 
352 | #define LL_DELETE2(head,del,next)                                                              \
353 | do {                                                                                           \
354 |   LDECLTYPE(head) _tmp;                                                                        \
355 |   if ((head) == (del)) {                                                                       \
356 |     (head)=(head)->next;                                                                       \
357 |   } else {                                                                                     \
358 |     _tmp = head;                                                                               \
359 |     while (_tmp->next && (_tmp->next != (del))) {                                              \
360 |       _tmp = _tmp->next;                                                                       \
361 |     }                                                                                          \
362 |     if (_tmp->next) {                                                                          \
363 |       _tmp->next = ((del)->next);                                                              \
364 |     }                                                                                          \
365 |   }                                                                                            \
366 | } while (0)
367 | 
368 | /* Here are VS2008 replacements for LL_APPEND and LL_DELETE */
369 | #define LL_APPEND_VS2008(head,add)                                                             \
370 |     LL_APPEND2_VS2008(head,add,next)
371 | 
372 | #define LL_APPEND2_VS2008(head,add,next)                                                       \
373 | do {                                                                                           \
374 |   if (head) {                                                                                  \
375 |     (add)->next = head;     /* use add->next as a temp variable */                             \
376 |     while ((add)->next->next) { (add)->next = (add)->next->next; }                             \
377 |     (add)->next->next=(add);                                                                   \
378 |   } else {                                                                                     \
379 |     (head)=(add);                                                                              \
380 |   }                                                                                            \
381 |   (add)->next=NULL;                                                                            \
382 | } while (0)
383 | 
384 | #define LL_DELETE_VS2008(head,del)                                                             \
385 |     LL_DELETE2_VS2008(head,del,next)
386 | 
387 | #define LL_DELETE2_VS2008(head,del,next)                                                       \
388 | do {                                                                                           \
389 |   if ((head) == (del)) {                                                                       \
390 |     (head)=(head)->next;                                                                       \
391 |   } else {                                                                                     \
392 |     char *_tmp = (char*)(head);                                                                \
393 |     while ((head)->next && ((head)->next != (del))) {                                          \
394 |       head = (head)->next;                                                                     \
395 |     }                                                                                          \
396 |     if ((head)->next) {                                                                        \
397 |       (head)->next = ((del)->next);                                                            \
398 |     }                                                                                          \
399 |     {                                                                                          \
400 |       char **_head_alias = (char**)&(head);                                                    \
401 |       *_head_alias = _tmp;                                                                     \
402 |     }                                                                                          \
403 |   }                                                                                            \
404 | } while (0)
405 | #ifdef NO_DECLTYPE
406 | #undef LL_APPEND
407 | #define LL_APPEND LL_APPEND_VS2008
408 | #undef LL_DELETE
409 | #define LL_DELETE LL_DELETE_VS2008
410 | #undef LL_DELETE2
411 | #define LL_DELETE2 LL_DELETE2_VS2008
412 | #undef LL_APPEND2
413 | #define LL_APPEND2 LL_APPEND2_VS2008
414 | #undef LL_CONCAT /* no LL_CONCAT_VS2008 */
415 | #undef DL_CONCAT /* no DL_CONCAT_VS2008 */
416 | #endif
417 | /* end VS2008 replacements */
418 | 
419 | #define LL_COUNT(head,el,counter)                                                              \
420 |     LL_COUNT2(head,el,counter,next)                                                            \
421 | 
422 | #define LL_COUNT2(head,el,counter,next)                                                        \
423 | {                                                                                              \
424 |     counter = 0;                                                                               \
425 |     LL_FOREACH2(head,el,next){ ++counter; }                                                    \
426 | }
427 | 
428 | #define LL_FOREACH(head,el)                                                                    \
429 |     LL_FOREACH2(head,el,next)
430 | 
431 | #define LL_FOREACH2(head,el,next)                                                              \
432 |     for(el=head;el;el=(el)->next)
433 | 
434 | #define LL_FOREACH_SAFE(head,el,tmp)                                                           \
435 |     LL_FOREACH_SAFE2(head,el,tmp,next)
436 | 
437 | #define LL_FOREACH_SAFE2(head,el,tmp,next)                                                     \
438 |   for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp)
439 | 
440 | #define LL_SEARCH_SCALAR(head,out,field,val)                                                   \
441 |     LL_SEARCH_SCALAR2(head,out,field,val,next)
442 | 
443 | #define LL_SEARCH_SCALAR2(head,out,field,val,next)                                             \
444 | do {                                                                                           \
445 |     LL_FOREACH2(head,out,next) {                                                               \
446 |       if ((out)->field == (val)) break;                                                        \
447 |     }                                                                                          \
448 | } while(0)
449 | 
450 | #define LL_SEARCH(head,out,elt,cmp)                                                            \
451 |     LL_SEARCH2(head,out,elt,cmp,next)
452 | 
453 | #define LL_SEARCH2(head,out,elt,cmp,next)                                                      \
454 | do {                                                                                           \
455 |     LL_FOREACH2(head,out,next) {                                                               \
456 |       if ((cmp(out,elt))==0) break;                                                            \
457 |     }                                                                                          \
458 | } while(0)
459 | 
460 | #define LL_REPLACE_ELEM(head, el, add)                                                         \
461 | do {                                                                                           \
462 |  LDECLTYPE(head) _tmp;                                                                         \
463 |  assert(head != NULL);                                                                         \
464 |  assert(el != NULL);                                                                           \
465 |  assert(add != NULL);                                                                          \
466 |  (add)->next = (el)->next;                                                                     \
467 |  if ((head) == (el)) {                                                                         \
468 |   (head) = (add);                                                                              \
469 |  } else {                                                                                      \
470 |   _tmp = head;                                                                                 \
471 |   while (_tmp->next && (_tmp->next != (el))) {                                                 \
472 |    _tmp = _tmp->next;                                                                          \
473 |   }                                                                                            \
474 |   if (_tmp->next) {                                                                            \
475 |     _tmp->next = (add);                                                                        \
476 |   }                                                                                            \
477 |  }                                                                                             \
478 | } while (0)
479 | 
480 | #define LL_PREPEND_ELEM(head, el, add)                                                         \
481 | do {                                                                                           \
482 |  LDECLTYPE(head) _tmp;                                                                         \
483 |  assert(head != NULL);                                                                         \
484 |  assert(el != NULL);                                                                           \
485 |  assert(add != NULL);                                                                          \
486 |  (add)->next = (el);                                                                           \
487 |  if ((head) == (el)) {                                                                         \
488 |   (head) = (add);                                                                              \
489 |  } else {                                                                                      \
490 |   _tmp = head;                                                                                 \
491 |   while (_tmp->next && (_tmp->next != (el))) {                                                 \
492 |    _tmp = _tmp->next;                                                                          \
493 |   }                                                                                            \
494 |   if (_tmp->next) {                                                                            \
495 |     _tmp->next = (add);                                                                        \
496 |   }                                                                                            \
497 |  }                                                                                             \
498 | } while (0)                                                                                    \
499 | 
500 | 
501 | /******************************************************************************
502 |  * doubly linked list macros (non-circular)                                   *
503 |  *****************************************************************************/
504 | #define DL_PREPEND(head,add)                                                                   \
505 |     DL_PREPEND2(head,add,prev,next)
506 | 
507 | #define DL_PREPEND2(head,add,prev,next)                                                        \
508 | do {                                                                                           \
509 |  (add)->next = head;                                                                           \
510 |  if (head) {                                                                                   \
511 |    (add)->prev = (head)->prev;                                                                 \
512 |    (head)->prev = (add);                                                                       \
513 |  } else {                                                                                      \
514 |    (add)->prev = (add);                                                                        \
515 |  }                                                                                             \
516 |  (head) = (add);                                                                               \
517 | } while (0)
518 | 
519 | #define DL_APPEND(head,add)                                                                    \
520 |     DL_APPEND2(head,add,prev,next)
521 | 
522 | #define DL_APPEND2(head,add,prev,next)                                                         \
523 | do {                                                                                           \
524 |   if (head) {                                                                                  \
525 |       (add)->prev = (head)->prev;                                                              \
526 |       (head)->prev->next = (add);                                                              \
527 |       (head)->prev = (add);                                                                    \
528 |       (add)->next = NULL;                                                                      \
529 |   } else {                                                                                     \
530 |       (head)=(add);                                                                            \
531 |       (head)->prev = (head);                                                                   \
532 |       (head)->next = NULL;                                                                     \
533 |   }                                                                                            \
534 | } while (0)
535 | 
536 | #define DL_CONCAT(head1,head2)                                                                 \
537 |     DL_CONCAT2(head1,head2,prev,next)
538 | 
539 | #define DL_CONCAT2(head1,head2,prev,next)                                                      \
540 | do {                                                                                           \
541 |   LDECLTYPE(head1) _tmp;                                                                       \
542 |   if (head2) {                                                                                 \
543 |     if (head1) {                                                                               \
544 |         _tmp = (head2)->prev;                                                                  \
545 |         (head2)->prev = (head1)->prev;                                                         \
546 |         (head1)->prev->next = (head2);                                                         \
547 |         (head1)->prev = _tmp;                                                                  \
548 |     } else {                                                                                   \
549 |         (head1)=(head2);                                                                       \
550 |     }                                                                                          \
551 |   }                                                                                            \
552 | } while (0)
553 | 
554 | #define DL_DELETE(head,del)                                                                    \
555 |     DL_DELETE2(head,del,prev,next)
556 | 
557 | #define DL_DELETE2(head,del,prev,next)                                                         \
558 | do {                                                                                           \
559 |   assert((del)->prev != NULL);                                                                 \
560 |   if ((del)->prev == (del)) {                                                                  \
561 |       (head)=NULL;                                                                             \
562 |   } else if ((del)==(head)) {                                                                  \
563 |       (del)->next->prev = (del)->prev;                                                         \
564 |       (head) = (del)->next;                                                                    \
565 |   } else {                                                                                     \
566 |       (del)->prev->next = (del)->next;                                                         \
567 |       if ((del)->next) {                                                                       \
568 |           (del)->next->prev = (del)->prev;                                                     \
569 |       } else {                                                                                 \
570 |           (head)->prev = (del)->prev;                                                          \
571 |       }                                                                                        \
572 |   }                                                                                            \
573 | } while (0)
574 | 
575 | #define DL_COUNT(head,el,counter)                                                              \
576 |     DL_COUNT2(head,el,counter,next)                                                            \
577 | 
578 | #define DL_COUNT2(head,el,counter,next)                                                        \
579 | {                                                                                              \
580 |     counter = 0;                                                                               \
581 |     DL_FOREACH2(head,el,next){ ++counter; }                                                    \
582 | }
583 | 
584 | #define DL_FOREACH(head,el)                                                                    \
585 |     DL_FOREACH2(head,el,next)
586 | 
587 | #define DL_FOREACH2(head,el,next)                                                              \
588 |     for(el=head;el;el=(el)->next)
589 | 
590 | /* this version is safe for deleting the elements during iteration */
591 | #define DL_FOREACH_SAFE(head,el,tmp)                                                           \
592 |     DL_FOREACH_SAFE2(head,el,tmp,next)
593 | 
594 | #define DL_FOREACH_SAFE2(head,el,tmp,next)                                                     \
595 |   for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp)
596 | 
597 | /* these are identical to their singly-linked list counterparts */
598 | #define DL_SEARCH_SCALAR LL_SEARCH_SCALAR
599 | #define DL_SEARCH LL_SEARCH
600 | #define DL_SEARCH_SCALAR2 LL_SEARCH_SCALAR2
601 | #define DL_SEARCH2 LL_SEARCH2
602 | 
603 | #define DL_REPLACE_ELEM(head, el, add)                                                         \
604 | do {                                                                                           \
605 |  assert(head != NULL);                                                                         \
606 |  assert(el != NULL);                                                                           \
607 |  assert(add != NULL);                                                                          \
608 |  if ((head) == (el)) {                                                                         \
609 |   (head) = (add);                                                                              \
610 |   (add)->next = (el)->next;                                                                    \
611 |   if ((el)->next == NULL) {                                                                    \
612 |    (add)->prev = (add);                                                                        \
613 |   } else {                                                                                     \
614 |    (add)->prev = (el)->prev;                                                                   \
615 |    (add)->next->prev = (add);                                                                  \
616 |   }                                                                                            \
617 |  } else {                                                                                      \
618 |   (add)->next = (el)->next;                                                                    \
619 |   (add)->prev = (el)->prev;                                                                    \
620 |   (add)->prev->next = (add);                                                                   \
621 |   if ((el)->next == NULL) {                                                                    \
622 |    (head)->prev = (add);                                                                       \
623 |   } else {                                                                                     \
624 |    (add)->next->prev = (add);                                                                  \
625 |   }                                                                                            \
626 |  }                                                                                             \
627 | } while (0)
628 | 
629 | #define DL_PREPEND_ELEM(head, el, add)                                                         \
630 | do {                                                                                           \
631 |  assert(head != NULL);                                                                         \
632 |  assert(el != NULL);                                                                           \
633 |  assert(add != NULL);                                                                          \
634 |  (add)->next = (el);                                                                           \
635 |  (add)->prev = (el)->prev;                                                                     \
636 |  (el)->prev = (add);                                                                           \
637 |  if ((head) == (el)) {                                                                         \
638 |   (head) = (add);                                                                              \
639 |  } else {                                                                                      \
640 |   (add)->prev->next = (add);                                                                   \
641 |  }                                                                                             \
642 | } while (0)                                                                                    \
643 | 
644 | 
645 | /******************************************************************************
646 |  * circular doubly linked list macros                                         *
647 |  *****************************************************************************/
648 | #define CDL_PREPEND(head,add)                                                                  \
649 |     CDL_PREPEND2(head,add,prev,next)
650 | 
651 | #define CDL_PREPEND2(head,add,prev,next)                                                       \
652 | do {                                                                                           \
653 |  if (head) {                                                                                   \
654 |    (add)->prev = (head)->prev;                                                                 \
655 |    (add)->next = (head);                                                                       \
656 |    (head)->prev = (add);                                                                       \
657 |    (add)->prev->next = (add);                                                                  \
658 |  } else {                                                                                      \
659 |    (add)->prev = (add);                                                                        \
660 |    (add)->next = (add);                                                                        \
661 |  }                                                                                             \
662 | (head)=(add);                                                                                  \
663 | } while (0)
664 | 
665 | #define CDL_DELETE(head,del)                                                                   \
666 |     CDL_DELETE2(head,del,prev,next)
667 | 
668 | #define CDL_DELETE2(head,del,prev,next)                                                        \
669 | do {                                                                                           \
670 |   if ( ((head)==(del)) && ((head)->next == (head))) {                                          \
671 |       (head) = NULL;                                                                             \
672 |   } else {                                                                                     \
673 |      (del)->next->prev = (del)->prev;                                                          \
674 |      (del)->prev->next = (del)->next;                                                          \
675 |      if ((del) == (head)) (head)=(del)->next;                                                  \
676 |   }                                                                                            \
677 | } while (0)
678 | 
679 | #define CDL_COUNT(head,el,counter)                                                             \
680 |     CDL_COUNT2(head,el,counter,next)                                                           \
681 | 
682 | #define CDL_COUNT2(head, el, counter,next)                                                     \
683 | {                                                                                              \
684 |     counter = 0;                                                                               \
685 |     CDL_FOREACH2(head,el,next){ ++counter; }                                                   \
686 | }
687 | 
688 | #define CDL_FOREACH(head,el)                                                                   \
689 |     CDL_FOREACH2(head,el,next)
690 | 
691 | #define CDL_FOREACH2(head,el,next)                                                             \
692 |     for(el=head;el;el=(((el)->next==head) ? 0L : (el)->next))
693 | 
694 | #define CDL_FOREACH_SAFE(head,el,tmp1,tmp2)                                                    \
695 |     CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next)
696 | 
697 | #define CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next)                                         \
698 |   for((el)=(head), ((tmp1)=(head)?((head)->prev):NULL);                                        \
699 |       (el) && ((tmp2)=(el)->next, 1);                                                          \
700 |       ((el) = (((el)==(tmp1)) ? 0L : (tmp2))))
701 | 
702 | #define CDL_SEARCH_SCALAR(head,out,field,val)                                                  \
703 |     CDL_SEARCH_SCALAR2(head,out,field,val,next)
704 | 
705 | #define CDL_SEARCH_SCALAR2(head,out,field,val,next)                                            \
706 | do {                                                                                           \
707 |     CDL_FOREACH2(head,out,next) {                                                              \
708 |       if ((out)->field == (val)) break;                                                        \
709 |     }                                                                                          \
710 | } while(0)
711 | 
712 | #define CDL_SEARCH(head,out,elt,cmp)                                                           \
713 |     CDL_SEARCH2(head,out,elt,cmp,next)
714 | 
715 | #define CDL_SEARCH2(head,out,elt,cmp,next)                                                     \
716 | do {                                                                                           \
717 |     CDL_FOREACH2(head,out,next) {                                                              \
718 |       if ((cmp(out,elt))==0) break;                                                            \
719 |     }                                                                                          \
720 | } while(0)
721 | 
722 | #define CDL_REPLACE_ELEM(head, el, add)                                                        \
723 | do {                                                                                           \
724 |  assert(head != NULL);                                                                         \
725 |  assert(el != NULL);                                                                           \
726 |  assert(add != NULL);                                                                          \
727 |  if ((el)->next == (el)) {                                                                     \
728 |   (add)->next = (add);                                                                         \
729 |   (add)->prev = (add);                                                                         \
730 |   (head) = (add);                                                                              \
731 |  } else {                                                                                      \
732 |   (add)->next = (el)->next;                                                                    \
733 |   (add)->prev = (el)->prev;                                                                    \
734 |   (add)->next->prev = (add);                                                                   \
735 |   (add)->prev->next = (add);                                                                   \
736 |   if ((head) == (el)) {                                                                        \
737 |    (head) = (add);                                                                             \
738 |   }                                                                                            \
739 |  }                                                                                             \
740 | } while (0)
741 | 
742 | #define CDL_PREPEND_ELEM(head, el, add)                                                        \
743 | do {                                                                                           \
744 |  assert(head != NULL);                                                                         \
745 |  assert(el != NULL);                                                                           \
746 |  assert(add != NULL);                                                                          \
747 |  (add)->next = (el);                                                                           \
748 |  (add)->prev = (el)->prev;                                                                     \
749 |  (el)->prev = (add);                                                                           \
750 |  (add)->prev->next = (add);                                                                    \
751 |  if ((head) == (el)) {                                                                         \
752 |   (head) = (add);                                                                              \
753 |  }                                                                                             \
754 | } while (0)                                                                                    \
755 | 
756 | #endif /* UTLIST_H */
757 | 
758 | 


--------------------------------------------------------------------------------