├── tests ├── test_helper.h ├── dictionary_tests.h ├── test_dict.txt ├── test_helper.c ├── stem_plural_tests.h ├── stem_singular_tests.h ├── runtests.sh ├── remove_suffixes_tests.h ├── remove_prefixes_tests.h ├── minunit.h ├── dictionary_tests.c ├── precedence_adjustment_tests.c ├── stem_plural_tests.c ├── remove_suffixes_tests.c ├── stem_singular_tests.c └── remove_prefixes_tests.c ├── src ├── deps │ └── strndup │ │ ├── strndup.h │ │ ├── package.json │ │ └── strndup.c ├── sastrawi │ ├── stem_singular.h │ ├── text_util.h │ ├── dictionary.h │ ├── stem_plural.h │ ├── remove_suffixes.h │ ├── precedence_adjustment.c │ ├── stem_singular.c │ ├── remove_suffixes.c │ ├── text_util.c │ ├── remove_prefixes.h │ ├── dictionary.c │ ├── stem_plural.c │ └── remove_prefixes.c ├── libsastrawi.h ├── libsastrawi.c ├── regex │ ├── preg.h │ └── preg.c ├── dbg.h └── uthash │ ├── utringbuffer.h │ ├── utstring.h │ ├── utarray.h │ └── utlist.h ├── .gitignore ├── TODO ├── test.php ├── Makefile.ori ├── Makefile └── README.md /tests/test_helper.h: -------------------------------------------------------------------------------- 1 | void free_parts(int parts_count, char **parts[]); 2 | -------------------------------------------------------------------------------- /tests/dictionary_tests.h: -------------------------------------------------------------------------------- 1 | char *test_dictionary_load(); 2 | char *test_dictionary_contains(); 3 | char *test_dictionary_add(); 4 | -------------------------------------------------------------------------------- /tests/test_dict.txt: -------------------------------------------------------------------------------- 1 | aba 2 | abad 3 | abadi 4 | abadiah 5 | abai 6 | zuhud 7 | zuhur 8 | zulfikar 9 | zulhijah 10 | zulkaidah 11 | -------------------------------------------------------------------------------- /src/deps/strndup/strndup.h: -------------------------------------------------------------------------------- 1 | #ifndef HAVE_STRNDUP 2 | #define HAVE_STRNDUP 3 | 4 | char *strndup(const char *s, size_t n); 5 | 6 | #endif /* HAVE_STRNDUP */ 7 | -------------------------------------------------------------------------------- /src/sastrawi/stem_singular.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int stem_singular_word(char *word, char **stemmed_word); 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vagrant/ 2 | Vagrantfile 3 | *.o 4 | *.lo 5 | pcre2* 6 | test_text_cleanser 7 | test_sastrawi 8 | tags 9 | *.dSYM 10 | build/* 11 | *.log 12 | tests/*_tests 13 | -------------------------------------------------------------------------------- /tests/test_helper.c: -------------------------------------------------------------------------------- 1 | void free_parts(int parts_count, char **parts[]) 2 | { 3 | for (int i = 0; i < parts_count; i++) 4 | { 5 | free((*parts)[i]); 6 | } 7 | free(*parts); 8 | } 9 | -------------------------------------------------------------------------------- /src/sastrawi/text_util.h: -------------------------------------------------------------------------------- 1 | int split_word(char *pattern, char *word, char **first_part, char **second_part); 2 | int split_word3(char *pattern, char *word, char **first_part, char **second_part, char **third_part); 3 | -------------------------------------------------------------------------------- /tests/stem_plural_tests.h: -------------------------------------------------------------------------------- 1 | char *test_is_plural(); 2 | char *test_plural_parts(); 3 | char *test_stem_plural_word_when_both_words_are_root_words_and_the_same(); 4 | char *test_stem_plural_word_when_one_word_has_suffixes(); 5 | -------------------------------------------------------------------------------- /src/libsastrawi.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "regex/preg.h" 5 | #include "sastrawi/stem_plural.h" 6 | #include "sastrawi/stem_singular.h" 7 | #include "sastrawi/dictionary.h" 8 | 9 | void print_my_name(); 10 | -------------------------------------------------------------------------------- /src/sastrawi/dictionary.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | char *dictionary_fullpath(char *relative_path); 6 | int dictionary_load(char *path_to_dict); 7 | int dictionary_contains(char *word); 8 | int dictionary_add(char *word); 9 | int dictionary_count(); 10 | -------------------------------------------------------------------------------- /src/libsastrawi.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include 5 | #include 6 | #include 7 | #include "libsastrawi.h" 8 | #include "sastrawi/stem_plural.h" 9 | #include "sastrawi/stem_singular.h" 10 | 11 | void print_my_name() 12 | { 13 | printf("mohan"); 14 | } 15 | -------------------------------------------------------------------------------- /src/sastrawi/stem_plural.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | //Sastrawi\Stemmer::isPlural 6 | int is_plural(char *word); 7 | 8 | int plural_parts(char *word, char **parts[]); 9 | 10 | //Sastrawi\Stemmer::stemPluralWord 11 | int stem_plural_word(char *word, char **stemmed_word); 12 | -------------------------------------------------------------------------------- /src/deps/strndup/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "strndup", 3 | "version": "0.0.1", 4 | "repo": "clibs/strndup", 5 | "description": "strndup implementation. Useful when unavailable on your platform.", 6 | "keywords": [ "string" ], 7 | "license": "public domain", 8 | "src": [ 9 | "strndup.c", 10 | "strndup.h" 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /src/deps/strndup/strndup.c: -------------------------------------------------------------------------------- 1 | #ifndef HAVE_STRNDUP 2 | 3 | #include 4 | #include 5 | 6 | char *strndup(const char *s, size_t n) 7 | { 8 | char* new = malloc(n+1); 9 | if (new) { 10 | strncpy(new, s, n); 11 | new[n] = '\0'; 12 | } 13 | return new; 14 | } 15 | 16 | #endif /* HAVE_STRNDUP */ 17 | -------------------------------------------------------------------------------- /tests/stem_singular_tests.h: -------------------------------------------------------------------------------- 1 | char *test_stem_singular_word(); 2 | char *test_stem_singular_word_removes_plain_prefixes(); 3 | char *test_stem_singular_word_removes_suffixes(); 4 | char *test_stem_singular_word_removes_complex_prefixes_1(); 5 | char *test_stem_singular_word_removes_complex_prefixes_2(); 6 | char *test_stem_singular_word_removes_complex_prefixes_3(); 7 | -------------------------------------------------------------------------------- /src/regex/preg.h: -------------------------------------------------------------------------------- 1 | #ifndef _preg_replace_h 2 | #define _preg_replace_h 3 | 4 | #define PCRE2_CODE_UNIT_WIDTH 8 5 | #include 6 | 7 | char *preg_replace(char *pattern, char *replacement, char *subject); 8 | 9 | int preg_match(char *pattern, char *subject, char **matches[]); 10 | 11 | void free_matches(int matches_count, char **matches[]); 12 | #endif 13 | -------------------------------------------------------------------------------- /tests/runtests.sh: -------------------------------------------------------------------------------- 1 | echo "Running unit tests:" 2 | 3 | for i in tests/*_tests 4 | do 5 | if test -f $i 6 | then 7 | if $VALGRIND ./$i 2>> tests/tests.log 8 | then 9 | echo $i PASS 10 | else 11 | echo "ERROR in test $i: here's tests/tests.log" 12 | echo "------" 13 | tail tests/tests.log 14 | exit 1 15 | fi 16 | fi 17 | done 18 | 19 | echo "" 20 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | - Add test case for mengira - should get kira, currently ira 2 | - mengeri => ngeri not keri 3 | - memproteksi, mempatroli - proteksi, patroli not protek, patrol 4 | - Check remove_prefixes_rule6 for cases where it stems but its not in the dict 5 | - When no remove_prefixes match, stem_singular returns an empty string, it should return the original string 6 | - remove_suffixes should return 1 if its in dict 7 | -------------------------------------------------------------------------------- /src/sastrawi/remove_suffixes.h: -------------------------------------------------------------------------------- 1 | void remove_suffixes(char *word, char **stemmed_word); 2 | int remove_inflectional_particle(char *word, char **stemmed_word, char **removed_part); 3 | int remove_possessive_pronoun(char *word, char **stemmed_word, char **removed_part); 4 | int remove_derivational_suffix(char *word, char **stemmed_word, char **removed_part); 5 | int remove_suffix(char *suffixes, char *word, char **stemmed_word, char **removed_part); 6 | -------------------------------------------------------------------------------- /test.php: -------------------------------------------------------------------------------- 1 | malaikat malaikat-nya 10 | print_r($words); 11 | $suffix = $words[2]; 12 | if (in_array($suffix, array('ku', 'mu', 'nya', 'lah', 'kah', 'tah', 'pun')) && 13 | preg_match('/^(.*)-(.*)$/', $words[1], $words)) { 14 | $words[2] .= '-' . $suffix; 15 | } 16 | 17 | print "blah \n"; 18 | print_r($words); 19 | print "blah2 \n"; 20 | -------------------------------------------------------------------------------- /tests/remove_suffixes_tests.h: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include "minunit.h" 5 | #include 6 | #include 7 | #include 8 | #include "../sastrawi/remove_suffixes.h" 9 | #include "../dbg.h" 10 | 11 | char *test_remove_inflectional_particle_with_dash(); 12 | char *test_remove_inflectional_particle_without_dash(); 13 | char *test_remove_inflectional_particle_no_match(); 14 | char *test_remove_possessive_pronoun_with_dash(); 15 | char *test_remove_possessive_pronoun_without_dash(); 16 | char *test_remove_derivational_suffix_with_dash(); 17 | char *test_remove_derivational_suffix_without_dash(); 18 | char *test_remove_suffixes(); 19 | -------------------------------------------------------------------------------- /tests/remove_prefixes_tests.h: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include "minunit.h" 5 | #include 6 | #include 7 | #include 8 | #include "../sastrawi.h" 9 | #include "../dbg.h" 10 | 11 | char *test_remove_plain_prefix_di(); 12 | char *test_remove_plain_prefix_ke(); 13 | char *test_remove_plain_prefix_se(); 14 | char *test_remove_complex_prefix_rule1_a(); 15 | char *test_remove_complex_prefix_rule1_b(); 16 | char *test_remove_complex_prefix_rule2(); 17 | char *test_remove_complex_prefix_rule2_excludes_er(); 18 | char *test_remove_complex_prefix_rule3_only_includes_er(); 19 | char *test_remove_plain_prefix_returns_0_if_word_notin_dictionary(); 20 | -------------------------------------------------------------------------------- /src/sastrawi/precedence_adjustment.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include 5 | #include 6 | #include 7 | #include "../dbg.h" 8 | 9 | 10 | int is_precedence_adjustment_satisfied(char *original_word) 11 | { 12 | int len = strlen(original_word); 13 | 14 | if(strncmp(original_word, "be", 2) == 0) { 15 | if(len > 4 && strcmp(&original_word[len-3], "lah") == 0) { 16 | return 1; 17 | } 18 | 19 | if(len > 3 && strcmp(&original_word[len-2], "an") == 0) { 20 | return 1; 21 | } 22 | 23 | } else if(strncmp(original_word, "me", 2) == 0 || 24 | strncmp(original_word, "di", 2) == 0 || 25 | strncmp(original_word, "pe", 2) == 0 || 26 | strncmp(original_word, "ter", 3) == 0) { 27 | 28 | if(len > 2 && strcmp(&original_word[len-1], "i") == 0) { 29 | return 1; 30 | } 31 | 32 | } 33 | 34 | return 0; 35 | 36 | } 37 | -------------------------------------------------------------------------------- /tests/minunit.h: -------------------------------------------------------------------------------- 1 | #undef NDEBUG 2 | #ifndef _minunit_h 3 | #define _minunit_h 4 | 5 | #include 6 | #include "dbg.h" 7 | #include 8 | 9 | #define mu_suite_start() char *message = NULL 10 | 11 | 12 | #define mu_assert(test, message) if(!(test)) { log_err(message); return message; } 13 | #define mu_run_test(test) debug("\n------%s", " " #test); \ 14 | message = test(); tests_run++; if (message) return message; 15 | 16 | #define RUN_TESTS(name) int main(int argc, char *argv[]) {\ 17 | argc = 1; \ 18 | debug("----- RUNNING: %s", argv[0]);\ 19 | printf("-----\n RUNNING: %s\n", argv[0]);\ 20 | char *result = name();\ 21 | if(result != 0) { \ 22 | printf("FAILED: %s\n", result);\ 23 | }\ 24 | else {\ 25 | printf("ALL TESTS PASSED\n"); \ 26 | }\ 27 | printf("Tests run: %d\n", tests_run);\ 28 | exit(result != 0);\ 29 | } 30 | 31 | int tests_run; 32 | 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /Makefile.ori: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CFLAGS = -O3 -Wall -g -std=c99 3 | INCLUDES = -I/usr/local/include 4 | LFLAGS = -L/usr/local/lib 5 | LIBS = -lpcre2-8 6 | SRCS = tests/test_sastrawi.c tests/test_dictionary.c tests/test_stem_singular.c tests/test_stem_plural.c tests/test_remove_prefixes.c tests/test_remove_suffixes.c sastrawi.c sastrawi/stem_plural.c sastrawi/stem_singular.c sastrawi/remove_prefixes.c sastrawi/remove_suffixes.c sastrawi/text_util.c sastrawi/dictionary.c regex/preg.c 7 | OBJS = $(SRCS:.c=.o) 8 | MAIN = test_sastrawi 9 | 10 | .PHONY: depend clean 11 | 12 | all: $(MAIN) 13 | @echo test_sastrawi compiled! 14 | 15 | $(MAIN): $(OBJS) 16 | $(CC) $(CFLAGS) $(INCLUDES) -o $(MAIN) $(OBJS) $(LFLAGS) $(LIBS) 17 | .c.o: 18 | $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ 19 | 20 | clean: 21 | $(RM) *.o *~ regex/*.o tests/*.o sastrawi/*.o $(MAIN) 22 | 23 | depend: $(SRCS) 24 | makedepend $(INCLUDES) $^ 25 | 26 | # DO NOT DELETE THIS LINE -- make depend needs it 27 | 28 | -------------------------------------------------------------------------------- /src/dbg.h: -------------------------------------------------------------------------------- 1 | #ifndef __dbg_h__ 2 | #define __dbg_h__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef NDEBUG 9 | #define debug(M, ...) 10 | #else 11 | #define debug(M, ...) fprintf(stderr, "DEBUG %s:%d: " M "\n", __FILE__, __LINE__, ##__VA_ARGS__) 12 | #endif 13 | 14 | #define clean_errno() (errno == 0 ? "None" : strerror(errno)) 15 | 16 | #define log_err(M, ...) fprintf(stderr, "[ERROR] (%s:%d: errno: %s) " M "\n", __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) 17 | 18 | #define log_warn(M, ...) fprintf(stderr, "[WARN] (%s:%d: errno: %s) " M "\n", __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) 19 | 20 | #define log_info(M, ...) fprintf(stderr, "[INFO] (%s:%d) " M "\n", __FILE__, __LINE__, ##__VA_ARGS__) 21 | 22 | #define check(A, M, ...) if(!(A)) { log_err(M, ##__VA_ARGS__); errno=0; goto error; } 23 | 24 | #define sentinel(M, ...) { log_err(M, ##__VA_ARGS__); errno=0; goto error; } 25 | 26 | #define check_mem(A) check((A), "Out of memory.") 27 | 28 | #define check_debug(A, M, ...) if(!(A)) { debug(M, ##__VA_ARGS__); errno=0; goto error; } 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/sastrawi/stem_singular.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include 5 | #include 6 | #include 7 | #include "dictionary.h" 8 | #include "remove_suffixes.h" 9 | #include "remove_prefixes.h" 10 | #include "stem_singular.h" 11 | #include "../dbg.h" 12 | 13 | int stem_singular_word(char *word, char **stemmed_word) 14 | { 15 | 16 | //step 1: word already in dictionary 17 | if(dictionary_contains(word)) { 18 | (*stemmed_word) = strndup(word, strlen(word)); 19 | return 1; 20 | } 21 | 22 | if(is_precedence_adjustment_satisfied(word)) { 23 | 24 | int rc = remove_prefixes(word, stemmed_word); 25 | 26 | //in dict and done - return 27 | if(rc) { 28 | return 1; 29 | } 30 | 31 | char *post_remove = strndup(*stemmed_word, strlen(*stemmed_word)); 32 | free(*stemmed_word); 33 | *stemmed_word = NULL; 34 | 35 | remove_suffixes(post_remove, stemmed_word); 36 | if(dictionary_contains(*stemmed_word)) { 37 | return 1; 38 | } else { 39 | free(*stemmed_word); 40 | *stemmed_word = NULL; 41 | } 42 | } 43 | 44 | 45 | //step 2 & 3: remove suffixes 46 | remove_suffixes(word, stemmed_word); 47 | if(dictionary_contains(*stemmed_word)) { 48 | return 1; 49 | } 50 | 51 | char *post_suffix_removal_word = strndup(*stemmed_word, strlen(*stemmed_word)); 52 | free(*stemmed_word); 53 | *stemmed_word = NULL; 54 | 55 | return remove_prefixes(post_suffix_removal_word, stemmed_word); 56 | } 57 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS=-Wall -g -O2 -Wextra -Isrc -I/usr/local/include -DNDEBUG $(OPTFLAGS) 2 | LDFLAGS=-lpcre2-8 -L/usr/local/lib $(OPTLIBS) 3 | PREFIX?=/usr/local 4 | 5 | SOURCES=$(wildcard src/**/*.c src/*.c tests/test_helper.c) 6 | OBJECTS=$(patsubst %.c,%.o,$(SOURCES)) 7 | 8 | TEST_SRC=$(wildcard tests/*_tests.c) 9 | TESTS=$(patsubst %.c,%,$(TEST_SRC)) 10 | 11 | TARGET=build/libsastrawi.a 12 | SO_TARGET=$(patsubst %.a,%.so,$(TARGET)) 13 | 14 | all: $(TARGET) tests 15 | 16 | dev: CFLAGS=-Wall -g -Isrc -Wall -Wextra $(OPTFLAGS) 17 | dev: all 18 | 19 | $(TARGET): CFLAGS += -fPIC 20 | $(TARGET): build $(OBJECTS) 21 | ar rcs $@ $(OBJECTS) 22 | ranlib $@ 23 | 24 | # $(SO_TARGET): $(TARGET) $(OBJECTS) 25 | # $(CC) -shared -o $@ $(OBJECTS) 26 | # 27 | build: 28 | @mkdir -p build 29 | @mkdir -p bin 30 | 31 | .PHONY: tests 32 | tests: LDLIBS += $(TARGET) tests/test_helper.o 33 | tests: $(TESTS) 34 | $(TESTS) 35 | sh ./tests/runtests.sh 36 | 37 | valgrind: all 38 | VALGRIND="valgrind --log-file=/tmp/valgrind-%p.log" 39 | 40 | clean: 41 | rm -rf build $(OBJECTS) $(TESTS) 42 | rm -f tests/tests.log 43 | find . -name "*.gc*" -exec rm {} \; 44 | rm -rf `find . -name "*.dSYM" -print` 45 | 46 | install: all 47 | install -d $(DESTDIR)/$(PREFIX)/lib/ 48 | install $(TARGET) $(DESTDIR)/$(PREFIX)/lib/ 49 | 50 | BADFUNCS='[^_.>a-zA-Z0-9](str(n?cpy|n?cat|xfrm|n?dup|str|prrk|tok|_)stpm?cpy|a?sn?printf|byte_)' 51 | check: 52 | @echo Files with potentially dangerous functions. 53 | @egrep $(BADFUNCS) $(SOURCES) || true 54 | 55 | list: 56 | sh -c "$(MAKE) -p no_targets__ | awk -F':' '/^[a-zA-Z0-9][^\$$#\/\\t=]*:([^=]|$$)/ {split(\$$1,A,/ /);for(i in A)print A[i]}' | grep -v '__\$$' | sort" 57 | -------------------------------------------------------------------------------- /src/sastrawi/remove_suffixes.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include 5 | #include 6 | #include "text_util.h" 7 | #include "remove_suffixes.h" 8 | #include "../dbg.h" 9 | 10 | void remove_suffixes(char *word, char **stemmed_word) 11 | { 12 | 13 | char *removed_parts = NULL; 14 | char *suffix_remove1 = NULL; 15 | char *suffix_remove2 = NULL; 16 | 17 | //step 2a 18 | remove_inflectional_particle(word, &suffix_remove1, &removed_parts); 19 | free(removed_parts); 20 | 21 | //step 2b 22 | remove_possessive_pronoun(suffix_remove1, &suffix_remove2, &removed_parts); 23 | free(removed_parts); 24 | 25 | //step 3 26 | remove_derivational_suffix(suffix_remove2, stemmed_word, &removed_parts); 27 | free(removed_parts); 28 | free(suffix_remove1); 29 | free(suffix_remove2); 30 | } 31 | 32 | 33 | int remove_inflectional_particle(char *word, char **stemmed_word, char **removed_part) 34 | { 35 | return remove_suffix("lah|kah|tah|pun", word, stemmed_word, removed_part); 36 | } 37 | 38 | int remove_possessive_pronoun(char *word, char **stemmed_word, char **removed_part) 39 | { 40 | return remove_suffix("ku|mu|nya", word, stemmed_word, removed_part); 41 | } 42 | 43 | int remove_derivational_suffix(char *word, char **stemmed_word, char **removed_part) 44 | { 45 | return remove_suffix("is|isme|isasi|i|kan|an", word, stemmed_word, removed_part); 46 | } 47 | 48 | int remove_suffix(char *suffixes, char *word, char **stemmed_word, char **removed_part) 49 | { 50 | char **matches = NULL; 51 | int rc; 52 | char *pattern = NULL; 53 | 54 | int pattern_rc = asprintf(&pattern, "(\\w+?)-?(%s)$", suffixes); 55 | 56 | rc = suffix_split_word(pattern, word, stemmed_word, removed_part); 57 | 58 | free(pattern); 59 | return rc; 60 | } 61 | -------------------------------------------------------------------------------- /tests/dictionary_tests.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include "minunit.h" 5 | #include 6 | #include 7 | #include 8 | #include "libsastrawi.h" 9 | #include "dbg.h" 10 | #include "dictionary_tests.h" 11 | 12 | char *test_dictionary_load() 13 | { 14 | int rc; 15 | 16 | rc = dictionary_load(dictionary_fullpath("tests/test_dict.txt")); 17 | mu_assert(rc, "when test_dict exists return truthy"); 18 | 19 | rc = dictionary_load(dictionary_fullpath("tests/test_not_exists.txt")); 20 | mu_assert(!rc, "when the dict file does not exist it should return falsy"); 21 | 22 | return NULL; 23 | } 24 | 25 | char *test_dictionary_contains() 26 | { 27 | dictionary_load(dictionary_fullpath("tests/test_dict.txt")); 28 | mu_assert(dictionary_contains("aba"), "test dict contains aba"); 29 | mu_assert(!dictionary_contains("non-existent"), "test dict does not contain non-existent"); 30 | 31 | return NULL; 32 | } 33 | 34 | char *test_dictionary_add() 35 | { 36 | dictionary_add("nonexistent"); 37 | 38 | mu_assert(dictionary_contains("nonexistent"), "dict should contain nonexistent"); 39 | mu_assert(!dictionary_contains("nonexistent2"), "dict should not contain nonexistent2"); 40 | 41 | int count = dictionary_count(); 42 | dictionary_add("bola"); 43 | int new_count = dictionary_count(); 44 | mu_assert(count == new_count, "dictionary_add ensures that entries are unique"); 45 | 46 | return NULL; 47 | } 48 | 49 | char *all_tests() 50 | { 51 | mu_suite_start(); 52 | 53 | dictionary_load(dictionary_fullpath("data/kata-dasar.txt")); 54 | 55 | mu_run_test(test_dictionary_load); 56 | mu_run_test(test_dictionary_add); 57 | mu_run_test(test_dictionary_contains); 58 | 59 | return NULL; 60 | } 61 | 62 | RUN_TESTS(all_tests); 63 | -------------------------------------------------------------------------------- /src/sastrawi/text_util.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../regex/preg.h" 3 | int prefix_split_word(char *pattern, char *word, char **first_part, char **second_part) 4 | { 5 | char **matches = NULL; 6 | int rc = 0; 7 | 8 | int match_count = preg_match(pattern, word, &matches); 9 | 10 | if(match_count == 3) { 11 | (*first_part) = strndup(matches[1], strlen(matches[1])); 12 | (*second_part) = strndup(matches[2], strlen(matches[2])); 13 | rc = 1; 14 | free_matches(match_count, &matches); 15 | } else { 16 | (*first_part) = strndup("", 0); 17 | (*second_part) = strndup(word, strlen(word)); 18 | } 19 | 20 | return rc; 21 | } 22 | 23 | int suffix_split_word(char *pattern, char *word, char **first_part, char **second_part) 24 | { 25 | char **matches = NULL; 26 | int rc = 0; 27 | 28 | int match_count = preg_match(pattern, word, &matches); 29 | 30 | if(match_count == 3) { 31 | (*first_part) = strndup(matches[1], strlen(matches[1])); 32 | (*second_part) = strndup(matches[2], strlen(matches[2])); 33 | rc = 1; 34 | free_matches(match_count, &matches); 35 | } else { 36 | (*first_part) = strndup(word, strlen(word)); 37 | (*second_part) = strndup("", 0); 38 | } 39 | 40 | return rc; 41 | } 42 | int split_word3(char *pattern, char *word, char **first_part, char **second_part, char **third_part) 43 | { 44 | char **matches = NULL; 45 | int rc = 0; 46 | 47 | int match_count = preg_match(pattern, word, &matches); 48 | 49 | if(match_count == 4) { 50 | (*first_part) = strndup(matches[1], strlen(matches[1])); 51 | (*second_part) = strndup(matches[2], strlen(matches[2])); 52 | (*third_part) = strndup(matches[3], strlen(matches[3])); 53 | 54 | rc = 1; 55 | free_matches(match_count, &matches); 56 | } 57 | 58 | return rc; 59 | } 60 | -------------------------------------------------------------------------------- /src/sastrawi/remove_prefixes.h: -------------------------------------------------------------------------------- 1 | typedef int (*PREFIX_REMOVER)(char *word, char **stemmed_word, char **removed_part); 2 | 3 | int remove_prefixes(char *word, char **stemmed_word); 4 | int remove_plain_prefix(char *word, char **stemmed_word, char **removed_part); 5 | int remove_complex_prefix_rule1(char *word, char **stemmed_word, char **removed_part); 6 | int remove_complex_prefix_rule2(char *word, char **stemmed_word, char **removed_part); 7 | int remove_complex_prefix_rule3(char *word, char **stemmed_word, char **removed_part); 8 | int remove_complex_prefix_rule4(char *word, char **stemmed_word, char **removed_part); 9 | int remove_complex_prefix_rule5(char *word, char **stemmed_word, char **removed_part); 10 | int remove_complex_prefix_rule6(char *word, char **stemmed_word, char **removed_part); 11 | int remove_complex_prefix_rule7(char *word, char **stemmed_word, char **removed_part); 12 | int remove_complex_prefix_rule8(char *word, char **stemmed_word, char **removed_part); 13 | int remove_complex_prefix_rule9(char *word, char **stemmed_word, char **removed_part); 14 | int remove_complex_prefix_rule10(char *word, char **stemmed_word, char **removed_part); 15 | int remove_complex_prefix_rule11(char *word, char **stemmed_word, char **removed_part); 16 | int remove_complex_prefix_rule12(char *word, char **stemmed_word, char **removed_part); 17 | int remove_complex_prefix_rule13(char *word, char **stemmed_word, char **removed_part); 18 | int remove_complex_prefix_rule14(char *word, char **stemmed_word, char **removed_part); 19 | int remove_complex_prefix_rule15(char *word, char **stemmed_word, char **removed_part); 20 | int remove_complex_prefix_rule16(char *word, char **stemmed_word, char **removed_part); 21 | int remove_complex_prefix_rule17(char *word, char **stemmed_word, char **removed_part); 22 | int remove_complex_prefix_rule18(char *word, char **stemmed_word, char **removed_part); 23 | int remove_complex_prefix_rule19(char *word, char **stemmed_word, char **removed_part); 24 | int remove_complex_prefix_rule20(char *word, char **stemmed_word, char **removed_part); 25 | -------------------------------------------------------------------------------- /src/sastrawi/dictionary.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../uthash/uthash.h" 9 | #include "dictionary.h" 10 | #include "../dbg.h" 11 | 12 | struct dict_entry { 13 | char *word; 14 | UT_hash_handle hh; 15 | }; 16 | 17 | struct dict_entry *dict; 18 | 19 | void remove_newline(char **word, int length) { 20 | if((*word)[length-1] == '\n') { 21 | (*word)[length-1] = '\0'; 22 | } 23 | } 24 | 25 | char *dictionary_fullpath(char *relative_path) 26 | { 27 | char *cwd = NULL; 28 | char *full_path = NULL; 29 | 30 | int rc = asprintf(&full_path, "%s/%s", getcwd(cwd,0), relative_path); 31 | check(rc != -1, "Cannot allocate memory"); 32 | 33 | return full_path; 34 | error: 35 | exit(1); 36 | } 37 | 38 | int dictionary_load(char *dict_path) 39 | { 40 | FILE *dict_file = NULL; 41 | char *word = NULL; 42 | size_t linecap = 0; 43 | ssize_t linelen; 44 | 45 | dict_file = fopen(dict_path, "rb"); 46 | check(dict_file, "Failed to open %s", dict_path); 47 | 48 | while((linelen = getline(&word, &linecap, dict_file)) > 0) { 49 | remove_newline(&word, linelen); 50 | dictionary_add(word); 51 | free(word); 52 | word = NULL; 53 | } 54 | 55 | fclose(dict_file); 56 | 57 | return 1; 58 | error: 59 | if(dict_file) fclose(dict_file); 60 | if(word) free(word); 61 | return 0; 62 | } 63 | 64 | int dictionary_add(char *word) 65 | { 66 | if(!dictionary_contains(word)) { 67 | struct dict_entry *dict_word = NULL; 68 | dict_word = malloc(sizeof(struct dict_entry)); 69 | check_mem(dict_word); 70 | 71 | dict_word->word = strndup(word, strlen(word)); 72 | HASH_ADD_KEYPTR(hh, dict, dict_word->word, strlen(dict_word->word), dict_word); 73 | } 74 | return 1; 75 | 76 | error: 77 | log_err("Failed to allocate memory for dictionary entry"); 78 | exit(1); 79 | } 80 | 81 | int dictionary_count() 82 | { 83 | return HASH_COUNT(dict); 84 | } 85 | 86 | int dictionary_contains(char *word) 87 | { 88 | struct dict_entry *dict_word = NULL; 89 | 90 | HASH_FIND_STR(dict, word, dict_word); 91 | 92 | if(dict_word == NULL) 93 | return 0; 94 | else 95 | return 1; 96 | } 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cSastrawi 2 | 3 | ## Introduction 4 | 5 | Bahasa Indonesia is one of the most spoken language in the world. 6 | [Stemming](https://en.wikipedia.org/wiki/Stemming) is important for many fields of computer science, from text search to machine learning. 7 | This is an attempt at porting the high quality [PHP based Bahasa Indonesian stemmer - Sastrawi](http://github.com/sastrawi/sastrawi) by [Andy Librian](https://github.com/andylibrian) to the C programming language. 8 | 9 | ## Why a port to C? 10 | 11 | - Because it will allow for a more direct integration with PostgreSQL via its full text search dictionary support. 12 | - We would be able to write bindings for most languages that provide ways to wrap C libraries 13 | - Because we can :) 14 | 15 | ## Caveat emptor 16 | 17 | - This is still super early code. Pretty much useless for anyone who actually wants something to use today. For that please look at PHP Sastrawi instead. I am putting this out here so that people who are interested/smarter then me can get involved early if they are so inclined. 18 | - I am learning C as I go along so apologies for the crappy code and lack of proper setup. 19 | 20 | 21 | ## Installation 22 | 23 | ### Mac OS X 24 | 25 | 1. Install PCRE2 via homebrew
26 | brew install pcre2`
27 | 
28 | 29 | ### Linux (tested on Ubuntu 14.04) 30 | 31 | 1. Download PCRE2 library from SourceForge http://sourceforge.net/projects/pcre/files/latest/download?source=files 32 | 2. Uncompress and install
33 | $ tar -xvjf pcre2-10.20.tar.bz2
34 | $ ./configure --enable-jit --prefix=/usr
35 | $ make
36 | $ make install
37 | 
38 | 39 | ## Run tests 40 | 41 | 1. For now just run `make`
42 | ± |master ✓| → make
43 | gcc -O3 -Wall -g -I/usr/local/include -c tests/test_sastrawi.c  -o tests/test_sastrawi.o
44 | gcc -O3 -Wall -g -I/usr/local/include -c sastrawi.c  -o sastrawi.o
45 | gcc -O3 -Wall -g -I/usr/local/include -c regex/preg.c  -o regex/preg.o
46 | gcc -O3 -Wall -g -I/usr/local/include -o test_sastrawi tests/test_sastrawi.o sastrawi.o regex/preg.o -L/usr/local/lib -lpcre2-8
47 | test_sastrawi compiled!
48 | 1. A file called `test_sastrawi` will be created in the same folder. When you run it you should see something like
49 |
50 | DEBUG tests/test_sastrawi.c:55: ----- RUNNING: ./test_sastrawi
51 | -----
52 |  RUNNING: ./test_sastrawi
53 | DEBUG tests/test_sastrawi.c:49:
54 | ------ test_is_plural
55 | DEBUG tests/test_sastrawi.c:50:
56 | ------ test_plural_parts
57 | ALL TESTS PASSED
58 | Tests run: 2
59 | 
60 | 61 | 62 | -------------------------------------------------------------------------------- /src/sastrawi/stem_plural.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include 5 | #include 6 | #include 7 | #include "stem_plural.h" 8 | #include "stem_singular.h" 9 | #include "../regex/preg.h" 10 | #include "../dbg.h" 11 | 12 | int is_plural(char *word) 13 | { 14 | char **matches; 15 | 16 | int matches_count, dash_count; 17 | 18 | matches_count = preg_match("^(.*)-(ku|mu|nya)$", word, &matches); 19 | 20 | if(matches_count > 0) { 21 | dash_count = strchr(matches[1], '-') != NULL; 22 | free_matches(matches_count, &matches); 23 | } else { 24 | dash_count = strchr(word, '-') != NULL; 25 | } 26 | 27 | return dash_count; 28 | 29 | } 30 | 31 | int plural_parts(char *word, char **parts[]) 32 | { 33 | char **matches; 34 | int matches_count, parts_count, rc; 35 | 36 | matches_count = preg_match("^(.*)-(.*)-(ku|mu|nya)$", word, &matches); 37 | 38 | if(matches_count < 0) { 39 | matches_count = preg_match("^(.*)-(.*)$", word, &matches); 40 | } 41 | 42 | if(matches_count>0) { 43 | 44 | char *second_part; 45 | 46 | if(matches_count == 4) { 47 | rc = asprintf(&second_part, "%s-%s",matches[2], matches[3]); 48 | check_debug(rc != -1, "Cannot allocate memory"); 49 | } else { 50 | second_part = strndup(matches[2], strlen(matches[2])); 51 | } 52 | 53 | *parts = malloc(2 * sizeof(char*)); 54 | (*parts)[0] = strndup(matches[1], strlen(matches[1])); 55 | (*parts)[1] = second_part; 56 | 57 | parts_count = 2; 58 | 59 | free_matches(matches_count, &matches); 60 | } else { 61 | *parts = malloc(1 * sizeof(char*)); 62 | (*parts)[0] = strndup(word, strlen(word)); 63 | parts_count = 1; 64 | } 65 | 66 | 67 | return parts_count; 68 | error: 69 | exit(1); 70 | } 71 | 72 | 73 | int stem_plural_word(char *word, char **stemmed_word) 74 | { 75 | 76 | char **word_parts = NULL; 77 | char *root_word0 = NULL; 78 | char *root_word1 = NULL; 79 | 80 | int rc = plural_parts(word, &word_parts); 81 | 82 | stem_singular_word(word_parts[0], &root_word0); 83 | stem_singular_word(word_parts[1], &root_word1); 84 | 85 | 86 | debug("word parts %s => %s, %s => %s", word_parts[0], root_word0, word_parts[1], root_word1); 87 | 88 | if(strcmp(root_word0, root_word1) == 0) { 89 | (*stemmed_word) = strndup(word_parts[0], strlen(word_parts[0])); 90 | } else { 91 | (*stemmed_word) = strndup(word, strlen(word)); 92 | } 93 | 94 | free_matches(rc, &word_parts); 95 | free(root_word0); 96 | free(root_word1); 97 | 98 | return 1; 99 | } 100 | -------------------------------------------------------------------------------- /tests/precedence_adjustment_tests.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include "minunit.h" 5 | #include 6 | #include 7 | #include 8 | #include "libsastrawi.h" 9 | #include "dbg.h" 10 | 11 | char *test_precendence_adjustment_satisfied_be_lah() 12 | { 13 | char *word_to_stem = "belilah"; 14 | int rc = is_precedence_adjustment_satisfied(word_to_stem); 15 | debug("word: %s", word_to_stem); 16 | mu_assert(rc == 1, "should be satisfied"); 17 | return NULL; 18 | } 19 | 20 | char *test_precendence_adjustment_satisfied_be_an() 21 | { 22 | char *word_to_stem = "belaan"; 23 | int rc = is_precedence_adjustment_satisfied(word_to_stem); 24 | debug("word: %s", word_to_stem); 25 | mu_assert(rc == 1, "should be satisfied"); 26 | return NULL; 27 | } 28 | 29 | char *test_precendence_adjustment_satisfied_me_i() 30 | { 31 | char *word_to_stem = "merangkumi"; 32 | int rc = is_precedence_adjustment_satisfied(word_to_stem); 33 | debug("word: %s", word_to_stem); 34 | mu_assert(rc == 1, "should be satisfied"); 35 | return NULL; 36 | } 37 | 38 | char *test_precendence_adjustment_satisfied_di_i() 39 | { 40 | char *word_to_stem = "dikahwini"; 41 | int rc = is_precedence_adjustment_satisfied(word_to_stem); 42 | debug("word: %s", word_to_stem); 43 | mu_assert(rc == 1, "should be satisfied"); 44 | return NULL; 45 | } 46 | 47 | char *test_precendence_adjustment_satisfied_pe_i() 48 | { 49 | char *word_to_stem = "penyanyi"; 50 | int rc = is_precedence_adjustment_satisfied(word_to_stem); 51 | debug("word: %s", word_to_stem); 52 | mu_assert(rc == 1, "should be satisfied"); 53 | return NULL; 54 | } 55 | 56 | char *test_precendence_adjustment_satisfied_ter_i() 57 | { 58 | char *word_to_stem = "terkini"; 59 | int rc = is_precedence_adjustment_satisfied(word_to_stem); 60 | debug("word: %s", word_to_stem); 61 | mu_assert(rc == 1, "should be satisfied"); 62 | return NULL; 63 | } 64 | 65 | char *test_precendence_adjustment_not_satisfied() 66 | { 67 | char *word_to_stem = "terjunam"; 68 | int rc = is_precedence_adjustment_satisfied(word_to_stem); 69 | debug("word: %s", word_to_stem); 70 | mu_assert(rc == 0, "should not be satisfied"); 71 | return NULL; 72 | } 73 | 74 | char *all_tests() 75 | { 76 | mu_suite_start(); 77 | mu_run_test(test_precendence_adjustment_satisfied_be_lah); 78 | mu_run_test(test_precendence_adjustment_satisfied_be_an); 79 | mu_run_test(test_precendence_adjustment_satisfied_me_i); 80 | mu_run_test(test_precendence_adjustment_satisfied_di_i); 81 | mu_run_test(test_precendence_adjustment_satisfied_pe_i); 82 | mu_run_test(test_precendence_adjustment_satisfied_ter_i); 83 | mu_run_test(test_precendence_adjustment_not_satisfied); 84 | return NULL; 85 | } 86 | 87 | RUN_TESTS(all_tests); 88 | -------------------------------------------------------------------------------- /tests/stem_plural_tests.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include "minunit.h" 5 | #include 6 | #include 7 | #include 8 | #include "libsastrawi.h" 9 | #include "dbg.h" 10 | #include "stem_plural_tests.h" 11 | #include "test_helper.h" 12 | 13 | 14 | char *test_is_plural() 15 | { 16 | 17 | mu_assert(!is_plural("hati-ku"), "hati-ku is not plural"); 18 | mu_assert(!is_plural("test2"), "test2 is not plural"); 19 | mu_assert(is_plural("hati-hati"), "hati-hati is plural"); 20 | 21 | return NULL; 22 | } 23 | 24 | char *test_plural_parts() { 25 | char **parts = NULL; 26 | int rc; 27 | 28 | rc = plural_parts("beli", &parts); 29 | mu_assert(rc == 1, "beli has 1 part"); 30 | mu_assert(strcmp("beli", parts[0]) == 0, "beli is returned in the parts"); 31 | 32 | free_parts(rc, &parts); 33 | 34 | rc = plural_parts("beli-beli", &parts); 35 | mu_assert(rc == 2, "beli-beli has 2 parts"); 36 | mu_assert(strcmp("beli", parts[0]) == 0, "beli-beli has 2 parts"); 37 | mu_assert(strcmp("beli", parts[1]) == 0, "beli-beli has 2 parts"); 38 | 39 | free_parts(rc, &parts); 40 | 41 | rc = plural_parts("beli-beli-ku", &parts); 42 | mu_assert(rc == 2, "beli-beli-ku has 2 parts"); 43 | mu_assert(strcmp("beli", parts[0]) == 0, "For beli-beli-ku, first part should be beli"); 44 | mu_assert(strcmp("beli-ku", parts[1]) == 0, "For beli-beli-ku, second part should be beli-ku"); 45 | 46 | free_parts(rc, &parts); 47 | 48 | return NULL; 49 | } 50 | 51 | char *test_stem_plural_word_when_both_words_are_root_words_and_the_same() 52 | { 53 | char *word = "malaikat-malaikat"; 54 | char *stemmed_word = NULL; 55 | int rc = stem_plural_word(word, &stemmed_word); 56 | mu_assert(strcmp("malaikat", stemmed_word) == 0, "it stems to malaikat"); 57 | free(stemmed_word); 58 | 59 | 60 | 61 | /* char *word3 = "berlari-lari"; */ 62 | /* char *stemmed_word3 = NULL; */ 63 | /* rc = stem_plural_word(word3, &stemmed_word3); */ 64 | /* debug("stem %s => %s, expected %s", word3, stemmed_word3, "lari"); */ 65 | /* mu_assert(strcmp("lari", stemmed_word3) == 0, "it stems to lari"); */ 66 | /* free(stemmed_word3); */ 67 | 68 | return NULL; 69 | } 70 | 71 | char *test_stem_plural_word_when_one_word_has_suffixes() 72 | { 73 | char *word = "malaikat-malaikatnya"; 74 | char *stemmed_word = NULL; 75 | int rc = stem_plural_word(word, &stemmed_word); 76 | mu_assert(strcmp("malaikat", stemmed_word) == 0, "it stems to malaikat"); 77 | free(stemmed_word); 78 | 79 | return NULL; 80 | } 81 | 82 | 83 | //TODO - create a test with berlarikah, to test return suffix 84 | char *all_tests() 85 | { 86 | mu_suite_start(); 87 | 88 | dictionary_load(dictionary_fullpath("data/kata-dasar.txt")); 89 | 90 | mu_run_test(test_is_plural); 91 | mu_run_test(test_plural_parts); 92 | mu_run_test(test_stem_plural_word_when_both_words_are_root_words_and_the_same); 93 | mu_run_test(test_stem_plural_word_when_one_word_has_suffixes); 94 | 95 | return NULL; 96 | } 97 | 98 | RUN_TESTS(all_tests); 99 | -------------------------------------------------------------------------------- /tests/remove_suffixes_tests.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include "minunit.h" 5 | #include 6 | #include 7 | #include 8 | #include "libsastrawi.h" 9 | #include "dbg.h" 10 | 11 | char *test_remove_inflectional_particle_with_dash() 12 | { 13 | char *stemmed_word = NULL; 14 | char *removed_part = NULL; 15 | 16 | int rc = remove_inflectional_particle("penting-kah", &stemmed_word, &removed_part); 17 | mu_assert(rc, "successfully stems"); 18 | mu_assert(strcmp("penting", stemmed_word) == 0, "we expect 'penting' as the stemmed word"); 19 | mu_assert(strcmp("kah", removed_part) == 0, "we expect 'kah' as the removed part"); 20 | 21 | return NULL; 22 | } 23 | 24 | 25 | 26 | char *test_remove_inflectional_particle_without_dash() 27 | { 28 | char *stemmed_word = NULL; 29 | char *removed_part = NULL; 30 | 31 | int rc = remove_inflectional_particle("pentingkah", &stemmed_word, &removed_part); 32 | mu_assert(rc, "successfully stems"); 33 | mu_assert(strcmp("penting", stemmed_word) == 0, "we expect 'penting' as the stemmed word"); 34 | mu_assert(strcmp("kah", removed_part) == 0, "we expect 'kah' as the removed part"); 35 | 36 | return NULL; 37 | } 38 | 39 | char *test_remove_inflectional_particle_no_match() 40 | { 41 | char *stemmed_word = NULL; 42 | char *removed_part = NULL; 43 | 44 | int rc = remove_inflectional_particle("penting", &stemmed_word, &removed_part); 45 | mu_assert(!rc, "fails stem"); 46 | mu_assert(strcmp("penting", stemmed_word) == 0, "we expect no change in the word passed in"); 47 | mu_assert(strcmp("", removed_part) == 0, "we expect empty string in the removed_part"); 48 | 49 | return NULL; 50 | } 51 | 52 | char *test_remove_possessive_pronoun_with_dash() 53 | { 54 | char *stemmed_word = NULL; 55 | char *removed_part = NULL; 56 | 57 | int rc = remove_possessive_pronoun("cinta-ku", &stemmed_word, &removed_part); 58 | 59 | mu_assert(rc, "successfully stems"); 60 | mu_assert(strcmp("cinta", stemmed_word) == 0, "we expect 'cinta' as the stemmed word"); 61 | mu_assert(strcmp("ku", removed_part) == 0, "we expect 'ku' as the removed part"); 62 | 63 | return NULL; 64 | } 65 | 66 | char *test_remove_possessive_pronoun_without_dash() 67 | { 68 | char *stemmed_word = NULL; 69 | char *removed_part = NULL; 70 | 71 | int rc = remove_possessive_pronoun("cintaku", &stemmed_word, &removed_part); 72 | mu_assert(rc, "successfully stems"); 73 | mu_assert(strcmp("cinta", stemmed_word) == 0, "we expect 'cinta' as the stemmed word"); 74 | mu_assert(strcmp("ku", removed_part) == 0, "we expect 'ku' as the removed part"); 75 | 76 | return NULL; 77 | } 78 | 79 | char *test_remove_derivational_suffix_with_dash() 80 | { 81 | char *stemmed_word = NULL; 82 | char *removed_part = NULL; 83 | 84 | int rc = remove_derivational_suffix("cinta-kan", &stemmed_word, &removed_part); 85 | 86 | mu_assert(rc, "successfully stems"); 87 | mu_assert(strcmp("cinta", stemmed_word) == 0, "we expect 'cinta' as the stemmed word"); 88 | mu_assert(strcmp("kan", removed_part) == 0, "we expect 'kan' as the removed part"); 89 | 90 | return NULL; 91 | } 92 | 93 | char *test_remove_derivational_suffix_without_dash() 94 | { 95 | char *stemmed_word = NULL; 96 | char *removed_part = NULL; 97 | 98 | int rc = remove_derivational_suffix("cintakan", &stemmed_word, &removed_part); 99 | mu_assert(rc, "successfully stems"); 100 | mu_assert(strcmp("cinta", stemmed_word) == 0, "we expect 'cinta' as the stemmed word"); 101 | mu_assert(strcmp("kan", removed_part) == 0, "we expect 'kan' as the removed part"); 102 | 103 | return NULL; 104 | } 105 | 106 | 107 | 108 | char *test_remove_suffixes() 109 | { 110 | char *word = "bajumukah"; 111 | char *stemmed_word = NULL; 112 | remove_suffixes(word, &stemmed_word); 113 | debug("stem word: %s, expected: baju, actual: %s", word, stemmed_word); 114 | mu_assert(strcmp("baju", stemmed_word) == 0, "it stems to baju"); 115 | free(stemmed_word); 116 | 117 | return NULL; 118 | } 119 | 120 | char *all_tests() 121 | { 122 | mu_suite_start(); 123 | 124 | dictionary_load(dictionary_fullpath("data/kata-dasar.txt")); 125 | 126 | mu_run_test(test_remove_inflectional_particle_with_dash); 127 | mu_run_test(test_remove_inflectional_particle_without_dash); 128 | mu_run_test(test_remove_inflectional_particle_no_match); 129 | 130 | mu_run_test(test_remove_possessive_pronoun_with_dash); 131 | mu_run_test(test_remove_possessive_pronoun_without_dash); 132 | 133 | mu_run_test(test_remove_derivational_suffix_with_dash); 134 | mu_run_test(test_remove_derivational_suffix_without_dash); 135 | 136 | mu_run_test(test_remove_suffixes); 137 | 138 | return NULL; 139 | } 140 | 141 | RUN_TESTS(all_tests); 142 | -------------------------------------------------------------------------------- /src/regex/preg.c: -------------------------------------------------------------------------------- 1 | #define PCRE2_STATIC 2 | #define PCRE2_CODE_UNIT_WIDTH 8 3 | 4 | #include 5 | #include 6 | #include 7 | #include "../uthash/uthash.h" 8 | #include "preg.h" 9 | #include "../dbg.h" 10 | #ifdef __linux 11 | #include "../deps/strndup/strndup.h" 12 | #endif 13 | 14 | /** 15 | * TODO: 16 | * - Do we need memory management for the cached regex ? 17 | */ 18 | 19 | struct re_cache { 20 | char *re; 21 | pcre2_code *compiled_re; 22 | UT_hash_handle hh; 23 | }; 24 | 25 | struct re_cache *active_re_cache; 26 | 27 | pcre2_code *compile(char *pattern) { 28 | 29 | PCRE2_SPTR pcre2_pattern = (PCRE2_SPTR)pattern; 30 | 31 | pcre2_code *re; 32 | int errornumber; 33 | PCRE2_SIZE erroroffset; 34 | 35 | re = pcre2_compile( 36 | pcre2_pattern, /* the pattern */ 37 | PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */ 38 | 0, /* default options */ 39 | &errornumber, /* for error number */ 40 | &erroroffset, /* for error offset */ 41 | NULL); /* use default compile context */ 42 | 43 | pcre2_jit_compile(re, PCRE2_JIT_COMPLETE); 44 | 45 | if (re == NULL) { 46 | PCRE2_UCHAR buffer[256]; 47 | pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); 48 | printf("PCRE2 compilation failed at offset %d: %s\n for pattern: %s", (int)erroroffset, 49 | buffer, pattern); 50 | exit(1); 51 | } 52 | 53 | return re; 54 | } 55 | 56 | pcre2_code *get_compiled_re(char *re) 57 | { 58 | struct re_cache *re_cache_item = NULL; 59 | HASH_FIND_STR(active_re_cache, re, re_cache_item); 60 | if(re_cache_item == NULL) { 61 | 62 | re_cache_item = malloc(sizeof(struct re_cache)); 63 | check_mem(re_cache_item); 64 | 65 | re_cache_item->re = strndup(re, strlen(re)); 66 | re_cache_item->compiled_re = compile(re_cache_item->re); 67 | HASH_ADD_KEYPTR(hh, active_re_cache, re_cache_item->re, strlen(re_cache_item->re), re_cache_item); 68 | } 69 | 70 | return re_cache_item->compiled_re; 71 | 72 | error: 73 | log_err("Failed to allocate memory for regex cache"); 74 | exit(1); 75 | } 76 | 77 | 78 | int preg_match(char *pattern, char *subject, char **matches[]) { 79 | 80 | int rc; 81 | PCRE2_SIZE *ovector; 82 | 83 | pcre2_code *compiled_re = get_compiled_re(pattern); 84 | 85 | PCRE2_SPTR pcre2_subject = (PCRE2_SPTR)subject; 86 | size_t subject_length = strlen((char *)subject); 87 | 88 | pcre2_match_data *match_data; 89 | 90 | match_data = pcre2_match_data_create_from_pattern(compiled_re, NULL); 91 | 92 | rc = pcre2_match( 93 | compiled_re, 94 | pcre2_subject, 95 | subject_length, 96 | 0, 97 | 0, 98 | match_data, 99 | NULL); 100 | 101 | 102 | if (rc > 1) { 103 | *matches = malloc(rc * sizeof(char*)); 104 | ovector = pcre2_get_ovector_pointer(match_data); 105 | 106 | check_mem(matches); 107 | 108 | for (int i = 0; i < rc; i++) 109 | { 110 | PCRE2_SPTR substring_start = pcre2_subject + ovector[2*i]; 111 | size_t substring_length = ovector[2*i+1] - ovector[2*i]; 112 | (*matches)[i] = strndup((char *)substring_start, (int)substring_length); 113 | } 114 | } 115 | 116 | pcre2_match_data_free(match_data); 117 | return rc; 118 | error: 119 | log_err("Failed to allocate memory for matches"); 120 | exit(1); 121 | } 122 | 123 | char *preg_replace(char *re, char *replacement, char *subject) { 124 | int rc; 125 | 126 | pcre2_code *compiled_re = get_compiled_re(re); 127 | 128 | PCRE2_SPTR pcre2_subject = (PCRE2_SPTR)subject; 129 | size_t subject_length = strlen((char *)subject); 130 | 131 | PCRE2_SPTR pcre2_replacement = (PCRE2_SPTR)replacement; 132 | size_t replacement_length = strlen((char *)replacement); 133 | 134 | PCRE2_UCHAR output[256]; 135 | size_t output_length = 256; 136 | 137 | rc = pcre2_substitute( 138 | compiled_re, 139 | pcre2_subject, 140 | subject_length, 141 | 0, 142 | PCRE2_SUBSTITUTE_GLOBAL, 143 | NULL, 144 | NULL, 145 | pcre2_replacement, 146 | replacement_length, 147 | output, 148 | &output_length 149 | ); 150 | 151 | 152 | if (rc < 0) { 153 | 154 | switch(rc) { 155 | case PCRE2_ERROR_NOMEMORY: 156 | printf("Output buffer not large enough\n"); break; 157 | case PCRE2_ERROR_BADREPLACEMENT: 158 | printf("Invalid replacement string %s\n", replacement); break; 159 | default: 160 | printf("Unknown error %d \n", rc); break; 161 | } 162 | 163 | exit(1); 164 | } 165 | 166 | return strndup((char *)output, output_length); 167 | } 168 | 169 | void free_matches(int matches_count, char **matches[]) 170 | { 171 | for (int i = 0; i < matches_count; i++) 172 | { 173 | free((*matches)[i]); 174 | } 175 | free(*matches); 176 | } 177 | 178 | 179 | -------------------------------------------------------------------------------- /src/uthash/utringbuffer.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2008-2014, Troy D. Hanson http://troydhanson.github.com/uthash/ 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | */ 23 | 24 | /* a ring-buffer implementation using macros 25 | */ 26 | #ifndef UTRINGBUFFER_H 27 | #define UTRINGBUFFER_H 28 | 29 | #include 30 | #include 31 | #include "utarray.h" // for "UT_icd" 32 | 33 | typedef struct { 34 | unsigned i; /* index of next available slot; wraps at n */ 35 | unsigned n; /* capacity */ 36 | unsigned char f; /* full */ 37 | UT_icd icd; /* initializer, copy and destructor functions */ 38 | char *d; /* n slots of size icd->sz */ 39 | } UT_ringbuffer; 40 | 41 | #define utringbuffer_init(a, _n, _icd) do { \ 42 | memset(a, 0, sizeof(UT_ringbuffer)); \ 43 | (a)->icd = *(_icd); \ 44 | (a)->n = (_n); \ 45 | if ((a)->n) { (a)->d = malloc((a)->n * (_icd)->sz); } \ 46 | } while(0) 47 | 48 | #define utringbuffer_clear(a) do { \ 49 | if ((a)->icd.dtor) { \ 50 | if ((a)->f) { \ 51 | for (unsigned _ut_i=0; _ut_i < (a)->n; _ut_i++) { \ 52 | (a)->icd.dtor(utringbuffer_eltptr(a, _ut_i)); \ 53 | } \ 54 | } else { \ 55 | for (unsigned _ut_i=0; _ut_i < (a)->i; _ut_i++) { \ 56 | (a)->icd.dtor(utringbuffer_eltptr(a, _ut_i)); \ 57 | } \ 58 | } \ 59 | } \ 60 | (a)->i = 0; \ 61 | (a)->f = 0; \ 62 | } while(0) 63 | 64 | #define utringbuffer_done(a) do { \ 65 | utringbuffer_clear(a); \ 66 | free((a)->d); (a)->d = NULL; \ 67 | (a)->n = 0; \ 68 | } while(0) 69 | 70 | #define utringbuffer_new(a,n,_icd) do { \ 71 | a = (UT_ringbuffer*)malloc(sizeof(UT_ringbuffer)); \ 72 | utringbuffer_init(a, n, _icd); \ 73 | } while(0) 74 | 75 | #define utringbuffer_free(a) do { \ 76 | utringbuffer_done(a); \ 77 | free(a); \ 78 | } while(0) 79 | 80 | #define utringbuffer_push_back(a,p) do { \ 81 | if ((a)->icd.dtor && (a)->f) { (a)->icd.dtor(_utringbuffer_internalptr(a,(a)->i)); } \ 82 | if ((a)->icd.copy) { (a)->icd.copy( _utringbuffer_internalptr(a,(a)->i), p); } \ 83 | else { memcpy(_utringbuffer_internalptr(a,(a)->i), p, (a)->icd.sz); }; \ 84 | if (++(a)->i == (a)->n) { (a)->i = 0; (a)->f = 1; } \ 85 | } while(0) 86 | 87 | #define utringbuffer_len(a) ((a)->f ? (a)->n : (a)->i) 88 | #define utringbuffer_empty(a) ((a)->i == 0 && !(a)->f) 89 | #define utringbuffer_full(a) ((a)->f != 0) 90 | 91 | #define _utringbuffer_real_idx(a,j) ((a)->f ? ((j) + (a)->i) % (a)->n : (j)) 92 | #define _utringbuffer_internalptr(a,j) ((void*)((char*)((a)->d + ((a)->icd.sz * (j))))) 93 | #define utringbuffer_eltptr(a,j) ((0 <= (j) && (j) < utringbuffer_len(a)) ? _utringbuffer_internalptr(a,_utringbuffer_real_idx(a,j)) : NULL) 94 | 95 | #define _utringbuffer_fake_idx(a,j) ((a)->f ? ((j) + (a)->n - (a)->i) % (a)->n : (j)) 96 | #define _utringbuffer_internalidx(a,e) (((char*)(e) >= (char*)(a)->d) ? (((char*)(e) - (char*)(a)->d)/(size_t)(a)->icd.sz) : -1) 97 | #define utringbuffer_eltidx(a,e) _utringbuffer_fake_idx(a, _utringbuffer_internalidx(a,e)) 98 | 99 | #define utringbuffer_front(a) utringbuffer_eltptr(a,0) 100 | #define utringbuffer_next(a,e) ((e)==NULL ? utringbuffer_front(a) : utringbuffer_eltptr(a, utringbuffer_eltidx(a,e)+1)) 101 | #define utringbuffer_prev(a,e) ((e)==NULL ? utringbuffer_back(a) : utringbuffer_eltptr(a, utringbuffer_eltidx(a,e)-1)) 102 | #define utringbuffer_back(a) (utringbuffer_empty(a) ? NULL : utringbuffer_eltptr(a, utringbuffer_len(a) - 1)) 103 | 104 | #endif /* UTRINGBUFFER_H */ 105 | -------------------------------------------------------------------------------- /tests/stem_singular_tests.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include "minunit.h" 5 | #include 6 | #include 7 | #include 8 | #include "libsastrawi.h" 9 | #include "dbg.h" 10 | 11 | char *test_stem_singular_word_for(char *word, char *expected_stem_word) 12 | { 13 | char *stemmed_word = NULL; 14 | int rc = stem_singular_word(word, &stemmed_word); 15 | debug("stem word: %s, expected: %s, actual: %s", word, expected_stem_word, stemmed_word); 16 | mu_assert(rc == 1, "failed to stem"); 17 | mu_assert(strcmp(expected_stem_word, stemmed_word) == 0, "failed to stem correctly"); 18 | free(stemmed_word); 19 | 20 | return NULL; 21 | } 22 | 23 | char *test_stem_singular_word_does_not_need_stemming() 24 | { 25 | return test_stem_singular_word_for("bola", "bola"); 26 | } 27 | 28 | char *test_stem_singular_word_removes_plain_prefixes() 29 | { 30 | return test_stem_singular_word_for("kerajinannya", "rajin"); 31 | } 32 | 33 | char *test_stem_singular_word_removes_suffixes() 34 | { 35 | return test_stem_singular_word_for("bajumukah", "baju"); 36 | } 37 | 38 | char *test_stem_singular_word_removes_complex_prefixes_1() 39 | { 40 | return test_stem_singular_word_for("beria", "ia"); 41 | } 42 | 43 | char *test_stem_singular_word_removes_complex_prefixes_2() 44 | { 45 | return test_stem_singular_word_for("bertabur", "tabur"); 46 | } 47 | 48 | char *test_stem_singular_word_removes_complex_prefixes_3() 49 | { 50 | return test_stem_singular_word_for("berdaerah", "daerah"); 51 | } 52 | 53 | char *test_stem_singular_word_removes_complex_prefixes_4() 54 | { 55 | return test_stem_singular_word_for("belajar", "ajar"); 56 | } 57 | 58 | char *test_stem_singular_word_removes_complex_prefixes_5() 59 | { 60 | return test_stem_singular_word_for("bekerja", "kerja"); 61 | } 62 | 63 | char *test_stem_singular_word_removes_complex_prefixes_6() 64 | { 65 | return test_stem_singular_word_for("teracun", "racun"); 66 | } 67 | 68 | char *test_stem_singular_word_removes_complex_prefixes_7() 69 | { 70 | return test_stem_singular_word_for("terperuk", "peruk"); 71 | } 72 | 73 | char *test_stem_singular_word_removes_complex_prefixes_8() 74 | { 75 | return test_stem_singular_word_for("tertangkap", "tangkap"); 76 | } 77 | 78 | char *test_stem_singular_word_removes_complex_prefixes_9() 79 | { 80 | return test_stem_singular_word_for("teterbang", "terbang"); 81 | } 82 | 83 | char *test_stem_singular_word_removes_complex_prefixes_10() 84 | { 85 | return test_stem_singular_word_for("mewarnai", "warna"); 86 | } 87 | 88 | char *test_stem_singular_word_removes_complex_prefixes_11() 89 | { 90 | return test_stem_singular_word_for("memfasilitasi", "fasilitas"); 91 | } 92 | 93 | char *test_stem_singular_word_removes_complex_prefixes_12() 94 | { 95 | return test_stem_singular_word_for("mempengaruhi", "pengaruh"); 96 | } 97 | 98 | char *test_stem_singular_word_removes_complex_prefixes_13() 99 | { 100 | return test_stem_singular_word_for("memasuki", "masuk"); 101 | } 102 | 103 | char *test_stem_singular_word_removes_complex_prefixes_14() 104 | { 105 | return test_stem_singular_word_for("mentaati", "taat"); 106 | } 107 | 108 | char *test_stem_singular_word_removes_complex_prefixes_15() 109 | { 110 | return test_stem_singular_word_for("menikmati", "nikmat"); 111 | } 112 | 113 | char *test_stem_singular_word_removes_complex_prefixes_16() 114 | { 115 | return test_stem_singular_word_for("mengqasar", "qasar"); 116 | } 117 | 118 | char *test_stem_singular_word_removes_complex_prefixes_17() 119 | { 120 | return test_stem_singular_word_for("mengecil", "kecil"); 121 | } 122 | 123 | char *test_stem_singular_word_removes_complex_prefixes_18() 124 | { 125 | return test_stem_singular_word_for("menyapu", "sapu"); 126 | } 127 | 128 | char *test_stem_singular_word_removes_complex_prefixes_19() 129 | { 130 | return test_stem_singular_word_for("memprotes", "protes"); 131 | } 132 | 133 | char *test_stem_singular_word_removes_complex_prefixes_20() 134 | { 135 | return test_stem_singular_word_for("peyoga", "yoga"); 136 | } 137 | 138 | char *test_stem_singular_word_uses_precedence_adjustment() 139 | { 140 | return test_stem_singular_word_for("memakai","pakai"); 141 | } 142 | 143 | char *test_stem_singular_word_uses_precedence_adjustment_2() 144 | { 145 | return test_stem_singular_word_for("berbadankan","badan"); 146 | } 147 | 148 | char *all_tests() 149 | { 150 | mu_suite_start(); 151 | 152 | dictionary_load(dictionary_fullpath("data/kata-dasar.txt")); 153 | 154 | mu_run_test(test_stem_singular_word_does_not_need_stemming); 155 | mu_run_test(test_stem_singular_word_removes_suffixes); 156 | 157 | mu_run_test(test_stem_singular_word_removes_plain_prefixes); 158 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_1); 159 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_2); 160 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_3); 161 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_4); 162 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_5); 163 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_6); 164 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_7); 165 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_8); 166 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_9); 167 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_10); 168 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_11); 169 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_12); 170 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_13); 171 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_14); 172 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_15); 173 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_16); 174 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_17); 175 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_18); 176 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_19); 177 | mu_run_test(test_stem_singular_word_removes_complex_prefixes_20); 178 | mu_run_test(test_stem_singular_word_uses_precedence_adjustment); 179 | mu_run_test(test_stem_singular_word_uses_precedence_adjustment_2); 180 | 181 | return NULL; 182 | } 183 | 184 | RUN_TESTS(all_tests); 185 | -------------------------------------------------------------------------------- /src/sastrawi/remove_prefixes.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include 5 | #include 6 | #include 7 | #include "dictionary.h" 8 | #include "text_util.h" 9 | #include "remove_prefixes.h" 10 | #include "../dbg.h" 11 | 12 | const int prefix_remover_count = 21; 13 | 14 | const PREFIX_REMOVER prefix_removers[prefix_remover_count] = { 15 | remove_plain_prefix, 16 | remove_complex_prefix_rule1, 17 | remove_complex_prefix_rule2, 18 | remove_complex_prefix_rule3, 19 | remove_complex_prefix_rule4, 20 | remove_complex_prefix_rule5, 21 | remove_complex_prefix_rule6, 22 | remove_complex_prefix_rule7, 23 | remove_complex_prefix_rule8, 24 | remove_complex_prefix_rule9, 25 | remove_complex_prefix_rule10, 26 | remove_complex_prefix_rule11, 27 | remove_complex_prefix_rule12, 28 | remove_complex_prefix_rule13, 29 | remove_complex_prefix_rule14, 30 | remove_complex_prefix_rule15, 31 | remove_complex_prefix_rule16, 32 | remove_complex_prefix_rule17, 33 | remove_complex_prefix_rule18, 34 | remove_complex_prefix_rule19, 35 | remove_complex_prefix_rule20 36 | }; 37 | 38 | 39 | int remove_prefixes(char *original_word, char **stemmed_word) 40 | { 41 | int rc = 0; 42 | char *removed_parts = NULL; 43 | 44 | char *word = strndup(original_word, strlen(original_word)); 45 | char *post_remove = NULL; 46 | 47 | for(int i =0; i < prefix_remover_count; i++) { 48 | 49 | free(post_remove); 50 | free(removed_parts); 51 | rc = (*prefix_removers[i])(word, &post_remove, &removed_parts); 52 | 53 | if(rc) { 54 | break; 55 | } else { 56 | free(word); 57 | word = strndup(post_remove, strlen(post_remove)); 58 | } 59 | } 60 | 61 | *stemmed_word = strndup(post_remove, strlen(post_remove)); 62 | 63 | //cleanup 64 | free(post_remove); 65 | free(removed_parts); 66 | free(word); 67 | 68 | return rc; 69 | } 70 | 71 | int remove_plain_prefix(char *word, char **stemmed_word, char **removed_part) 72 | { 73 | int rc = 0; 74 | 75 | int split_rc = prefix_split_word("^(di|ke|se)(\\w+)$", word, removed_part, stemmed_word); 76 | 77 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 78 | rc = 1; 79 | } 80 | 81 | return rc; 82 | } 83 | 84 | int remove_complex_prefix_rule1(char *word, char **stemmed_word, char **removed_part) 85 | { 86 | int rc = 0; 87 | 88 | int split_rc = prefix_split_word("(^ber)([aiueo].*)$", word, removed_part, stemmed_word); 89 | 90 | //1a 91 | if(split_rc == 1) { 92 | if(dictionary_contains(*stemmed_word)) { 93 | rc = 1; 94 | } else { 95 | //1b 96 | char *alternative_stemmed_word; 97 | asprintf(&alternative_stemmed_word, "r%s", *stemmed_word); 98 | rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "be"); 99 | } 100 | } 101 | return rc; 102 | } 103 | 104 | int remove_complex_prefix_rule2(char *word, char **stemmed_word, char **removed_part) 105 | { 106 | int rc = 0; 107 | char *partial_stemmed_word; 108 | 109 | int split_rc = split_word3("(^ber)([^aeiou][a-z](\\w*))", word, removed_part, stemmed_word, &partial_stemmed_word); 110 | 111 | 112 | if(split_rc == 1 && (strstr(partial_stemmed_word, "er") == NULL)) { 113 | if(dictionary_contains(*stemmed_word)) { 114 | rc = 1; 115 | } 116 | } else { 117 | (*stemmed_word) = strndup(word, strlen(word)); 118 | (*removed_part) = strndup("", 0); 119 | } 120 | 121 | return rc; 122 | } 123 | 124 | int remove_complex_prefix_rule3(char *word, char **stemmed_word, char **removed_part) 125 | { 126 | int rc = 0; 127 | 128 | int split_rc = prefix_split_word("(^ber)([^aeiou][a-z]er\\w*)", word, removed_part, stemmed_word); 129 | 130 | 131 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 132 | rc = 1; 133 | } 134 | 135 | return rc; 136 | } 137 | 138 | int remove_complex_prefix_rule4(char *word, char **stemmed_word, char **removed_part) 139 | { 140 | int rc = 0; 141 | 142 | int split_rc = prefix_split_word("(^bel)(ajar)", word, removed_part, stemmed_word); 143 | 144 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 145 | rc = 1; 146 | } 147 | 148 | return rc; 149 | } 150 | 151 | int remove_complex_prefix_rule5(char *word, char **stemmed_word, char **removed_part) 152 | { 153 | int rc = 0; 154 | 155 | int split_rc = prefix_split_word("(^be)([^aeiour]er[^aeiou]\\w*)", word, removed_part, stemmed_word); 156 | 157 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 158 | rc = 1; 159 | } 160 | return rc; 161 | } 162 | 163 | int remove_complex_prefix_rule6(char *word, char **stemmed_word, char **removed_part) 164 | { 165 | int rc = 0; 166 | 167 | int split_rc = prefix_split_word("(^ter)([aiueo].*)$", word, removed_part, stemmed_word); 168 | 169 | //6a 170 | if(split_rc == 1) { 171 | if(dictionary_contains(*stemmed_word)) { 172 | rc = 1; 173 | } else { 174 | //6b 175 | char *alternative_stemmed_word; 176 | asprintf(&alternative_stemmed_word, "r%s", *stemmed_word); 177 | rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "te"); 178 | } 179 | } 180 | return rc; 181 | } 182 | 183 | int remove_complex_prefix_rule7(char *word, char **stemmed_word, char **removed_part) 184 | { 185 | int rc = 0; 186 | 187 | int split_rc = prefix_split_word("(^ter)([^aeiour]er[aeiou]\\w*)", word, removed_part, stemmed_word); 188 | 189 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 190 | rc = 1; 191 | } 192 | 193 | return rc; 194 | } 195 | 196 | int remove_complex_prefix_rule8(char *word, char **stemmed_word, char **removed_part) 197 | { 198 | int rc = 0; 199 | char *partial_stemmed_word; 200 | 201 | int split_rc = split_word3("(^ter)([^aeiour](\\w*))", word, removed_part, stemmed_word, &partial_stemmed_word); 202 | 203 | if(split_rc == 1 && (strstr(partial_stemmed_word, "er") == NULL)) { 204 | if(dictionary_contains(*stemmed_word)) { 205 | rc = 1; 206 | } 207 | } else { 208 | (*stemmed_word) = strndup(word, strlen(word)); 209 | (*removed_part) = strndup("", 0); 210 | } 211 | 212 | return rc; 213 | } 214 | 215 | int remove_complex_prefix_rule9(char *word, char **stemmed_word, char **removed_part) 216 | { 217 | int rc = 0; 218 | char *partial_stemmed_word; 219 | 220 | int split_rc = prefix_split_word("(^te)([^aeiour]er[^aeiou]\\w*)", word, removed_part, stemmed_word); 221 | 222 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 223 | rc = 1; 224 | } 225 | return rc; 226 | } 227 | 228 | int remove_complex_prefix_rule10(char *word, char **stemmed_word, char **removed_part) 229 | { 230 | int rc = 0; 231 | 232 | int split_rc = prefix_split_word("(^me)([lrwy][aeiou]\\w*)", word, removed_part, stemmed_word); 233 | 234 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 235 | rc = 1; 236 | } 237 | return rc; 238 | } 239 | 240 | int remove_complex_prefix_rule11(char *word, char **stemmed_word, char **removed_part) 241 | { 242 | int rc = 0; 243 | 244 | int split_rc = prefix_split_word("(^mem)([fbv]\\w*)", word, removed_part, stemmed_word); 245 | 246 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 247 | rc = 1; 248 | } 249 | 250 | return rc; 251 | } 252 | 253 | int remove_complex_prefix_rule12(char *word, char **stemmed_word, char **removed_part) 254 | { 255 | int rc = 0; 256 | 257 | int split_rc = prefix_split_word("(^mem)(pe\\w*)", word, removed_part, stemmed_word); 258 | 259 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 260 | rc = 1; 261 | } 262 | 263 | return rc; 264 | } 265 | 266 | int remove_complex_prefix_rule13(char *word, char **stemmed_word, char **removed_part) 267 | { 268 | int rc = 0; 269 | 270 | int split_rc = prefix_split_word("(^me)(m[aeiou]\\w*)", word, removed_part, stemmed_word); 271 | 272 | if(split_rc == 1 ) { 273 | if(dictionary_contains(*stemmed_word)) { 274 | rc = 1; 275 | } else { 276 | char *alternative_stemmed_word; 277 | asprintf(&alternative_stemmed_word, "p%s", *stemmed_word+1); 278 | rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "me"); 279 | } 280 | } 281 | return rc; 282 | } 283 | 284 | int remove_complex_prefix_rule14(char *word, char **stemmed_word, char **removed_part) 285 | { 286 | int rc = 0; 287 | 288 | int split_rc = prefix_split_word("(^men)([cdjstz]\\w*)", word, removed_part, stemmed_word); 289 | 290 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 291 | rc = 1; 292 | } 293 | return rc; 294 | } 295 | 296 | int remove_complex_prefix_rule15(char *word, char **stemmed_word, char **removed_part) 297 | { 298 | int rc = 0; 299 | 300 | int split_rc = prefix_split_word("(^me)(n[aeiou]\\w*)", word, removed_part, stemmed_word); 301 | 302 | if(split_rc == 1 ) { 303 | if(dictionary_contains(*stemmed_word)) { 304 | rc = 1; 305 | } else { 306 | char *alternative_stemmed_word; 307 | asprintf(&alternative_stemmed_word, "t%s", *stemmed_word+1); 308 | rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "me"); 309 | } 310 | } 311 | return rc; 312 | } 313 | 314 | int remove_complex_prefix_rule16(char *word, char **stemmed_word, char **removed_part) 315 | { 316 | int rc = 0; 317 | 318 | int split_rc = prefix_split_word("(^meng)([ghqk]\\w*)", word, removed_part, stemmed_word); 319 | 320 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 321 | rc = 1; 322 | } 323 | return rc; 324 | } 325 | 326 | int remove_complex_prefix_rule17(char *word, char **stemmed_word, char **removed_part) 327 | { 328 | int rc = 0; 329 | char *alternative_stemmed_word; 330 | 331 | int split_rc = prefix_split_word("(^meng)([aeiou]\\w*)", word, removed_part, stemmed_word); 332 | 333 | if(split_rc == 1) { 334 | if(dictionary_contains(*stemmed_word)) { 335 | rc = 1; 336 | } 337 | 338 | if(rc == 0) { 339 | asprintf(&alternative_stemmed_word, "k%s", *stemmed_word); 340 | rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "meng"); 341 | free(alternative_stemmed_word); 342 | } 343 | 344 | if(rc == 0) { 345 | asprintf(&alternative_stemmed_word, "%s", *stemmed_word+1); 346 | rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "menge"); 347 | free(alternative_stemmed_word); 348 | } 349 | 350 | if(rc == 0) { 351 | asprintf(&alternative_stemmed_word, "ng%s", *stemmed_word); 352 | rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "me"); 353 | free(alternative_stemmed_word); 354 | } 355 | } 356 | return rc; 357 | } 358 | 359 | int remove_complex_prefix_rule18(char *word, char **stemmed_word, char **removed_part) 360 | { 361 | int rc = 0; 362 | 363 | int split_rc = prefix_split_word("(^me)(ny\\w*)", word, removed_part, stemmed_word); 364 | 365 | if(split_rc == 1 ) { 366 | if(dictionary_contains(*stemmed_word)) { 367 | rc = 1; 368 | } else { 369 | char *alternative_stemmed_word; 370 | asprintf(&alternative_stemmed_word, "s%s", *stemmed_word+2); 371 | rc = assign_if_root_word(stemmed_word, alternative_stemmed_word, removed_part, "meny"); 372 | } 373 | } 374 | return rc; 375 | } 376 | 377 | int remove_complex_prefix_rule19(char *word, char **stemmed_word, char **removed_part) 378 | { 379 | int rc = 0; 380 | 381 | int split_rc = prefix_split_word("(^mem)(p[^e]\\w*)", word, removed_part, stemmed_word); 382 | 383 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 384 | rc = 1; 385 | } 386 | return rc; 387 | } 388 | 389 | int remove_complex_prefix_rule20(char *word, char **stemmed_word, char **removed_part) 390 | { 391 | int rc = 0; 392 | 393 | int split_rc = prefix_split_word("(^pe)([wy][aeiou]\\w*)", word, removed_part, stemmed_word); 394 | 395 | if(split_rc == 1 && dictionary_contains(*stemmed_word)) { 396 | rc = 1; 397 | } 398 | return rc; 399 | } 400 | 401 | int assign_if_root_word(char **stemmed_word, char *alternative_stemmed_word, char **removed_part, char *alternative_removed_part) { 402 | int rc = 0; 403 | 404 | if(dictionary_contains(alternative_stemmed_word)) { 405 | free(*removed_part); 406 | *removed_part = strndup(alternative_removed_part, strlen(alternative_removed_part)); 407 | 408 | free(*stemmed_word); 409 | *stemmed_word = strndup(alternative_stemmed_word, strlen(alternative_stemmed_word)); 410 | rc = 1; 411 | } 412 | 413 | return rc; 414 | } 415 | -------------------------------------------------------------------------------- /src/uthash/utstring.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2008-2014, Troy D. Hanson http://troydhanson.github.com/uthash/ 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | */ 23 | 24 | /* a dynamic string implementation using macros 25 | */ 26 | #ifndef UTSTRING_H 27 | #define UTSTRING_H 28 | 29 | #define UTSTRING_VERSION 1.9.9 30 | 31 | #ifdef __GNUC__ 32 | #define _UNUSED_ __attribute__ ((__unused__)) 33 | #else 34 | #define _UNUSED_ 35 | #endif 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #define oom() exit(-1) 42 | 43 | typedef struct { 44 | char *d; 45 | size_t n; /* allocd size */ 46 | size_t i; /* index of first unused byte */ 47 | } UT_string; 48 | 49 | #define utstring_reserve(s,amt) \ 50 | do { \ 51 | if (((s)->n - (s)->i) < (size_t)(amt)) { \ 52 | (s)->d = (char*)realloc((s)->d, (s)->n + (amt)); \ 53 | if ((s)->d == NULL) oom(); \ 54 | (s)->n += (amt); \ 55 | } \ 56 | } while(0) 57 | 58 | #define utstring_init(s) \ 59 | do { \ 60 | (s)->n = 0; (s)->i = 0; (s)->d = NULL; \ 61 | utstring_reserve(s,100); \ 62 | (s)->d[0] = '\0'; \ 63 | } while(0) 64 | 65 | #define utstring_done(s) \ 66 | do { \ 67 | if ((s)->d != NULL) free((s)->d); \ 68 | (s)->n = 0; \ 69 | } while(0) 70 | 71 | #define utstring_free(s) \ 72 | do { \ 73 | utstring_done(s); \ 74 | free(s); \ 75 | } while(0) 76 | 77 | #define utstring_new(s) \ 78 | do { \ 79 | s = (UT_string*)calloc(sizeof(UT_string),1); \ 80 | if (!s) oom(); \ 81 | utstring_init(s); \ 82 | } while(0) 83 | 84 | #define utstring_renew(s) \ 85 | do { \ 86 | if (s) { \ 87 | utstring_clear(s); \ 88 | } else { \ 89 | utstring_new(s); \ 90 | } \ 91 | } while(0) 92 | 93 | #define utstring_clear(s) \ 94 | do { \ 95 | (s)->i = 0; \ 96 | (s)->d[0] = '\0'; \ 97 | } while(0) 98 | 99 | #define utstring_bincpy(s,b,l) \ 100 | do { \ 101 | utstring_reserve((s),(l)+1); \ 102 | if (l) memcpy(&(s)->d[(s)->i], b, l); \ 103 | (s)->i += (l); \ 104 | (s)->d[(s)->i]='\0'; \ 105 | } while(0) 106 | 107 | #define utstring_concat(dst,src) \ 108 | do { \ 109 | utstring_reserve((dst),((src)->i)+1); \ 110 | if ((src)->i) memcpy(&(dst)->d[(dst)->i], (src)->d, (src)->i); \ 111 | (dst)->i += (src)->i; \ 112 | (dst)->d[(dst)->i]='\0'; \ 113 | } while(0) 114 | 115 | #define utstring_len(s) ((unsigned)((s)->i)) 116 | 117 | #define utstring_body(s) ((s)->d) 118 | 119 | _UNUSED_ static void utstring_printf_va(UT_string *s, const char *fmt, va_list ap) { 120 | int n; 121 | va_list cp; 122 | while (1) { 123 | #ifdef _WIN32 124 | cp = ap; 125 | #else 126 | va_copy(cp, ap); 127 | #endif 128 | n = vsnprintf (&s->d[s->i], s->n-s->i, fmt, cp); 129 | va_end(cp); 130 | 131 | if ((n > -1) && ((size_t) n < (s->n-s->i))) { 132 | s->i += n; 133 | return; 134 | } 135 | 136 | /* Else try again with more space. */ 137 | if (n > -1) utstring_reserve(s,n+1); /* exact */ 138 | else utstring_reserve(s,(s->n)*2); /* 2x */ 139 | } 140 | } 141 | #ifdef __GNUC__ 142 | /* support printf format checking (2=the format string, 3=start of varargs) */ 143 | static void utstring_printf(UT_string *s, const char *fmt, ...) 144 | __attribute__ (( format( printf, 2, 3) )); 145 | #endif 146 | _UNUSED_ static void utstring_printf(UT_string *s, const char *fmt, ...) { 147 | va_list ap; 148 | va_start(ap,fmt); 149 | utstring_printf_va(s,fmt,ap); 150 | va_end(ap); 151 | } 152 | 153 | /******************************************************************************* 154 | * begin substring search functions * 155 | ******************************************************************************/ 156 | /* Build KMP table from left to right. */ 157 | _UNUSED_ static void _utstring_BuildTable( 158 | const char *P_Needle, 159 | size_t P_NeedleLen, 160 | long *P_KMP_Table) 161 | { 162 | long i, j; 163 | 164 | i = 0; 165 | j = i - 1; 166 | P_KMP_Table[i] = j; 167 | while (i < (long) P_NeedleLen) 168 | { 169 | while ( (j > -1) && (P_Needle[i] != P_Needle[j]) ) 170 | { 171 | j = P_KMP_Table[j]; 172 | } 173 | i++; 174 | j++; 175 | if (i < (long) P_NeedleLen) 176 | { 177 | if (P_Needle[i] == P_Needle[j]) 178 | { 179 | P_KMP_Table[i] = P_KMP_Table[j]; 180 | } 181 | else 182 | { 183 | P_KMP_Table[i] = j; 184 | } 185 | } 186 | else 187 | { 188 | P_KMP_Table[i] = j; 189 | } 190 | } 191 | 192 | return; 193 | } 194 | 195 | 196 | /* Build KMP table from right to left. */ 197 | _UNUSED_ static void _utstring_BuildTableR( 198 | const char *P_Needle, 199 | size_t P_NeedleLen, 200 | long *P_KMP_Table) 201 | { 202 | long i, j; 203 | 204 | i = P_NeedleLen - 1; 205 | j = i + 1; 206 | P_KMP_Table[i + 1] = j; 207 | while (i >= 0) 208 | { 209 | while ( (j < (long) P_NeedleLen) && (P_Needle[i] != P_Needle[j]) ) 210 | { 211 | j = P_KMP_Table[j + 1]; 212 | } 213 | i--; 214 | j--; 215 | if (i >= 0) 216 | { 217 | if (P_Needle[i] == P_Needle[j]) 218 | { 219 | P_KMP_Table[i + 1] = P_KMP_Table[j + 1]; 220 | } 221 | else 222 | { 223 | P_KMP_Table[i + 1] = j; 224 | } 225 | } 226 | else 227 | { 228 | P_KMP_Table[i + 1] = j; 229 | } 230 | } 231 | 232 | return; 233 | } 234 | 235 | 236 | /* Search data from left to right. ( Multiple search mode. ) */ 237 | _UNUSED_ static long _utstring_find( 238 | const char *P_Haystack, 239 | size_t P_HaystackLen, 240 | const char *P_Needle, 241 | size_t P_NeedleLen, 242 | long *P_KMP_Table) 243 | { 244 | long i, j; 245 | long V_FindPosition = -1; 246 | 247 | /* Search from left to right. */ 248 | i = j = 0; 249 | while ( (j < (int)P_HaystackLen) && (((P_HaystackLen - j) + i) >= P_NeedleLen) ) 250 | { 251 | while ( (i > -1) && (P_Needle[i] != P_Haystack[j]) ) 252 | { 253 | i = P_KMP_Table[i]; 254 | } 255 | i++; 256 | j++; 257 | if (i >= (int)P_NeedleLen) 258 | { 259 | /* Found. */ 260 | V_FindPosition = j - i; 261 | break; 262 | } 263 | } 264 | 265 | return V_FindPosition; 266 | } 267 | 268 | 269 | /* Search data from right to left. ( Multiple search mode. ) */ 270 | _UNUSED_ static long _utstring_findR( 271 | const char *P_Haystack, 272 | size_t P_HaystackLen, 273 | const char *P_Needle, 274 | size_t P_NeedleLen, 275 | long *P_KMP_Table) 276 | { 277 | long i, j; 278 | long V_FindPosition = -1; 279 | 280 | /* Search from right to left. */ 281 | j = (P_HaystackLen - 1); 282 | i = (P_NeedleLen - 1); 283 | while ( (j >= 0) && (j >= i) ) 284 | { 285 | while ( (i < (int)P_NeedleLen) && (P_Needle[i] != P_Haystack[j]) ) 286 | { 287 | i = P_KMP_Table[i + 1]; 288 | } 289 | i--; 290 | j--; 291 | if (i < 0) 292 | { 293 | /* Found. */ 294 | V_FindPosition = j + 1; 295 | break; 296 | } 297 | } 298 | 299 | return V_FindPosition; 300 | } 301 | 302 | 303 | /* Search data from left to right. ( One time search mode. ) */ 304 | _UNUSED_ static long utstring_find( 305 | UT_string *s, 306 | long P_StartPosition, /* Start from 0. -1 means last position. */ 307 | const char *P_Needle, 308 | size_t P_NeedleLen) 309 | { 310 | long V_StartPosition; 311 | long V_HaystackLen; 312 | long *V_KMP_Table; 313 | long V_FindPosition = -1; 314 | 315 | if (P_StartPosition < 0) 316 | { 317 | V_StartPosition = s->i + P_StartPosition; 318 | } 319 | else 320 | { 321 | V_StartPosition = P_StartPosition; 322 | } 323 | V_HaystackLen = s->i - V_StartPosition; 324 | if ( (V_HaystackLen >= (long) P_NeedleLen) && (P_NeedleLen > 0) ) 325 | { 326 | V_KMP_Table = (long *)malloc(sizeof(long) * (P_NeedleLen + 1)); 327 | if (V_KMP_Table != NULL) 328 | { 329 | _utstring_BuildTable(P_Needle, P_NeedleLen, V_KMP_Table); 330 | 331 | V_FindPosition = _utstring_find(s->d + V_StartPosition, 332 | V_HaystackLen, 333 | P_Needle, 334 | P_NeedleLen, 335 | V_KMP_Table); 336 | if (V_FindPosition >= 0) 337 | { 338 | V_FindPosition += V_StartPosition; 339 | } 340 | 341 | free(V_KMP_Table); 342 | } 343 | } 344 | 345 | return V_FindPosition; 346 | } 347 | 348 | 349 | /* Search data from right to left. ( One time search mode. ) */ 350 | _UNUSED_ static long utstring_findR( 351 | UT_string *s, 352 | long P_StartPosition, /* Start from 0. -1 means last position. */ 353 | const char *P_Needle, 354 | size_t P_NeedleLen) 355 | { 356 | long V_StartPosition; 357 | long V_HaystackLen; 358 | long *V_KMP_Table; 359 | long V_FindPosition = -1; 360 | 361 | if (P_StartPosition < 0) 362 | { 363 | V_StartPosition = s->i + P_StartPosition; 364 | } 365 | else 366 | { 367 | V_StartPosition = P_StartPosition; 368 | } 369 | V_HaystackLen = V_StartPosition + 1; 370 | if ( (V_HaystackLen >= (long) P_NeedleLen) && (P_NeedleLen > 0) ) 371 | { 372 | V_KMP_Table = (long *)malloc(sizeof(long) * (P_NeedleLen + 1)); 373 | if (V_KMP_Table != NULL) 374 | { 375 | _utstring_BuildTableR(P_Needle, P_NeedleLen, V_KMP_Table); 376 | 377 | V_FindPosition = _utstring_findR(s->d, 378 | V_HaystackLen, 379 | P_Needle, 380 | P_NeedleLen, 381 | V_KMP_Table); 382 | 383 | free(V_KMP_Table); 384 | } 385 | } 386 | 387 | return V_FindPosition; 388 | } 389 | /******************************************************************************* 390 | * end substring search functions * 391 | ******************************************************************************/ 392 | 393 | #endif /* UTSTRING_H */ 394 | -------------------------------------------------------------------------------- /src/uthash/utarray.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2008-2014, Troy D. Hanson http://troydhanson.github.com/uthash/ 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | */ 23 | 24 | /* a dynamic array implementation using macros 25 | */ 26 | #ifndef UTARRAY_H 27 | #define UTARRAY_H 28 | 29 | #define UTARRAY_VERSION 1.9.9 30 | 31 | #ifdef __GNUC__ 32 | #define _UNUSED_ __attribute__ ((__unused__)) 33 | #else 34 | #define _UNUSED_ 35 | #endif 36 | 37 | #include /* size_t */ 38 | #include /* memset, etc */ 39 | #include /* exit */ 40 | 41 | #define oom() exit(-1) 42 | 43 | typedef void (ctor_f)(void *dst, const void *src); 44 | typedef void (dtor_f)(void *elt); 45 | typedef void (init_f)(void *elt); 46 | typedef struct { 47 | size_t sz; 48 | init_f *init; 49 | ctor_f *copy; 50 | dtor_f *dtor; 51 | } UT_icd; 52 | 53 | typedef struct { 54 | unsigned i,n;/* i: index of next available slot, n: num slots */ 55 | UT_icd icd; /* initializer, copy and destructor functions */ 56 | char *d; /* n slots of size icd->sz*/ 57 | } UT_array; 58 | 59 | #define utarray_init(a,_icd) do { \ 60 | memset(a,0,sizeof(UT_array)); \ 61 | (a)->icd=*_icd; \ 62 | } while(0) 63 | 64 | #define utarray_done(a) do { \ 65 | if ((a)->n) { \ 66 | if ((a)->icd.dtor) { \ 67 | size_t _ut_i; \ 68 | for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \ 69 | (a)->icd.dtor(utarray_eltptr(a,_ut_i)); \ 70 | } \ 71 | } \ 72 | free((a)->d); \ 73 | } \ 74 | (a)->n=0; \ 75 | } while(0) 76 | 77 | #define utarray_new(a,_icd) do { \ 78 | a=(UT_array*)malloc(sizeof(UT_array)); \ 79 | utarray_init(a,_icd); \ 80 | } while(0) 81 | 82 | #define utarray_free(a) do { \ 83 | utarray_done(a); \ 84 | free(a); \ 85 | } while(0) 86 | 87 | #define utarray_reserve(a,by) do { \ 88 | if (((a)->i+(by)) > ((a)->n)) { \ 89 | while(((a)->i+(by)) > ((a)->n)) { (a)->n = ((a)->n ? (2*(a)->n) : 8); } \ 90 | if ( ((a)->d=(char*)realloc((a)->d, (a)->n*(a)->icd.sz)) == NULL) oom(); \ 91 | } \ 92 | } while(0) 93 | 94 | #define utarray_push_back(a,p) do { \ 95 | utarray_reserve(a,1); \ 96 | if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,(a)->i++), p); } \ 97 | else { memcpy(_utarray_eltptr(a,(a)->i++), p, (a)->icd.sz); }; \ 98 | } while(0) 99 | 100 | #define utarray_pop_back(a) do { \ 101 | if ((a)->icd.dtor) { (a)->icd.dtor( _utarray_eltptr(a,--((a)->i))); } \ 102 | else { (a)->i--; } \ 103 | } while(0) 104 | 105 | #define utarray_extend_back(a) do { \ 106 | utarray_reserve(a,1); \ 107 | if ((a)->icd.init) { (a)->icd.init(_utarray_eltptr(a,(a)->i)); } \ 108 | else { memset(_utarray_eltptr(a,(a)->i),0,(a)->icd.sz); } \ 109 | (a)->i++; \ 110 | } while(0) 111 | 112 | #define utarray_len(a) ((a)->i) 113 | 114 | #define utarray_eltptr(a,j) (((j) < (a)->i) ? _utarray_eltptr(a,j) : NULL) 115 | #define _utarray_eltptr(a,j) ((char*)((a)->d + ((a)->icd.sz*(j) ))) 116 | 117 | #define utarray_insert(a,p,j) do { \ 118 | if (j > (a)->i) utarray_resize(a,j); \ 119 | utarray_reserve(a,1); \ 120 | if ((j) < (a)->i) { \ 121 | memmove( _utarray_eltptr(a,(j)+1), _utarray_eltptr(a,j), \ 122 | ((a)->i - (j))*((a)->icd.sz)); \ 123 | } \ 124 | if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,j), p); } \ 125 | else { memcpy(_utarray_eltptr(a,j), p, (a)->icd.sz); }; \ 126 | (a)->i++; \ 127 | } while(0) 128 | 129 | #define utarray_inserta(a,w,j) do { \ 130 | if (utarray_len(w) == 0) break; \ 131 | if (j > (a)->i) utarray_resize(a,j); \ 132 | utarray_reserve(a,utarray_len(w)); \ 133 | if ((j) < (a)->i) { \ 134 | memmove(_utarray_eltptr(a,(j)+utarray_len(w)), \ 135 | _utarray_eltptr(a,j), \ 136 | ((a)->i - (j))*((a)->icd.sz)); \ 137 | } \ 138 | if ((a)->icd.copy) { \ 139 | size_t _ut_i; \ 140 | for(_ut_i=0;_ut_i<(w)->i;_ut_i++) { \ 141 | (a)->icd.copy(_utarray_eltptr(a,j+_ut_i), _utarray_eltptr(w,_ut_i)); \ 142 | } \ 143 | } else { \ 144 | memcpy(_utarray_eltptr(a,j), _utarray_eltptr(w,0), \ 145 | utarray_len(w)*((a)->icd.sz)); \ 146 | } \ 147 | (a)->i += utarray_len(w); \ 148 | } while(0) 149 | 150 | #define utarray_resize(dst,num) do { \ 151 | size_t _ut_i; \ 152 | if (dst->i > (size_t)(num)) { \ 153 | if ((dst)->icd.dtor) { \ 154 | for(_ut_i=num; _ut_i < dst->i; _ut_i++) { \ 155 | (dst)->icd.dtor(utarray_eltptr(dst,_ut_i)); \ 156 | } \ 157 | } \ 158 | } else if (dst->i < (size_t)(num)) { \ 159 | utarray_reserve(dst,num-dst->i); \ 160 | if ((dst)->icd.init) { \ 161 | for(_ut_i=dst->i; _ut_i < num; _ut_i++) { \ 162 | (dst)->icd.init(utarray_eltptr(dst,_ut_i)); \ 163 | } \ 164 | } else { \ 165 | memset(_utarray_eltptr(dst,dst->i),0,(dst)->icd.sz*(num-dst->i)); \ 166 | } \ 167 | } \ 168 | dst->i = num; \ 169 | } while(0) 170 | 171 | #define utarray_concat(dst,src) do { \ 172 | utarray_inserta((dst),(src),utarray_len(dst)); \ 173 | } while(0) 174 | 175 | #define utarray_erase(a,pos,len) do { \ 176 | if ((a)->icd.dtor) { \ 177 | size_t _ut_i; \ 178 | for(_ut_i=0; _ut_i < len; _ut_i++) { \ 179 | (a)->icd.dtor(utarray_eltptr((a),pos+_ut_i)); \ 180 | } \ 181 | } \ 182 | if ((a)->i > (pos+len)) { \ 183 | memmove( _utarray_eltptr((a),pos), _utarray_eltptr((a),pos+len), \ 184 | (((a)->i)-(pos+len))*((a)->icd.sz)); \ 185 | } \ 186 | (a)->i -= (len); \ 187 | } while(0) 188 | 189 | #define utarray_renew(a,u) do { \ 190 | if (a) utarray_clear(a); \ 191 | else utarray_new((a),(u)); \ 192 | } while(0) 193 | 194 | #define utarray_clear(a) do { \ 195 | if ((a)->i > 0) { \ 196 | if ((a)->icd.dtor) { \ 197 | size_t _ut_i; \ 198 | for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \ 199 | (a)->icd.dtor(utarray_eltptr(a,_ut_i)); \ 200 | } \ 201 | } \ 202 | (a)->i = 0; \ 203 | } \ 204 | } while(0) 205 | 206 | #define utarray_sort(a,cmp) do { \ 207 | qsort((a)->d, (a)->i, (a)->icd.sz, cmp); \ 208 | } while(0) 209 | 210 | #define utarray_find(a,v,cmp) bsearch((v),(a)->d,(a)->i,(a)->icd.sz,cmp) 211 | 212 | #define utarray_front(a) (((a)->i) ? (_utarray_eltptr(a,0)) : NULL) 213 | #define utarray_next(a,e) (((e)==NULL) ? utarray_front(a) : ((((a)->i) > (utarray_eltidx(a,e)+1)) ? _utarray_eltptr(a,utarray_eltidx(a,e)+1) : NULL)) 214 | #define utarray_prev(a,e) (((e)==NULL) ? utarray_back(a) : ((utarray_eltidx(a,e) > 0) ? _utarray_eltptr(a,utarray_eltidx(a,e)-1) : NULL)) 215 | #define utarray_back(a) (((a)->i) ? (_utarray_eltptr(a,(a)->i-1)) : NULL) 216 | #define utarray_eltidx(a,e) (((char*)(e) >= (char*)((a)->d)) ? (((char*)(e) - (char*)((a)->d))/(size_t)(a)->icd.sz) : -1) 217 | 218 | /* last we pre-define a few icd for common utarrays of ints and strings */ 219 | static void utarray_str_cpy(void *dst, const void *src) { 220 | char **_src = (char**)src, **_dst = (char**)dst; 221 | *_dst = (*_src == NULL) ? NULL : strdup(*_src); 222 | } 223 | static void utarray_str_dtor(void *elt) { 224 | char **eltc = (char**)elt; 225 | if (*eltc) free(*eltc); 226 | } 227 | static const UT_icd ut_str_icd _UNUSED_ = {sizeof(char*),NULL,utarray_str_cpy,utarray_str_dtor}; 228 | static const UT_icd ut_int_icd _UNUSED_ = {sizeof(int),NULL,NULL,NULL}; 229 | static const UT_icd ut_ptr_icd _UNUSED_ = {sizeof(void*),NULL,NULL,NULL}; 230 | 231 | 232 | #endif /* UTARRAY_H */ 233 | -------------------------------------------------------------------------------- /tests/remove_prefixes_tests.c: -------------------------------------------------------------------------------- 1 | #ifdef __linux 2 | #define _GNU_SOURCE 3 | #endif 4 | #include "minunit.h" 5 | #include 6 | #include 7 | #include 8 | #include "libsastrawi.h" 9 | #include "sastrawi/remove_prefixes.h" 10 | #include "dbg.h" 11 | 12 | char *test_remove_complex_prefix(char *stemable_word, char *expected_stemmed_word, char *expected_removed_part, PREFIX_REMOVER fn) 13 | { 14 | char *stemmed_word = NULL; 15 | char *removed_part = NULL; 16 | 17 | int rc = fn(stemable_word, &stemmed_word, &removed_part); 18 | debug("word: %s, expected stemmed word: %s, actual stemmed word: %s, expected removed part: %s, actual removed part: %s", stemable_word, expected_stemmed_word, stemmed_word, expected_removed_part, removed_part); 19 | mu_assert(rc == 1, "failed to stem"); 20 | mu_assert(strcmp(expected_stemmed_word, stemmed_word) == 0, "failed while asserting stemmed word"); 21 | mu_assert(strcmp(expected_removed_part, removed_part) == 0, "failed while asserting removed part"); 22 | free(stemmed_word); 23 | free(removed_part); 24 | return NULL; 25 | } 26 | 27 | char *test_remove_plain_prefix_returns_0_if_word_notin_dictionary() 28 | { 29 | char *stemmed_word = NULL; 30 | char *removed_part = NULL; 31 | 32 | int rc = remove_plain_prefix("dipertikai", &stemmed_word, &removed_part); 33 | 34 | mu_assert(rc == 0, "successfully stems but not in dictionary"); 35 | mu_assert(strcmp("pertikai", stemmed_word) == 0, "we expect 'pertikai' as the stemmed word"); 36 | mu_assert(strcmp("di", removed_part) == 0, "we expect 'di' as the removed part"); 37 | 38 | return NULL; 39 | } 40 | 41 | char *test_remove_plain_prefix_di() 42 | { 43 | char *stemmed_word = NULL; 44 | char *removed_part = NULL; 45 | 46 | int rc = remove_plain_prefix("dicinta", &stemmed_word, &removed_part); 47 | 48 | mu_assert(rc == 1, "successfully stems"); 49 | mu_assert(strcmp("cinta", stemmed_word) == 0, "we expect 'sana' as the stemmed word"); 50 | mu_assert(strcmp("di", removed_part) == 0, "we expect 'di' as the removed part"); 51 | 52 | return NULL; 53 | } 54 | 55 | char *test_remove_plain_prefix_ke() 56 | { 57 | char *stemmed_word = NULL; 58 | char *removed_part = NULL; 59 | 60 | int rc = remove_plain_prefix("kesana", &stemmed_word, &removed_part); 61 | 62 | mu_assert(rc == 1, "successfully stems"); 63 | mu_assert(strcmp("sana", stemmed_word) == 0, "we expect 'sana' as the stemmed word"); 64 | mu_assert(strcmp("ke", removed_part) == 0, "we expect 'ke' as the removed part"); 65 | 66 | return NULL; 67 | } 68 | 69 | char *test_remove_plain_prefix_se() 70 | { 71 | char *stemmed_word = NULL; 72 | char *removed_part = NULL; 73 | 74 | int rc = remove_plain_prefix("sejenis", &stemmed_word, &removed_part); 75 | 76 | mu_assert(rc == 1, "successfully stems"); 77 | mu_assert(strcmp("jenis", stemmed_word) == 0, "we expect 'jenis' as the stemmed word"); 78 | mu_assert(strcmp("se", removed_part) == 0, "we expect 'se' as the removed part"); 79 | 80 | return NULL; 81 | } 82 | 83 | 84 | 85 | char *test_remove_complex_prefix_rule1_a() 86 | { 87 | char *stemmed_word = NULL; 88 | char *removed_part = NULL; 89 | 90 | int rc = remove_complex_prefix_rule1("beria", &stemmed_word, &removed_part); 91 | debug("stem word: beria, expected: ia, actual: %s", stemmed_word); 92 | mu_assert(rc == 1, "sucessfully stemmed"); 93 | mu_assert(strcmp("ia", stemmed_word) == 0, "it stems to ia"); 94 | mu_assert(strcmp("ber", removed_part) == 0, "remove part should be ber"); 95 | free(stemmed_word); 96 | free(removed_part); 97 | 98 | return NULL; 99 | } 100 | 101 | char *test_remove_complex_prefix_rule1_b() 102 | { 103 | char *stemmed_word = NULL; 104 | char *removed_part = NULL; 105 | 106 | int rc = remove_complex_prefix_rule1("berakit", &stemmed_word, &removed_part); 107 | debug("stem word: berakit, expected: rakit, actual: %s", stemmed_word); 108 | mu_assert(rc == 1, "sucessfully stemmed"); 109 | mu_assert(strcmp("rakit", stemmed_word) == 0, "it stems to rakit"); 110 | mu_assert(strcmp("be", removed_part) == 0, "remove part should be be"); 111 | free(stemmed_word); 112 | free(removed_part); 113 | 114 | return NULL; 115 | } 116 | 117 | char *test_remove_complex_prefix_rule2() 118 | { 119 | return test_remove_complex_prefix("berkop", "kop", "ber", remove_complex_prefix_rule2); 120 | } 121 | 122 | char *test_remove_complex_prefix_rule2_excludes_er() 123 | { 124 | char *word = "berdaerah"; 125 | char *stemmed_word = NULL; 126 | char *removed_part = NULL; 127 | 128 | int rc = remove_complex_prefix_rule2(word, &stemmed_word, &removed_part); 129 | debug("stem word: %s, expected: berdaerah, actual: %s", word, stemmed_word); 130 | mu_assert(rc == 0, "does not stem"); 131 | mu_assert(strcmp("berdaerah", stemmed_word) == 0, "it does not stem it"); 132 | free(stemmed_word); 133 | free(removed_part); 134 | 135 | return NULL; 136 | } 137 | 138 | char *test_remove_complex_prefix_rule3_only_includes_er() 139 | { 140 | char *stemable_word = "berdaerah"; 141 | char *nonstemable_word = "bertabur"; 142 | char *stemmed_word = NULL; 143 | char *removed_part = NULL; 144 | 145 | int rc = remove_complex_prefix_rule3(stemable_word, &stemmed_word, &removed_part); 146 | debug("stem word: %s, expected: daerah, actual: %s", stemable_word, stemmed_word); 147 | mu_assert(rc == 1, "sucessfully stemmed"); 148 | mu_assert(strcmp("daerah", stemmed_word) == 0, "it stems to daerah"); 149 | mu_assert(strcmp("ber", removed_part) == 0, "remove part should be ber"); 150 | free(stemmed_word); 151 | free(removed_part); 152 | 153 | rc = remove_complex_prefix_rule3(nonstemable_word, &stemmed_word, &removed_part); 154 | mu_assert(rc == 0, "cannot stem"); 155 | free(stemmed_word); 156 | free(removed_part); 157 | 158 | return NULL; 159 | } 160 | 161 | char *test_remove_complex_prefix_rule4() 162 | { 163 | char *stemable_word = "belajar"; 164 | char *nonstemable_word = "bertabur"; 165 | char *stemmed_word = NULL; 166 | char *removed_part = NULL; 167 | 168 | int rc = remove_complex_prefix_rule4(stemable_word, &stemmed_word, &removed_part); 169 | debug("stem word: %s, expected: ajar, actual: %s", stemable_word, stemmed_word); 170 | mu_assert(rc == 1, "sucessfully stemmed"); 171 | mu_assert(strcmp("ajar", stemmed_word) == 0, "it stems to ajar"); 172 | mu_assert(strcmp("bel", removed_part) == 0, "remove part should be bel"); 173 | free(stemmed_word); 174 | free(removed_part); 175 | 176 | rc = remove_complex_prefix_rule4(nonstemable_word, &stemmed_word, &removed_part); 177 | mu_assert(rc == 0, "cannot stem"); 178 | free(stemmed_word); 179 | free(removed_part); 180 | 181 | return NULL; 182 | } 183 | 184 | char *test_remove_complex_prefix_rule5() 185 | { 186 | char *stemable_word = "bekerja"; 187 | char *nonstemable_word = "berlari"; 188 | char *stemmed_word = NULL; 189 | char *removed_part = NULL; 190 | 191 | int rc = remove_complex_prefix_rule5(stemable_word, &stemmed_word, &removed_part); 192 | debug("stem word: %s, expected: kerja, actual: %s", stemable_word, stemmed_word); 193 | mu_assert(rc == 1, "sucessfully stemmed"); 194 | mu_assert(strcmp("kerja", stemmed_word) == 0, "it stems to kerja"); 195 | mu_assert(strcmp("be", removed_part) == 0, "remove part should be be"); 196 | free(stemmed_word); 197 | free(removed_part); 198 | 199 | rc = remove_complex_prefix_rule5(nonstemable_word, &stemmed_word, &removed_part); 200 | mu_assert(rc == 0, "cannot stem"); 201 | free(stemmed_word); 202 | free(removed_part); 203 | 204 | return NULL; 205 | } 206 | 207 | char *test_remove_complex_prefix_rule6a() 208 | { 209 | char *stemable_word = "terancam"; 210 | char *nonstemable_word = "terbalik"; 211 | char *stemmed_word = NULL; 212 | char *removed_part = NULL; 213 | 214 | int rc = remove_complex_prefix_rule6(stemable_word, &stemmed_word, &removed_part); 215 | debug("stem word: %s, expected: ancam, actual: %s", stemable_word, stemmed_word); 216 | mu_assert(rc == 1, "sucessfully stemmed"); 217 | mu_assert(strcmp("ancam", stemmed_word) == 0, "it stems to ancam"); 218 | mu_assert(strcmp("ter", removed_part) == 0, "remove part should be be"); 219 | free(stemmed_word); 220 | free(removed_part); 221 | 222 | rc = remove_complex_prefix_rule6(nonstemable_word, &stemmed_word, &removed_part); 223 | mu_assert(rc == 0, "cannot stem"); 224 | free(stemmed_word); 225 | free(removed_part); 226 | 227 | return NULL; 228 | } 229 | 230 | char *test_remove_complex_prefix_rule6b() 231 | { 232 | char *stemable_word = "teracun"; 233 | char *nonstemable_word = "terbalik"; 234 | char *stemmed_word = NULL; 235 | char *removed_part = NULL; 236 | 237 | int rc = remove_complex_prefix_rule6(stemable_word, &stemmed_word, &removed_part); 238 | debug("stem word: %s, expected: racun, actual: %s", stemable_word, stemmed_word); 239 | mu_assert(rc == 1, "sucessfully stemmed"); 240 | mu_assert(strcmp("racun", stemmed_word) == 0, "it stems to racun"); 241 | mu_assert(strcmp("te", removed_part) == 0, "remove part should be te"); 242 | free(stemmed_word); 243 | free(removed_part); 244 | 245 | rc = remove_complex_prefix_rule6(nonstemable_word, &stemmed_word, &removed_part); 246 | mu_assert(rc == 0, "cannot stem"); 247 | free(stemmed_word); 248 | free(removed_part); 249 | 250 | return NULL; 251 | } 252 | 253 | char *test_remove_complex_prefix_rule7() 254 | { 255 | char *stemable_word = "terperuk"; 256 | char *stemmed_word = NULL; 257 | char *removed_part = NULL; 258 | 259 | int rc = remove_complex_prefix_rule7(stemable_word, &stemmed_word, &removed_part); 260 | debug("stem word: %s, expected: peruk, actual: %s", stemable_word, stemmed_word); 261 | mu_assert(rc == 1, "sucessfully stemmed"); 262 | mu_assert(strcmp("peruk", stemmed_word) == 0, "it stems to peruk"); 263 | mu_assert(strcmp("ter", removed_part) == 0, "remove part should be ter"); 264 | free(stemmed_word); 265 | free(removed_part); 266 | return NULL; 267 | } 268 | 269 | char *test_remove_complex_prefix_rule8() 270 | { 271 | char *stemable_word = "tertangkap"; 272 | char *stemmed_word = NULL; 273 | char *removed_part = NULL; 274 | 275 | int rc = remove_complex_prefix_rule8(stemable_word, &stemmed_word, &removed_part); 276 | debug("stem word: %s, expected: tangkap, actual: %s", stemable_word, stemmed_word); 277 | mu_assert(rc == 1, "sucessfully stemmed"); 278 | mu_assert(strcmp("tangkap", stemmed_word) == 0, "it stems to tangkap"); 279 | mu_assert(strcmp("ter", removed_part) == 0, "remove part should be ter"); 280 | free(stemmed_word); 281 | free(removed_part); 282 | return NULL; 283 | } 284 | 285 | char *test_remove_complex_prefix_rule8_excludes_er() 286 | { 287 | char *word = "terterbang"; 288 | char *stemmed_word = NULL; 289 | char *removed_part = NULL; 290 | 291 | int rc = remove_complex_prefix_rule8(word, &stemmed_word, &removed_part); 292 | debug("stem word: %s, expected: terterbang, actual: %s", word, stemmed_word); 293 | mu_assert(rc == 0, "does not stem"); 294 | mu_assert(strcmp("terterbang", stemmed_word) == 0, "it does not stem it"); 295 | free(stemmed_word); 296 | free(removed_part); 297 | 298 | return NULL; 299 | } 300 | 301 | char *test_remove_complex_prefix_rule9() 302 | { 303 | return test_remove_complex_prefix("teterbang", "terbang", "te", remove_complex_prefix_rule9); 304 | } 305 | 306 | char *test_remove_complex_prefix_rule10_l() 307 | { 308 | return test_remove_complex_prefix("melalu", "lalu", "me", remove_complex_prefix_rule10); 309 | } 310 | 311 | char *test_remove_complex_prefix_rule10_r() 312 | { 313 | return test_remove_complex_prefix("meracun", "racun", "me", remove_complex_prefix_rule10); 314 | } 315 | 316 | char *test_remove_complex_prefix_rule10_w() 317 | { 318 | return test_remove_complex_prefix("mewarna", "warna", "me", remove_complex_prefix_rule10); 319 | } 320 | 321 | char *test_remove_complex_prefix_rule10_y() 322 | { 323 | return test_remove_complex_prefix("meyakin", "yakin", "me", remove_complex_prefix_rule10); 324 | } 325 | 326 | char *test_remove_complex_prefix_rule11_f() 327 | { 328 | return test_remove_complex_prefix("memfasilitas", "fasilitas", "mem", remove_complex_prefix_rule11); 329 | } 330 | 331 | char *test_remove_complex_prefix_rule11_b() 332 | { 333 | return test_remove_complex_prefix("membantu", "bantu", "mem", remove_complex_prefix_rule11); 334 | } 335 | 336 | char *test_remove_complex_prefix_rule11_v() 337 | { 338 | return test_remove_complex_prefix("memvonis", "vonis", "mem", remove_complex_prefix_rule11); 339 | } 340 | 341 | char *test_remove_complex_prefix_rule11_unstemmable() 342 | { 343 | char *word = "terbalik"; 344 | char *stemmed_word = NULL; 345 | char *removed_part = NULL; 346 | 347 | int rc = remove_complex_prefix_rule11(word, &stemmed_word, &removed_part); 348 | debug("word: %s, expected: %s, actual: %s, expected removed: %s, actual removed : %s", 349 | word, word, 350 | stemmed_word, "", 351 | removed_part); 352 | 353 | mu_assert(rc == 0, "should not stem"); 354 | mu_assert(strcmp(word, stemmed_word) == 0, "it returns the original word"); 355 | mu_assert(strcmp("", removed_part) == 0, "it returns an empty string as the removed part"); 356 | free(stemmed_word); 357 | free(removed_part); 358 | 359 | return NULL; 360 | } 361 | 362 | char *test_remove_complex_prefix_rule12() 363 | { 364 | return test_remove_complex_prefix("mempengaruh", "pengaruh", "mem", remove_complex_prefix_rule12); 365 | } 366 | 367 | char *test_remove_complex_prefix_rule13a() 368 | { 369 | return test_remove_complex_prefix("memasuk", "masuk", "me", remove_complex_prefix_rule13); 370 | } 371 | 372 | char *test_remove_complex_prefix_rule13b() 373 | { 374 | return test_remove_complex_prefix("memakai", "pakai", "me", remove_complex_prefix_rule13); 375 | } 376 | 377 | char *test_remove_complex_prefix_rule14_c() 378 | { 379 | return test_remove_complex_prefix("mencantum", "cantum", "men", remove_complex_prefix_rule14); 380 | } 381 | 382 | char *test_remove_complex_prefix_rule14_d() 383 | { 384 | return test_remove_complex_prefix("menduduk", "duduk", "men", remove_complex_prefix_rule14); 385 | } 386 | 387 | char *test_remove_complex_prefix_rule14_j() 388 | { 389 | return test_remove_complex_prefix("menjemput", "jemput", "men", remove_complex_prefix_rule14); 390 | } 391 | 392 | char *test_remove_complex_prefix_rule14_s() 393 | { 394 | return test_remove_complex_prefix("mensyukur", "syukur", "men", remove_complex_prefix_rule14); 395 | } 396 | 397 | char *test_remove_complex_prefix_rule14_t() 398 | { 399 | return test_remove_complex_prefix("mentaat", "taat", "men", remove_complex_prefix_rule14); 400 | } 401 | 402 | char *test_remove_complex_prefix_rule14_z() 403 | { 404 | return test_remove_complex_prefix("menziarah", "ziarah", "men", remove_complex_prefix_rule14); 405 | } 406 | 407 | char *test_remove_complex_prefix_rule15a() 408 | { 409 | return test_remove_complex_prefix("menikmat", "nikmat", "me", remove_complex_prefix_rule15); 410 | } 411 | 412 | char *test_remove_complex_prefix_rule15b() 413 | { 414 | return test_remove_complex_prefix("menulis", "tulis", "me", remove_complex_prefix_rule15); 415 | } 416 | 417 | char *test_remove_complex_prefix_rule16_g() 418 | { 419 | return test_remove_complex_prefix("mengguna", "guna", "meng", remove_complex_prefix_rule16); 420 | } 421 | 422 | char *test_remove_complex_prefix_rule16_h() 423 | { 424 | return test_remove_complex_prefix("menghambat", "hambat", "meng", remove_complex_prefix_rule16); 425 | } 426 | 427 | char *test_remove_complex_prefix_rule16_q() 428 | { 429 | return test_remove_complex_prefix("mengqasar", "qasar", "meng", remove_complex_prefix_rule16); 430 | } 431 | 432 | char *test_remove_complex_prefix_rule16_k() 433 | { 434 | return test_remove_complex_prefix("mengkritik", "kritik", "meng", remove_complex_prefix_rule16); 435 | } 436 | 437 | char *test_remove_complex_prefix_rule17a() 438 | { 439 | return test_remove_complex_prefix("mengerat", "erat", "meng", remove_complex_prefix_rule17); 440 | } 441 | 442 | char *test_remove_complex_prefix_rule17b() 443 | { 444 | return test_remove_complex_prefix("mengecil", "kecil", "meng", remove_complex_prefix_rule17); 445 | } 446 | 447 | char *test_remove_complex_prefix_rule17c() 448 | { 449 | return test_remove_complex_prefix("mengecat", "cat", "menge", remove_complex_prefix_rule17); 450 | } 451 | 452 | char *test_remove_complex_prefix_rule17d() 453 | { 454 | return test_remove_complex_prefix("mengiang", "ngiang", "me", remove_complex_prefix_rule17); 455 | } 456 | 457 | char *test_remove_complex_prefix_rule18a() 458 | { 459 | return test_remove_complex_prefix("menyala", "nyala", "me", remove_complex_prefix_rule18); 460 | } 461 | 462 | char *test_remove_complex_prefix_rule18b() 463 | { 464 | return test_remove_complex_prefix("menyapu", "sapu", "meny", remove_complex_prefix_rule18); 465 | } 466 | 467 | char *test_remove_complex_prefix_rule19_1() 468 | { 469 | return test_remove_complex_prefix("memproteksi", "proteksi", "mem", remove_complex_prefix_rule19); 470 | } 471 | 472 | char *test_remove_complex_prefix_rule19_2() 473 | { 474 | return test_remove_complex_prefix("mempatroli", "patroli", "mem", remove_complex_prefix_rule19); 475 | } 476 | 477 | char *test_remove_complex_prefix_rule20_1() 478 | { 479 | return test_remove_complex_prefix("pewarna", "warna", "pe", remove_complex_prefix_rule20); 480 | } 481 | 482 | char *test_remove_complex_prefix_rule20_2() 483 | { 484 | return test_remove_complex_prefix("peyoga", "yoga", "pe", remove_complex_prefix_rule20); 485 | } 486 | 487 | char *test_remove_prefixes_when_cannot_stem_to_word_in_dict() 488 | { 489 | char *stemmed_word; 490 | 491 | int rc = remove_prefixes("mewarnai", &stemmed_word); 492 | debug("word: mewarnai, expected stemmed word: warnai, actual stemmed word: %s", stemmed_word); 493 | mu_assert(rc == 0, "it changes the word, but its not done"); 494 | mu_assert(strcmp("warnai", stemmed_word) == 0, "failed while asserting stemmed word"); 495 | return NULL; 496 | } 497 | 498 | char *all_tests() 499 | { 500 | mu_suite_start(); 501 | 502 | dictionary_load(dictionary_fullpath("data/kata-dasar.txt")); 503 | 504 | mu_run_test(test_remove_plain_prefix_returns_0_if_word_notin_dictionary) 505 | mu_run_test(test_remove_plain_prefix_di); 506 | mu_run_test(test_remove_plain_prefix_ke); 507 | mu_run_test(test_remove_plain_prefix_se); 508 | mu_run_test(test_remove_complex_prefix_rule1_a); 509 | mu_run_test(test_remove_complex_prefix_rule1_b); 510 | mu_run_test(test_remove_complex_prefix_rule2); 511 | mu_run_test(test_remove_complex_prefix_rule2_excludes_er); 512 | mu_run_test(test_remove_complex_prefix_rule3_only_includes_er); 513 | mu_run_test(test_remove_complex_prefix_rule4); 514 | mu_run_test(test_remove_complex_prefix_rule5); 515 | mu_run_test(test_remove_complex_prefix_rule6a); 516 | mu_run_test(test_remove_complex_prefix_rule6b); 517 | mu_run_test(test_remove_complex_prefix_rule7); 518 | mu_run_test(test_remove_complex_prefix_rule8); 519 | mu_run_test(test_remove_complex_prefix_rule8_excludes_er); 520 | mu_run_test(test_remove_complex_prefix_rule9); 521 | mu_run_test(test_remove_complex_prefix_rule10_l); 522 | mu_run_test(test_remove_complex_prefix_rule10_r); 523 | mu_run_test(test_remove_complex_prefix_rule10_w); 524 | mu_run_test(test_remove_complex_prefix_rule10_y); 525 | mu_run_test(test_remove_complex_prefix_rule11_f); 526 | mu_run_test(test_remove_complex_prefix_rule11_b); 527 | mu_run_test(test_remove_complex_prefix_rule11_v); 528 | mu_run_test(test_remove_complex_prefix_rule11_unstemmable); 529 | mu_run_test(test_remove_complex_prefix_rule12); 530 | mu_run_test(test_remove_complex_prefix_rule13a); 531 | mu_run_test(test_remove_complex_prefix_rule13b); 532 | mu_run_test(test_remove_complex_prefix_rule14_c); 533 | mu_run_test(test_remove_complex_prefix_rule14_d); 534 | mu_run_test(test_remove_complex_prefix_rule14_j); 535 | mu_run_test(test_remove_complex_prefix_rule14_s); 536 | mu_run_test(test_remove_complex_prefix_rule14_t); 537 | mu_run_test(test_remove_complex_prefix_rule14_z); 538 | mu_run_test(test_remove_complex_prefix_rule15a); 539 | mu_run_test(test_remove_complex_prefix_rule15b); 540 | mu_run_test(test_remove_complex_prefix_rule16_g); 541 | mu_run_test(test_remove_complex_prefix_rule16_h); 542 | mu_run_test(test_remove_complex_prefix_rule16_q); 543 | mu_run_test(test_remove_complex_prefix_rule16_k); 544 | mu_run_test(test_remove_complex_prefix_rule17a); 545 | mu_run_test(test_remove_complex_prefix_rule17b); 546 | mu_run_test(test_remove_complex_prefix_rule17c); 547 | mu_run_test(test_remove_complex_prefix_rule17d); 548 | mu_run_test(test_remove_complex_prefix_rule18a); 549 | mu_run_test(test_remove_complex_prefix_rule18b); 550 | mu_run_test(test_remove_complex_prefix_rule19_1); 551 | mu_run_test(test_remove_complex_prefix_rule19_2); 552 | mu_run_test(test_remove_complex_prefix_rule20_1); 553 | mu_run_test(test_remove_complex_prefix_rule20_2); 554 | mu_run_test(test_remove_prefixes_when_cannot_stem_to_word_in_dict); 555 | return NULL; 556 | } 557 | 558 | RUN_TESTS(all_tests); 559 | -------------------------------------------------------------------------------- /src/uthash/utlist.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2007-2014, Troy D. Hanson http://troydhanson.github.com/uthash/ 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | */ 23 | 24 | #ifndef UTLIST_H 25 | #define UTLIST_H 26 | 27 | #define UTLIST_VERSION 1.9.9 28 | 29 | #include 30 | 31 | /* 32 | * This file contains macros to manipulate singly and doubly-linked lists. 33 | * 34 | * 1. LL_ macros: singly-linked lists. 35 | * 2. DL_ macros: doubly-linked lists. 36 | * 3. CDL_ macros: circular doubly-linked lists. 37 | * 38 | * To use singly-linked lists, your structure must have a "next" pointer. 39 | * To use doubly-linked lists, your structure must "prev" and "next" pointers. 40 | * Either way, the pointer to the head of the list must be initialized to NULL. 41 | * 42 | * ----------------.EXAMPLE ------------------------- 43 | * struct item { 44 | * int id; 45 | * struct item *prev, *next; 46 | * } 47 | * 48 | * struct item *list = NULL: 49 | * 50 | * int main() { 51 | * struct item *item; 52 | * ... allocate and populate item ... 53 | * DL_APPEND(list, item); 54 | * } 55 | * -------------------------------------------------- 56 | * 57 | * For doubly-linked lists, the append and delete macros are O(1) 58 | * For singly-linked lists, append and delete are O(n) but prepend is O(1) 59 | * The sort macro is O(n log(n)) for all types of single/double/circular lists. 60 | */ 61 | 62 | /* These macros use decltype or the earlier __typeof GNU extension. 63 | As decltype is only available in newer compilers (VS2010 or gcc 4.3+ 64 | when compiling c++ code), this code uses whatever method is needed 65 | or, for VS2008 where neither is available, uses casting workarounds. */ 66 | #ifdef _MSC_VER /* MS compiler */ 67 | #if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ 68 | #define LDECLTYPE(x) decltype(x) 69 | #else /* VS2008 or older (or VS2010 in C mode) */ 70 | #define NO_DECLTYPE 71 | #define LDECLTYPE(x) char* 72 | #endif 73 | #elif defined(__ICCARM__) 74 | #define NO_DECLTYPE 75 | #define LDECLTYPE(x) char* 76 | #else /* GNU, Sun and other compilers */ 77 | #define LDECLTYPE(x) __typeof(x) 78 | #endif 79 | 80 | /* for VS2008 we use some workarounds to get around the lack of decltype, 81 | * namely, we always reassign our tmp variable to the list head if we need 82 | * to dereference its prev/next pointers, and save/restore the real head.*/ 83 | #ifdef NO_DECLTYPE 84 | #define _SV(elt,list) _tmp = (char*)(list); {char **_alias = (char**)&(list); *_alias = (elt); } 85 | #define _NEXT(elt,list,next) ((char*)((list)->next)) 86 | #define _NEXTASGN(elt,list,to,next) { char **_alias = (char**)&((list)->next); *_alias=(char*)(to); } 87 | /* #define _PREV(elt,list,prev) ((char*)((list)->prev)) */ 88 | #define _PREVASGN(elt,list,to,prev) { char **_alias = (char**)&((list)->prev); *_alias=(char*)(to); } 89 | #define _RS(list) { char **_alias = (char**)&(list); *_alias=_tmp; } 90 | #define _CASTASGN(a,b) { char **_alias = (char**)&(a); *_alias=(char*)(b); } 91 | #else 92 | #define _SV(elt,list) 93 | #define _NEXT(elt,list,next) ((elt)->next) 94 | #define _NEXTASGN(elt,list,to,next) ((elt)->next)=(to) 95 | /* #define _PREV(elt,list,prev) ((elt)->prev) */ 96 | #define _PREVASGN(elt,list,to,prev) ((elt)->prev)=(to) 97 | #define _RS(list) 98 | #define _CASTASGN(a,b) (a)=(b) 99 | #endif 100 | 101 | /****************************************************************************** 102 | * The sort macro is an adaptation of Simon Tatham's O(n log(n)) mergesort * 103 | * Unwieldy variable names used here to avoid shadowing passed-in variables. * 104 | *****************************************************************************/ 105 | #define LL_SORT(list, cmp) \ 106 | LL_SORT2(list, cmp, next) 107 | 108 | #define LL_SORT2(list, cmp, next) \ 109 | do { \ 110 | LDECLTYPE(list) _ls_p; \ 111 | LDECLTYPE(list) _ls_q; \ 112 | LDECLTYPE(list) _ls_e; \ 113 | LDECLTYPE(list) _ls_tail; \ 114 | int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \ 115 | if (list) { \ 116 | _ls_insize = 1; \ 117 | _ls_looping = 1; \ 118 | while (_ls_looping) { \ 119 | _CASTASGN(_ls_p,list); \ 120 | list = NULL; \ 121 | _ls_tail = NULL; \ 122 | _ls_nmerges = 0; \ 123 | while (_ls_p) { \ 124 | _ls_nmerges++; \ 125 | _ls_q = _ls_p; \ 126 | _ls_psize = 0; \ 127 | for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \ 128 | _ls_psize++; \ 129 | _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list); \ 130 | if (!_ls_q) break; \ 131 | } \ 132 | _ls_qsize = _ls_insize; \ 133 | while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \ 134 | if (_ls_psize == 0) { \ 135 | _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ 136 | _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ 137 | } else if (_ls_qsize == 0 || !_ls_q) { \ 138 | _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ 139 | _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ 140 | } else if (cmp(_ls_p,_ls_q) <= 0) { \ 141 | _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ 142 | _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ 143 | } else { \ 144 | _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ 145 | _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ 146 | } \ 147 | if (_ls_tail) { \ 148 | _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \ 149 | } else { \ 150 | _CASTASGN(list,_ls_e); \ 151 | } \ 152 | _ls_tail = _ls_e; \ 153 | } \ 154 | _ls_p = _ls_q; \ 155 | } \ 156 | if (_ls_tail) { \ 157 | _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list); \ 158 | } \ 159 | if (_ls_nmerges <= 1) { \ 160 | _ls_looping=0; \ 161 | } \ 162 | _ls_insize *= 2; \ 163 | } \ 164 | } \ 165 | } while (0) 166 | 167 | 168 | #define DL_SORT(list, cmp) \ 169 | DL_SORT2(list, cmp, prev, next) 170 | 171 | #define DL_SORT2(list, cmp, prev, next) \ 172 | do { \ 173 | LDECLTYPE(list) _ls_p; \ 174 | LDECLTYPE(list) _ls_q; \ 175 | LDECLTYPE(list) _ls_e; \ 176 | LDECLTYPE(list) _ls_tail; \ 177 | int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \ 178 | if (list) { \ 179 | _ls_insize = 1; \ 180 | _ls_looping = 1; \ 181 | while (_ls_looping) { \ 182 | _CASTASGN(_ls_p,list); \ 183 | list = NULL; \ 184 | _ls_tail = NULL; \ 185 | _ls_nmerges = 0; \ 186 | while (_ls_p) { \ 187 | _ls_nmerges++; \ 188 | _ls_q = _ls_p; \ 189 | _ls_psize = 0; \ 190 | for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \ 191 | _ls_psize++; \ 192 | _SV(_ls_q,list); _ls_q = _NEXT(_ls_q,list,next); _RS(list); \ 193 | if (!_ls_q) break; \ 194 | } \ 195 | _ls_qsize = _ls_insize; \ 196 | while ((_ls_psize > 0) || ((_ls_qsize > 0) && _ls_q)) { \ 197 | if (_ls_psize == 0) { \ 198 | _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ 199 | _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ 200 | } else if ((_ls_qsize == 0) || (!_ls_q)) { \ 201 | _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ 202 | _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ 203 | } else if (cmp(_ls_p,_ls_q) <= 0) { \ 204 | _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ 205 | _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ 206 | } else { \ 207 | _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ 208 | _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ 209 | } \ 210 | if (_ls_tail) { \ 211 | _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \ 212 | } else { \ 213 | _CASTASGN(list,_ls_e); \ 214 | } \ 215 | _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list); \ 216 | _ls_tail = _ls_e; \ 217 | } \ 218 | _ls_p = _ls_q; \ 219 | } \ 220 | _CASTASGN(list->prev, _ls_tail); \ 221 | _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,NULL,next); _RS(list); \ 222 | if (_ls_nmerges <= 1) { \ 223 | _ls_looping=0; \ 224 | } \ 225 | _ls_insize *= 2; \ 226 | } \ 227 | } \ 228 | } while (0) 229 | 230 | #define CDL_SORT(list, cmp) \ 231 | CDL_SORT2(list, cmp, prev, next) 232 | 233 | #define CDL_SORT2(list, cmp, prev, next) \ 234 | do { \ 235 | LDECLTYPE(list) _ls_p; \ 236 | LDECLTYPE(list) _ls_q; \ 237 | LDECLTYPE(list) _ls_e; \ 238 | LDECLTYPE(list) _ls_tail; \ 239 | LDECLTYPE(list) _ls_oldhead; \ 240 | LDECLTYPE(list) _tmp; \ 241 | int _ls_insize, _ls_nmerges, _ls_psize, _ls_qsize, _ls_i, _ls_looping; \ 242 | if (list) { \ 243 | _ls_insize = 1; \ 244 | _ls_looping = 1; \ 245 | while (_ls_looping) { \ 246 | _CASTASGN(_ls_p,list); \ 247 | _CASTASGN(_ls_oldhead,list); \ 248 | list = NULL; \ 249 | _ls_tail = NULL; \ 250 | _ls_nmerges = 0; \ 251 | while (_ls_p) { \ 252 | _ls_nmerges++; \ 253 | _ls_q = _ls_p; \ 254 | _ls_psize = 0; \ 255 | for (_ls_i = 0; _ls_i < _ls_insize; _ls_i++) { \ 256 | _ls_psize++; \ 257 | _SV(_ls_q,list); \ 258 | if (_NEXT(_ls_q,list,next) == _ls_oldhead) { \ 259 | _ls_q = NULL; \ 260 | } else { \ 261 | _ls_q = _NEXT(_ls_q,list,next); \ 262 | } \ 263 | _RS(list); \ 264 | if (!_ls_q) break; \ 265 | } \ 266 | _ls_qsize = _ls_insize; \ 267 | while (_ls_psize > 0 || (_ls_qsize > 0 && _ls_q)) { \ 268 | if (_ls_psize == 0) { \ 269 | _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ 270 | _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ 271 | if (_ls_q == _ls_oldhead) { _ls_q = NULL; } \ 272 | } else if (_ls_qsize == 0 || !_ls_q) { \ 273 | _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ 274 | _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ 275 | if (_ls_p == _ls_oldhead) { _ls_p = NULL; } \ 276 | } else if (cmp(_ls_p,_ls_q) <= 0) { \ 277 | _ls_e = _ls_p; _SV(_ls_p,list); _ls_p = \ 278 | _NEXT(_ls_p,list,next); _RS(list); _ls_psize--; \ 279 | if (_ls_p == _ls_oldhead) { _ls_p = NULL; } \ 280 | } else { \ 281 | _ls_e = _ls_q; _SV(_ls_q,list); _ls_q = \ 282 | _NEXT(_ls_q,list,next); _RS(list); _ls_qsize--; \ 283 | if (_ls_q == _ls_oldhead) { _ls_q = NULL; } \ 284 | } \ 285 | if (_ls_tail) { \ 286 | _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_ls_e,next); _RS(list); \ 287 | } else { \ 288 | _CASTASGN(list,_ls_e); \ 289 | } \ 290 | _SV(_ls_e,list); _PREVASGN(_ls_e,list,_ls_tail,prev); _RS(list); \ 291 | _ls_tail = _ls_e; \ 292 | } \ 293 | _ls_p = _ls_q; \ 294 | } \ 295 | _CASTASGN(list->prev,_ls_tail); \ 296 | _CASTASGN(_tmp,list); \ 297 | _SV(_ls_tail,list); _NEXTASGN(_ls_tail,list,_tmp,next); _RS(list); \ 298 | if (_ls_nmerges <= 1) { \ 299 | _ls_looping=0; \ 300 | } \ 301 | _ls_insize *= 2; \ 302 | } \ 303 | } \ 304 | } while (0) 305 | 306 | /****************************************************************************** 307 | * singly linked list macros (non-circular) * 308 | *****************************************************************************/ 309 | #define LL_PREPEND(head,add) \ 310 | LL_PREPEND2(head,add,next) 311 | 312 | #define LL_PREPEND2(head,add,next) \ 313 | do { \ 314 | (add)->next = head; \ 315 | head = add; \ 316 | } while (0) 317 | 318 | #define LL_CONCAT(head1,head2) \ 319 | LL_CONCAT2(head1,head2,next) 320 | 321 | #define LL_CONCAT2(head1,head2,next) \ 322 | do { \ 323 | LDECLTYPE(head1) _tmp; \ 324 | if (head1) { \ 325 | _tmp = head1; \ 326 | while (_tmp->next) { _tmp = _tmp->next; } \ 327 | _tmp->next=(head2); \ 328 | } else { \ 329 | (head1)=(head2); \ 330 | } \ 331 | } while (0) 332 | 333 | #define LL_APPEND(head,add) \ 334 | LL_APPEND2(head,add,next) 335 | 336 | #define LL_APPEND2(head,add,next) \ 337 | do { \ 338 | LDECLTYPE(head) _tmp; \ 339 | (add)->next=NULL; \ 340 | if (head) { \ 341 | _tmp = head; \ 342 | while (_tmp->next) { _tmp = _tmp->next; } \ 343 | _tmp->next=(add); \ 344 | } else { \ 345 | (head)=(add); \ 346 | } \ 347 | } while (0) 348 | 349 | #define LL_DELETE(head,del) \ 350 | LL_DELETE2(head,del,next) 351 | 352 | #define LL_DELETE2(head,del,next) \ 353 | do { \ 354 | LDECLTYPE(head) _tmp; \ 355 | if ((head) == (del)) { \ 356 | (head)=(head)->next; \ 357 | } else { \ 358 | _tmp = head; \ 359 | while (_tmp->next && (_tmp->next != (del))) { \ 360 | _tmp = _tmp->next; \ 361 | } \ 362 | if (_tmp->next) { \ 363 | _tmp->next = ((del)->next); \ 364 | } \ 365 | } \ 366 | } while (0) 367 | 368 | /* Here are VS2008 replacements for LL_APPEND and LL_DELETE */ 369 | #define LL_APPEND_VS2008(head,add) \ 370 | LL_APPEND2_VS2008(head,add,next) 371 | 372 | #define LL_APPEND2_VS2008(head,add,next) \ 373 | do { \ 374 | if (head) { \ 375 | (add)->next = head; /* use add->next as a temp variable */ \ 376 | while ((add)->next->next) { (add)->next = (add)->next->next; } \ 377 | (add)->next->next=(add); \ 378 | } else { \ 379 | (head)=(add); \ 380 | } \ 381 | (add)->next=NULL; \ 382 | } while (0) 383 | 384 | #define LL_DELETE_VS2008(head,del) \ 385 | LL_DELETE2_VS2008(head,del,next) 386 | 387 | #define LL_DELETE2_VS2008(head,del,next) \ 388 | do { \ 389 | if ((head) == (del)) { \ 390 | (head)=(head)->next; \ 391 | } else { \ 392 | char *_tmp = (char*)(head); \ 393 | while ((head)->next && ((head)->next != (del))) { \ 394 | head = (head)->next; \ 395 | } \ 396 | if ((head)->next) { \ 397 | (head)->next = ((del)->next); \ 398 | } \ 399 | { \ 400 | char **_head_alias = (char**)&(head); \ 401 | *_head_alias = _tmp; \ 402 | } \ 403 | } \ 404 | } while (0) 405 | #ifdef NO_DECLTYPE 406 | #undef LL_APPEND 407 | #define LL_APPEND LL_APPEND_VS2008 408 | #undef LL_DELETE 409 | #define LL_DELETE LL_DELETE_VS2008 410 | #undef LL_DELETE2 411 | #define LL_DELETE2 LL_DELETE2_VS2008 412 | #undef LL_APPEND2 413 | #define LL_APPEND2 LL_APPEND2_VS2008 414 | #undef LL_CONCAT /* no LL_CONCAT_VS2008 */ 415 | #undef DL_CONCAT /* no DL_CONCAT_VS2008 */ 416 | #endif 417 | /* end VS2008 replacements */ 418 | 419 | #define LL_COUNT(head,el,counter) \ 420 | LL_COUNT2(head,el,counter,next) \ 421 | 422 | #define LL_COUNT2(head,el,counter,next) \ 423 | { \ 424 | counter = 0; \ 425 | LL_FOREACH2(head,el,next){ ++counter; } \ 426 | } 427 | 428 | #define LL_FOREACH(head,el) \ 429 | LL_FOREACH2(head,el,next) 430 | 431 | #define LL_FOREACH2(head,el,next) \ 432 | for(el=head;el;el=(el)->next) 433 | 434 | #define LL_FOREACH_SAFE(head,el,tmp) \ 435 | LL_FOREACH_SAFE2(head,el,tmp,next) 436 | 437 | #define LL_FOREACH_SAFE2(head,el,tmp,next) \ 438 | for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp) 439 | 440 | #define LL_SEARCH_SCALAR(head,out,field,val) \ 441 | LL_SEARCH_SCALAR2(head,out,field,val,next) 442 | 443 | #define LL_SEARCH_SCALAR2(head,out,field,val,next) \ 444 | do { \ 445 | LL_FOREACH2(head,out,next) { \ 446 | if ((out)->field == (val)) break; \ 447 | } \ 448 | } while(0) 449 | 450 | #define LL_SEARCH(head,out,elt,cmp) \ 451 | LL_SEARCH2(head,out,elt,cmp,next) 452 | 453 | #define LL_SEARCH2(head,out,elt,cmp,next) \ 454 | do { \ 455 | LL_FOREACH2(head,out,next) { \ 456 | if ((cmp(out,elt))==0) break; \ 457 | } \ 458 | } while(0) 459 | 460 | #define LL_REPLACE_ELEM(head, el, add) \ 461 | do { \ 462 | LDECLTYPE(head) _tmp; \ 463 | assert(head != NULL); \ 464 | assert(el != NULL); \ 465 | assert(add != NULL); \ 466 | (add)->next = (el)->next; \ 467 | if ((head) == (el)) { \ 468 | (head) = (add); \ 469 | } else { \ 470 | _tmp = head; \ 471 | while (_tmp->next && (_tmp->next != (el))) { \ 472 | _tmp = _tmp->next; \ 473 | } \ 474 | if (_tmp->next) { \ 475 | _tmp->next = (add); \ 476 | } \ 477 | } \ 478 | } while (0) 479 | 480 | #define LL_PREPEND_ELEM(head, el, add) \ 481 | do { \ 482 | LDECLTYPE(head) _tmp; \ 483 | assert(head != NULL); \ 484 | assert(el != NULL); \ 485 | assert(add != NULL); \ 486 | (add)->next = (el); \ 487 | if ((head) == (el)) { \ 488 | (head) = (add); \ 489 | } else { \ 490 | _tmp = head; \ 491 | while (_tmp->next && (_tmp->next != (el))) { \ 492 | _tmp = _tmp->next; \ 493 | } \ 494 | if (_tmp->next) { \ 495 | _tmp->next = (add); \ 496 | } \ 497 | } \ 498 | } while (0) \ 499 | 500 | 501 | /****************************************************************************** 502 | * doubly linked list macros (non-circular) * 503 | *****************************************************************************/ 504 | #define DL_PREPEND(head,add) \ 505 | DL_PREPEND2(head,add,prev,next) 506 | 507 | #define DL_PREPEND2(head,add,prev,next) \ 508 | do { \ 509 | (add)->next = head; \ 510 | if (head) { \ 511 | (add)->prev = (head)->prev; \ 512 | (head)->prev = (add); \ 513 | } else { \ 514 | (add)->prev = (add); \ 515 | } \ 516 | (head) = (add); \ 517 | } while (0) 518 | 519 | #define DL_APPEND(head,add) \ 520 | DL_APPEND2(head,add,prev,next) 521 | 522 | #define DL_APPEND2(head,add,prev,next) \ 523 | do { \ 524 | if (head) { \ 525 | (add)->prev = (head)->prev; \ 526 | (head)->prev->next = (add); \ 527 | (head)->prev = (add); \ 528 | (add)->next = NULL; \ 529 | } else { \ 530 | (head)=(add); \ 531 | (head)->prev = (head); \ 532 | (head)->next = NULL; \ 533 | } \ 534 | } while (0) 535 | 536 | #define DL_CONCAT(head1,head2) \ 537 | DL_CONCAT2(head1,head2,prev,next) 538 | 539 | #define DL_CONCAT2(head1,head2,prev,next) \ 540 | do { \ 541 | LDECLTYPE(head1) _tmp; \ 542 | if (head2) { \ 543 | if (head1) { \ 544 | _tmp = (head2)->prev; \ 545 | (head2)->prev = (head1)->prev; \ 546 | (head1)->prev->next = (head2); \ 547 | (head1)->prev = _tmp; \ 548 | } else { \ 549 | (head1)=(head2); \ 550 | } \ 551 | } \ 552 | } while (0) 553 | 554 | #define DL_DELETE(head,del) \ 555 | DL_DELETE2(head,del,prev,next) 556 | 557 | #define DL_DELETE2(head,del,prev,next) \ 558 | do { \ 559 | assert((del)->prev != NULL); \ 560 | if ((del)->prev == (del)) { \ 561 | (head)=NULL; \ 562 | } else if ((del)==(head)) { \ 563 | (del)->next->prev = (del)->prev; \ 564 | (head) = (del)->next; \ 565 | } else { \ 566 | (del)->prev->next = (del)->next; \ 567 | if ((del)->next) { \ 568 | (del)->next->prev = (del)->prev; \ 569 | } else { \ 570 | (head)->prev = (del)->prev; \ 571 | } \ 572 | } \ 573 | } while (0) 574 | 575 | #define DL_COUNT(head,el,counter) \ 576 | DL_COUNT2(head,el,counter,next) \ 577 | 578 | #define DL_COUNT2(head,el,counter,next) \ 579 | { \ 580 | counter = 0; \ 581 | DL_FOREACH2(head,el,next){ ++counter; } \ 582 | } 583 | 584 | #define DL_FOREACH(head,el) \ 585 | DL_FOREACH2(head,el,next) 586 | 587 | #define DL_FOREACH2(head,el,next) \ 588 | for(el=head;el;el=(el)->next) 589 | 590 | /* this version is safe for deleting the elements during iteration */ 591 | #define DL_FOREACH_SAFE(head,el,tmp) \ 592 | DL_FOREACH_SAFE2(head,el,tmp,next) 593 | 594 | #define DL_FOREACH_SAFE2(head,el,tmp,next) \ 595 | for((el)=(head);(el) && (tmp = (el)->next, 1); (el) = tmp) 596 | 597 | /* these are identical to their singly-linked list counterparts */ 598 | #define DL_SEARCH_SCALAR LL_SEARCH_SCALAR 599 | #define DL_SEARCH LL_SEARCH 600 | #define DL_SEARCH_SCALAR2 LL_SEARCH_SCALAR2 601 | #define DL_SEARCH2 LL_SEARCH2 602 | 603 | #define DL_REPLACE_ELEM(head, el, add) \ 604 | do { \ 605 | assert(head != NULL); \ 606 | assert(el != NULL); \ 607 | assert(add != NULL); \ 608 | if ((head) == (el)) { \ 609 | (head) = (add); \ 610 | (add)->next = (el)->next; \ 611 | if ((el)->next == NULL) { \ 612 | (add)->prev = (add); \ 613 | } else { \ 614 | (add)->prev = (el)->prev; \ 615 | (add)->next->prev = (add); \ 616 | } \ 617 | } else { \ 618 | (add)->next = (el)->next; \ 619 | (add)->prev = (el)->prev; \ 620 | (add)->prev->next = (add); \ 621 | if ((el)->next == NULL) { \ 622 | (head)->prev = (add); \ 623 | } else { \ 624 | (add)->next->prev = (add); \ 625 | } \ 626 | } \ 627 | } while (0) 628 | 629 | #define DL_PREPEND_ELEM(head, el, add) \ 630 | do { \ 631 | assert(head != NULL); \ 632 | assert(el != NULL); \ 633 | assert(add != NULL); \ 634 | (add)->next = (el); \ 635 | (add)->prev = (el)->prev; \ 636 | (el)->prev = (add); \ 637 | if ((head) == (el)) { \ 638 | (head) = (add); \ 639 | } else { \ 640 | (add)->prev->next = (add); \ 641 | } \ 642 | } while (0) \ 643 | 644 | 645 | /****************************************************************************** 646 | * circular doubly linked list macros * 647 | *****************************************************************************/ 648 | #define CDL_PREPEND(head,add) \ 649 | CDL_PREPEND2(head,add,prev,next) 650 | 651 | #define CDL_PREPEND2(head,add,prev,next) \ 652 | do { \ 653 | if (head) { \ 654 | (add)->prev = (head)->prev; \ 655 | (add)->next = (head); \ 656 | (head)->prev = (add); \ 657 | (add)->prev->next = (add); \ 658 | } else { \ 659 | (add)->prev = (add); \ 660 | (add)->next = (add); \ 661 | } \ 662 | (head)=(add); \ 663 | } while (0) 664 | 665 | #define CDL_DELETE(head,del) \ 666 | CDL_DELETE2(head,del,prev,next) 667 | 668 | #define CDL_DELETE2(head,del,prev,next) \ 669 | do { \ 670 | if ( ((head)==(del)) && ((head)->next == (head))) { \ 671 | (head) = NULL; \ 672 | } else { \ 673 | (del)->next->prev = (del)->prev; \ 674 | (del)->prev->next = (del)->next; \ 675 | if ((del) == (head)) (head)=(del)->next; \ 676 | } \ 677 | } while (0) 678 | 679 | #define CDL_COUNT(head,el,counter) \ 680 | CDL_COUNT2(head,el,counter,next) \ 681 | 682 | #define CDL_COUNT2(head, el, counter,next) \ 683 | { \ 684 | counter = 0; \ 685 | CDL_FOREACH2(head,el,next){ ++counter; } \ 686 | } 687 | 688 | #define CDL_FOREACH(head,el) \ 689 | CDL_FOREACH2(head,el,next) 690 | 691 | #define CDL_FOREACH2(head,el,next) \ 692 | for(el=head;el;el=(((el)->next==head) ? 0L : (el)->next)) 693 | 694 | #define CDL_FOREACH_SAFE(head,el,tmp1,tmp2) \ 695 | CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next) 696 | 697 | #define CDL_FOREACH_SAFE2(head,el,tmp1,tmp2,prev,next) \ 698 | for((el)=(head), ((tmp1)=(head)?((head)->prev):NULL); \ 699 | (el) && ((tmp2)=(el)->next, 1); \ 700 | ((el) = (((el)==(tmp1)) ? 0L : (tmp2)))) 701 | 702 | #define CDL_SEARCH_SCALAR(head,out,field,val) \ 703 | CDL_SEARCH_SCALAR2(head,out,field,val,next) 704 | 705 | #define CDL_SEARCH_SCALAR2(head,out,field,val,next) \ 706 | do { \ 707 | CDL_FOREACH2(head,out,next) { \ 708 | if ((out)->field == (val)) break; \ 709 | } \ 710 | } while(0) 711 | 712 | #define CDL_SEARCH(head,out,elt,cmp) \ 713 | CDL_SEARCH2(head,out,elt,cmp,next) 714 | 715 | #define CDL_SEARCH2(head,out,elt,cmp,next) \ 716 | do { \ 717 | CDL_FOREACH2(head,out,next) { \ 718 | if ((cmp(out,elt))==0) break; \ 719 | } \ 720 | } while(0) 721 | 722 | #define CDL_REPLACE_ELEM(head, el, add) \ 723 | do { \ 724 | assert(head != NULL); \ 725 | assert(el != NULL); \ 726 | assert(add != NULL); \ 727 | if ((el)->next == (el)) { \ 728 | (add)->next = (add); \ 729 | (add)->prev = (add); \ 730 | (head) = (add); \ 731 | } else { \ 732 | (add)->next = (el)->next; \ 733 | (add)->prev = (el)->prev; \ 734 | (add)->next->prev = (add); \ 735 | (add)->prev->next = (add); \ 736 | if ((head) == (el)) { \ 737 | (head) = (add); \ 738 | } \ 739 | } \ 740 | } while (0) 741 | 742 | #define CDL_PREPEND_ELEM(head, el, add) \ 743 | do { \ 744 | assert(head != NULL); \ 745 | assert(el != NULL); \ 746 | assert(add != NULL); \ 747 | (add)->next = (el); \ 748 | (add)->prev = (el)->prev; \ 749 | (el)->prev = (add); \ 750 | (add)->prev->next = (add); \ 751 | if ((head) == (el)) { \ 752 | (head) = (add); \ 753 | } \ 754 | } while (0) \ 755 | 756 | #endif /* UTLIST_H */ 757 | 758 | --------------------------------------------------------------------------------