├── data ├── .gitkeep └── .gitignore ├── include ├── .gitkeep ├── disambiguation.h ├── cluster.h ├── macros.h ├── typedefs.h ├── txt2sqlite3.h ├── clusterhead.h ├── postprocess.h ├── strcmp95.h ├── disambiguate.h ├── threading.h ├── worker.h ├── utilities.h ├── comparators.h ├── exceptions.h └── newcluster.h ├── Makefile.am ├── doc ├── manual │ ├── images │ │ ├── .gitignore │ │ ├── disambiguation_workflow.png │ │ ├── disambiguation_data_structure.png │ │ └── Makefile │ ├── manual.pdf │ ├── DisambigManual.pdf │ └── Makefile └── .gitignore ├── src ├── breakpoints.gdb ├── .gitignore ├── txt2sqlite3.sh ├── testdb.sh ├── record_private.h ├── main.cpp ├── txt2sqlite3-test.txt ├── txt2sqlite3-test.final ├── makefile.citris ├── Makefile.am ├── string_manipulator.cpp └── threading.cpp ├── COPYING ├── INSTALL ├── test ├── testapp.cpp ├── reporting.cpp ├── data │ ├── final_jones.txt │ ├── berkeley │ │ ├── rare_names_berkeley.txt │ │ ├── prior_saved_6.txt │ │ ├── prior_saved_5.txt │ │ ├── prior_saved_4.txt │ │ ├── prior_saved_1.txt │ │ ├── prior_saved_3.txt │ │ ├── prior_saved_2.txt │ │ ├── final_berkeley.txt │ │ ├── tset05_1.txt │ │ ├── tset05_2.txt │ │ ├── tset05_3.txt │ │ ├── tset05_4.txt │ │ ├── tset05_5.txt │ │ └── tset05_6.txt │ ├── invpat_onerecord.txt │ └── final_eighteen.txt ├── reporting.h ├── runtime.log ├── testdata │ ├── invpat_onerecord.txt │ └── assignee_comparison.csv ├── testfake.cpp ├── makefile.orig ├── testdata.h ├── jarowinkler.data ├── testall.sh ├── unittest.cpp ├── .gitignore ├── test_readtext.cpp ├── fixtures │ ├── EngineConfig.txt │ └── BlockingConfig.txt ├── testutils.cpp ├── bashcolors.sh ├── test_abbreviation.cpp ├── fake.h ├── test_clusterhead.cpp ├── test_postprocess.cpp ├── testutils.h ├── integration.sh ├── integration.rb ├── test_fetch_records.cpp ├── test_compare.cpp ├── test_typedefs.cpp ├── test_misspell.cpp ├── test_training.cpp ├── test_rarenames.cpp ├── test_engine.cpp ├── test_ratios.cpp ├── test_jwcmp.cpp ├── test_coauthor.cpp ├── test_qp.cpp ├── Makefile.am ├── test_engineconfig.cpp ├── test_string_manipulator.cpp ├── fake.cpp ├── test_cluster.cpp ├── testdata.cpp ├── test_comparators.cpp ├── test_similarity.cpp ├── test_namecompare.cpp ├── test_blocking.cpp ├── test_strcmp95.cpp ├── test_assignee_comparison.cpp ├── test_clusterinfo.cpp └── test_record.cpp ├── .gitignore ├── config ├── engineresults.txt ├── engine.txt ├── engine2.txt ├── EngineConfig.txt ├── engine3.txt ├── engine18.txt ├── engine2011.txt ├── engine_jones.txt ├── engine2009.txt ├── engine2010.txt ├── blockingresults.txt ├── block.txt ├── blocking6.txt └── BlockingConfig.txt ├── readme.markdown ├── configure.ac └── makefile.orig /data/.gitkeep: -------------------------------------------------------------------------------- 1 | keep 2 | -------------------------------------------------------------------------------- /include/.gitkeep: -------------------------------------------------------------------------------- 1 | keep 2 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = src test 2 | -------------------------------------------------------------------------------- /doc/manual/images/.gitignore: -------------------------------------------------------------------------------- 1 | *.pdf 2 | -------------------------------------------------------------------------------- /src/breakpoints.gdb: -------------------------------------------------------------------------------- 1 | b main.cpp:9 2 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | /opt/local/share/automake-1.12/COPYING -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | /opt/local/share/automake-1.12/INSTALL -------------------------------------------------------------------------------- /test/testapp.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "unittest.h" 3 | 4 | 5 | -------------------------------------------------------------------------------- /test/reporting.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "reporting.h" 3 | 4 | 5 | -------------------------------------------------------------------------------- /doc/.gitignore: -------------------------------------------------------------------------------- 1 | html/ 2 | latex/ 3 | *.aux 4 | *.log 5 | *.dvi 6 | *~ 7 | -------------------------------------------------------------------------------- /test/data/final_jones.txt: -------------------------------------------------------------------------------- 1 | 09123095-2###0.999527###09123095-2,09241635-2, 2 | -------------------------------------------------------------------------------- /doc/manual/manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/disambiguator/HEAD/doc/manual/manual.pdf -------------------------------------------------------------------------------- /test/reporting.h: -------------------------------------------------------------------------------- 1 | #ifndef PATENT_REPORTING_H 2 | #define PATENT_REPORTING_H 3 | 4 | #endif //PATENT_REPORTING_H 5 | -------------------------------------------------------------------------------- /doc/manual/DisambigManual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/disambiguator/HEAD/doc/manual/DisambigManual.pdf -------------------------------------------------------------------------------- /test/runtime.log: -------------------------------------------------------------------------------- 1 | Reading Engine Configuration from engine.txt ... ... 2 | Engine configuration file engine.txt does not exist. 3 | -------------------------------------------------------------------------------- /doc/manual/images/disambiguation_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/disambiguator/HEAD/doc/manual/images/disambiguation_workflow.png -------------------------------------------------------------------------------- /doc/manual/images/disambiguation_data_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funginstitute/disambiguator/HEAD/doc/manual/images/disambiguation_data_structure.png -------------------------------------------------------------------------------- /doc/manual/Makefile: -------------------------------------------------------------------------------- 1 | # Checking to ensure Makefile is not ignored by global git options 2 | 3 | manual: manual.tex 4 | pdflatex manual.tex 5 | 6 | clean: 7 | rm -rf *.dvi *.log *.aux *~ 8 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | runtime.txt 2 | runtime.log 3 | txt2sqlite3 4 | runtime 5 | zardoz 6 | *.sqlite3 7 | *.log 8 | gmon* 9 | tmp 10 | lib*.a 11 | disambiguate 12 | .deps/ 13 | *.o 14 | Makefile 15 | Makefile.in 16 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | prior*.txt 2 | ratio*.txt 3 | stat_*.txt 4 | tset*.txt 5 | xset*.txt 6 | network*.txt 7 | newmatch*.txt 8 | postprocess*.txt 9 | final.txt.pplog 10 | final.txt 11 | match_cons.txt 12 | match_cons2.txt 13 | -------------------------------------------------------------------------------- /include/disambiguation.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PATENT_DISAMBIGUATION_H 3 | #define PATENT_DISAMBIGUATION_H 4 | 5 | #include "typedefs.h" 6 | 7 | #define STRINGIZE_DETAIL(x) #x 8 | #define STRINGIZE(x) STRINGIZE_DETAIL(x) 9 | 10 | 11 | #endif /* PATENT_DISAMBIGUATION_H */ 12 | -------------------------------------------------------------------------------- /test/data/berkeley/rare_names_berkeley.txt: -------------------------------------------------------------------------------- 1 | Firstname: 2 | ADAM , BRIAN , DAVID E , DAVID M , DAVID , IKHLAQ S , IKHLAQ , JOHN WILLIAM , JOHN , JOHN , LEE O , MATTHEW , PATRICK O , PETER , 3 | Lastname: 4 | BEGUELIN , TOTTY , GOURLEY , SIDHU , SIDHU , PLEVYAK , HAINES , MATTIS , 5 | -------------------------------------------------------------------------------- /src/txt2sqlite3.sh: -------------------------------------------------------------------------------- 1 | 2 | # This will be a driver file for the txt2sqlite convertor 3 | 4 | # ./txt2sqlite3 /var/share/patentdata/disambiguation/test/18/txt2sqlite3.sqlite3 final /var/share/patentdata/disambiguation/test/18/final.txt UNIQUE_RECORD_COLUMN_NAME UNIQUE_INVENTOR_COLUMN_NAME 5 | 6 | -------------------------------------------------------------------------------- /test/data/invpat_onerecord.txt: -------------------------------------------------------------------------------- 1 | Firstname,Middlename,Lastname,Street,City,State,Country,Zipcode,Latitude,Longitude,Patent,ApplyYear,Assignee,AsgNum,Class,Coauthor,Unique_Record_ID 2 | PHILIP E,PHILIP E,DURAND,,HUDSON,MA,US,1749,42.388756,-71.557437,03858241,1974,UNITED STATES OF AMERICA ARMY,H000000000072,2,L.NORRIS-0,03858241-1 3 | -------------------------------------------------------------------------------- /test/testdata/invpat_onerecord.txt: -------------------------------------------------------------------------------- 1 | Firstname,Middlename,Lastname,Street,City,State,Country,Zipcode,Latitude,Longitude,Patent,ApplyYear,Assignee,AsgNum,Class,Coauthor,Unique_Record_ID 2 | PHILIP E,PHILIP E,DURAND,,HUDSON,MA,US,1749,42.388756,-71.557437,03858241,1974,UNITED STATES OF AMERICA ARMY,H000000000072,2,L.NORRIS-0,03858241-1 3 | -------------------------------------------------------------------------------- /doc/manual/images/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile added to git, overriding global ignore 2 | 3 | INKSCAPE = /Applications/Inkscape.app/Contents/Resources/bin/inkscape 4 | 5 | PDFS = algorithm.pdf 6 | 7 | pdfs: $(PDFS) 8 | 9 | %.pdf : %.svg 10 | $(INKSCAPE) --export-area-drawing -f $< --export-pdf $@ 11 | 12 | 13 | clean: 14 | rm -rf *~ *.pdf 15 | -------------------------------------------------------------------------------- /src/testdb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ./txt2sqlite3 test.sqlite3 finaltest txt2sqlite3-test.txt urcn uicn 4 | echo "testdb: Dumping test data from sqlite..." 5 | sqlite3 -separator ',' test.sqlite3 "select * from finaltest" > tmp 6 | echo "testdb: Diffing dumped with final test..." 7 | diff tmp txt2sqlite3-test.final 8 | #rm tmp 9 | rm test.sqlite3 10 | -------------------------------------------------------------------------------- /test/testfake.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "fake.h" 3 | 4 | void 5 | testfake() { 6 | 7 | string filename = "./testdata/assignee_comparison.csv"; 8 | string title = "Fake testing..."; 9 | FakeTest * ft = new FakeTest(title, filename); 10 | ft->load_fake_data(filename); 11 | delete ft; 12 | } 13 | 14 | int 15 | main(int, char **) { 16 | 17 | testfake(); 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tmp 2 | tmp/* 3 | ./*/tmp/* 4 | gmon.out 5 | *~ 6 | *.swp 7 | experiments/* 8 | stamp-h1 9 | install-sh 10 | autoscan.log 11 | autom4te.cache/ 12 | aclocal.m4 13 | missing 14 | depcomp 15 | config.h 16 | config.log 17 | config.status 18 | config.h.in 19 | Makefile 20 | Makefile.in 21 | .deps/ 22 | Rare*.txt 23 | prior*.txt 24 | *.o 25 | *.gch 26 | exedisambig 27 | txt2sqlite3 28 | .fuse* 29 | *.gdb 30 | configure 31 | -------------------------------------------------------------------------------- /src/record_private.h: -------------------------------------------------------------------------------- 1 | #ifndef PATENT_RECORD_PRIVATE_H 2 | #define PATENT_RECORD_PRIVATE_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | std::vector parse_column_names (std::string line); 10 | 11 | std::vector create_column_indices( 12 | std::vector requested_columns, 13 | std::vector total_col_names); 14 | 15 | #endif // PATENT_RECORD_PRIVATE_H 16 | -------------------------------------------------------------------------------- /test/makefile.orig: -------------------------------------------------------------------------------- 1 | 2 | INCLUDES="/opt/local/include" 3 | LIBS="/opt/local/lib" 4 | 5 | programs = test_readtext 6 | 7 | test: test_readtext.cpp 8 | g++ -I$(INCLUDES) -c test_readtext.cpp 9 | #g++ -o test_readtext -I$(INCLUDES) test_readtext.o ../DisambigEngine.o -L$(LIBS) -lboost_unit_test_framework 10 | g++ -o test_readtext -I$(INCLUDES) test_readtext.o -L$(LIBS) -lboost_unit_test_framework 11 | 12 | clean: 13 | rm -rf *~ *.o $(programs) 14 | -------------------------------------------------------------------------------- /test/testdata.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PATENT_TEST_TESTDATA_H 3 | #define PATENT_TEST_TESTDATA_H 4 | 5 | #include // for RecordList 6 | #include 7 | #include 8 | 9 | Record * make_foobar_record(void); 10 | 11 | Record * make_quuxalot_record(void); 12 | 13 | RecordPList get_record_list(void); 14 | 15 | cBlocking_Operation_By_Coauthors get_blocker_coathor(void); 16 | 17 | vector get_column_names(void); 18 | 19 | #endif // PATENT_TEST_TESTDATA_H 20 | -------------------------------------------------------------------------------- /test/jarowinkler.data: -------------------------------------------------------------------------------- 1 | shackleford shackelford .9848 2 | cunningham cunnigham .9833 3 | campell campbell .9792 4 | nichleson nichulson .9630 5 | massey massie .9444 6 | abroms abrams .9333 7 | galloway calloway .9167 8 | lampley campley .9048 9 | dixon dickson .8533 10 | frederick fredric .9815 11 | michele michelle .9792 12 | jesse jessie .9722 13 | marhta martha .9667 14 | jonathon jonathan .9583 15 | julies juluis .9333 16 | jeraldine geraldine .9246 17 | yvette yevett .9111 18 | tanya tonya .8933 19 | dwayne duane .8578 20 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | int disambiguate_main(std::string & engineconf, std::string & blockconf); 8 | 9 | int 10 | main(int argc, char ** argv) { 11 | 12 | if (argc < 3) { 13 | std::cout << "usage: disambiguate " << std::endl; 14 | exit(0); 15 | } 16 | 17 | std::string engineconf(argv[1]); 18 | std::string blockconf(argv[2]); 19 | 20 | return disambiguate_main(engineconf, blockconf); 21 | } 22 | -------------------------------------------------------------------------------- /include/cluster.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PATENT_CLUSTER_H 3 | #define PATENT_CLUSTER_H 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // TODO: Document why 3000 was chosen. 11 | #define LARGE_BLOCK_SIZE 3000 12 | 13 | // TODO: Why 40 rounds instead of 30 or 50? 14 | #define MAX_ROUNDS 40 15 | 16 | #include "attribute.h" 17 | #include "newcluster.h" 18 | 19 | 20 | //forward declaration 21 | class Record; 22 | class cRatios; 23 | #include "clusterinfo.h" 24 | 25 | 26 | 27 | #endif /* PATENT_CLUSTER_H */ 28 | -------------------------------------------------------------------------------- /include/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef PATENT_MACROS_H 2 | #define PATENT_MACROS_H 3 | 4 | // See https://github.com/ndim/freemcan/blob/f6348add31479b17ad7c7eec0acaf1f100843e84/include/compiler.h#L44 5 | /** Rename Unused Parameter */ 6 | #define _UP(x) unused_p__ ## x 7 | 8 | /** Mark Unused Parameter */ 9 | #define UP(x) _UP(x) __attribute__((unused)) 10 | 11 | /** Rename Unused Variable */ 12 | #define _UV(x) unused_v__ ## x 13 | 14 | /** Mark Unused Variable */ 15 | #define UV(x) _UV(x) __attribute__((unused)) 16 | 17 | 18 | #endif /* PATENT_MACROS_H */ 19 | -------------------------------------------------------------------------------- /test/testall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | source ./bashcolors.sh 4 | initializeANSI 5 | 6 | ./attribute 7 | ./engine 8 | ./jwcmp 9 | ./ratios 10 | ./ratiocomponent 11 | ./blocking 12 | ./compare 13 | ./similarity 14 | ./assigneecomparison 15 | ./coauthor 16 | ./strcmp95 17 | ./stringmanipulator 18 | ./training 19 | ./clusterinfo 20 | ./cluster 21 | ./clusterhead 22 | ./fetchrecords 23 | ./misspell 24 | ./rarenames 25 | ./namecompare 26 | ./engineconfig 27 | ./record 28 | ./postprocess 29 | 30 | 31 | echo "Typedefs test" 32 | `./typedefs` 33 | echo "Abbreviations test" 34 | `./abbreviation` 35 | 36 | source ./integration.sh 37 | 38 | -------------------------------------------------------------------------------- /test/unittest.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "test_typedefs.h" 3 | #include "test_comparators.h" 4 | #include "test_attribute.h" 5 | #include "test_blocking.h" 6 | 7 | /** 8 | * Invoking program specific defines for adding main into 9 | * standalone files. This is handy when module or class can 10 | * be used as a standalone code or as a library. 11 | */ 12 | /* 13 | #-D$*_STANDALONE 14 | 15 | $(objects): %.o: %.c 16 | $(CC) -c $(CFLAGS) $(INCLUDES) -D$*_STANDALONE $< 17 | #ifdef rose_STANDALONE 18 | */ 19 | 20 | void 21 | test_all() { 22 | 23 | } 24 | 25 | int 26 | main(int argc, char ** argv) { 27 | 28 | return 0; 29 | } 30 | -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | ## Unignoring for test purposes 2 | !prior*.txt 3 | 4 | ## Executables using cppunit 5 | postprocess 6 | testfake 7 | compare 8 | qp 9 | coauthor 10 | ratiocomponent 11 | fake 12 | clusterinfo 13 | assigneecomparison 14 | fetchrecords 15 | ratios 16 | cluster 17 | training 18 | engine 19 | clusterhead 20 | similarity 21 | jwcmp 22 | namecompare 23 | misspell 24 | abbreviation 25 | tmp 26 | engineconfig 27 | rarenames 28 | strcmp95 29 | comparesimilarities 30 | comparators 31 | stringmanipulator 32 | attribute 33 | blocking 34 | record 35 | typedefs 36 | idtest 37 | distest 38 | 39 | ## The usual suspects... 40 | *.o 41 | a.out 42 | -------------------------------------------------------------------------------- /test/test_readtext.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Switch to ccpunit: 3 | * http://cppunit.sourceforge.net/doc/1.11.6/cppunit_cookbook.html 4 | */ 5 | #include 6 | using namespace boost::unit_test; 7 | 8 | void 9 | my_test_function() { 10 | BOOST_CHECK(1==1); 11 | } 12 | 13 | test_suite* 14 | init_unit_test_suite( int argc, char* argv[] ) { 15 | 16 | test_suite * test = BOOST_TEST_SUITE( "Master test suite" ); 17 | test->add( BOOST_TEST_CASE( &my_test_function ) ); 18 | return test; 19 | } 20 | 21 | 22 | int 23 | main(int argc, char ** argv) { 24 | 25 | init_unit_test_suite(argc, argv); 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /include/typedefs.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PATENT_TYPEDEFS_H 3 | #define PATENT_TYPEDEFS_H 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | typedef std::vector Label; 11 | 12 | // SimilarityProfile will probably get moved back 13 | // to the ratios and smoothing code once that us 14 | // cleaned up and refactored more. 15 | typedef std::vector SimilarityProfile; 16 | 17 | //asgdetail consists of assignee number and its patent counts. 18 | typedef std::pair asgdetail; 19 | 20 | // Use these for dealing with SimilarityProfile counts. 21 | typedef uint32_t sp_count_t; 22 | typedef sp_count_t m_count_t; 23 | typedef sp_count_t n_count_t; 24 | 25 | 26 | #endif /* PATENT_TYPEDEFS_H */ 27 | -------------------------------------------------------------------------------- /config/engineresults.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Engine 2 | WORKING DIRECTORY = /var/share/patentdata/disambiguation/test/full 3 | ORIGINAL CSV FILE = /var/share/patentdata/patents/full/full.csv 4 | NUMBER OF THREADS = 2 5 | GENERATE STABLE TRAINING SETS = true 6 | USE AVAILABLE RATIOS DATABASE = true 7 | THRESHOLDS = 0.99, 0.95 8 | NECESSARY ATTRIBUTES = Firstname, Lastname, Street, City, State, Country, Zipcode, Lat, Lon, InvSeq, Patent, AppYearStr, AppDateStr, Assignee, AsgNum, Class, Invnum 9 | ADJUST PRIOR BY FREQUENCY = true 10 | DEBUG MODE = true 11 | NUMBER OF TRAINING PAIRS = 1000000 12 | STARTING ROUND = 1 13 | STARTING FILE = /media/HITACHI/backup_server/1tb/home/doolin/src/CPP_Disambiguation/data/match_cons2.txt 14 | POSTPROCESS AFTER EACH ROUND = true 15 | -------------------------------------------------------------------------------- /config/engine.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Engine 2 | WORKING DIRECTORY = ../data 3 | ORIGINAL CSV FILE = /media/HITACHI/backup_server/1tb/home/doolin/src/CPP_Disambiguation/test/invpat2.txt 4 | NUMBER OF THREADS = 2 5 | GENERATE STABLE TRAINING SETS = true 6 | USE AVAILABLE RATIOS DATABASE = true 7 | THRESHOLDS = 0.99, 0.95 8 | NECESSARY ATTRIBUTES = Firstname, Lastname, Street, City, State, Country, Zipcode, Latitude, Longitude, InvSeq, Patent, AppDateStr, Assignee, AsgNum, Class, Coauthor, invnum_N 9 | ADJUST PRIOR BY FREQUENCY = true 10 | DEBUG MODE = false 11 | NUMBER OF TRAINING PAIRS = 1000000 12 | STARTING ROUND = 1 13 | STARTING FILE = /media/HITACHI/backup_server/1tb/home/doolin/src/CPP_Disambiguation/data/match_cons.txt 14 | POSTPROCESS AFTER EACH ROUND = true 15 | -------------------------------------------------------------------------------- /config/engine2.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Engine 2 | WORKING DIRECTORY = ../data 3 | ORIGINAL CSV FILE = /media/HITACHI/backup_server/1tb/home/doolin/src/CPP_Disambiguation/test/invpat2.txt 4 | NUMBER OF THREADS = 2 5 | GENERATE STABLE TRAINING SETS = true 6 | USE AVAILABLE RATIOS DATABASE = true 7 | THRESHOLDS = 0.99, 0.95 8 | NECESSARY ATTRIBUTES = Firstname, Middlename, Lastname, Street, City, State, Country, Zipcode, Latitude, Longitude, Patent, ApplyYear, Assignee, AsgNum, Class, Coauthor, Unique_Record_ID 9 | ADJUST PRIOR BY FREQUENCY = true 10 | DEBUG MODE = false 11 | NUMBER OF TRAINING PAIRS = 1000000 12 | STARTING ROUND = 1 13 | STARTING FILE = /media/HITACHI/backup_server/1tb/home/doolin/src/CPP_Disambiguation/data/match_cons.txt 14 | POSTPROCESS AFTER EACH ROUND = true 15 | -------------------------------------------------------------------------------- /config/EngineConfig.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Engine 2 | WORKING DIRECTORY = /media/data/edwardspace/workplace/testcpp/Disambiguation 3 | ORIGINAL CSV FILE = /media/data/edwardspace/workplace/testcpp/Disambiguation/invpat.txt 4 | NUMBER OF THREADS = 2 5 | GENERATE STABLE TRAINING SETS = true 6 | USE AVAILABLE RATIOS DATABASE = false 7 | THRESHOLDS = 0.99, 0.95 8 | NECESSARY ATTRIBUTES = Firstname, Lastname, Unique_Record_ID, Middlename, Longitude, Class, Latitude, Coauthor, Assignee, AsgNum, Country, Street, ApplyYear, City, Patent 9 | ADJUST PRIOR BY FREQUENCY = true 10 | DEBUG MODE = false 11 | NUMBER OF TRAINING PAIRS = 1000000 12 | STARTING ROUND = 1 13 | STARTING FILE = /media/data/edwardspace/workplace/testcpp/Disambiguation/match_cons.txt 14 | POSTPROCESS AFTER EACH ROUND = true 15 | -------------------------------------------------------------------------------- /config/engine3.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Engine 2 | WORKING DIRECTORY = ../data 3 | ORIGINAL CSV FILE = /media/HITACHI/backup_server/1tb/home/doolin/src/CPP_Disambiguation/test/invpat_onerecord.txt 4 | NUMBER OF THREADS = 1 5 | GENERATE STABLE TRAINING SETS = true 6 | USE AVAILABLE RATIOS DATABASE = false 7 | THRESHOLDS = 0.99, 0.95 8 | NECESSARY ATTRIBUTES = Firstname, Middlename, Lastname, Street, City, State, Country, Zipcode, Latitude, Longitude, Patent, ApplyYear, Assignee, AsgNum, Class, Coauthor, Unique_Record_ID 9 | ADJUST PRIOR BY FREQUENCY = true 10 | DEBUG MODE = true 11 | NUMBER OF TRAINING PAIRS = 100 12 | STARTING ROUND = 1 13 | STARTING FILE = /media/HITACHI/backup_server/1tb/home/doolin/src/CPP_Disambiguation/data/match_cons2.txt 14 | POSTPROCESS AFTER EACH ROUND = true 15 | -------------------------------------------------------------------------------- /test/fixtures/EngineConfig.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Engine 2 | WORKING DIRECTORY = /media/data/edwardspace/workplace/testcpp/Disambiguation 3 | ORIGINAL CSV FILE = /media/data/edwardspace/workplace/testcpp/Disambiguation/invpat.txt 4 | NUMBER OF THREADS = 2 5 | GENERATE STABLE TRAINING SETS = true 6 | USE AVAILABLE RATIOS DATABASE = false 7 | THRESHOLDS = 0.99, 0.95 8 | NECESSARY ATTRIBUTES = Firstname, Lastname, Unique_Record_ID, Middlename, Longitude, Class, Latitude, Coauthor, Assignee, AsgNum, Country, Street, ApplyYear, City, Patent 9 | ADJUST PRIOR BY FREQUENCY = true 10 | DEBUG MODE = false 11 | NUMBER OF TRAINING PAIRS = 1000000 12 | STARTING ROUND = 1 13 | STARTING FILE = /media/data/edwardspace/workplace/testcpp/Disambiguation/match_cons.txt 14 | POSTPROCESS AFTER EACH ROUND = true 15 | -------------------------------------------------------------------------------- /config/engine18.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Engine 2 | WORKING DIRECTORY = /var/share/patentdata/disambiguation/test/18 3 | ORIGINAL CSV FILE = /var/share/patentdata/patents/test/18/invpat.csv 4 | NUMBER OF THREADS = 2 5 | GENERATE STABLE TRAINING SETS = true 6 | USE AVAILABLE RATIOS DATABASE = false 7 | THRESHOLDS = 0.99, 0.95 8 | NECESSARY ATTRIBUTES = Firstname, Middlename, Lastname, Street, City, State, Country, Zipcode, Latitude, Longitude, InvSeq, Patent, AppYear, ApplyYear, GYear, AppDate, Assignee, AsgNum, Class, Coauthor, Invnum, Invnum_N, Unique_Record_ID 9 | ADJUST PRIOR BY FREQUENCY = true 10 | DEBUG MODE = true 11 | NUMBER OF TRAINING PAIRS = 1000000 12 | STARTING ROUND = 1 13 | STARTING FILE = /home/doolin/src/disambiguator/data/match_cons2.txt 14 | POSTPROCESS AFTER EACH ROUND = true 15 | -------------------------------------------------------------------------------- /config/engine2011.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Engine 2 | WORKING DIRECTORY = ../data 3 | ORIGINAL CSV FILE = /media/HITACHI/backup_server/1tb/home/doolin/src/Python-Scripts/invpat2011.csv 4 | NUMBER OF THREADS = 2 5 | GENERATE STABLE TRAINING SETS = true 6 | USE AVAILABLE RATIOS DATABASE = true 7 | THRESHOLDS = 0.99, 0.95 8 | NECESSARY ATTRIBUTES = Firstname, Middlename, Lastname, Street, City, State, Country, Zipcode, Latitude, Longitude, InvSeq, Patent, AppYear, ApplyYear, GYear, AppDate, Assignee, AsgNum, Class, Coauthor, Invnum, Invnum_N, Unique_Record_ID 9 | ADJUST PRIOR BY FREQUENCY = true 10 | DEBUG MODE = true 11 | NUMBER OF TRAINING PAIRS = 1000000 12 | STARTING ROUND = 1 13 | STARTING FILE = /media/HITACHI/backup_server/1tb/home/doolin/src/CPP_Disambiguation/data/match_cons2.txt 14 | POSTPROCESS AFTER EACH ROUND = false 15 | -------------------------------------------------------------------------------- /config/engine_jones.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Engine 2 | WORKING DIRECTORY = /var/share/patentdata/disambiguation/test/synthetic/jones 3 | ORIGINAL CSV FILE = /var/share/patentdata/patents/test/synthetic/jones/jones3.csv 4 | NUMBER OF THREADS = 4 5 | GENERATE STABLE TRAINING SETS = true 6 | USE AVAILABLE RATIOS DATABASE = true 7 | THRESHOLDS = 0.99, 0.95 8 | NECESSARY ATTRIBUTES = Firstname, Middlename, Lastname, Street, City, State, Country, Zipcode, Latitude, Longitude, InvSeq, Patent, AppYear, ApplyYear, GYear, AppDate, Assignee, AsgNum, Class, Coauthor, Invnum, Invnum_N, Unique_Record_ID 9 | ADJUST PRIOR BY FREQUENCY = true 10 | DEBUG MODE = true 11 | NUMBER OF TRAINING PAIRS = 10000000 12 | STARTING ROUND = 1 13 | STARTING FILE = /home/doolin/src/disambiguator/data/match_cons2.txt 14 | POSTPROCESS AFTER EACH ROUND = true 15 | -------------------------------------------------------------------------------- /config/engine2009.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Engine 2 | WORKING DIRECTORY = /var/share/patentdata/disambiguation/data2009 3 | ORIGINAL CSV FILE = /var/share/patentdata/patents/2009/invpat.csv 4 | NUMBER OF THREADS = 2 5 | GENERATE STABLE TRAINING SETS = true 6 | USE AVAILABLE RATIOS DATABASE = false 7 | THRESHOLDS = 0.99, 0.95 8 | NECESSARY ATTRIBUTES = Firstname, Middlename, Lastname, Street, City, State, Country, Zipcode, Latitude, Longitude, InvSeq, Patent, AppYear, ApplyYear, GYear, AppDate, Assignee, AsgNum, Class, Coauthor, Invnum, Invnum_N, Unique_Record_ID 9 | ADJUST PRIOR BY FREQUENCY = true 10 | DEBUG MODE = false 11 | NUMBER OF TRAINING PAIRS = 1000000 12 | STARTING ROUND = 1 13 | STARTING FILE = /media/HITACHI/backup_server/1tb/home/doolin/src/CPP_Disambiguation/data/match_cons2.txt 14 | POSTPROCESS AFTER EACH ROUND = true 15 | -------------------------------------------------------------------------------- /config/engine2010.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Engine 2 | WORKING DIRECTORY = /var/share/patentdata/disambiguation/data2010 3 | ORIGINAL CSV FILE = /var/share/patentdata/patents/2010/invpat.csv 4 | NUMBER OF THREADS = 2 5 | GENERATE STABLE TRAINING SETS = true 6 | USE AVAILABLE RATIOS DATABASE = true 7 | THRESHOLDS = 0.99, 0.95 8 | NECESSARY ATTRIBUTES = Firstname, Middlename, Lastname, Street, City, State, Country, Zipcode, Latitude, Longitude, InvSeq, Patent, AppYear, ApplyYear, GYear, AppDate, Assignee, AsgNum, Class, Coauthor, Invnum, Invnum_N, Unique_Record_ID 9 | ADJUST PRIOR BY FREQUENCY = true 10 | DEBUG MODE = true 11 | NUMBER OF TRAINING PAIRS = 1000000 12 | STARTING ROUND = 1 13 | STARTING FILE = /media/HITACHI/backup_server/1tb/home/doolin/src/CPP_Disambiguation/data/match_cons2.txt 14 | POSTPROCESS AFTER EACH ROUND = false 15 | -------------------------------------------------------------------------------- /src/txt2sqlite3-test.txt: -------------------------------------------------------------------------------- 1 | 07949850-2###1###07949850-2, 2 | D0643978-0###1###D0643978-0, 3 | 07883819-0###1###07883819-0, 4 | 07955754-0###1###07955754-0,07972449-0,08007958-0,08017280-0,08029870-2,08084164-0, 5 | 07883730-1###1###07883730-1, 6 | 07983719-0###1###07983719-0,08050241-0, 7 | 07991595-1###1###07991595-1, 8 | 07877123-0###1###07877123-0,08060162-0, 9 | 08057588-0###1###08057588-0, 10 | 07938557-3###1###07938557-3, 11 | 07996768-0###1###07996768-0,07925988-0, 12 | 07881234-0###1###07881234-0,07978937-0,07991128-0, 13 | 07913704-0###1###07913704-0, 14 | 07990994-1###1###07990994-1, 15 | 08071644-0###1###08071644-0,08080537-0, 16 | 07881028-0###1###07881028-0,07956671-0,07968908-0, 17 | 08029491-0###1###08029491-0, 18 | 07980419-1###1###07980419-1,08016095-2, 19 | 07866506-6###1###07866506-6,D0639322-6,08056760-2, 20 | 07867975-2###1###07867975-2, 21 | -------------------------------------------------------------------------------- /include/txt2sqlite3.h: -------------------------------------------------------------------------------- 1 | /* 2 | * txt2sqlite3.h 3 | * 4 | * Created on: Mar 25, 2011 5 | * Author: ysun 6 | */ 7 | 8 | #ifndef TXT2SQLITE3_H_ 9 | #define TXT2SQLITE3_H_ 10 | 11 | #ifdef __cplusplus 12 | extern "C" { 13 | #include 14 | } 15 | #endif 16 | 17 | #include 18 | using std::string; 19 | 20 | bool stepwise_dump ( const char * sqlite3_target, const char * tablename, const char * txt_source, const string & unique_record_name, const string & unique_inventor_name); 21 | bool dump_match ( const char * sqlite3_target, const char * tablename, const char * txt_source, const string & unique_record_name, const string & unique_inventor_name); 22 | bool stepwise_add_column ( const char * sqlite3_target, const char * tablename, const char * txt_source, const string & unique_record_name, const string & unique_inventor_name); 23 | 24 | 25 | 26 | 27 | 28 | 29 | #endif /* TXT2SQLITE3_H_ */ 30 | -------------------------------------------------------------------------------- /test/data/final_eighteen.txt: -------------------------------------------------------------------------------- 1 | D0656309-1###1###D0656309-1, 2 | D0656299-0###1###D0656299-0, 3 | D0656303-1###1###D0656303-1, 4 | D0656313-3###1###D0656313-3, 5 | D0656296-1###1###D0656296-1, 6 | D0656300-0###1###D0656300-0, 7 | D0656311-0###1###D0656311-0, 8 | D0656304-0###1###D0656304-0, 9 | D0656298-0###1###D0656298-0, 10 | D0656306-0###1###D0656306-0, 11 | D0656313-0###1###D0656313-0, 12 | D0656307-0###1###D0656307-0, 13 | D0656310-1###1###D0656310-1, 14 | D0656305-0###1###D0656305-0, 15 | D0656303-0###1###D0656303-0, 16 | D0656308-0###1###D0656308-0, 17 | D0656297-0###1###D0656297-0, 18 | D0656301-0###1###D0656301-0, 19 | D0656296-2###1###D0656296-2, 20 | D0656296-0###1###D0656296-0, 21 | D0656309-0###1###D0656309-0, 22 | D0656313-1###1###D0656313-1, 23 | D0656307-2###1###D0656307-2, 24 | D0656307-1###1###D0656307-1, 25 | D0656313-2###1###D0656313-2, 26 | D0656310-0###1###D0656310-0, 27 | D0656312-0###1###D0656312-0, 28 | D0656302-0###1###D0656302-0, 29 | -------------------------------------------------------------------------------- /src/txt2sqlite3-test.final: -------------------------------------------------------------------------------- 1 | 07866506-6,07866506-6 2 | 07867975-2,07867975-2 3 | 07877123-0,07877123-0 4 | 07881028-0,07881028-0 5 | 07881234-0,07881234-0 6 | 07883730-1,07883730-1 7 | 07883819-0,07883819-0 8 | 07913704-0,07913704-0 9 | 07925988-0,07996768-0 10 | 07938557-3,07938557-3 11 | 07949850-2,07949850-2 12 | 07955754-0,07955754-0 13 | 07956671-0,07881028-0 14 | 07968908-0,07881028-0 15 | 07972449-0,07955754-0 16 | 07978937-0,07881234-0 17 | 07980419-1,07980419-1 18 | 07983719-0,07983719-0 19 | 07990994-1,07990994-1 20 | 07991128-0,07881234-0 21 | 07991595-1,07991595-1 22 | 07996768-0,07996768-0 23 | 08007958-0,07955754-0 24 | 08016095-2,07980419-1 25 | 08017280-0,07955754-0 26 | 08029491-0,08029491-0 27 | 08029870-2,07955754-0 28 | 08050241-0,07983719-0 29 | 08056760-2,07866506-6 30 | 08057588-0,08057588-0 31 | 08060162-0,07877123-0 32 | 08071644-0,08071644-0 33 | 08080537-0,08071644-0 34 | 08084164-0,07955754-0 35 | D0639322-6,07866506-6 36 | D0643978-0,D0643978-0 37 | -------------------------------------------------------------------------------- /test/testutils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "colortest.h" 6 | #include "testutils.h" 7 | 8 | using std::string; 9 | 10 | void 11 | describe(const char * indent, const char * description, const char * COLOR) { 12 | 13 | std::cout << indent << COLOR << description << COLOR_RESET 14 | << std::endl; 15 | } 16 | 17 | void 18 | describe_test(const char * indent, const char * description) { 19 | describe(indent, description, DESCCOLOR); 20 | } 21 | 22 | void 23 | describe_pending(const char * indent, const char * description) { 24 | string pending("(PENDING) "); 25 | string d(description); 26 | pending = pending + d; 27 | describe(indent, pending.c_str() , PENDINGCOLOR); 28 | } 29 | 30 | 31 | void 32 | describe_pass(const char * indent, const char * description) { 33 | describe(indent, description, PASSCOLOR); 34 | } 35 | 36 | 37 | void 38 | describe_fail(const char * indent, const char * description) { 39 | describe(indent, description, FAILCOLOR); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /test/bashcolors.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # http://code.google.com/p/shunit2/source/browse/trunk/source/2.1/src/shunit2 4 | # return if shunit already loaded 5 | #[ -n "${SHUNIT_VERSION:-}" ] && exit 0 6 | #SHUNIT_VERSION='2.1.6' 7 | 8 | 9 | # These colors define how the output to the testing shell 10 | # scripts work. 11 | initializeANSI() 12 | { 13 | esc="" 14 | 15 | blackf="${esc}[30m"; redf="${esc}[31m"; greenf="${esc}[32m" 16 | yellowf="${esc}[33m" bluef="${esc}[34m"; purplef="${esc}[35m" 17 | cyanf="${esc}[36m"; whitef="${esc}[37m" 18 | 19 | orangeyf="${esc}[38;05;214m"; 20 | yellowyf="${esc}[38;05;150m"; 21 | 22 | blackb="${esc}[40m"; redb="${esc}[41m"; greenb="${esc}[42m" 23 | yellowb="${esc}[43m" blueb="${esc}[44m"; purpleb="${esc}[45m" 24 | cyanb="${esc}[46m"; whiteb="${esc}[47m"; 25 | 26 | 27 | boldon="${esc}[1m"; boldoff="${esc}[22m" 28 | italicson="${esc}[3m"; italicsoff="${esc}[23m" 29 | ulon="${esc}[4m"; uloff="${esc}[24m" 30 | invon="${esc}[7m"; invoff="${esc}[27m" 31 | 32 | reset="${esc}[0m" 33 | } 34 | 35 | -------------------------------------------------------------------------------- /test/data/berkeley/prior_saved_6.txt: -------------------------------------------------------------------------------- 1 | D####CUL## : 0.223144, 2 | J####BEA## : 0, 3 | K####ARN## : 5.10826e-07, 4 | K####UDE## : 5.10826e-07, 5 | D####CUT## : 0.223144, 6 | A####BEG## : 0.916291, 7 | J####AAR## : 0, 8 | G####KAN## : 0.916291, 9 | B####TOT## : 0.95, 10 | L####MUL## : 2.23144e-07, 11 | L####STE## : 2.23144e-07, 12 | J####PLE## : 0, 13 | L####FLE## : 2.23144e-07, 14 | L####FAB## : 2.23144e-07, 15 | K####PIS## : 0.306495, 16 | W####KAI## : 1.60944e-06, 17 | N####SIT## : 9.16291e-07, 18 | D####GOU## : 0.223144, 19 | R####LAF## : 2.23144e-07, 20 | J####PIN## : 0, 21 | O####STA## : 1.60944e-06, 22 | S####WAT## : 0.95, 23 | T####CAM## : 9.16291e-07, 24 | A####ARV## : 9.16291e-07, 25 | G####PAP## : 9.16291e-07, 26 | D####DOO## : 0.223144, 27 | P####NEL## : 5.10826e-07, 28 | M####HAI## : 0.95, 29 | T####SHA## : 0.916291, 30 | Y####ZAT## : 9.16291e-07, 31 | Y####FAY## : 0.916291, 32 | R####SCH## : 2.23144e-07, 33 | R####SHE## : 2.23144e-07, 34 | J####HUN## : 0, 35 | N####TRE## : 9.16291e-07, 36 | I####SID## : 0.95, 37 | P####BRO## : 0.510826, 38 | R####CON## : 2.23144e-07, 39 | P####MAT## : 0.510826, 40 | -------------------------------------------------------------------------------- /config/blockingresults.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Blocking Mechanism 2 | //ATTENTION: LOOK AT get_prior_value function BEFORE YOU WANT TO CHANGE THE FORMAT. 3 | //format: 4 | //attribute name: data index : parameters of string manipulation. 5 | //parameters of string manipulation for current use: starting position, number of characters to extract, direction. 6 | [ Round 1 ] 7 | Firstname: 1 : 0,0 , true 8 | Middlename: 1 : 0 , 0 , false 9 | Lastname: 0: 0, 0, true 10 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Longitude 11 | 12 | [ Round 2 ] 13 | Firstname: 1 : 0 , 3 , true 14 | Middlename: 1 : 0 , 0 , false 15 | Lastname: 0 : 0, 5, true 16 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 17 | 18 | [ Round 3 ] 19 | Firstname: 1 : 0, 1, true 20 | Middlename: 1 : 0 , 0 , false 21 | Lastname: 0: 0, 5, true 22 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 23 | 24 | [ Round 4 ] 25 | Firstname: 1: 0, 1, true 26 | Middlename: 1 : 0 , 0 , false 27 | Lastname: 0: 0, 3, true 28 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 29 | -------------------------------------------------------------------------------- /include/clusterhead.h: -------------------------------------------------------------------------------- 1 | #ifndef PATENT_CLUSTERHEAD_H 2 | #define PATENT_CLUSTERHEAD_H 3 | 4 | /** 5 | * ClusterHead: 6 | * This class contains two pieces of information about 7 | * a cluster: its delegate and its cohesion. 8 | */ 9 | class ClusterHead { 10 | 11 | public: 12 | 13 | /** const Record * m_delegate: the delegate (representative) of a cluster. 14 | * Usually this pointer contains the most frequently occurring information. 15 | */ 16 | const Record * m_delegate; 17 | 18 | /** 19 | * double m_cohesion: the cohesion of a cluster, meaning the probability 20 | * for the members of the cluster to be of the same inventor. 21 | */ 22 | double m_cohesion; 23 | 24 | /** 25 | * ClusterHead(const Record * const p, const double c): constructor 26 | */ 27 | ClusterHead(const Record * const representative, const double cohesion) 28 | : m_delegate(representative), m_cohesion(cohesion) {}; 29 | 30 | /** 31 | * ClusterHead ( const ClusterHead & rhs): copy constructor 32 | */ 33 | ClusterHead (const ClusterHead & rhs) 34 | : m_delegate(rhs.m_delegate), m_cohesion(rhs.m_cohesion) {} 35 | }; 36 | 37 | #endif /* PATENT_CLUSTERHEAD_H */ 38 | -------------------------------------------------------------------------------- /test/data/berkeley/prior_saved_5.txt: -------------------------------------------------------------------------------- 1 | J####HUNT## : 0, 2 | G####KAN## : 0.916291, 3 | D####CULLE## : 0.223144, 4 | J####PINKE## : 0, 5 | J####BEATT## : 0, 6 | A####BEGUE## : 0.916291, 7 | G####PAPAD## : 9.16291e-07, 8 | D####DOOLI## : 0.223144, 9 | D####CUTTI## : 0.223144, 10 | K####UDELL## : 5.10826e-07, 11 | J####AARON## : 0, 12 | D####GOURL## : 0.223144, 13 | B####TOTTY## : 0.95, 14 | J####PLEVY## : 0, 15 | L####FABIN## : 2.23144e-07, 16 | L####FLEMI## : 2.23144e-07, 17 | K####PISTE## : 0.306495, 18 | R####SHEAR## : 2.23144e-07, 19 | R####LAFET## : 2.23144e-07, 20 | T####CAMAR## : 9.16291e-07, 21 | L####MULLE## : 2.23144e-07, 22 | N####SITAR## : 9.16291e-07, 23 | K####ARNET## : 5.10826e-07, 24 | M####HAINE## : 0.95, 25 | R####SCHNE## : 2.23144e-07, 26 | Y####FAYBI## : 0.916291, 27 | W####KAISE## : 1.60944e-06, 28 | S####WATER## : 0.95, 29 | A####ARVIN## : 9.16291e-07, 30 | P####NELSO## : 5.10826e-07, 31 | N####TREUH## : 9.16291e-07, 32 | O####STAFS## : 1.60944e-06, 33 | P####MATTI## : 0.510826, 34 | Y####ZATS## : 9.16291e-07, 35 | T####SHALO## : 0.916291, 36 | I####SIDHU## : 0.95, 37 | L####STEWA## : 2.23144e-07, 38 | R####CONAN## : 2.23144e-07, 39 | P####BROWN## : 0.510826, 40 | -------------------------------------------------------------------------------- /test/test_abbreviation.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | #include "comparators.h" 11 | 12 | /** 13 | * Run this code with /usr/bin/valgrind ./abbreviation --leak-check=full 14 | */ 15 | 16 | void 17 | test_is_abbrev() { 18 | 19 | const char * s1 = "abbreviation"; 20 | const char * s2 = "abbr"; 21 | CPPUNIT_ASSERT (4 == is_abbreviation(s1,s2)); 22 | } 23 | 24 | /** 25 | * This tests the implementation, which passes, with 26 | * the caveat that it doesn't implement an algorithm 27 | * conforming to any standard, accepted definition of 28 | * an abbreviation. This might could be called something 29 | * else, like "custom_stemming" or something. 30 | */ 31 | void 32 | test_is_not_abbrev() { 33 | 34 | const char * s1 = "abbreviation"; 35 | const char * s2 = "abbr."; 36 | CPPUNIT_ASSERT (0 == is_abbreviation(s1,s2)); 37 | } 38 | 39 | void 40 | test_abbreviations() { 41 | test_is_abbrev(); 42 | test_is_not_abbrev(); 43 | } 44 | 45 | int 46 | main(int, char **) { 47 | 48 | test_abbreviations(); 49 | return 0; 50 | } 51 | 52 | -------------------------------------------------------------------------------- /test/fake.h: -------------------------------------------------------------------------------- 1 | #ifndef PATENT_FAKE_H 2 | #define PATENT_FAKE_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | class FakeTest : public CppUnit::TestCase { 16 | 17 | private: 18 | 19 | list source; 20 | vector requested_columns; 21 | RecordPList record_pointers; 22 | // Accessory container for unit testing, not 23 | // present in the disambiguation code. 24 | vector rpv; 25 | string csvfilename; 26 | map uid_dict; 27 | const cBlocking_Operation_By_Coauthors * coauthor_blocking; 28 | 29 | public: 30 | 31 | FakeTest(string name, string filename); 32 | 33 | ~FakeTest() {} 34 | 35 | void load_fake_data(string csvfilename); 36 | 37 | RecordIndex * get_uid_dict(); 38 | 39 | RecordPList get_recpointers(); 40 | 41 | vector get_recvecs(); 42 | 43 | list get_all_records(); 44 | 45 | const cBlocking_Operation_By_Coauthors * get_coauthor_blocking(); 46 | 47 | void runTest() { 48 | load_fake_data(csvfilename); 49 | } 50 | }; 51 | 52 | 53 | #endif //PATENT_FAKE_H 54 | -------------------------------------------------------------------------------- /test/data/berkeley/prior_saved_4.txt: -------------------------------------------------------------------------------- 1 | KEN####ARNET## : 4.05465e-07, 2 | JOH####AARON## : 0, 3 | JAM####PINKE## : 4.05465e-07, 4 | DOU####CUTTI## : 0.95, 5 | JAM####HUNT## : 4.05465e-07, 6 | ADA####BEGUE## : 0.95, 7 | JOH####BEATT## : 0, 8 | BRI####TOTTY## : 0.95, 9 | GEN####KAN## : 0.95, 10 | GRE####PAPAD## : 1.09861e-06, 11 | ARV####ARVIN## : 1.09861e-06, 12 | DAV####CULLE## : 0, 13 | KRI####PISTE## : 0.659167, 14 | JOH####PLEVY## : 0, 15 | LAR####FABIN## : 1.09861e-06, 16 | ROB####CONAN## : 4.05465e-07, 17 | PET####MATTI## : 0.95, 18 | DAV####DOOLI## : 0, 19 | NOA####TREUH## : 1.09861e-06, 20 | MAT####HAINE## : 0.95, 21 | THO####CAMAR## : 1.09861e-06, 22 | YUR####ZATS## : 1.09861e-06, 23 | STE####WATER## : 0.95, 24 | WIL####KAISE## : 1.09861e-06, 25 | TID####SHALO## : 0.95, 26 | YAR####FAYBI## : 0.95, 27 | DAV####GOURL## : 0, 28 | LLO####STEWA## : 1.09861e-06, 29 | LIL####MULLE## : 1.09861e-06, 30 | NIC####SITAR## : 1.09861e-06, 31 | KEN####UDELL## : 4.05465e-07, 32 | PHY####NELSO## : 1.09861e-06, 33 | PAT####BROWN## : 0.95, 34 | ROB####SHEAR## : 4.05465e-07, 35 | ROS####LAFET## : 1.09861e-06, 36 | RIC####SCHNE## : 1.09861e-06, 37 | IKH####SIDHU## : 0.95, 38 | LEE####FLEMI## : 1.09861e-06, 39 | OSC####STAFS## : 1.09861e-06, 40 | -------------------------------------------------------------------------------- /include/postprocess.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PATENT_POSTPROCESS_H 3 | #define PATENT_POSTPROCESS_H 4 | 5 | #include "newcluster.h" 6 | 7 | 8 | typedef list < Cluster > Cluster_Container; 9 | 10 | class ClusterSet { 11 | 12 | private: 13 | Cluster_Container consolidated; 14 | ClusterSet (const ClusterSet &); 15 | 16 | public: 17 | 18 | //ClusterSet & convert_from_ClusterInfo(const Cluster_Info *); 19 | 20 | const Cluster_Container & get_set() const { 21 | return consolidated; 22 | } 23 | 24 | Cluster_Container & get_modifiable_set() { 25 | return consolidated; 26 | } 27 | 28 | ClusterSet () {} 29 | 30 | ~ClusterSet() {} 31 | 32 | void output_results (const char *) const; 33 | 34 | void read_from_file (const char * filename, 35 | const map & uid_tree); 36 | }; 37 | 38 | 39 | void post_polish (ClusterSet & m, 40 | Uid2UinvTree & uid2uinv, 41 | //map < const Record *, const Record *> & uid2uinv, 42 | const PatentTree & patent_tree, 43 | //const map < const Record *, RecordPList, cSort_by_attrib > & patent_tree, 44 | const string & logfile); 45 | 46 | 47 | #endif /* PATENT_POSTPROCESS_H */ 48 | -------------------------------------------------------------------------------- /include/strcmp95.h: -------------------------------------------------------------------------------- 1 | /* The strcmp95 function returns a double precision value from 0.0 (total 2 | disagreement) to 1.0 (character-by-character agreement). The returned 3 | value is a measure of the similarity of the two strings. */ 4 | 5 | /* Date of Release: Jan. 26, 1994 */ 6 | /* Modified: April 24, 1994 Corrected the processing of the single length 7 | character strings. 8 | Authors: This function was written using the logic from code written by 9 | Bill Winkler, George McLaughlin and Matt Jaro with modifications 10 | by Maureen Lynch. 11 | Comment: This is the official string comparator to be used for matching 12 | during the 1995 Test Census. */ 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #define NOTNUM(c) ((c>57) || (c<48)) 19 | #define INRANGE(c) ((c>0) && (c<91)) 20 | #define MAX_VAR_SIZE 1024 21 | #define NULL60 " " 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | 27 | //double strcmp95(char *, char *, long, int **); 28 | double strcmp95_modified (const char *ying, const char *yang); 29 | 30 | #ifdef __cplusplus 31 | } 32 | #endif 33 | -------------------------------------------------------------------------------- /test/data/berkeley/prior_saved_1.txt: -------------------------------------------------------------------------------- 1 | ARVIND####ARVIND## : 1.09861e-06, 2 | DAVID####GOURLEY## : 0, 3 | ADAM####BEGUELIN## : 0.95, 4 | DAVID####CULLER## : 0, 5 | DAVID####DOOLIN## : 0, 6 | DOUGLASS####CUTTING## : 0.95, 7 | GREGORY####PAPADOPOULOS## : 1.09861e-06, 8 | BRIAN####TOTTY## : 0.95, 9 | JOHN####AARONIII## : 0, 10 | JAMES####PINKERTON## : 4.05465e-07, 11 | IKHLAQ####SIDHU## : 0.417233, 12 | JAMES####HUNT## : 4.05465e-07, 13 | GENE####KAN## : 0.95, 14 | JOHN####BEATTY## : 0, 15 | KENT####UDELL## : 1.09861e-06, 16 | KENNETH####ARNETT## : 1.09861e-06, 17 | JOHN####PLEVYAK## : 0, 18 | KRISTOFER####PISTER## : 0.109861, 19 | LARRY####FABINY## : 1.09861e-06, 20 | LEE####FLEMING## : 1.09861e-06, 21 | LILAC####MULLER## : 1.09861e-06, 22 | LLOYD####STEWARTJR## : 1.09861e-06, 23 | NICHOLAS####SITAR## : 1.09861e-06, 24 | NOAH####TREUHAFT## : 1.09861e-06, 25 | MATTHEW####HAINES## : 0.784723, 26 | OSCAR####STAFSUDD## : 1.09861e-06, 27 | RICHARD####SCHNEIDER## : 1.09861e-06, 28 | ROB####CONANT## : 1.09861e-06, 29 | PATRICK####BROWN## : 0.95, 30 | PHYLLIS####NELSON## : 1.09861e-06, 31 | PETER####MATTIS## : 0.95, 32 | ROBERT####SHEAR## : 1.09861e-06, 33 | ROSS####LAFETRA## : 1.09861e-06, 34 | THOMAS####CAMARDA## : 1.09861e-06, 35 | STEVE####WATERHOUSE## : 0.95, 36 | WILLIAM####KAISER## : 1.09861e-06, 37 | TIDHAR####SHALON## : 0.439445, 38 | YURI####ZATS## : 1.09861e-06, 39 | YAROSLAV####FAYBISHENKO## : 0.95, 40 | -------------------------------------------------------------------------------- /test/test_clusterhead.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include "testdata.h" 8 | #include "testutils.h" 9 | 10 | class ClusterHeadTest : public CppUnit::TestCase { 11 | 12 | public: 13 | ClusterHeadTest(std::string name) : CppUnit::TestCase(name) { 14 | 15 | describe_test(INDENT0, name.c_str()); 16 | } 17 | 18 | void create_clusterhead() { 19 | 20 | Spec spec; 21 | 22 | Record * representative = make_foobar_record(); 23 | spec.it("Creates a ClusterHead", [representative](Description desc)->bool { 24 | double cohesion = 0.9953; 25 | ClusterHead ch(representative, cohesion); 26 | return (cohesion == ch.m_cohesion); 27 | }); 28 | // Segfaults... 29 | //r->print(); 30 | // segfaults, which shouldn't be... 31 | // Leaks, bad 32 | //r->clean_member_attrib_pool(); 33 | //std::cout << "sizeof(r): " << sizeof(*r) << std::endl; 34 | delete representative; 35 | } 36 | 37 | void runTest() { 38 | create_clusterhead(); 39 | } 40 | }; 41 | 42 | 43 | void 44 | test_clusterhead() { 45 | 46 | ClusterHeadTest * cht = new ClusterHeadTest(std::string("ClusterHead unit testing")); 47 | cht->runTest(); 48 | delete cht; 49 | } 50 | 51 | 52 | #ifdef test_clusterhead_STANDALONE 53 | int 54 | main(int UP(argc), char ** UP(argv)) { 55 | 56 | test_clusterhead(); 57 | return 0; 58 | } 59 | #endif 60 | -------------------------------------------------------------------------------- /test/test_postprocess.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "testdata.h" 16 | #include "testutils.h" 17 | #include "fake.h" 18 | 19 | 20 | 21 | class PostProcessTest : public CppUnit::TestCase { 22 | 23 | private: 24 | 25 | FakeTest * ft; 26 | RecordPList recpointers; 27 | vector rpv; 28 | list all_records; 29 | 30 | 31 | public: 32 | 33 | PostProcessTest(std::string name) : CppUnit::TestCase(name) { 34 | 35 | describe_test(INDENT0, name.c_str()); 36 | } 37 | 38 | 39 | void test_find_associated_nodes() { 40 | 41 | describe_test(INDENT2, "Testing associated nodes..."); 42 | 43 | Spec spec; 44 | spec.xit("create associated_delegates set", DO_SPEC { 45 | return false; 46 | }); 47 | } 48 | 49 | 50 | void runTests() { 51 | test_find_associated_nodes(); 52 | } 53 | 54 | }; 55 | 56 | 57 | void 58 | test_postprocess() { 59 | 60 | PostProcessTest * ppt = new PostProcessTest(std::string("Postprocess test")); 61 | ppt->runTests(); 62 | delete ppt; 63 | } 64 | 65 | 66 | #ifdef test_postprocess_STANDALONE 67 | int 68 | main(int UP(argc), char ** UP(argv)) { 69 | 70 | test_postprocess(); 71 | return 0; 72 | } 73 | #endif 74 | -------------------------------------------------------------------------------- /test/data/berkeley/prior_saved_3.txt: -------------------------------------------------------------------------------- 1 | JOHN####BEATTY## : 6.93147e-07, 2 | DAVID##M##DOOLIN## : 0.693147, 3 | JOHN##W##AARONIII## : 6.93147e-07, 4 | JAMES##T##PINKERTO## : 6.93147e-07, 5 | ARVIN####ARVIND## : 1.09861e-06, 6 | GREGO##M##PAPADOPO## : 1.09861e-06, 7 | BRIAN####TOTTY## : 0.95, 8 | DOUGL##R##CUTTING## : 0.95, 9 | JAMES##R##HUNT## : 6.93147e-07, 10 | GENE##H##KAN## : 0.95, 11 | DAVID##E##CULLER## : 0.693147, 12 | LARRY####FABINY## : 1.09861e-06, 13 | LEE####FLEMING## : 4.05465e-07, 14 | JOHN####PLEVYAK## : 0.693147, 15 | KENNE##E##ARNETT## : 1.09861e-06, 16 | NOAH####TREUHAFT## : 1.09861e-06, 17 | KRIST##S##PISTER## : 0.659167, 18 | LLOYD##D##STEWARTJ## : 1.09861e-06, 19 | ADAM####BEGUELIN## : 0.95, 20 | OSCAR##M##STAFSUDD## : 1.09861e-06, 21 | TIDHA##D##SHALON## : 0.95, 22 | YAROS####FAYBISHE## : 0.95, 23 | IKHLA##S##SIDHU## : 0.95, 24 | STEVE####WATERHOU## : 0.95, 25 | YURI##S##ZATS## : 1.09861e-06, 26 | WILLI##J##KAISER## : 1.09861e-06, 27 | DAVID####GOURLEY## : 0.693147, 28 | KENT##S##UDELL## : 1.09861e-06, 29 | NICHO####SITAR## : 1.09861e-06, 30 | MATTH####HAINES## : 0.95, 31 | PATRI##O##BROWN## : 0.95, 32 | PETER####MATTIS## : 0.95, 33 | ROB####CONANT## : 1.09861e-06, 34 | ROBER##M##SHEAR## : 1.09861e-06, 35 | ROSS##V##LAFETRA## : 1.09861e-06, 36 | THOMA##J##CAMARDA## : 1.09861e-06, 37 | LEE##O##FLEMING## : 4.05465e-07, 38 | LILAC####MULLER## : 1.09861e-06, 39 | PHYLL##R##NELSON## : 1.09861e-06, 40 | RICHA##P##SCHNEIDE## : 1.09861e-06, 41 | -------------------------------------------------------------------------------- /include/disambiguate.h: -------------------------------------------------------------------------------- 1 | /** @file */ 2 | 3 | #ifndef PATENT_DISAMBIGUATE_H 4 | #define PATENT_DISAMBIGUATE_H 5 | 6 | namespace EngineConfiguration { 7 | bool config_engine(const char * filename, std::ostream & os); 8 | } 9 | 10 | namespace BlockingConfiguration { 11 | 12 | using std::string; 13 | 14 | class cBlockingDetail { 15 | public: 16 | StringManipulator * m_psm; 17 | std::string m_columnname; 18 | unsigned int m_dataindex; 19 | int m_begin; 20 | unsigned int m_nchar; 21 | bool m_isforward; 22 | cBlockingDetail() { 23 | m_psm = new StringNoSpaceTruncate ; 24 | } 25 | cBlockingDetail ( const cBlockingDetail & rhs ) { 26 | m_psm = rhs.m_psm->clone(); 27 | m_columnname = rhs.m_columnname; 28 | m_dataindex = rhs.m_dataindex; 29 | m_begin = rhs.m_begin; 30 | m_nchar = rhs.m_nchar; 31 | m_isforward = rhs.m_isforward; 32 | } 33 | ~cBlockingDetail() { delete m_psm; } 34 | }; 35 | 36 | int config_blocking (const char * filename, const string & module_id); 37 | int config_blocking (const char * filename, const string & module_id, std::ostream &); 38 | } 39 | 40 | int Full_Disambiguation(const char * EngineConfigFile, const char * BlockingConfigFile); 41 | 42 | // So ugly. 43 | #include 44 | class cBlocking_Operation; 45 | std::auto_ptr get_blocking_pointer(); 46 | 47 | #endif /* PATENT_DISAMBIGUATE_H */ 48 | -------------------------------------------------------------------------------- /test/data/berkeley/prior_saved_2.txt: -------------------------------------------------------------------------------- 1 | JAMES##R##HUNT## : 6.93147e-07, 2 | JAMES##T##PINKERTON## : 6.93147e-07, 3 | JOHN##W##AARONIII## : 6.93147e-07, 4 | GREGORY##M##PAPADOPOULOS## : 1.09861e-06, 5 | DAVID####GOURLEY## : 0.693147, 6 | DAVID##E##CULLER## : 0.693147, 7 | ARVIND####ARVIND## : 1.09861e-06, 8 | ADAM####BEGUELIN## : 0.95, 9 | GENE##H##KAN## : 0.95, 10 | DOUGLASS##R##CUTTING## : 0.95, 11 | BRIAN####TOTTY## : 0.95, 12 | LARRY####FABINY## : 1.09861e-06, 13 | KENT##S##UDELL## : 1.09861e-06, 14 | KENNETH##E##ARNETT## : 1.09861e-06, 15 | JOHN####BEATTY## : 6.93147e-07, 16 | LILAC####MULLER## : 1.09861e-06, 17 | LLOYD##D##STEWARTJR## : 1.09861e-06, 18 | JOHN####PLEVYAK## : 0.693147, 19 | LEE####FLEMING## : 4.05465e-07, 20 | DAVID##M##DOOLIN## : 0.693147, 21 | YAROSLAV####FAYBISHENKO## : 0.95, 22 | PETER####MATTIS## : 0.95, 23 | ROB####CONANT## : 1.09861e-06, 24 | ROBERT##M##SHEAR## : 1.09861e-06, 25 | ROSS##V##LAFETRA## : 1.09861e-06, 26 | THOMAS##J##CAMARDA## : 1.09861e-06, 27 | IKHLAQ##S##SIDHU## : 0.95, 28 | STEVE####WATERHOUSE## : 0.95, 29 | YURI##S##ZATS## : 1.09861e-06, 30 | WILLIAM##J##KAISER## : 1.09861e-06, 31 | TIDHAR##D##SHALON## : 0.95, 32 | OSCAR##M##STAFSUDD## : 1.09861e-06, 33 | PHYLLIS##R##NELSON## : 1.09861e-06, 34 | LEE##O##FLEMING## : 4.05465e-07, 35 | NICHOLAS####SITAR## : 1.09861e-06, 36 | NOAH####TREUHAFT## : 1.09861e-06, 37 | RICHARD##P##SCHNEIDER## : 1.09861e-06, 38 | PATRICK##O##BROWN## : 0.95, 39 | KRISTOFER##SJ##PISTER## : 0.659167, 40 | MATTHEW####HAINES## : 0.95, 41 | -------------------------------------------------------------------------------- /test/testutils.h: -------------------------------------------------------------------------------- 1 | #ifndef PATENT_TESTUTILS_H 2 | #define PATENT_TESTUTILS_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | #define DESCCOLOR COLOR45 9 | #define PASSCOLOR COLOR119 10 | #define FAILCOLOR COLOR124 11 | #define PENDINGCOLOR COLOR166 12 | 13 | #define INDENT0 "" 14 | #define INDENT2 " " 15 | #define INDENT4 " " 16 | #define INDENT6 " " 17 | 18 | 19 | typedef const char * Description; 20 | typedef void (*Describer)(const char *, const char *); 21 | 22 | void describe_test (const char * indent, const char * description); 23 | void describe_pass (const char * indent, const char * description); 24 | void describe_fail (const char * indent, const char * description); 25 | void describe_pending (const char * indent, const char * description); 26 | 27 | #define DO_SPEC [](Description d)->bool 28 | #define DO_SPEC_THIS [this](Description d)->bool 29 | #define DO_SPEC_HANDLE [&](Description d)->bool 30 | 31 | class Spec { 32 | 33 | public: 34 | 35 | char buf[512]; 36 | 37 | void it (Description desc, std::function test) { 38 | 39 | Describer d; 40 | sprintf(buf, desc); 41 | try { 42 | CPPUNIT_ASSERT(test(desc)); 43 | d = describe_pass; 44 | } catch (CppUnit::Exception e) { 45 | d = describe_fail; 46 | } 47 | d(INDENT4, buf); 48 | } 49 | 50 | void xit (Description desc, std::function test) { 51 | 52 | describe_pending(INDENT4, desc); 53 | } 54 | 55 | }; 56 | 57 | class TestUtils { 58 | }; 59 | 60 | 61 | #endif // PATENT_TESTUTILS_H 62 | -------------------------------------------------------------------------------- /config/block.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Blocking Mechanism 2 | //ATTENTION: LOOK AT get_prior_value function BEFORE YOU WANT TO CHANGE THE FORMAT. 3 | //format: 4 | //attribute name: data index : parameters of string manipulation. 5 | //parameters of string manipulation for current use: starting position, number of characters to extract, direction. 6 | [ Round 1 ] 7 | Firstname: 1 : 0,0 , true 8 | Middlename: 1 : 0 , 0 , false 9 | Lastname: 0: 0, 0, true 10 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Longitude 11 | 12 | [ Round 2 ] 13 | Firstname: 1 : 0 , 0 , true 14 | Middlename: 1 : 0 , 0 , true 15 | Lastname: 0: 0, 0, true 16 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 17 | 18 | [ Round 3 ] 19 | Firstname: 1 : 0 , 5 , true 20 | Middlename: 1 : 0 , 1 , true 21 | Lastname: 0: 0 , 8 , true 22 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 23 | 24 | [ Round 4 ] 25 | Firstname: 1 : 0 , 3 , true 26 | Middlename: 1 : 0 , 0 , false 27 | Lastname: 0 : 0, 5, true 28 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 29 | 30 | [ Round 5 ] 31 | Firstname: 1 : 0, 1, true 32 | Middlename: 1 : 0 , 0 , false 33 | Lastname: 0: 0, 5, true 34 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 35 | 36 | [ Round 6 ] 37 | Firstname: 1: 0, 1, true 38 | Middlename: 1 : 0 , 0 , false 39 | Lastname: 0: 0, 3, true 40 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 41 | -------------------------------------------------------------------------------- /config/blocking6.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Blocking Mechanism 2 | //ATTENTION: LOOK AT get_prior_value function BEFORE YOU WANT TO CHANGE THE FORMAT. 3 | //format: 4 | //attribute name: data index : parameters of string manipulation. 5 | //parameters of string manipulation for current use: starting position, number of characters to extract, direction. 6 | [ Round 1 ] 7 | Firstname: 1 : 0,0 , true 8 | Middlename: 1 : 0 , 0 , false 9 | Lastname: 0: 0, 0, true 10 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Longitude 11 | 12 | [ Round 2 ] 13 | Firstname: 1 : 0 , 0 , true 14 | Middlename: 1 : 0 , 0 , true 15 | Lastname: 0: 0, 0, true 16 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 17 | 18 | [ Round 3 ] 19 | Firstname: 1 : 0 , 5 , true 20 | Middlename: 1 : 0 , 1 , true 21 | Lastname: 0: 0 , 8 , true 22 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 23 | 24 | [ Round 4 ] 25 | Firstname: 1 : 0 , 3 , true 26 | Middlename: 1 : 0 , 0 , false 27 | Lastname: 0 : 0, 5, true 28 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 29 | 30 | [ Round 5 ] 31 | Firstname: 1 : 0, 1, true 32 | Middlename: 1 : 0 , 0 , false 33 | Lastname: 0: 0, 5, true 34 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 35 | 36 | [ Round 6 ] 37 | Firstname: 1: 0, 1, true 38 | Middlename: 1 : 0 , 0 , false 39 | Lastname: 0: 0, 3, true 40 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 41 | -------------------------------------------------------------------------------- /config/BlockingConfig.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Blocking Mechanism 2 | //ATTENTION: LOOK AT get_prior_value function BEFORE YOU WANT TO CHANGE THE FORMAT. 3 | //format: 4 | //attribute name: data index : parameters of string manipulation. 5 | //parameters of string manipulation for current use: starting position, number of characters to extract, direction. 6 | [ Round 1 ] 7 | Firstname: 1 : 0,0 , true 8 | Middlename: 1 : 0 , 0 , false 9 | Lastname: 0: 0, 0, true 10 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Longitude 11 | 12 | [ Round 2 ] 13 | Firstname: 1 : 0 , 0 , true 14 | Middlename: 1 : 0 , 0 , true 15 | Lastname: 0: 0, 0, true 16 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 17 | 18 | [ Round 3 ] 19 | Firstname: 1 : 0 , 5 , true 20 | Middlename: 1 : 0 , 1 , true 21 | Lastname: 0: 0 , 8 , true 22 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 23 | 24 | [ Round 4 ] 25 | Firstname: 1 : 0 , 3 , true 26 | Middlename: 1 : 0 , 0 , false 27 | Lastname: 0 : 0, 5, true 28 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 29 | 30 | [ Round 5 ] 31 | Firstname: 1 : 0, 1, true 32 | Middlename: 1 : 0 , 0 , false 33 | Lastname: 0: 0, 5, true 34 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 35 | 36 | [ Round 6 ] 37 | Firstname: 1: 0, 1, true 38 | Middlename: 1 : 0 , 0 , false 39 | Lastname: 0: 0, 3, true 40 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 41 | -------------------------------------------------------------------------------- /test/fixtures/BlockingConfig.txt: -------------------------------------------------------------------------------- 1 | //Configuration file for the Disambiguation Blocking Mechanism 2 | //ATTENTION: LOOK AT get_prior_value function BEFORE YOU WANT TO CHANGE THE FORMAT. 3 | //format: 4 | //attribute name: data index : parameters of string manipulation. 5 | //parameters of string manipulation for current use: starting position, number of characters to extract, direction. 6 | [ Round 1 ] 7 | Firstname: 1 : 0,0 , true 8 | Middlename: 1 : 0 , 0 , false 9 | Lastname: 0: 0, 0, true 10 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Longitude 11 | 12 | [ Round 2 ] 13 | Firstname: 1 : 0 , 0 , true 14 | Middlename: 1 : 0 , 0 , true 15 | Lastname: 0: 0, 0, true 16 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 17 | 18 | [ Round 3 ] 19 | Firstname: 1 : 0 , 5 , true 20 | Middlename: 1 : 0 , 1 , true 21 | Lastname: 0: 0 , 8 , true 22 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 23 | 24 | [ Round 4 ] 25 | Firstname: 1 : 0 , 3 , true 26 | Middlename: 1 : 0 , 0 , false 27 | Lastname: 0 : 0, 5, true 28 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 29 | 30 | [ Round 5 ] 31 | Firstname: 1 : 0, 1, true 32 | Middlename: 1 : 0 , 0 , false 33 | Lastname: 0: 0, 5, true 34 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 35 | 36 | [ Round 6 ] 37 | Firstname: 1: 0, 1, true 38 | Middlename: 1 : 0 , 0 , false 39 | Lastname: 0: 0, 3, true 40 | ACTIVE SIMILARITY ATTRIBUTES: Firstname, Middlename, Lastname, Coauthor, Class, Assignee 41 | -------------------------------------------------------------------------------- /readme.markdown: -------------------------------------------------------------------------------- 1 | # Patent Inventor Disambiguation 2 | 3 | **NOTE: this is a legacy code base which I've assumed development on. 4 | I'm working to bring the code base under full test coverage, and 5 | adding extensibility via a callback architecture.** 6 | 7 | This project is part of an ongoing effort to match inventors 8 | with patents, information which is useful for public policy 9 | considerations. 10 | 11 | The United States Patent and Trademark Office issues unique ID 12 | numbers for patents, but does not uniquely identify inventors. 13 | 14 | ## Contributing 15 | 16 | Easy, fork it and go. You will need the following to compile: 17 | 18 | * gcc4.7 or 4.8 toolchain, or a recent icc or clang release 19 | implementing most of c++11. 20 | * IBM's cplex package, free for academic use. You can compile 21 | and link without cplex, but you won't be able to generate the 22 | lookup tables. 23 | 24 | Don't forget to set the upstream repo for 25 | rebasing correctly: 26 | 27 | * `git remote add upstream 28 | git://github.com/funginstitute/disambiguator.git` 29 | 30 | Before you pull from upstream or issue pull request, rebase: 31 | 32 | * git pull --rebase upstream master 33 | 34 | This will keep a linear commit history, which helps 35 | debugging tremendously. 36 | 37 | ## Documentation 38 | 39 | The main manual is located here: [`doc/manual.pdf`](doc/manual.pdf). 40 | 41 | 42 | ## Handy links 43 | 44 | * [Harvard Dataverse Network(DVN)](http://dvn.iq.harvard.edu/dvn/dv/patent) 45 | * [USPTO](http://www.uspto.gov/) 46 | * [Google Patent information distribution](http://www.google.com/googlebooks/uspto-patents.html) 47 | -------------------------------------------------------------------------------- /src/makefile.citris: -------------------------------------------------------------------------------- 1 | #CXX=icpc 2 | #CC=icc 3 | #CFLAGS=-O3 -g -xsse4.1 -ipo -static-intel -Wall 4 | 5 | #CXX=g++ 6 | #CC=gcc 7 | CXX=icc 8 | CC=icc 9 | 10 | #CFLAGS=-O3 -g -Wall -I/usr/local/include -L/usr/local/lib 11 | # Profiling 12 | #CFLAGS=-O3 -pg -Wall -I/usr/local/include -L/usr/local/lib 13 | # Debugging 14 | 15 | OOQPINCLUDEDIR=/usr/local/include/ooqp 16 | 17 | #ILOGINSTALLDIR=/home/ysun/ILOG/CPLEX_Studio_AcademicResearch122 18 | ILOGINSTALLDIR=/global/home/users/ddoolin/ILOG 19 | 20 | 21 | CPLEXINCLUDE=$(ILOGINSTALLDIR)/cplex/include 22 | CONCERTINCLUDE=$(ILOGINSTALLDIR)/concert/include 23 | 24 | INCLUDES = -I../include -I/usr/local/include -I$(CPLEXINCLUDE) -I$(CONCERTINCLUDE) 25 | 26 | #CPLEXLIB=$(ILOGINSTALLDIR)/cplex/lib/x86_sles10_4.1/static_pic 27 | CPLEXLIB=$(ILOGINSTALLDIR)/cplex/lib/x86-64_sles10_4.1/static_pic 28 | 29 | #CONCERTLIB=$(ILOGINSTALLDIR)/concert/lib/x86_sles10_4.1/static_pic 30 | CONCERTLIB=$(ILOGINSTALLDIR)/concert/lib/x86-64_sles10_4.1/static_pic 31 | 32 | objects = attribute.o engine.o comparators.o blocking_operation.o cluster.o \ 33 | ratios.o fileoper.o main.o disambiguate.o newcluster.o record.o \ 34 | record_reconfigurator.o string_manipulator.o threading.o training.o \ 35 | utilities.o strcmp95.o ratio_smoothing.o postprocess.o 36 | 37 | CFLAGS=-O3 -Wall -fno-inline -DIL_STD 38 | 39 | %.o: %.cpp 40 | $(CXX) $(CFLAGS) $(INCLUDES) -c $< 41 | 42 | %.o: %.c 43 | $(CC) $(CFLAGS) -I../include -c $< 44 | 45 | all: exedisambig 46 | 47 | exedisambig: $(objects) 48 | $(CXX) -o $@ $? $(CFLAGS) -L$(CPLEXLIB) -lilocplex -lcplex -L$(CONCERTLIB) -lconcert -lm -lpthread 49 | 50 | clean: 51 | rm -rf *.o *.gch 52 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # -*- Autoconf -*- 2 | # Process this file with autoconf to produce a configure script. 3 | 4 | AC_PREREQ([2.68]) 5 | AC_INIT([disambiguation], [1.0], [david.doolin@gmail.com]) 6 | AM_INIT_AUTOMAKE([foreign]) 7 | AC_PROG_RANLIB 8 | AM_PATH_CPPUNIT(1.12.1) 9 | AC_CONFIG_SRCDIR([src/cluster.cpp]) 10 | AC_CONFIG_HEADERS([config.h]) 11 | 12 | # Checks for programs. 13 | AC_PROG_CXX 14 | AC_PROG_CC 15 | 16 | 17 | # Checks for libraries. 18 | # FIXME: Replace `main' with a function in `-lboost_unit_test_framework': 19 | AC_CHECK_LIB([boost_unit_test_framework], [main]) 20 | 21 | # FIXME: Replace `main' with a function in `-lconcert': 22 | AC_CHECK_LIB([concert], [__ZN22IloHashSavedIntDomainID2Ev]) 23 | 24 | # FIXME: Replace `main' with a function in `-lcplex': 25 | AC_CHECK_LIB([cplex], [main]) 26 | 27 | # FIXME: Replace `main' with a function in `-lilocplex': 28 | AC_CHECK_LIB([ilocplex], [main]) 29 | 30 | # FIXME: Replace `main' with a function in `-lm': 31 | AC_CHECK_LIB([m], [main]) 32 | 33 | # FIXME: Replace `main' with a function in `-lpthread': 34 | AC_CHECK_LIB([pthread], [main]) 35 | 36 | # FIXME: Replace `main' with a function in `-lsqlite3': 37 | AC_CHECK_LIB([sqlite3], [main]) 38 | 39 | # FIXME: Replace `main' with a function in `-lcppunit': 40 | AC_CHECK_LIB([cppunit], [main]) 41 | 42 | # Checks for header files. 43 | AC_CHECK_HEADERS([string.h unistd.h]) 44 | 45 | AC_CHECK_HEADERS([ilocplex.h]) 46 | 47 | # Checks for typedefs, structures, and compiler characteristics. 48 | AC_HEADER_STDBOOL 49 | AC_C_INLINE 50 | AC_TYPE_SIZE_T 51 | 52 | # Checks for library functions. 53 | AC_FUNC_ERROR_AT_LINE 54 | AC_FUNC_MALLOC 55 | AC_CHECK_FUNCS([memset sqrt]) 56 | 57 | AC_CONFIG_FILES([Makefile 58 | test/Makefile 59 | src/Makefile]) 60 | AC_OUTPUT 61 | -------------------------------------------------------------------------------- /test/integration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source ./bashcolors.sh 4 | initializeANSI 5 | 6 | echo 7 | echo $cyanf"# Integration testing for disambiguator" 8 | echo 9 | 10 | function test_disambiguation() { 11 | 12 | current_dir=`pwd` 13 | dir=$1 14 | cd $dir 15 | 16 | ~/src/disambiguator/src/disambiguate engine.txt blocking6.txt > runtime.log 17 | 18 | cd $current_dir 19 | output=$dir"/final.txt" 20 | testfile=$2 21 | results=`diff ${output} ${testfile}` 22 | if [[ $results ]] ; then 23 | echo $redf"failed final: diff results in $results" 24 | else 25 | echo $greenf"passed final" 26 | fi 27 | 28 | 29 | 30 | if [[ $3 ]] ; then 31 | echo "\n" 32 | rare=$dir"/Rare_Names.txt" 33 | results=`diff ${rare} $3` 34 | if [[ $results ]] ; then 35 | echo $redf"failed rare names: diff results in $results" 36 | else 37 | echo $greenf"passed rare names" 38 | fi 39 | fi 40 | } 41 | 42 | 43 | # TODO: Create an array for the test directories and files, 44 | # then loop it to run the tests. 45 | testdir=/data/patentdata/disambiguation/test/synthetic/jones 46 | testfile="./data/final_jones.txt" 47 | testval=`test_disambiguation $testdir $testfile` 48 | echo $cyanf"## Jones" 49 | echo $testval 50 | 51 | echo 52 | 53 | testdir=/data/patentdata/disambiguation/test/18 54 | testfile="./data/final_eighteen.txt" 55 | testval=`test_disambiguation $testdir $testfile` 56 | echo $cyanf"## Eighteen" 57 | echo $testval 58 | 59 | echo 60 | 61 | testdir=/data/patentdata/disambiguation/experiments/earth/berkeley 62 | finalfile="./data/berkeley/final_berkeley.txt" 63 | rarenamefile="./data/berkeley/rare_names_berkeley.txt" 64 | testval=`test_disambiguation $testdir $finalfile $rarenamefile` 65 | echo $cyanf"## Berkeley" 66 | echo -e $testval 67 | 68 | 69 | #./integration.rb 70 | 71 | 72 | echo $reset 73 | -------------------------------------------------------------------------------- /test/integration.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require './termcolors' 4 | 5 | class IntegrationTest 6 | 7 | include TermColors 8 | 9 | #@@finalfile = "./data/berkeley/final_berkeley.txt" 10 | #@@rarenamefile = "./data/berkeley/rare_names_berkeley.txt" 11 | 12 | @@testdir = "/data/patentdata/disambiguation/experiments/earth/berkeley" 13 | 14 | def print_headline(testname) 15 | color = @@colors[45] 16 | print "#{color}## Integration testing for #{testname}", RESET, "\n" 17 | end 18 | 19 | def print_passed(testname) 20 | color = @@colors[119] 21 | print " #{color} Passed: #{testname}", RESET, "\n" 22 | end 23 | 24 | def print_failed(testname) 25 | color = @@colors[124] 26 | print " #{color} Failed: #{testname}", RESET, "\n" 27 | end 28 | end 29 | 30 | 31 | class Prior < IntegrationTest 32 | 33 | def initialize 34 | print_headline("priors") 35 | end 36 | 37 | def test_priors 38 | (1..6).each_with_index do |num| 39 | testfile = "#{@@testdir}/prior_saved_#{num}.txt" 40 | priorfile = "./data/berkeley/prior_saved_#{num}.txt" 41 | results = `diff -y --suppress-common-lines #{testfile} #{priorfile}` 42 | puts results unless results == "" 43 | end 44 | end 45 | end 46 | 47 | 48 | class Tset < IntegrationTest 49 | 50 | def initialize 51 | print_headline("tsets") 52 | end 53 | 54 | def test_tsets 55 | (1..6).each_with_index do |num| 56 | tset = "tset05_#{num}" 57 | testfile = "#{@@testdir}/#{tset}.txt" 58 | priorfile = "./data/berkeley/#{tset}.txt" 59 | results = `diff -y --suppress-common-lines #{testfile} #{priorfile}` 60 | if results 61 | print_passed(tset) 62 | else 63 | print_failed(tset) 64 | end 65 | end 66 | end 67 | 68 | end 69 | 70 | 71 | prior = Prior.new 72 | #prior.test_priors 73 | tset = Tset.new 74 | tset.test_tsets 75 | -------------------------------------------------------------------------------- /test/test_fetch_records.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // Really good web pages: 6 | // http://stackoverflow.com/questions/318064/how-do-you-declare-an-interface-in-c 7 | // http://stackoverflow.com/questions/7182359/template-instantiation-details-of-gcc-and-ms-compilers 8 | //#include 9 | #include 10 | //#include 11 | 12 | #include "testdata.h" 13 | #include "colortest.h" 14 | #include "testutils.h" 15 | 16 | 17 | using std::string; 18 | using std::cout; 19 | using std::endl; 20 | 21 | 22 | class FetchRecordTest : public CppUnit::TestCase { 23 | 24 | public: 25 | 26 | FetchRecordTest(std::string name) : CppUnit::TestCase(name) { 27 | describe_test(INDENT0, name.c_str()); 28 | } 29 | 30 | void test_get_records() { 31 | 32 | describe_test(INDENT2, "Testing fetch_records_from_txt..."); 33 | 34 | list source; 35 | // filename is 20 bytes, check how it leaks. 36 | const char * filename = "testdata/invpat2.txt"; 37 | vector requested_columns; 38 | requested_columns.push_back(string("Firstname")); 39 | requested_columns.push_back(string("Lastname")); 40 | requested_columns.push_back(string("Middlename")); 41 | bool successful = fetch_records_from_txt(source, filename, requested_columns); 42 | 43 | if (not successful) 44 | exit(-1); 45 | } 46 | 47 | /** 48 | * Now, with a load of records, I should be able to test specific 49 | * records for attributes and values. 50 | */ 51 | 52 | }; 53 | 54 | 55 | void 56 | test_fetch_records() { 57 | 58 | FetchRecordTest * frt = new FetchRecordTest("Testing fetch_records"); 59 | frt->test_get_records(); 60 | delete frt; 61 | } 62 | 63 | 64 | #ifdef test_fetch_records_STANDALONE 65 | int 66 | main(int, char **) { 67 | 68 | test_fetch_records(); 69 | 70 | return 0; 71 | } 72 | #endif 73 | -------------------------------------------------------------------------------- /test/test_compare.cpp: -------------------------------------------------------------------------------- 1 | 2 | //#include 3 | //#include 4 | //#include 5 | 6 | #include 7 | #include 8 | 9 | 10 | #include 11 | 12 | #include "testutils.h" 13 | 14 | 15 | class CompareTest : public CppUnit::TestCase { 16 | 17 | public: 18 | CompareTest( std::string name ) : CppUnit::TestCase( name ) {} 19 | 20 | void check_less_than() { 21 | std::vector s1; 22 | s1.push_back(1); 23 | std::vector s2; 24 | s2.push_back(2); 25 | SimilarityCompare sc; 26 | bool checkval = sc(s1, s2); 27 | //std::cout << "checkval: " << checkval << std::endl; 28 | CPPUNIT_ASSERT(checkval == true); 29 | } 30 | 31 | void check_greater_than() { 32 | std::vector s1; 33 | s1.push_back(2); 34 | std::vector s2; 35 | s2.push_back(1); 36 | SimilarityCompare sc; 37 | bool checkval = sc(s1, s2); 38 | //std::cout << "checkval: " << checkval << std::endl; 39 | CPPUNIT_ASSERT(checkval == false); 40 | } 41 | 42 | void check_equals_to() { 43 | std::vector s1; 44 | s1.push_back(1); 45 | std::vector s2; 46 | s2.push_back(1); 47 | SimilarityCompare sc; 48 | bool checkval = sc(s1, s2); 49 | //std::cout << "checkval: " << checkval << std::endl; 50 | CPPUNIT_ASSERT(checkval == false); 51 | } 52 | 53 | void runTest() { 54 | check_less_than(); 55 | check_greater_than(); 56 | check_equals_to(); 57 | } 58 | }; 59 | 60 | 61 | void 62 | test_compare() { 63 | 64 | CompareTest * ct = new CompareTest(std::string("SimilarityCompare test")); 65 | ct->runTest(); 66 | delete ct; 67 | } 68 | 69 | #ifdef test_compare_STANDALONE 70 | int 71 | main(int UP(argc), char ** UP(argv)) { 72 | 73 | test_compare(); 74 | return 0; 75 | } 76 | #endif 77 | -------------------------------------------------------------------------------- /test/test_typedefs.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | #include 16 | 17 | /** 18 | * Run thie code with /usr/bin/valgrind ./typedefs --leak-check=full 19 | */ 20 | 21 | typedef map Dict; 22 | 23 | // From: https://github.com/doolin/CPP_Disambiguation/blob/master/src/DisambigComp.cpp#L571 24 | //const map > * const asg_table_pointer 25 | typedef std::pair TableVal; 26 | typedef std::map Table; 27 | const Table * const asg_table_pointer = NULL; 28 | 29 | //typedef vector Label; 30 | 31 | class Initial { 32 | public: 33 | Initial(void) : value(1) {} 34 | int value; 35 | }; 36 | 37 | class InitialTest : public CppUnit::TestCase { 38 | public: 39 | InitialTest( std::string name ) : CppUnit::TestCase( name ) {} 40 | 41 | void runTest() { 42 | Initial * i = new Initial(); 43 | CPPUNIT_ASSERT( i->value == 1 ); 44 | const string uid_identifier = cUnique_Record_ID::static_get_class_name(); 45 | //uid = string("Unique_Record_ID"); 46 | CPPUNIT_ASSERT( uid_identifier == string("Unique_Record_ID")); 47 | delete i; 48 | } 49 | 50 | void LabelTest(void) { 51 | Label * l = new Label(); 52 | delete l; 53 | } 54 | 55 | void LabelTest(char const * label) { 56 | const string s(label); 57 | Label l(1, s); 58 | Label m(1, string("baz")); 59 | } 60 | }; 61 | 62 | 63 | int 64 | main(int UP(argc), char ** UP(argv)) { 65 | 66 | InitialTest * it = new InitialTest(std::string("initial test")); 67 | it->runTest(); 68 | it->LabelTest(); 69 | it->LabelTest("foobar"); 70 | 71 | delete it; 72 | 73 | return 0; 74 | } 75 | -------------------------------------------------------------------------------- /include/threading.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Threading.h 3 | * 4 | * Created on: Oct 6, 2010 5 | * Author: ysun 6 | */ 7 | 8 | #ifndef THREADING_H_ 9 | #define THREADING_H_ 10 | 11 | #include 12 | #include 13 | 14 | /* 15 | * Threading Base 16 | */ 17 | class Runnable 18 | { 19 | public: 20 | virtual void run() = 0; 21 | virtual ~Runnable() {}; 22 | }; 23 | 24 | /* 25 | * Thread class 26 | * Steps to do multithreading: 27 | * 1. Create a worker class that inherits from Thread class ( synchronization objects can also be in the class ). 28 | * 2. The worker class should override the "void run()" function. DO NOT GET CONFUSED! 29 | * 3. At the place where multi-threading is expected, create worker OBJECTS. 30 | * 4. Start multi-threading by calling the method "bool start()" from worker objects. 31 | * 5. Block the main thread until child threads finish by calling "void join()" or void join(unsigned long)" from worker objects. 32 | * 33 | */ 34 | class Thread : public Runnable 35 | { 36 | private: 37 | static int threadInitNumber; 38 | int curThreadInitNumber; 39 | Runnable *target; 40 | pthread_t tid; 41 | int threadStatus; 42 | pthread_attr_t attr; 43 | sched_param param; 44 | static void* run0(void* pVoid); 45 | void* run1(); 46 | static int getNextThreadNum(); 47 | 48 | public: 49 | static const int THREAD_STATUS_NEW = 0; 50 | static const int THREAD_STATUS_RUNNING = 1; 51 | static const int THREAD_STATUS_EXIT = -1; 52 | Thread(); 53 | Thread(Runnable *iTarget); 54 | ~Thread(); 55 | void run(); 56 | bool start(); 57 | int getState(); 58 | void join(); 59 | void join(unsigned long millisTime); 60 | bool operator ==(const Thread *otherThread); 61 | pthread_t getThreadID(); 62 | static pthread_t getCurrentThreadID(); 63 | static bool isEquals(Thread *iTarget); 64 | void setThreadScope(bool isSystem); 65 | bool getThreadScope(); 66 | void setThreadPriority(int priority); 67 | int getThreadPriority(); 68 | 69 | }; 70 | 71 | 72 | #endif /* THREADING_H_ */ 73 | 74 | 75 | -------------------------------------------------------------------------------- /test/test_misspell.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | #include 5 | 6 | #include "comparators.h" 7 | #include "testutils.h" 8 | 9 | 10 | /** 11 | * Run this code with valgrind ./misspell --leak-check=full 12 | */ 13 | 14 | class Misspell : public CppUnit::TestCase { 15 | 16 | public: 17 | Misspell(std::string name) : CppUnit::TestCase(name) { 18 | 19 | describe_test(INDENT0, name.c_str()); 20 | } 21 | 22 | 23 | void test_is_misspell() { 24 | 25 | Spec spec; 26 | spec.it("Testing 'misspell' against 'mispell' returns 1", DO_SPEC { 27 | const char * s1 = "misspell"; 28 | const char * s2 = "mispell"; 29 | return (1 == is_misspell(s1,s2)); 30 | }); 31 | } 32 | 33 | 34 | void test_is_not_misspell() { 35 | 36 | Spec spec; 37 | spec.it("Testing 'misspell' against 'misspell' returns 4", DO_SPEC { 38 | const char * s1 = "misspell"; 39 | const char * s2 = "misspell"; 40 | return (4 == is_misspell(s1,s2)); 41 | }); 42 | } 43 | 44 | 45 | void test_is_missepll() { 46 | 47 | Spec spec; 48 | spec.it("Testing 'misspell' against missepll' returns 2", DO_SPEC { 49 | const char * s1 = "misspell"; 50 | const char * s2 = "missepll"; 51 | return (2 == is_misspell(s1,s2)); 52 | }); 53 | } 54 | 55 | 56 | void test_is_misspall() { 57 | 58 | Spec spec; 59 | spec.it("Testing 'misspell' against 'misspall' returns 3", DO_SPEC { 60 | const char * s1 = "misspell"; 61 | const char * s2 = "misspall"; 62 | return (3 == is_misspell(s1,s2)); 63 | }); 64 | } 65 | 66 | 67 | 68 | void runTests() { 69 | test_is_misspell(); 70 | test_is_not_misspell(); 71 | test_is_missepll(); 72 | test_is_misspall(); 73 | } 74 | 75 | }; 76 | 77 | 78 | 79 | void 80 | test_misspell() { 81 | Misspell * mt = new Misspell("Misspell (is_misspell()) function unit test"); 82 | mt->runTests(); 83 | delete mt; 84 | } 85 | 86 | #ifdef test_misspell_STANDALONE 87 | int 88 | main(int, char **) { 89 | 90 | test_misspell(); 91 | return 0; 92 | } 93 | #endif 94 | -------------------------------------------------------------------------------- /test/test_training.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "colortest.h" 13 | #include "testutils.h" 14 | #include "fake.h" 15 | 16 | class TrainingTest : public CppUnit::TestCase { 17 | 18 | private: 19 | FakeTest * ft; 20 | RecordPList recpointers; 21 | vector rpv; 22 | 23 | public: 24 | 25 | TrainingTest(std::string name) : CppUnit::TestCase(name) { 26 | 27 | describe_test(INDENT0, name.c_str()); 28 | 29 | const string filename("testdata/assignee_comparison.csv"); 30 | ft = new FakeTest(string("Fake training test"), filename); 31 | ft->load_fake_data(filename); 32 | recpointers = ft->get_recpointers(); 33 | rpv = ft->get_recvecs(); 34 | } 35 | 36 | 37 | ~TrainingTest() { 38 | describe_test(INDENT2, "Destroying TrainingTest"); 39 | delete ft; 40 | } 41 | 42 | 43 | void test_get_blocking_indice() { 44 | 45 | describe_test(INDENT2, "Testing get_blocking_indice()"); 46 | 47 | Spec spec; 48 | 49 | spec.it("Blocking index for Firstname/Lastname/Class/Coauthor: (0,1,13,12)", DO_SPEC { 50 | vector column_names = { 51 | "Firstname", "Lastname", "Class", "Coauthor" 52 | }; 53 | vector bi = get_blocking_indices(column_names); 54 | // The order of the blocking indices depends on how the column 55 | // names are ordered when the data is read from the configuration file. 56 | // We're controlling the order in the FakeTest constructor here. 57 | vector target = { 0, 1, 13, 12 }; 58 | return (target == bi); 59 | }); 60 | 61 | } 62 | 63 | 64 | void runTest() { 65 | test_get_blocking_indice(); 66 | } 67 | }; 68 | 69 | 70 | void 71 | test_training() { 72 | 73 | TrainingTest * tt = new TrainingTest(string("Training initial test")); 74 | tt->runTest(); 75 | delete tt; 76 | } 77 | 78 | #ifdef test_training_STANDALONE 79 | int 80 | main(int UP(argc), char ** UP(argv)) { 81 | 82 | test_training(); 83 | return 0; 84 | } 85 | #endif 86 | -------------------------------------------------------------------------------- /test/test_rarenames.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | 9 | #include "testutils.h" 10 | 11 | extern "C" { 12 | #include "strcmp95.h" 13 | } 14 | 15 | using std::string; 16 | using std::pair; 17 | 18 | 19 | class RarenamesTest : public CppUnit::TestCase { 20 | 21 | public: 22 | RarenamesTest(string name) : CppUnit::TestCase(name) { 23 | 24 | describe_test(INDENT0, name.c_str()); 25 | } 26 | 27 | /** 28 | * The name_compare function builds a similarity 29 | * weight. 30 | */ 31 | void test_rarename() { 32 | //find_rare_names_v2(vec_pdest, source); 33 | } 34 | 35 | void test_choose_rare_words() { 36 | 37 | WordCounter wc; 38 | wc.insert (pair (string("foo"), WordCounts(1,2))); 39 | wc.insert (pair (string("bar"), WordCounts(1,88))); 40 | wc.insert (pair (string("barbar"),WordCounts(3,88))); 41 | wc.insert (pair (string("baz"), WordCounts(1,200))); 42 | wc.insert (pair (string("quux"), WordCounts(5,2))); 43 | wc.insert (pair (string("red"), WordCounts(5,88))); 44 | wc.insert (pair (string("black"), WordCounts(5,200))); 45 | 46 | std::set rarewords; 47 | choose_rare_words(wc, rarewords); 48 | 49 | CPPUNIT_ASSERT(0 == rarewords.count("foo")); 50 | CPPUNIT_ASSERT(1 == rarewords.count("bar")); 51 | CPPUNIT_ASSERT(1 == rarewords.count("barbar")); 52 | CPPUNIT_ASSERT(0 == rarewords.count("baz")); 53 | CPPUNIT_ASSERT(0 == rarewords.count("quux")); 54 | CPPUNIT_ASSERT(0 == rarewords.count("red")); 55 | CPPUNIT_ASSERT(0 == rarewords.count("black")); 56 | } 57 | 58 | 59 | void runTests() { 60 | test_rarename(); 61 | test_choose_rare_words(); 62 | } 63 | 64 | }; 65 | 66 | 67 | void test_rarenames() { 68 | 69 | RarenamesTest * rt = new RarenamesTest(std::string("Rare names unit tests")); 70 | rt->runTests(); 71 | delete rt; 72 | } 73 | 74 | 75 | #ifdef test_rarenames_STANDALONE 76 | int 77 | main(int, char **) { 78 | test_rarenames(); 79 | return 0; 80 | } 81 | #endif 82 | -------------------------------------------------------------------------------- /include/worker.h: -------------------------------------------------------------------------------- 1 | #ifndef PATENT_WORKER_H 2 | #define PATENT_WORKER_H 3 | 4 | 5 | 6 | /** 7 | * Worker is a threading subclass to achieve multithreading in Linux systems. 8 | * It is used in ClusterInfo::disambiguate function. 9 | * It is unnecessary to understanding the detail. The only thing necessary to know is the constructor. 10 | */ 11 | class Worker : public Thread { 12 | 13 | /** 14 | * Private: 15 | * map < string, ClusterInfo::ClusterList >::iterator * ppdisambiged: 16 | * the pointer to an iterator that is a cursor of progress of disambiguation. 17 | * 18 | * const cRatios * pratios: the pointer to a cRatio object. 19 | * ClusterInfo & cluster_ref: the reference of a ClusterInfo object that is actually the source. 20 | * static pthread_mutex_t iter_lock: a mutex to synchronize the cursor. 21 | * static unsigned int count: a static member to count the number of disambiguated blocks. 22 | * void run(): the overriding function of base class, implementing details of disambiguation in each thread. 23 | */ 24 | private: 25 | map < string, ClusterInfo::ClusterList >::iterator * ppdisambiged; 26 | const cRatios * pratios; 27 | ClusterInfo & cluster_ref; 28 | 29 | static pthread_mutex_t iter_lock; 30 | static unsigned int count; 31 | void run(); 32 | 33 | /** 34 | * Public: 35 | * explicit Worker( map < string, ClusterInfo::ClusterList >::iterator & input_pdisambiged, 36 | const cRatios & ratiosmap, ClusterInfo & inputcluster): constructor 37 | * ~Worker(): destructor 38 | * static void zero_count(): clear the variable "count" to zero 39 | * static unsigned int get_count(): return the variable "count" 40 | * 41 | */ 42 | public: 43 | explicit Worker( map < string, ClusterInfo::ClusterList >::iterator & input_pdisambiged, 44 | const cRatios & ratiosmap, 45 | ClusterInfo & inputcluster 46 | ) : ppdisambiged(&input_pdisambiged), pratios(&ratiosmap), cluster_ref(inputcluster) {} 47 | 48 | ~Worker() {} 49 | static void zero_count() { count = 0; } 50 | static unsigned int get_count() { return count;} 51 | }; 52 | 53 | 54 | #endif /* PATENT_WORKER_H */ 55 | -------------------------------------------------------------------------------- /test/test_engine.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | 7 | #include "testdata.h" 8 | #include "testutils.h" 9 | 10 | using std::string; 11 | using std::vector; 12 | 13 | class EngineTest : public CppUnit::TestCase { 14 | 15 | private: 16 | static const std::string LINE; 17 | vector tcn; 18 | vector requested_columns; 19 | vector indices; 20 | 21 | public: 22 | EngineTest(std::string name) : CppUnit::TestCase(name) { 23 | 24 | describe_test(INDENT0, name.c_str()); 25 | } 26 | 27 | void set_up() { 28 | tcn = parse_column_names(LINE); 29 | requested_columns = get_column_names(); 30 | // TODO: Find out why this is failing with an exception 31 | indices = create_column_indices(requested_columns, tcn); 32 | Attribute::register_class_names(requested_columns); 33 | } 34 | 35 | void test_parse_column_names() { 36 | Spec spec; 37 | spec.it("Correctly parsed column names", DO_SPEC_THIS { 38 | return (10 == tcn.size()); 39 | }); 40 | } 41 | 42 | 43 | void test_create_column_indices() { 44 | Spec spec; 45 | spec.it("Indices have the correct length", DO_SPEC_THIS { 46 | //std::cout << "indices.size: " << indices.size() << std::endl; 47 | return (10 == indices.size()); 48 | }); 49 | } 50 | 51 | 52 | void test_instantiate_attributes() { 53 | 54 | Attribute ** pa = new Attribute*[tcn.size()]; 55 | pa = instantiate_attributes(tcn, tcn.size()); 56 | 57 | Spec spec; 58 | spec.it("First attribute has attrib_group 'Personal'", [this,pa](Description desc)->bool { 59 | return (string("Personal") == pa[0]->get_attrib_group()); 60 | }); 61 | 62 | delete pa[0]; 63 | for (unsigned int i = 1; i< tcn.size(); ++i) { 64 | delete pa[i]; 65 | } 66 | delete [] pa; 67 | } 68 | 69 | 70 | void runTest() { 71 | set_up(); 72 | test_parse_column_names(); 73 | test_create_column_indices(); 74 | test_instantiate_attributes(); 75 | } 76 | }; 77 | 78 | const std::string EngineTest::LINE = "Firstname,Middlename,Lastname,Latitude,Assignee,City,Country,Patent,ApplyYear,AsgNum"; 79 | 80 | void 81 | test_engine() { 82 | 83 | EngineTest * et = new EngineTest(std::string("Engine unit test")); 84 | et->runTest(); 85 | delete et; 86 | } 87 | 88 | 89 | #ifdef test_engine_STANDALONE 90 | int 91 | main(int UP(argc), char ** UP(argv)) { 92 | 93 | test_engine(); 94 | return 0; 95 | } 96 | #endif 97 | -------------------------------------------------------------------------------- /test/data/berkeley/final_berkeley.txt: -------------------------------------------------------------------------------- 1 | 05241635-3###1###05241635-3, 2 | 06915307-3###1###06915307-3,06453319-4,06292880-4,06289358-4,06209003-4,06128627-4,06128623-4, 3 | 06915307-4###1###06915307-4,06453319-5,06292880-5,06289358-5,06209003-5,06128627-5,06128623-5, 4 | 05241635-2###0.996933###05018062-1,05123095-2,05241635-2, 5 | 07171415-2###1###07171415-2,07099871-5, 6 | 07171415-4###1###07171415-4,07099871-3,07013303-3,06961723-2, 7 | 06915307-5###1###06915307-5,06453319-6,06292880-6,06289358-6,06209003-6,06128627-6,06128623-6, 8 | 07171415-0###1###07171415-0,07099871-1,07013303-1,06961723-1, 9 | 05241635-1###1###05241635-1,05123095-1,05018062-2, 10 | 07573873-2###0.992424###06914897-1,06577622-3,06487603-2,06175871-2,07573873-2,07453815-1,07016675-1,06937699-1,06870830-1,06857072-2,06857021-2,06856616-1,06822957-4,06804224-2,06795429-3,06785261-2,06771674-2,06744759-1,06741586-2,06732314-3,06731642-3,06731630-2,06681252-2,06678250-2,06674745-2,06650901-3,06625119-2,06587433-3,06584490-2,06542504-4,06512761-2,06446127-2,06434606-2,06381638-4,06366959-1,06363053-4,06360271-2,06351524-2,06182125-3,06055236-4,06954454-1,06650619-2,06170075-3,06570606-1,06487690-3,06269099-4,06243846-4,06226769-3,06151636-3,06145109-3,05870412-3,06169744-2,07032242-2,06937610-1,06697354-4,06675218-4,06567405-3,06567399-2,06442141-2,06353614-3,06006271-4, 11 | 06158781-1###1###06158781-1, 12 | 07013303-5###1###07013303-5, 13 | 05018576-3###1###05018576-3, 14 | 05123095-3###1###05123095-3, 15 | 06915307-1###1###06915307-1,06453319-2,06292880-2,06289358-2,06209003-2,06128627-2,06128623-2, 16 | 06517734-2###1###06517734-2, 17 | 05726480-1###0.991348###07420980-0,07529217-0,05726480-1,05659195-2, 18 | 06517734-4###1###06517734-4, 19 | 05018576-1###1###05018576-1, 20 | 06517734-3###1###06517734-3, 21 | 05029133-2###1###05029133-2, 22 | 05136185-1###1###05136185-1, 23 | 06517734-1###1###06517734-1, 24 | 05018576-4###1###05018576-4, 25 | 06915307-2###0.997018###06915307-2,06292880-3,06289358-3,06209003-3,06128627-3,06128623-3,06453319-3, 26 | 05018576-2###1###05018576-2, 27 | 07529217-3###1###07529217-3, 28 | 05659195-3###1###05659195-3, 29 | 07625697-1###1###07625697-1,07442499-0,07378236-0,07323298-1,06110426-2,05807522-1, 30 | 06915307-0###1###06915307-0,06453319-1,06292880-1,06289358-1,06209003-1,06128627-1,06128623-1, 31 | 05659195-4###1###05659195-4, 32 | 07529217-2###1###07529217-2, 33 | 05029133-1###1###05029133-1, 34 | 05018062-3###1###05018062-3, 35 | 07420980-1###1###07420980-1, 36 | 07171415-5###1###07171415-5,07099871-4,07013303-4,06961723-3, 37 | 07013303-2###1###07013303-2, 38 | 07625697-0###0.998767###07625697-0,07378236-1,06110426-1,05807522-2,07442499-1,07323298-0, 39 | 05659195-1###1###05659195-1, 40 | 07171415-1###1###07171415-1,07099871-0,07013303-0,06961723-0, 41 | 07529217-1###1###07529217-1, 42 | -------------------------------------------------------------------------------- /test/test_ratios.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include "ratios.h" 8 | #include "engine.h" 9 | #include "block.h" 10 | 11 | #include "testutils.h" 12 | #include "fake.h" 13 | 14 | using std::string; 15 | using std::pair; 16 | using std::unique_ptr; 17 | 18 | 19 | class RatioSmoothingTest : public CppUnit::TestCase { 20 | 21 | private: 22 | FakeTest * ft; 23 | static const short BUF_SIZE = 256; 24 | 25 | public: 26 | RatioSmoothingTest(string name) : CppUnit::TestCase(name) { 27 | 28 | const string filename("testdata/clustertest.csv"); 29 | ft = new FakeTest(string("Fake RatioComponentTest"), filename); 30 | ft->load_fake_data(filename); 31 | 32 | describe_test(INDENT0, name.c_str()); 33 | } 34 | 35 | ~RatioSmoothingTest() { 36 | delete ft; 37 | } 38 | 39 | void test_compute_total_nodes() { 40 | 41 | Spec spec; 42 | 43 | spec.it("Max (empty) and min (empty) should have %d node", [&spec](Description desc)->bool { 44 | SimilarityProfile max; 45 | SimilarityProfile min; 46 | auto totalnodes = compute_total_nodes(min, max); 47 | sprintf(spec.buf, desc, totalnodes); 48 | return (1 == totalnodes); 49 | }); 50 | 51 | 52 | spec.it("Max (1,1) and min(0,0) should have 4 nodes", DO_SPEC { 53 | SimilarityProfile max{1, 1}; 54 | SimilarityProfile min{0, 0}; 55 | return (4 == compute_total_nodes(min, max)); 56 | }); 57 | 58 | 59 | spec.it("Max (1,1,2) and min(0,0,0) should have 12 nodes", DO_SPEC { 60 | SimilarityProfile max{1, 1, 2}; 61 | SimilarityProfile min{0, 0, 0}; 62 | return (12 == compute_total_nodes(min, max)); 63 | }); 64 | 65 | } 66 | 67 | void test_get_max_similarity() { 68 | 69 | Spec spec; 70 | 71 | spec.it("Max similarity should be (4,3,5,6,4,6)", DO_SPEC { 72 | vector names{"Firstname", "Middlename", "Lastname", "Coauthor", "Class", "Assignee"}; 73 | Record::activate_comparators_by_name(names); 74 | SimilarityProfile sp = get_max_similarity(names); 75 | SimilarityProfile max = {4,3,5,6,4,6}; 76 | return (max == get_max_similarity(names)); 77 | }); 78 | 79 | } 80 | 81 | void test_ratios() { 82 | 83 | test_compute_total_nodes(); 84 | test_get_max_similarity(); 85 | } 86 | 87 | }; 88 | 89 | 90 | void test_ratio_smoothing() { 91 | 92 | RatioSmoothingTest * rt = new RatioSmoothingTest(std::string("Ratios test")); 93 | rt->test_ratios(); 94 | delete rt; 95 | } 96 | 97 | 98 | #ifdef test_ratios_STANDALONE 99 | int 100 | main(int, char **) { 101 | test_ratio_smoothing(); 102 | return 0; 103 | } 104 | #endif 105 | -------------------------------------------------------------------------------- /test/test_jwcmp.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "comparators.h" 9 | 10 | #include "testutils.h" 11 | 12 | using std::string; 13 | 14 | 15 | class JWcmpTest : public CppUnit::TestCase { 16 | 17 | private: 18 | string s1, s2; 19 | 20 | public: 21 | JWcmpTest(std::string name) : CppUnit::TestCase(name) { 22 | 23 | describe_test(INDENT0, name.c_str()); 24 | } 25 | 26 | float compute_jw(const char * s1, const char * s2) { 27 | return jwcmp(s1, s2); 28 | } 29 | 30 | float compute_jw(string s1, string s2) { 31 | return jwcmp(s1.c_str(), s2.c_str()); 32 | } 33 | 34 | void print_score(string s1, string s2, int score) { 35 | std::cout << s1 << " vs. " << s2 << ": " << score << std::endl; 36 | } 37 | 38 | 39 | void testem_all() { 40 | 41 | Spec spec; 42 | 43 | spec.it("Comparing two empty strings scores 0", [this](Description desc)->bool { 44 | s1 = ""; s2 = ""; 45 | int score = jwcmp(s1, s2); 46 | //print_score(s1, s2, score); 47 | return (0 == score); 48 | }); 49 | 50 | 51 | spec.it("Comparing %s with %s results in %d", [this, &spec](Description desc)->bool { 52 | s1 = "MATTHEW"; s2 = "XYZ"; 53 | int score = jwcmp(s1, s2); 54 | sprintf(spec.buf, desc, s1.c_str(), s2.c_str(), score); 55 | return (0 == score); 56 | }); 57 | 58 | spec.it("Comparing %s with %s results in %d", [this, &spec](Description desc)->bool { 59 | s1 = "MATTHEW"; s2 = "TALIN"; 60 | int score = jwcmp(s1, s2); 61 | sprintf(spec.buf, desc, s1.c_str(), s2.c_str(), score); 62 | return (0 == score); 63 | }); 64 | 65 | 66 | spec.it("Comparing %s with %s results in %d", [this, &spec](Description desc)->bool { 67 | s1 = "MATTHEW"; s2 = "MATHEW"; 68 | int score = jwcmp(s1, s2); 69 | sprintf(spec.buf, desc, s1.c_str(), s2.c_str(), score); 70 | return (4 == score); 71 | }); 72 | 73 | 74 | spec.it("Comparing %s with %s results in %d", [this, &spec](Description desc)->bool { 75 | s1 = "MATTHEW"; s2 = "MATTHEW"; 76 | int score = jwcmp(s1, s2); 77 | sprintf(spec.buf, desc, s1.c_str(), s2.c_str(), score); 78 | return (5 == score); 79 | }); 80 | 81 | } 82 | 83 | void runTests() { 84 | testem_all(); 85 | } 86 | 87 | }; 88 | 89 | 90 | void 91 | test_jwcmp() { 92 | 93 | JWcmpTest * st = new JWcmpTest(std::string("Jaro/Winkler similarity binning unit testing")); 94 | st->testem_all(); 95 | 96 | delete st; 97 | } 98 | 99 | 100 | #ifdef test_jwcmp_STANDALONE 101 | int 102 | main(int, char **) { 103 | test_jwcmp(); 104 | return 0; 105 | } 106 | #endif /* jwcmp_STANDALONE */ 107 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | 2 | CXX=g++ 3 | CC=gcc 4 | #CFLAGS=-O3 -g -Wall -I/usr/local/include -L/usr/local/lib 5 | # Profiling 6 | #CFLAGS=-O3 -pg -Wall -I/usr/local/include -L/usr/local/lib 7 | # Debugging 8 | 9 | CPPFLAGS = -Wall -Wextra 10 | CXXFLAGS = -O0 -g 11 | 12 | OOQPINCLUDEDIR=/usr/local/include/ooqp 13 | 14 | #ILOGINSTALLDIR=/data/patentdata/ILOG/CPLEX_Studio124 15 | ILOGINSTALLDIR=/Users/daviddoolin/Applications/IBM/ILOG/CPLEX_Studio124 16 | CPLEXINCLUDE=$(ILOGINSTALLDIR)/cplex/include 17 | CONCERTINCLUDE=$(ILOGINSTALLDIR)/concert/include 18 | 19 | # Linux server 20 | #CPLEXLIB=$(ILOGINSTALLDIR)/cplex/lib/x86-64_sles10_4.1/static_pic 21 | #CONCERTLIB=$(ILOGINSTALLDIR)/concert/lib/x86-64_sles10_4.1/static_pic 22 | # Local macbook 23 | CPLEXLIB=$(ILOGINSTALLDIR)/cplex/lib/x86-64_darwin9_gcc4.0/static_pic 24 | CONCERTLIB=$(ILOGINSTALLDIR)/concert/lib/x86-64_darwin9_gcc4.0/static_pic 25 | 26 | INCLUDES = -I/usr/local/include -I$(CPLEXINCLUDE) -I$(CONCERTINCLUDE) -I../include 27 | 28 | noinst_LIBRARIES = libdisambiguation.a 29 | libdisambiguation_a_SOURCES = disambiguate.cpp cluster.cpp comparators.cpp attribute.cpp \ 30 | engine.cpp blocking_operation.cpp newcluster.cpp \ 31 | postprocess.cpp ratios.cpp ratio_smoothing.cpp \ 32 | training.cpp utilities.cpp threading.cpp strcmp95.c record.cpp \ 33 | string_manipulator.cpp record_reconfigurator.cpp 34 | 35 | #libdisambiguation_a_CXXFLAGS = -O0 -pg a 36 | libdisambiguation_a_CPPFLAGS = -Wall -Wextra -fno-inline $(INCLUDES) -DIL_STD -L/usr/local/lib -DNDEBUG -w #-Wno-ignored-qualifiers 37 | #libdisambiguation_a_LDFLAGS = -pg -O0 38 | 39 | #disambiguate_CPPFLAGS = -O0 -g -Wall -fno-inline $(INCLUDES) -DIL_STD -L/usr/local/lib -DNDEBUG -w 40 | disambiguate_LDFLAGS = -L$(CPLEXLIB) -lilocplex -lcplex -L$(CONCERTLIB) -lconcert 41 | #disambiguate_LDADD = -L$(CPLEXLIB) -lilocplex -lcplex -L$(CONCERTLIB) -lconcert 42 | #disambiguate_LDADD = ilocplex 43 | 44 | bin_PROGRAMS = disambiguate zardoz #txt2sqlite3 45 | disambiguate_SOURCES = main.cpp 46 | #disambiguate_CXXFLAGS = -O0 -pg 47 | disambiguate_CPPFLAGS = -Wall -Wextra -fno-inline $(INCLUDES) -DIL_STD -L/usr/local/lib 48 | #disambiguate_CPPFLAGS = -O0 -pg -Wall -Wextra -fno-inline $(INCLUDES) -DIL_STD -L/usr/local/lib #-DNDEBUG -w -finstrument-functions-exclude-file-list=iostream.h,string.h,vector.h 49 | #disambiguate_LDFLAGS = -L$(CPLEXLIB) -lilocplex -lcplex -L$(CONCERTLIB) -lconcert 50 | #disambiguate_LDFLAGS = -pg -O0 51 | # For Macos 52 | #disambiguate_LDADD = libdisambiguation.a # -L$(CPLEXLIB) -lilocplex -lcplex -L$(CONCERTLIB) -lconcert 53 | disambiguate_LDADD = libdisambiguation.a -L$(CPLEXLIB) -lilocplex -lcplex -L$(CONCERTLIB) -lconcert 54 | #disambiguate_LDADD = ilocplex 55 | 56 | 57 | zardoz_SOURCES = txt2sqlite3.cpp 58 | -------------------------------------------------------------------------------- /makefile.orig: -------------------------------------------------------------------------------- 1 | #CXX=icpc 2 | #CC=icc 3 | #CFLAGS=-O3 -g -xsse4.1 -ipo -static-intel -Wall 4 | 5 | CXX=g++ 6 | CC=gcc 7 | #CFLAGS=-O3 -g -Wall -I/usr/local/include -L/usr/local/lib 8 | # Profiling 9 | #CFLAGS=-O3 -pg -Wall -I/usr/local/include -L/usr/local/lib 10 | # Debugging 11 | CFLAGS=-O0 -g -Wall -fno-inline -I/usr/local/include -L/usr/local/lib 12 | OOQPINCLUDEDIR=/usr/local/include/ooqp 13 | ILOGINSTALLDIR=/home/ysun/ILOG/CPLEX_Studio_AcademicResearch122 14 | 15 | CPLEXINCLUDE=$(ILOGINSTALLDIR)/cplex/include 16 | CONCERTINCLUDE=$(ILOGINSTALLDIR)/concert/include 17 | #CPLEXLIB=$(ILOGINSTALLDIR)/cplex/lib/x86_sles10_4.1/static_pic 18 | CPLEXLIB=$(ILOGINSTALLDIR)/cplex/lib/x86-64_sles10_4.1/static_pic 19 | #CONCERTLIB=$(ILOGINSTALLDIR)/concert/lib/x86_sles10_4.1/static_pic 20 | CONCERTLIB=$(ILOGINSTALLDIR)/concert/lib/x86-64_sles10_4.1/static_pic 21 | 22 | all:exedisambig txt2sqlite3 23 | 24 | exedisambig: Disambigmain.o DisambigDefs.o DisambigRatios.o DisambigEngine.o DisambigFileOper.o strcmp95.o DisambigComp.o DisambigTraining.o Threading.o DisambigCluster.o DisambigRatioSmoothing.o DisambigNewCluster.o DisambigCustomizedDefs.o DisambigPostProcess.o DisambigUtilities.o 25 | $(CXX) -o $@ $? $(CFLAGS) -L$(CPLEXLIB) -lilocplex -lcplex -L$(CONCERTLIB) -lconcert -lm -lpthread 26 | 27 | Disambigmain.o: Disambigmain.cpp 28 | $(CXX) -c $? $(CFLAGS) 29 | 30 | DisambigDefs.o: DisambigDefs.cpp DisambigDefs.h 31 | $(CXX) -c $? $(CFLAGS) 32 | 33 | DisambigRatios.o: DisambigRatios.cpp DisambigRatios.h 34 | $(CXX) -c $? $(CFLAGS) 35 | 36 | DisambigEngine.o: DisambigEngine.cpp DisambigEngine.h 37 | $(CXX) -c $? $(CFLAGS) 38 | 39 | DisambigFileOper.o: DisambigFileOper.cpp DisambigFileOper.h 40 | $(CXX) -c $? $(CFLAGS) 41 | 42 | DisambigComp.o: DisambigComp.cpp DisambigComp.h 43 | $(CXX) -c $? $(CFLAGS) 44 | 45 | DisambigTraining.o: DisambigTraining.h DisambigTraining.cpp 46 | $(CXX) -c $? $(CFLAGS) 47 | 48 | strcmp95.o: strcmp95.c strcmp95.h 49 | $(CC) -c $? $(CFLAGS) 50 | 51 | Threading.o: Threading.cpp Threading.h 52 | $(CXX) -c $? $(CFLAGS) 53 | 54 | DisambigCluster.o: DisambigCluster.cpp DisambigCluster.h 55 | $(CXX) -c $? $(CFLAGS) 56 | 57 | DisambigRatioSmoothing.o: DisambigRatioSmoothing.cpp 58 | $(CXX) -c $? $(CFLAGS) -I$(OOQPINCLUDEDIR) -I$(CPLEXINCLUDE) -I$(CONCERTINCLUDE) -DIL_STD -DNDEBUG -w 59 | 60 | DisambigNewCluster.o: DisambigNewCluster.h DisambigNewCluster.cpp 61 | $(CXX) -c $? $(CFLAGS) 62 | 63 | DisambigCustomizedDefs.o: DisambigCustomizedDefs.h DisambigCustomizedDefs.cpp 64 | $(CXX) -c $? $(CFLAGS) 65 | 66 | DisambigPostProcess.o: DisambigPostProcess.h DisambigPostProcess.cpp 67 | $(CXX) -c $? $(CFLAGS) 68 | 69 | DisambigUtilities.o: DisambigUtilities.h DisambigUtilities.cpp 70 | $(CXX) -c $? $(CFLAGS) 71 | 72 | clean: 73 | rm *.o *.gch 74 | 75 | txt2sqlite3: txt2sqlite3.h txt2sqlite3.cpp 76 | $(CXX) -o $@ $? $(CFLAGS) -lsqlite3 77 | -------------------------------------------------------------------------------- /include/utilities.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PATENT_UTILITIES_H 3 | #define PATENT_UTILITIES_H 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | using std::vector; 10 | using std::list; 11 | using std::string; 12 | using std::map; 13 | 14 | class Record; 15 | class StringManipulator; 16 | class ClusterSet; 17 | class cRatios; 18 | 19 | 20 | bool make_changable_training_sets_by_patent (const list & record_pointers, 21 | const vector& blocking_column_names, 22 | const vector < const StringManipulator *> & pstring_oper, 23 | const unsigned int limit, 24 | const vector & training_filenames); 25 | 26 | bool make_changable_training_sets_by_assignee (const list & record_pointers, 27 | const vector& blocking_column_names, 28 | const vector < const StringManipulator *> & pstring_oper, 29 | const unsigned int limit, 30 | const vector & training_filenames); 31 | 32 | int unique_inventors_per_period (unsigned int starting_year, 33 | unsigned int interval, 34 | const char * wholedatabase, 35 | const char * disambigresult, 36 | const char * outputfile); 37 | 38 | /** 39 | * @param all_records are all the records, excellent. 40 | * @param last_disambig_result match file (?) from the last round. 41 | * @param outputfile currently hard coded as "final.txt" in disambiguate.cpp:650 42 | */ 43 | void one_step_postprocess (const list < Record > & all_records, 44 | const char * last_disambig_result, 45 | const char * outputfile); 46 | 47 | //string remove_headtail_space (const string & s); 48 | 49 | void out_of_cluster_density (const ClusterSet & upper, 50 | const ClusterSet & lower, 51 | const cRatios & ratio, 52 | std::ofstream & ofile ); 53 | 54 | void exit_with_error (const char * s, 55 | const char * file, 56 | const char * line); 57 | 58 | #endif /* PATENT_UTILITIES_H */ 59 | -------------------------------------------------------------------------------- /test/test_coauthor.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // Really good web pages: 6 | // http://stackoverflow.com/questions/318064/how-do-you-declare-an-interface-in-c 7 | // http://stackoverflow.com/questions/7182359/template-instantiation-details-of-gcc-and-ms-compilers 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "testdata.h" 20 | #include "colortest.h" 21 | #include "testutils.h" 22 | #include "fake.h" 23 | 24 | 25 | using std::string; 26 | using std::cout; 27 | using std::endl; 28 | 29 | 30 | class CoauthorTest : public CppUnit::TestCase { 31 | 32 | private: 33 | FakeTest * ft; 34 | RecordPList rp; 35 | static const short BUF_SIZE = 256; 36 | Describer describer; 37 | 38 | public: 39 | 40 | CoauthorTest(std::string name) : CppUnit::TestCase(name) { 41 | 42 | describe_test(INDENT0, name.c_str()); 43 | // TODO: Load up a file for coauthors. 44 | ft = new FakeTest("FakeTest for coauthors", "./testdata/assignee_comparison.csv"); 45 | ft->load_fake_data("./testdata/assignee_comparison.csv"); 46 | rp = ft->get_recpointers(); 47 | 48 | } 49 | 50 | 51 | void test_r1r2() { 52 | 53 | char buffer[BUF_SIZE]; 54 | char teststr[] = "Comparing %s with %s, similarity %d"; 55 | 56 | RecordPList rp = ft->get_recpointers(); 57 | vector rpv = ft->get_recvecs(); 58 | const Record & r1 = *rpv[1]; 59 | const Record & r2 = *rpv[1]; 60 | 61 | vector active_similarity_attributes; 62 | active_similarity_attributes.push_back(string("Coauthor")); 63 | Record::activate_comparators_by_name(active_similarity_attributes); 64 | 65 | SimilarityProfile sp = r1.record_compare(r2); 66 | //print_similarity(sp); 67 | uint32_t similarity = sp[0]; 68 | sprintf(buffer, teststr, "r1", "r2", similarity); 69 | 70 | try { 71 | CPPUNIT_ASSERT(1 == similarity); 72 | describer = describe_pass; 73 | } catch (CppUnit::Exception e) { 74 | describer = describe_fail; 75 | } 76 | describer(INDENT4, buffer); 77 | } 78 | 79 | 80 | void test_coauthors() { 81 | RecordPList rp = ft->get_recpointers(); 82 | cBlocking_Operation_By_Coauthors bobcobj(rp, 1); 83 | Reconfigurator_Coauthor rcobj (bobcobj.get_patent_tree()); 84 | std::for_each(rp.begin(), rp.end(), rcobj); 85 | } 86 | 87 | 88 | }; 89 | 90 | 91 | void 92 | test_fetch_records() { 93 | 94 | CoauthorTest * ct = new CoauthorTest("Testing Coauthor comparison"); 95 | ct->test_r1r2(); 96 | ct->test_coauthors(); 97 | delete ct; 98 | } 99 | 100 | 101 | #ifdef test_coauthor_STANDALONE 102 | int 103 | main(int, char **) { 104 | 105 | test_fetch_records(); 106 | return 0; 107 | } 108 | #endif 109 | -------------------------------------------------------------------------------- /include/comparators.h: -------------------------------------------------------------------------------- 1 | #ifndef PATENT_COMPARATORS_H 2 | #define PATENT_COMPARATORS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | using std::string; 9 | using std::map; 10 | using std::vector; 11 | 12 | const unsigned int Jaro_Wrinkler_Max = 5; 13 | 14 | 15 | char * extract_initials (char * dest, 16 | const char * source); 17 | 18 | int nospacecmp (const char* str1, 19 | const char* str2); 20 | 21 | int jwcmp (const string & str1, 22 | const string & str2); 23 | 24 | int midnamecmp (const string & str1, 25 | const string & str2 ); 26 | 27 | int countrycmp (const string & country1, 28 | const string & country2 ); 29 | 30 | int streetcmp (const string & inputstreet1, 31 | const string & inputstreet2); 32 | 33 | int latloncmp (const string & inputlat1, 34 | const string & inputlon1, 35 | const string & inputlat2, 36 | const string & inputlon2 ); 37 | 38 | int classcmp (const string & class1, 39 | const string & class2); 40 | 41 | int coauthorcmp (const string & coauthor1, 42 | const string & coauthor2); 43 | 44 | int asgcmp (const string & s1, 45 | const string &s2); 46 | 47 | int name_compare (const string & s1, 48 | const string & s2, 49 | const unsigned int prev, 50 | const unsigned int cur); 51 | 52 | int is_abbreviation (const char * s1, 53 | const char * s2); 54 | 55 | int is_misspell (const char * s1, 56 | const char * s2 ); 57 | 58 | 59 | template 60 | inline const Tp& max_val(const Tp& arg1, const Tp &arg2) { 61 | return ( arg1 < arg2 )? arg2 : arg1; 62 | } 63 | 64 | 65 | template 66 | inline const Tp& min_val(const Tp& arg1, const Tp &arg2) { 67 | return ( arg1 < arg2 )? arg1 : arg2; 68 | } 69 | 70 | 71 | // TODO: Unit test this template, it looks very fragile with the 72 | // dependence on sorted data for both arguments. 73 | template < typename Iter1, typename Iter2 > 74 | unsigned int 75 | num_common_elements(const Iter1 & p1begin, 76 | const Iter1 & p1e, 77 | const Iter2 & p2begin, 78 | const Iter2 & p2e, 79 | const unsigned int max) { 80 | 81 | // containers must be sorted before use. 82 | // it has to be a sorted version container, like set, or sorted vector or list 83 | unsigned int cnt = 0; 84 | Iter1 p1b = p1begin; 85 | Iter2 p2b = p2begin; 86 | while ( p1b != p1e && p2b != p2e ) { 87 | if ( *p1b < *p2b ) { 88 | ++p1b; 89 | } 90 | else if ( *p2b < *p1b ) { 91 | ++p2b; 92 | } 93 | else { 94 | ++cnt; 95 | ++p1b; 96 | ++p2b; 97 | } 98 | 99 | if ( cnt == max && max != 0 ) 100 | break; 101 | } 102 | return cnt; 103 | } 104 | 105 | 106 | #endif /* PATENT_COMPARATORS_H */ 107 | -------------------------------------------------------------------------------- /src/string_manipulator.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "engine.h" 12 | #include "attribute.h" 13 | #include "cluster.h" 14 | #include "ratios.h" 15 | #include "newcluster.h" 16 | 17 | using std::map; 18 | using std::set; 19 | 20 | /** 21 | * Aim: to truncate string as desired. See the explanation in the 22 | * header file for more details 23 | * 24 | * Algorithm: simple string manipulation in C. 25 | */ 26 | 27 | string 28 | StringTruncate::manipulate( const string & inputstring ) const { 29 | 30 | if ( ! is_usable ) 31 | throw cException_Blocking_Disabled("String Truncation not activated yet."); 32 | 33 | if ( 0 == nchar ) { 34 | if ( is_forward ) 35 | return inputstring; 36 | else { 37 | return string(""); 38 | } 39 | } 40 | 41 | if ( inputstring.size() == 0 ) 42 | return inputstring; 43 | 44 | char * p = new char[ nchar + 1]; 45 | const char * res = p; 46 | const char * source; 47 | if ( begin >= 0 && static_cast(begin) < inputstring.size() ) 48 | source = &inputstring.at(begin); 49 | else if ( begin < 0 && ( begin + inputstring.size() >= 0 ) ) 50 | source = &inputstring.at( begin + inputstring.size() ); 51 | else { 52 | delete [] p; 53 | throw StringTruncate::cException_String_Truncation(inputstring.c_str()); 54 | } 55 | 56 | if ( is_forward) { 57 | for ( unsigned int i = 0; i < nchar && *source !='\0'; ++i ) 58 | *p++ = *source++; 59 | *p = '\0'; 60 | } 61 | else { 62 | for ( unsigned int i = 0; i < nchar && source != inputstring.c_str() ; ++i ) 63 | *p++ = *source--; 64 | *p = '\0'; 65 | } 66 | string result (res); 67 | delete [] res; 68 | return result; 69 | } 70 | 71 | 72 | 73 | const string cBlocking_Operation::delim = "##"; 74 | 75 | /* 76 | * Aim: to extract initials of each word in a string, maybe not starting from the first word. 77 | * See the explanation in the header file for more details 78 | * Algorithm: simple string manipulation in C. 79 | */ 80 | string 81 | ExtractInitials::manipulate( const string & inputstring ) const { 82 | 83 | size_t pos, prev_pos; 84 | pos = prev_pos = 0; 85 | 86 | if ( inputstring.empty() ) 87 | return string(); 88 | list < char > tempres; 89 | do { 90 | tempres.push_back(inputstring.at(prev_pos) ); 91 | pos = inputstring.find(delimiter, prev_pos ); 92 | prev_pos = pos + 1; 93 | } while ( pos != string::npos ); 94 | 95 | 96 | const unsigned int word_count = tempres.size(); 97 | if ( word_count >= starting_word ) { 98 | for ( unsigned int i = 0; i < starting_word; ++i ) 99 | tempres.pop_front(); 100 | } 101 | return string(tempres.begin(), tempres.end()); 102 | } 103 | 104 | 105 | /* 106 | * Aim: to extract the first word of a string. 107 | * Algorithm: STL string operations. 108 | */ 109 | string 110 | StringExtractFirstWord::manipulate( const string & inputstring ) const { 111 | 112 | string res = inputstring.substr(0, inputstring.find(delimiter, 0)); 113 | return res; 114 | } 115 | 116 | 117 | -------------------------------------------------------------------------------- /src/threading.cpp: -------------------------------------------------------------------------------- 1 | #include "threading.h" 2 | 3 | int Thread::threadInitNumber = 1; 4 | 5 | int Thread::getNextThreadNum() 6 | { 7 | return threadInitNumber++; 8 | } 9 | 10 | void* Thread::run0(void* pVoid) 11 | { 12 | Thread* p = (Thread*) pVoid; 13 | p->run1(); 14 | return p; 15 | } 16 | 17 | void* Thread::run1() 18 | { 19 | 20 | threadStatus = THREAD_STATUS_RUNNING; 21 | tid = pthread_self(); 22 | run(); 23 | threadStatus = THREAD_STATUS_EXIT; 24 | tid = 0; 25 | pthread_exit(NULL); 26 | } 27 | 28 | void Thread::run() 29 | { 30 | if (target != NULL) 31 | { 32 | (*target).run(); 33 | } 34 | } 35 | 36 | Thread::Thread() 37 | { 38 | tid = 0; 39 | threadStatus = THREAD_STATUS_NEW; 40 | curThreadInitNumber = getNextThreadNum(); 41 | pthread_attr_init(&attr); 42 | } 43 | 44 | Thread::Thread(Runnable *iTarget) 45 | { 46 | target = iTarget; 47 | tid = 0; 48 | threadStatus = THREAD_STATUS_NEW; 49 | curThreadInitNumber = getNextThreadNum(); 50 | pthread_attr_init(&attr); 51 | } 52 | 53 | Thread::~Thread() 54 | { 55 | pthread_attr_destroy(&attr); 56 | } 57 | 58 | bool Thread::start() 59 | { 60 | return pthread_create(&tid, &attr, run0, this) == 0; 61 | } 62 | 63 | pthread_t Thread::getCurrentThreadID() 64 | { 65 | return pthread_self(); 66 | } 67 | 68 | pthread_t Thread::getThreadID() 69 | { 70 | return tid; 71 | } 72 | 73 | int Thread::getState() 74 | { 75 | return threadStatus; 76 | } 77 | 78 | void Thread::join() 79 | { 80 | if (tid > 0) 81 | { 82 | pthread_join(tid, NULL); 83 | } 84 | } 85 | 86 | void Thread::join(unsigned long millisTime) 87 | { 88 | 89 | if (tid == 0) 90 | { 91 | return; 92 | } 93 | if (millisTime == 0) 94 | { 95 | join(); 96 | }else 97 | { 98 | unsigned long k = 0; 99 | while (threadStatus != THREAD_STATUS_EXIT && k <= millisTime) 100 | { 101 | usleep(100); 102 | k++; 103 | } 104 | } 105 | } 106 | 107 | bool Thread::operator ==(const Thread *otherThread) 108 | { 109 | if (otherThread == NULL) 110 | { 111 | return false; 112 | } 113 | if (curThreadInitNumber == (*otherThread).curThreadInitNumber) 114 | { 115 | return true; 116 | } 117 | return false; 118 | } 119 | 120 | bool Thread::isEquals(Thread *iTarget) 121 | { 122 | if (iTarget == NULL) 123 | { 124 | return false; 125 | } 126 | return pthread_self() == iTarget->tid; 127 | } 128 | 129 | void Thread::setThreadScope(bool isSystem) 130 | { 131 | if (isSystem) 132 | { 133 | pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); 134 | }else 135 | { 136 | pthread_attr_setscope(&attr, PTHREAD_SCOPE_PROCESS); 137 | } 138 | } 139 | 140 | bool Thread::getThreadScope() 141 | { 142 | int scopeType = 0; 143 | pthread_attr_getscope(&attr, &scopeType); 144 | return scopeType == PTHREAD_SCOPE_SYSTEM; 145 | } 146 | 147 | void Thread::setThreadPriority(int priority) 148 | { 149 | pthread_attr_getschedparam(&attr, ¶m); 150 | //param.__sched_priority = priority; 151 | param.sched_priority = priority; 152 | pthread_attr_setschedparam(&attr, ¶m); 153 | } 154 | 155 | int Thread::getThreadPriority() 156 | { 157 | pthread_attr_getschedparam(&attr, ¶m); 158 | //return param.__sched_priority; 159 | return param.sched_priority; 160 | } 161 | -------------------------------------------------------------------------------- /test/test_qp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | #include "testutils.h" 11 | 12 | typedef std::pair Ratio; 13 | typedef std::pair Count; 14 | typedef SimilarityProfile SP; 15 | 16 | // TODO: add a function which takes a uint32_t[]; 17 | SimilarityProfile 18 | csp(uint32_t s1, uint32_t s2) { 19 | SimilarityProfile sp; 20 | sp.push_back(s1); 21 | sp.push_back(s2); 22 | return sp; 23 | } 24 | 25 | 26 | class QPTest : public CppUnit::TestCase { 27 | 28 | private: 29 | SimilarityProfile sp, max, min; 30 | SPRatiosIndex ratio_map; 31 | SPCountsIndex mcount, ncount; 32 | SP sp1, sp2, sp3, sp4, sp5, sp6, sp7, sp8; 33 | 34 | 35 | void init_counts() { 36 | 37 | mcount.insert(Count(sp1, 5)); 38 | ncount.insert(Count(sp1, 5)); 39 | 40 | mcount.insert(Count(sp2, 2)); 41 | ncount.insert(Count(sp2, 2)); 42 | 43 | mcount.insert(Count(sp3, 20)); 44 | ncount.insert(Count(sp3, 20)); 45 | 46 | mcount.insert(Count(sp4, 35)); 47 | ncount.insert(Count(sp4, 35)); 48 | 49 | mcount.insert(Count(sp5, 101)); 50 | ncount.insert(Count(sp5, 101)); 51 | 52 | mcount.insert(Count(sp6, 8)); 53 | ncount.insert(Count(sp6, 8)); 54 | 55 | mcount.insert(Count(sp7, 7)); 56 | ncount.insert(Count(sp7, 7)); 57 | 58 | mcount.insert(Count(sp8, 201)); 59 | ncount.insert(Count(sp8, 21)); 60 | } 61 | 62 | 63 | public: 64 | QPTest(std::string name) : CppUnit::TestCase(name) { 65 | 66 | describe_test(INDENT0, name.c_str()); 67 | 68 | max = csp(3,3); 69 | min = csp(0,0); 70 | 71 | sp1 = csp(1,0); 72 | sp2 = csp(0,1); 73 | sp3 = csp(2,0); 74 | sp4 = csp(2,2); 75 | sp5 = csp(2,3); 76 | sp6 = csp(1,2); 77 | sp7 = csp(1,3); 78 | sp8 = csp(3,1); 79 | 80 | 81 | ratio_map.insert(Ratio(sp1, 5.0)); 82 | ratio_map.insert(Ratio(sp2, 2.0)); 83 | ratio_map.insert(Ratio(sp3, 20.0)); 84 | ratio_map.insert(Ratio(sp4, 35.0)); 85 | ratio_map.insert(Ratio(sp5, 101.0)); 86 | ratio_map.insert(Ratio(sp6, 8.0)); 87 | ratio_map.insert(Ratio(sp7, 7.0)); 88 | ratio_map.insert(Ratio(sp8, 201.0)); 89 | 90 | init_counts(); 91 | } 92 | 93 | void print_indexes() { 94 | 95 | SPRatiosIndex::const_iterator it = ratio_map.begin(); 96 | for (; it != ratio_map.end(); ++it) { 97 | uint32_t index = sp2index(it->first, min, max); 98 | std::cout << "index: " << index << ", "; 99 | print_similarity(it->first); 100 | std::cout << ", " << it->second; 101 | std::cout << std::endl; 102 | } 103 | 104 | } 105 | 106 | void test_smoothing() { 107 | 108 | vector attribute_names; 109 | attribute_names.push_back(string("foo")); 110 | 111 | smoothing_inter_extrapolation_cplex(ratio_map, min, max, mcount, ncount, attribute_names, false, false); 112 | } 113 | 114 | 115 | void runTest() { 116 | print_indexes(); 117 | test_smoothing(); 118 | print_indexes(); 119 | } 120 | 121 | }; 122 | 123 | 124 | void 125 | test_qp() { 126 | 127 | QPTest * qpt = new QPTest(std::string("Similarity test")); 128 | qpt->runTest(); 129 | delete qpt; 130 | } 131 | 132 | 133 | #ifdef test_qp_STANDALONE 134 | int 135 | main(int, char **) { 136 | 137 | test_qp(); 138 | return 0; 139 | } 140 | #endif 141 | -------------------------------------------------------------------------------- /test/Makefile.am: -------------------------------------------------------------------------------- 1 | 2 | #ILOGINSTALLDIR=/data/patentdata/ILOG/CPLEX_Studio124 3 | ILOGINSTALLDIR=/Users/daviddoolin/Applications/IBM/ILOG/CPLEX_Studio124 4 | CPLEXINCLUDE=$(ILOGINSTALLDIR)/cplex/include 5 | CONCERTINCLUDE=$(ILOGINSTALLDIR)/concert/include 6 | 7 | # Linux server 8 | #CPLEXLIB=$(ILOGINSTALLDIR)/cplex/lib/x86-64_sles10_4.1/static_pic 9 | #CONCERTLIB=$(ILOGINSTALLDIR)/concert/lib/x86-64_sles10_4.1/static_pic 10 | # Local macbook 11 | CPLEXLIB=$(ILOGINSTALLDIR)/cplex/lib/x86-64_darwin9_gcc4.0/static_pic 12 | CONCERTLIB=$(ILOGINSTALLDIR)/concert/lib/x86-64_darwin9_gcc4.0/static_pic 13 | 14 | INCLUDES = -I/usr/local/include -I$(CPLEXINCLUDE) -I$(CONCERTINCLUDE) -I../include -I/opt/local/include -I../src 15 | 16 | 17 | 18 | #CXXFLAGS = -Wall -Wextra -ansi -Wignored-qualifiers # -pedantic -Wno-unused-parameter -Wno-uninitialized # -Wno-ignored-qualifiers 19 | AM_CXXFLAGS = -std=c++11 -O0 -g -Wall -Wextra -Wformat-zero-length -Wno-unused-parameter -Wno-uninitialized # -pedantic -ansi 20 | #CXXFLAGS = $(AM_CXXFLAGS) 21 | 22 | LDADD = ../src/libdisambiguation.a 23 | AM_LDFLAGS = $(shell cppunit-config --libs) -L$(CPLEXLIB) -lilocplex -lcplex -L$(CONCERTLIB) -lconcert 24 | #AM_CXXFLAGS = $(shell cppunit-config --libs) -L$(CPLEXLIB) -lilocplex -lcplex -L$(CONCERTLIB) -lconcert 25 | 26 | 27 | TESTS = typedefs record blocking attribute stringmanipulator \ 28 | comparators comparesimilarities strcmp95 rarenames engineconfig \ 29 | abbreviation misspell namecompare jwcmp similarity clusterhead cluster engine \ 30 | training ratios fetchrecords assigneecomparison clusterinfo ratiocomponent \ 31 | coauthor qp compare testfake postprocess 32 | 33 | bin_PROGRAMS = $(TESTS) 34 | 35 | %.o: %.cpp 36 | g++ $(AM_CXXFLAGS) $(INCLUDES) -D$*_STANDALONE -c $< 37 | 38 | COMMON = testdata.cpp testutils.cpp 39 | 40 | #fake_SOURCES = fake.cpp $(COMMON) 41 | testfake_SOURCES = testfake.cpp fake.cpp $(COMMON) 42 | ratiocomponent_SOURCES = test_ratiocomponent.cpp fake.cpp $(COMMON) 43 | engine_SOURCES = test_engine.cpp $(COMMON) 44 | typedefs_SOURCES = test_typedefs.cpp 45 | record_SOURCES = test_record.cpp $(COMMON) 46 | cluster_SOURCES = test_cluster.cpp fake.cpp $(COMMON) 47 | clusterinfo_SOURCES = test_clusterinfo.cpp fake.cpp $(COMMON) 48 | training_SOURCES = test_training.cpp fake.cpp $(COMMON) 49 | clusterhead_SOURCES = test_clusterhead.cpp $(COMMON) 50 | attribute_SOURCES = test_attribute.cpp fake.cpp $(COMMON) 51 | fetchrecords_SOURCES = test_fetch_records.cpp $(COMMON) 52 | assigneecomparison_SOURCES = test_assignee_comparison.cpp fake.cpp $(COMMON) 53 | coauthor_SOURCES = test_coauthor.cpp fake.cpp $(COMMON) 54 | comparators_SOURCES = test_comparators.cpp $(COMMON) 55 | rarenames_SOURCES = test_rarenames.cpp $(COMMON) 56 | jwcmp_SOURCES = test_jwcmp.cpp $(COMMON) 57 | ratios_SOURCES = test_ratios.cpp fake.cpp $(COMMON) 58 | similarity_SOURCES = test_similarity.cpp $(COMMON) 59 | abbreviation_SOURCES = test_abbreviation.cpp 60 | misspell_SOURCES = test_misspell.cpp $(COMMON) 61 | namecompare_SOURCES = test_namecompare.cpp $(COMMON) 62 | engineconfig_SOURCES = test_engineconfig.cpp $(COMMON) 63 | comparesimilarities_SOURCES = test_compare.cpp 64 | 65 | blocking_SOURCES = test_blocking.cpp $(COMMON) fake.cpp 66 | blocking_LDADD = ../src/libdisambiguation.a -L$(CPLEXLIB) -lilocplex -lcplex -L$(CONCERTLIB) -lconcert 67 | 68 | stringmanipulator_SOURCES = test_string_manipulator.cpp $(COMMON) 69 | strcmp95_SOURCES = test_strcmp95.cpp $(COMMON) 70 | qp_SOURCES = test_qp.cpp $(COMMON) 71 | compare_SOURCES = test_compare.cpp $(COMMON) 72 | postprocess_SOURCES = test_postprocess.cpp $(COMMON) 73 | 74 | relink: 75 | rm -rf $(TESTS) 76 | 77 | -------------------------------------------------------------------------------- /test/test_engineconfig.cpp: -------------------------------------------------------------------------------- 1 | 2 | // This needs to be in a library for test code. =( 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | typedef std::vector Labels; 17 | 18 | #include "testdata.h" 19 | #include "testutils.h" 20 | 21 | using std::string; 22 | using std::vector; 23 | 24 | std::vector 25 | setup_columns() { 26 | 27 | // Labels, as typedef'ed above. Keep the full type 28 | // definition here to help keep track of what things 29 | // are called in the main code. 30 | std::vector involved_columns; 31 | involved_columns.push_back (std::string("Firstname")); 32 | involved_columns.push_back (std::string("Middlename")); 33 | involved_columns.push_back (std::string("Lastname")); 34 | involved_columns.push_back (std::string("Street")); 35 | involved_columns.push_back (std::string("City")); 36 | involved_columns.push_back (std::string("State")); 37 | involved_columns.push_back (std::string("Country")); 38 | involved_columns.push_back (std::string("Zipcode")); 39 | involved_columns.push_back (std::string("Latitude")); 40 | involved_columns.push_back (std::string("Longitude")); 41 | involved_columns.push_back (std::string("Patent")); 42 | involved_columns.push_back (std::string("ApplyYear")); 43 | involved_columns.push_back (std::string("Assignee")); 44 | involved_columns.push_back (std::string("AsgNum")); 45 | involved_columns.push_back (std::string("Class")); 46 | involved_columns.push_back (std::string("Coauthor")); 47 | involved_columns.push_back (std::string("Unique_Record_ID")); 48 | return involved_columns; 49 | } 50 | 51 | 52 | void 53 | rare_names(std::list all_records) { 54 | 55 | RecordPList rare_firstname_set; 56 | RecordPList rare_lastname_set; 57 | std::vector rare_pointer_vec; 58 | rare_pointer_vec.push_back(&rare_firstname_set); 59 | rare_pointer_vec.push_back(&rare_lastname_set); 60 | list record_pointers; 61 | 62 | list::const_iterator riter = all_records.begin(); 63 | for (; riter != all_records.end(); ++riter) { 64 | record_pointers.push_back(&(*riter)); 65 | } 66 | 67 | find_rare_names_v2(rare_pointer_vec, record_pointers); 68 | std::cout << "End of rare_names in test" << std::endl; 69 | } 70 | 71 | 72 | class EngineConfigTest : public CppUnit::TestCase { 73 | 74 | private: 75 | static const std::string LINE; 76 | vector tcn; 77 | vector requested_columns; 78 | vector indices; 79 | 80 | public: 81 | EngineConfigTest(std::string name) : CppUnit::TestCase(name) { 82 | 83 | describe_test(INDENT0, name.c_str()); 84 | } 85 | 86 | ~EngineConfigTest() {} 87 | 88 | void runTest() { 89 | 90 | std::vector involved_columns = setup_columns(); 91 | //printer(involved_columns); 92 | char filename[] = "./fixtures/csv/rjones.csv"; 93 | std::list all_records; 94 | fetch_records_from_txt(all_records, filename, involved_columns); 95 | rare_names(all_records); 96 | } 97 | }; 98 | 99 | 100 | void 101 | test_engineconfig() { 102 | 103 | EngineConfigTest * ect = new EngineConfigTest("Engine configuration unit testing"); 104 | ect->runTest(); 105 | delete ect; 106 | } 107 | 108 | 109 | #ifdef test_engineconfig_STANDALONE 110 | int 111 | main(int UP(argc), char ** UP(argv)) { 112 | 113 | test_engineconfig(); 114 | return 0; 115 | } 116 | #endif 117 | -------------------------------------------------------------------------------- /test/test_string_manipulator.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | #include "testutils.h" 11 | 12 | class StringManipulatorTest : public CppUnit::TestCase { 13 | 14 | public: 15 | StringManipulatorTest(std::string name) : CppUnit::TestCase(name) { 16 | 17 | describe_test(INDENT0, name.c_str()); 18 | } 19 | 20 | // Postpone this until understanding what it does. 21 | // Mark it as will not fix in Trac 22 | void remain_same() { 23 | StringRemainSame rs(); 24 | } 25 | 26 | void remove_space() { 27 | std::string input("THIS IS AN EXAMPLE "); 28 | std::string target("THISISANEXAMPLE"); 29 | StringRemoveSpace * rc = new StringRemoveSpace(); 30 | std::string result = rc->manipulate(input); 31 | CPPUNIT_ASSERT(result == target); 32 | delete rc; 33 | } 34 | 35 | 36 | // The code calls this operation "Truncate" but 37 | // it's really a substring operation. I thought 38 | // these kinds of operations were in the STL. 39 | void truncate() { 40 | 41 | StringTruncate stobj; 42 | 43 | { 44 | stobj.set_truncater(0, 5, true); 45 | std::string eric = stobj.manipulate ("ERIC"); 46 | CPPUNIT_ASSERT(eric == string("ERIC")); 47 | std::string johnson = stobj.manipulate ("JOHNSON"); 48 | CPPUNIT_ASSERT(johnson == string("JOHNS")); 49 | } 50 | 51 | { 52 | // starting position = 0 (head of the string), 53 | // extraction length = full length, direction = forward. 54 | stobj.set_truncater(0, 0, true); 55 | std::string johnson = stobj.manipulate ("JOHNSON"); 56 | CPPUNIT_ASSERT(johnson == string("JOHNSON")); 57 | } 58 | 59 | { 60 | stobj.set_truncater(-6, 2, true); 61 | std::string johnson = stobj.manipulate("JOHNSON"); 62 | CPPUNIT_ASSERT(johnson == string("OH")); 63 | } 64 | 65 | { 66 | stobj.set_truncater(-5, 2, false); 67 | std::string johnson = stobj.manipulate("JOHNSON"); 68 | CPPUNIT_ASSERT(johnson == string("HO")); 69 | } 70 | 71 | { 72 | stobj.set_truncater(4, 3, false); 73 | std::string johnson = stobj.manipulate("JOHNSON"); 74 | CPPUNIT_ASSERT(johnson == string("SNH")); 75 | } 76 | 77 | { 78 | stobj.set_truncater(4, 0, false); 79 | std::string johnson = stobj.manipulate("JOHNSON"); 80 | CPPUNIT_ASSERT(johnson == string("")); 81 | } 82 | 83 | } 84 | 85 | void collapse_and_truncate() { 86 | 87 | StringNoSpaceTruncate stobj; 88 | 89 | { 90 | stobj.set_truncater(-6, 2, true); 91 | std::string johnson = stobj.manipulate("JOHN SON"); 92 | CPPUNIT_ASSERT(johnson == string("OH")); 93 | } 94 | 95 | { 96 | stobj.set_truncater(-6, 2, true); 97 | std::string johnson = stobj.manipulate(" JOHN SON "); 98 | CPPUNIT_ASSERT(johnson == string("OH")); 99 | } 100 | 101 | } 102 | 103 | void extract_initials() { 104 | 105 | ExtractInitials eiobj(3); 106 | std::string result = eiobj.manipulate("THIS IS AN EXAMPLE, YOU KNOW."); 107 | CPPUNIT_ASSERT(result == string("EYK")); 108 | } 109 | 110 | 111 | void extract_first_word() { 112 | 113 | StringExtractFirstWord sefobj; 114 | std::string thomas = sefobj.manipulate("THOMAS DAVID ANDERSON"); 115 | CPPUNIT_ASSERT(thomas == std::string("THOMAS")); 116 | } 117 | 118 | 119 | void runTest() { 120 | remain_same(); 121 | remove_space(); 122 | truncate(); 123 | collapse_and_truncate(); 124 | extract_initials(); 125 | extract_first_word(); 126 | } 127 | }; 128 | 129 | int 130 | main(int, char **) { 131 | 132 | StringManipulatorTest * rt = new StringManipulatorTest(std::string("String manipulator test")); 133 | rt->runTest(); 134 | delete rt; 135 | return 0; 136 | } 137 | -------------------------------------------------------------------------------- /test/testdata/assignee_comparison.csv: -------------------------------------------------------------------------------- 1 | Firstname,Middlename,Lastname,Street,City,State,Country,Zipcode,Latitude,Longitude,Patent,ApplyYear,Assignee,AsgNum,Class,Coauthor,Unique_Record_ID 2 | PHILIP E,PHILIP E,DURAND,,HUDSON,MA,US,1749,42.388756,-71.557437,03858241,1974,UNITED STATES OF AMERICA ARMY,H000000000072,2,L.NORRIS-0,03858241-1 3 | LONNIE H,LONNIE H,NORRIS,,MILFORD,MA,US,1757,42.149786,-71.522396,03858241,1974,UNITED STATES OF AMERICA ARMY,H000000000072,2,P.DURAND-0,03858241-2 4 | ELWYN R,ELWYN R,GOODING,,PINCKNEY,MI,US,48169,42.45908,-83.934013,03858242,1973,,,2,,03858242-1 5 | CLAUDE RAYMOND,CLAUDE RAYMOND,PIERRON,,EPINAL,,FR,,48.183333,6.45,03858243,1973,PLYMATIC,A000010191467,2,J.JENNY-0/R.ZUCCARO-0,03858243-1 6 | JEAN PAUL,JEAN PAUL,JENNY,,DECINES,,FR,,45.75,4.966667,03858243,1973,PLYMATIC,A000010191467,2,C.PIERRON-0/R.ZUCCARO-0,03858243-2 7 | ROBERT,ROBERT,ZUCCARO,,EPINAL,,FR,,48.183333,6.45,03858243,1973,PLYMATIC,A000010191467,2,C.PIERRON-0/J.JENNY-0,03858243-3 8 | RICHARD L,RICHARD L,MANN,,WOODSTOCK,CT,US,6281,41.946708,-71.986604,03858244,1973,,,2/69,,03858244-1 9 | MICHAEL A,MICHAEL A,NATE II,,NEW YORK,NY,US,10292,40.708181,-74.003846,03858245,1972,HAIR AGAIN LTD,H000000108362,128/606/623,M.MANN-0,03858245-1 10 | MAURICE A,MAURICE A,MANN,,NEW YORK,NY,US,10292,40.708181,-74.003846,03858245,1972,HAIR AGAIN LTD,H000000108362,128/606/623,M.NATE II-0,03858245-2 11 | SIMCHA,SIMCHA,MILO,,AUSTIN,TX,US,78799,30.267104,-97.744615,03858246,1973,,,623/137,,03858246-1 12 | JACK,JACK,BAUMAN,,LOS ANGELES,CA,US,90230,34.001758,-118.394799,03858247,1973,,,623/606,,03858247-1 13 | SYLVESTER L,SYLVESTER L,CROWE,,FRANKLIN,IN,US,46131,39.488252,-86.054873,03858248,1973,,,623,,03858248-1 14 | DURRELL UNGER,DURRELL UNGER,HOWARD,,SAN ANTONIO,TX,US,78299,29.42543,-98.486926,03858249,1973,,,4,,03858249-1 15 | ANTHONY,ANTHONY,COGLITORE,,STATEN ISLAND,NY,US,10314,40.606393,-74.152402,03858250,1973,,,4,,03858250-1 16 | WALTER J,WALTER J,VOLLRATH,,SHEBOYGAN,WI,US,53083,43.80234,-87.7698,03858251,1973,POLAR WARE COMPANY,H000000064389,4,R.VOLLRATH-0/P.VIRNOCHE-0/J.TURK-0/P.GERDES-0/R.SINGER-0,03858251-1 17 | RICHARD J,RICHARD J,VOLLRATH,,SHEBOYGAN,WI,US,53083,43.80234,-87.7698,03858251,1973,POLAR WARE COMPANY,H000000064389,4,W.VOLLRATH-0/P.VIRNOCHE-0/J.TURK-0/P.GERDES-0/R.SINGER-0,03858251-2 18 | PAUL R,PAUL R,VIRNOCHE,,NEWTON,WI,US,53063,43.969789,-87.794434,03858251,1973,POLAR WARE COMPANY,H000000064389,4,W.VOLLRATH-0/R.VOLLRATH-0/J.TURK-0/P.GERDES-0/R.SINGER-0,03858251-3 19 | JOSEPH,JOSEPH,TURK,,SHEBOYGAN,WI,US,53083,43.80234,-87.7698,03858251,1973,POLAR WARE COMPANY,H000000064389,4,W.VOLLRATH-0/R.VOLLRATH-0/P.VIRNOCHE-0/P.GERDES-0/R.SINGER-0,03858251-4 20 | PAUL E,PAUL E,GERDES,,SHEBOYGAN,WI,US,53083,43.80234,-87.7698,03858251,1973,POLAR WARE COMPANY,H000000064389,4,W.VOLLRATH-0/R.VOLLRATH-0/P.VIRNOCHE-0/J.TURK-0/R.SINGER-0,03858251-5 21 | ROY W,ROY W,SINGER,,SHEBOYGAN,WI,US,53083,43.80234,-87.7698,03858251,1973,POLAR WARE COMPANY,H000000064389,4,W.VOLLRATH-0/R.VOLLRATH-0/P.VIRNOCHE-0/J.TURK-0/P.GERDES-0,03858251-6 22 | OLGIERD Z,OLGIERD Z,EJCHORSZT,,LONG BEACH,CA,US,90899,33.767496,-118.187712,03858252,1973,,,4/239,,03858252-1 23 | ADRIAN A,ADRIAN A,LAUZON,,MONTREAL,,CA,,55.4,-104.366667,03858253,1973,,,5/312,,03858253-1 24 | EDMUND S,EDMUND S,COOMES,,BIGGS,CA,US,95917,39.415882,-121.692678,03858254,1973,,,5,,03858254-1 25 | GEORGE T,GEORGE T,PURVES JR,,INDIANAPOLIS,IN,US,46298,39.89393,-86.229942,03858255,1973,,,5,,03858255-1 26 | ANDREW E,ANDREW E,BEER,,NEW YORK,NY,US,10292,40.708181,-74.003846,03858256,1972,,,5/297,,03858256-1 27 | SAMUEL,SAMUEL,YOUNG,,DANBURY,CT,US,6817,41.396437,-73.456032,03858257,1973,,,5,,03858257-1 28 | PETER S,PETER S,STEVENS,,MANHATTAN BEACH,CA,US,90267,33.874322,-118.39546,03858258,1974,,,7,,03858258-1 29 | EDGAR EARL,EDGAR EARL,RENFREW,,LOCK HAVEN,PA,US,17745,41.134686,-77.449701,03858259,1972,AMERICAN ANILINE PRODUCTS INCORPORATED,H000000019680,8,G.GENTA-0,03858259-1 30 | GUIDO RUGGIERO,GUIDO RUGGIERO,GENTA,,SNYDER,NY,US,14226,42.969504,-78.798576,03858259,1972,AMERICAN ANILINE PRODUCTS INCORPORATED,H000000019680,8,E.RENFREW-0,03858259-2 31 | PAUL EDMOND,PAUL EDMOND,HANSER,,MOLINE,IL,US,61266,41.505858,-90.51407,03858260,1973,,,114,,03858260-1 32 | -------------------------------------------------------------------------------- /test/fake.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "fake.h" 14 | #include "testdata.h" 15 | #include "testutils.h" 16 | 17 | 18 | FakeTest::FakeTest(string name, string filename) 19 | : CppUnit::TestCase(name), csvfilename(filename) { 20 | 21 | describe_test(INDENT2, name.c_str()); 22 | 23 | requested_columns.push_back(string("Firstname")); 24 | requested_columns.push_back(string("Lastname")); 25 | requested_columns.push_back(string("Middlename")); 26 | requested_columns.push_back(string("Patent")); 27 | requested_columns.push_back(string("Assignee")); 28 | requested_columns.push_back(string("AsgNum")); 29 | requested_columns.push_back(string("ApplyYear")); 30 | requested_columns.push_back(string("Latitude")); 31 | requested_columns.push_back(string("Longitude")); 32 | requested_columns.push_back(string("Street")); 33 | requested_columns.push_back(string("City")); 34 | requested_columns.push_back(string("Country")); 35 | requested_columns.push_back(string("Unique_Record_ID")); 36 | requested_columns.push_back(string("Coauthor")); 37 | requested_columns.push_back(string("Class")); 38 | } 39 | 40 | 41 | void 42 | FakeTest::load_fake_data(string csvfilename) { 43 | 44 | describe_test(INDENT4, "Loading fake data..."); 45 | 46 | //const char * filename = "testdata/clustertest.csv"; 47 | const char * filename = csvfilename.c_str(); 48 | bool successful = fetch_records_from_txt(source, filename, requested_columns); 49 | 50 | if (not successful) 51 | exit(-1); 52 | 53 | Record * r = &source.front(); 54 | Record::set_sample_record(r); 55 | 56 | create_record_plist(source, record_pointers); 57 | 58 | // IPDict 59 | //map uid_dict; 60 | const string uid_identifier = cUnique_Record_ID::static_get_class_name(); 61 | create_btree_uid2record_pointer(uid_dict, source, uid_identifier); 62 | 63 | bool matching_mode = true; 64 | bool frequency_adjust_mode = false; 65 | bool debug_mode = false; 66 | ClusterInfo match(uid_dict, matching_mode, frequency_adjust_mode, debug_mode); 67 | 68 | const uint32_t num_coauthors_to_group = 2; 69 | cBlocking_Operation_By_Coauthors blocker_coauthor(record_pointers, num_coauthors_to_group); 70 | coauthor_blocking = &blocker_coauthor; 71 | 72 | //cBlocking_Operation_By_Coauthors blocker_coauthor = get_blocker_coathor(); 73 | Cluster::set_reference_patent_tree_pointer(blocker_coauthor.get_patent_tree()); 74 | 75 | list::iterator i = source.begin(); 76 | for (; i != source.end(); ++i) { 77 | rpv.push_back(&(*i)); 78 | } 79 | 80 | vector comparators; 81 | comparators.push_back("Firstname"); 82 | comparators.push_back("Lastname"); 83 | Record::activate_comparators_by_name(comparators); 84 | } 85 | 86 | 87 | RecordIndex * 88 | FakeTest::get_uid_dict() { 89 | return &uid_dict; 90 | } 91 | 92 | 93 | const cBlocking_Operation_By_Coauthors * 94 | FakeTest::get_coauthor_blocking() { 95 | return coauthor_blocking; 96 | } 97 | 98 | 99 | RecordPList 100 | FakeTest::get_recpointers() { 101 | return record_pointers; 102 | } 103 | 104 | 105 | vector 106 | FakeTest::get_recvecs() { 107 | return rpv; 108 | } 109 | 110 | 111 | list 112 | FakeTest::get_all_records() { 113 | return source; 114 | } 115 | 116 | void 117 | test_fake() { 118 | 119 | const string filename("testdata/clustertest.csv"); 120 | FakeTest * ft = new FakeTest(std::string("initial test"), filename); 121 | ft->runTest(); 122 | delete ft; 123 | } 124 | 125 | 126 | #if 0 // not working anymore, need to build libtest.a 127 | #ifdef fake_STANDALONE 128 | int 129 | main(int UP(argc), char ** UP(argv)) { 130 | 131 | test_fake(); 132 | return 0; 133 | } 134 | #endif 135 | #endif 136 | -------------------------------------------------------------------------------- /test/test_cluster.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | //#include 5 | //#include 6 | //#include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "testdata.h" 15 | #include "testutils.h" 16 | #include "fake.h" 17 | 18 | class ClusterTest : public CppUnit::TestCase { 19 | 20 | private: 21 | 22 | FakeTest * ft; 23 | vector rpv; 24 | 25 | 26 | public: 27 | ClusterTest(std::string name) : CppUnit::TestCase(name) { 28 | 29 | describe_test(INDENT0, name.c_str()); 30 | 31 | const string filename("testdata/clustertest.csv"); 32 | //const string filename("testdata/assignee_comparison.csv"); 33 | ft = new FakeTest(string("Fake ClusterTest"), filename); 34 | ft->load_fake_data(filename); 35 | rpv = ft->get_recvecs(); 36 | } 37 | 38 | void create_cluster() { 39 | Attribute::register_class_names(get_column_names()); 40 | Record * r = make_quuxalot_record(); 41 | ClusterHead ch(r, 0.9953); 42 | RecordPList rl = get_record_list(); 43 | cBlocking_Operation_By_Coauthors blocker_coauthor = get_blocker_coathor(); 44 | Cluster::set_reference_patent_tree_pointer( blocker_coauthor.get_patent_tree()); 45 | Cluster * c = new Cluster(ch, rl); 46 | delete c; 47 | } 48 | 49 | 50 | void test_find_representatives() { 51 | 52 | static const string useful_columns[] = { 53 | cFirstname::static_get_class_name(), 54 | cMiddlename::static_get_class_name(), 55 | cLastname::static_get_class_name(), 56 | cLatitude::static_get_class_name(), 57 | cAssignee::static_get_class_name(), 58 | cCity::static_get_class_name(), 59 | cCountry::static_get_class_name() 60 | }; 61 | static const uint32_t numcols = sizeof(useful_columns)/sizeof(string); 62 | 63 | vector indice = make_indice(useful_columns,numcols); 64 | 65 | Spec spec; 66 | spec.it("Indexing for find_representatives", [&indice](Description desc)->bool { 67 | vector test = { 0, 2, 1, 7, 4, 10, 11 }; 68 | return (test == indice); 69 | }); 70 | 71 | 72 | const Record * r1 = rpv[1]; 73 | const Record * r2 = rpv[2]; 74 | const Record * r3 = rpv[3]; 75 | 76 | typedef std::pair AttCount; 77 | typedef map AttCounts; 78 | RecordPList m_fellows = { r1, r2, r3 }; 79 | vector trace = make_trace(indice, m_fellows, numcols); 80 | 81 | #if 0 /// Save all this it's really useful 82 | for_each(begin(trace), end(trace), [](AttCounts acs) { 83 | for_each(begin(acs), end(acs), [](AttCount ac) { 84 | const vector & data = ac.first->get_data(); 85 | for_each(begin(data), end(data), [](const string * s) { 86 | std::cout << "Att: " << s->c_str() << ", "; 87 | }); 88 | std::cout << "count: " << ac.second << std::endl; 89 | }); 90 | }); 91 | #endif 92 | 93 | vector most = get_most(trace, numcols); 94 | 95 | const Record * mp = get_record_with_most(most, m_fellows, indice, numcols); 96 | 97 | //mp->print(); 98 | 99 | spec.it("Unique record ID for cluster representative should be 06453319-4", [mp](Description desc)->bool { 100 | const string uid = mp->get_unique_record_id(); 101 | return (uid == string("06453319-4")); 102 | }); 103 | 104 | } 105 | 106 | 107 | void runTest() { 108 | test_find_representatives(); 109 | //create_cluster(); 110 | } 111 | }; 112 | 113 | 114 | void 115 | test_clusters() { 116 | 117 | ClusterTest * rt = new ClusterTest(std::string("Cluster unit testing")); 118 | rt->runTest(); 119 | delete rt; 120 | } 121 | 122 | 123 | #ifdef test_cluster_STANDALONE 124 | int 125 | main(int UP(argc), char ** UP(argv)) { 126 | 127 | test_clusters(); 128 | return 0; 129 | } 130 | #endif 131 | -------------------------------------------------------------------------------- /test/data/berkeley/tset05_1.txt: -------------------------------------------------------------------------------- 1 | 07171415-4,07099871-3 2 | 07171415-4,07013303-3 3 | 07171415-4,06961723-2 4 | 07099871-3,07013303-3 5 | 07099871-3,06961723-2 6 | 07013303-3,06961723-2 7 | 07171415-2,07099871-5 8 | 07171415-0,07099871-1 9 | 07171415-0,07013303-1 10 | 07171415-0,06961723-1 11 | 07099871-1,07013303-1 12 | 07099871-1,06961723-1 13 | 07013303-1,06961723-1 14 | 07420980-0,07529217-0 15 | 06915307-3,06453319-4 16 | 06915307-3,06292880-4 17 | 06915307-3,06289358-4 18 | 06915307-3,06209003-4 19 | 06915307-3,06128627-4 20 | 06915307-3,06128623-4 21 | 06453319-4,06292880-4 22 | 06453319-4,06289358-4 23 | 06453319-4,06209003-4 24 | 06453319-4,06128627-4 25 | 06453319-4,06128623-4 26 | 06292880-4,06289358-4 27 | 06292880-4,06209003-4 28 | 06292880-4,06128627-4 29 | 06292880-4,06128623-4 30 | 06289358-4,06209003-4 31 | 06289358-4,06128627-4 32 | 06289358-4,06128623-4 33 | 06209003-4,06128627-4 34 | 06209003-4,06128623-4 35 | 06128627-4,06128623-4 36 | 06915307-4,06453319-5 37 | 06915307-4,06292880-5 38 | 06915307-4,06289358-5 39 | 06915307-4,06209003-5 40 | 06915307-4,06128627-5 41 | 06915307-4,06128623-5 42 | 06453319-5,06292880-5 43 | 06453319-5,06289358-5 44 | 06453319-5,06209003-5 45 | 06453319-5,06128627-5 46 | 06453319-5,06128623-5 47 | 06292880-5,06289358-5 48 | 06292880-5,06209003-5 49 | 06292880-5,06128627-5 50 | 06292880-5,06128623-5 51 | 06289358-5,06209003-5 52 | 06289358-5,06128627-5 53 | 06289358-5,06128623-5 54 | 06209003-5,06128627-5 55 | 06209003-5,06128623-5 56 | 06128627-5,06128623-5 57 | 06915307-5,06453319-6 58 | 06915307-5,06292880-6 59 | 06915307-5,06289358-6 60 | 06915307-5,06209003-6 61 | 06915307-5,06128627-6 62 | 06915307-5,06128623-6 63 | 06453319-6,06292880-6 64 | 06453319-6,06289358-6 65 | 06453319-6,06209003-6 66 | 06453319-6,06128627-6 67 | 06453319-6,06128623-6 68 | 06292880-6,06289358-6 69 | 06292880-6,06209003-6 70 | 06292880-6,06128627-6 71 | 06292880-6,06128623-6 72 | 06289358-6,06209003-6 73 | 06289358-6,06128627-6 74 | 06289358-6,06128623-6 75 | 06209003-6,06128627-6 76 | 06209003-6,06128623-6 77 | 06128627-6,06128623-6 78 | 06915307-1,06453319-2 79 | 06915307-1,06292880-2 80 | 06915307-1,06289358-2 81 | 06915307-1,06209003-2 82 | 06915307-1,06128627-2 83 | 06915307-1,06128623-2 84 | 06453319-2,06292880-2 85 | 06453319-2,06289358-2 86 | 06453319-2,06209003-2 87 | 06453319-2,06128627-2 88 | 06453319-2,06128623-2 89 | 06292880-2,06289358-2 90 | 06292880-2,06209003-2 91 | 06292880-2,06128627-2 92 | 06292880-2,06128623-2 93 | 06289358-2,06209003-2 94 | 06289358-2,06128627-2 95 | 06289358-2,06128623-2 96 | 06209003-2,06128627-2 97 | 06209003-2,06128623-2 98 | 06128627-2,06128623-2 99 | 06915307-2,06453319-3 100 | 06915307-2,06292880-3 101 | 06915307-2,06289358-3 102 | 06915307-2,06209003-3 103 | 06915307-2,06128627-3 104 | 06915307-2,06128623-3 105 | 06453319-3,06292880-3 106 | 06453319-3,06289358-3 107 | 06453319-3,06209003-3 108 | 06453319-3,06128627-3 109 | 06453319-3,06128623-3 110 | 06292880-3,06289358-3 111 | 06292880-3,06209003-3 112 | 06292880-3,06128627-3 113 | 06292880-3,06128623-3 114 | 06289358-3,06209003-3 115 | 06289358-3,06128627-3 116 | 06289358-3,06128623-3 117 | 06209003-3,06128627-3 118 | 06209003-3,06128623-3 119 | 06128627-3,06128623-3 120 | 06915307-0,06453319-1 121 | 06915307-0,06292880-1 122 | 06915307-0,06289358-1 123 | 06915307-0,06209003-1 124 | 06915307-0,06128627-1 125 | 06915307-0,06128623-1 126 | 06453319-1,06292880-1 127 | 06453319-1,06289358-1 128 | 06453319-1,06209003-1 129 | 06453319-1,06128627-1 130 | 06453319-1,06128623-1 131 | 06292880-1,06289358-1 132 | 06292880-1,06209003-1 133 | 06292880-1,06128627-1 134 | 06292880-1,06128623-1 135 | 06289358-1,06209003-1 136 | 06289358-1,06128627-1 137 | 06289358-1,06128623-1 138 | 06209003-1,06128627-1 139 | 06209003-1,06128623-1 140 | 06128627-1,06128623-1 141 | 07171415-5,07099871-4 142 | 07171415-5,07013303-4 143 | 07171415-5,06961723-3 144 | 07099871-4,07013303-4 145 | 07099871-4,06961723-3 146 | 07013303-4,06961723-3 147 | 07171415-1,07099871-0 148 | 07171415-1,07013303-0 149 | 07171415-1,06961723-0 150 | 07099871-0,07013303-0 151 | 07099871-0,06961723-0 152 | 07013303-0,06961723-0 153 | -------------------------------------------------------------------------------- /test/testdata.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include // typedef RecordList 4 | #include 5 | 6 | vector 7 | get_column_names() { 8 | 9 | vector cn; 10 | cn.push_back("Firstname"); 11 | cn.push_back("Middlename"); 12 | cn.push_back("Lastname"); 13 | cn.push_back("Latitude"); 14 | cn.push_back("Assignee"); 15 | cn.push_back("City"); 16 | cn.push_back("Country"); 17 | cn.push_back("Patent"); 18 | cn.push_back("ApplyYear"); 19 | cn.push_back("AsgNum"); 20 | return cn; 21 | } 22 | 23 | 24 | Record * make_foobar_record() { 25 | 26 | cFirstname * firstname = new cFirstname("Foo"); 27 | cMiddlename * middlename = new cMiddlename("X"); 28 | cLastname * lastname = new cLastname("Bar"); 29 | cLatitude * latitude = new cLatitude("42.00"); 30 | cAssignee * assignee = new cAssignee("Gonesilent"); 31 | cCity * city = new cCity("Burlingame"); 32 | cCountry * country = new cCountry("US"); 33 | cPatent * patent = new cPatent("07100123"); 34 | cApplyYear * applyyear = new cApplyYear("2008"); 35 | cAsgNum * asgnum = new cAsgNum("H000000064389"); 36 | 37 | vector temp_vec_attrib; 38 | 39 | temp_vec_attrib.push_back(firstname); 40 | temp_vec_attrib.push_back(middlename); 41 | temp_vec_attrib.push_back(lastname); 42 | temp_vec_attrib.push_back(latitude); 43 | temp_vec_attrib.push_back(assignee); 44 | temp_vec_attrib.push_back(city); 45 | temp_vec_attrib.push_back(country); 46 | temp_vec_attrib.push_back(patent); 47 | temp_vec_attrib.push_back(applyyear); 48 | temp_vec_attrib.push_back(asgnum); 49 | 50 | Record * r = new Record(temp_vec_attrib); 51 | r->set_column_names(get_column_names()); 52 | return r; 53 | } 54 | 55 | 56 | Record * make_quuxalot_record() { 57 | 58 | cFirstname * firstname = new cFirstname("Quux"); 59 | cMiddlename * middlename = new cMiddlename("A"); 60 | cLastname * lastname = new cLastname("Lot"); 61 | cLatitude * latitude = new cLatitude("42.00"); 62 | cAssignee * assignee = new cAssignee("Gonesilent"); 63 | cCity * city = new cCity("Burlingame"); 64 | cCountry * country = new cCountry("US"); 65 | cPatent * patent = new cPatent("07100124"); 66 | cApplyYear * applyyear = new cApplyYear("2008"); 67 | cAsgNum * asgnum = new cAsgNum("H000000064389"); 68 | 69 | vector temp_vec_attrib; 70 | 71 | temp_vec_attrib.push_back(firstname); 72 | temp_vec_attrib.push_back(middlename); 73 | temp_vec_attrib.push_back(lastname); 74 | temp_vec_attrib.push_back(latitude); 75 | temp_vec_attrib.push_back(assignee); 76 | temp_vec_attrib.push_back(city); 77 | temp_vec_attrib.push_back(country); 78 | temp_vec_attrib.push_back(patent); 79 | temp_vec_attrib.push_back(applyyear); 80 | temp_vec_attrib.push_back(asgnum); 81 | 82 | Record * r = new Record(temp_vec_attrib); 83 | r->set_column_names(get_column_names()); 84 | return r; 85 | } 86 | 87 | 88 | RecordPList get_record_list() { 89 | Record * r1 = make_foobar_record(); 90 | Record * r2 = make_quuxalot_record(); 91 | RecordPList rl; 92 | rl.push_back(r1); 93 | rl.push_back(r2); 94 | #if 0 95 | vector colnames = get_column_names(); 96 | //std::cout << "column names size: " << colnames.size() << std::endl; 97 | Attribute::register_class_names(colnames); 98 | //Attribute ** pa = instantiate_attributes(Record::column_names, colnames.size()); 99 | Attribute ** pa = instantiate_attributes(colnames, colnames.size()); 100 | std::cout << "Here\n"; 101 | //check_interactive_consistency(pa, colnames.size(), colnames); 102 | r1->reconfigure_record_for_interactives(); 103 | r2->reconfigure_record_for_interactives(); 104 | #endif 105 | return rl; 106 | } 107 | 108 | 109 | cBlocking_Operation_By_Coauthors get_blocker_coathor() { 110 | 111 | int num_coauthors_to_group = 4; 112 | RecordPList all_rec_pointers; 113 | cBlocking_Operation_By_Coauthors blocker_coauthor(all_rec_pointers, num_coauthors_to_group); 114 | return blocker_coauthor; 115 | } 116 | 117 | 118 | -------------------------------------------------------------------------------- /test/data/berkeley/tset05_2.txt: -------------------------------------------------------------------------------- 1 | 07171415-4,07099871-3 2 | 07171415-4,07013303-3 3 | 07171415-4,06961723-2 4 | 07099871-3,07013303-3 5 | 07099871-3,06961723-2 6 | 07013303-3,06961723-2 7 | 07171415-2,07099871-5 8 | 07171415-0,07099871-1 9 | 07171415-0,07013303-1 10 | 07171415-0,06961723-1 11 | 07099871-1,07013303-1 12 | 07099871-1,06961723-1 13 | 07013303-1,06961723-1 14 | 05726480-1,07420980-0 15 | 05726480-1,07529217-0 16 | 07420980-0,07529217-0 17 | 06915307-3,06453319-4 18 | 06915307-3,06292880-4 19 | 06915307-3,06289358-4 20 | 06915307-3,06209003-4 21 | 06915307-3,06128627-4 22 | 06915307-3,06128623-4 23 | 06453319-4,06292880-4 24 | 06453319-4,06289358-4 25 | 06453319-4,06209003-4 26 | 06453319-4,06128627-4 27 | 06453319-4,06128623-4 28 | 06292880-4,06289358-4 29 | 06292880-4,06209003-4 30 | 06292880-4,06128627-4 31 | 06292880-4,06128623-4 32 | 06289358-4,06209003-4 33 | 06289358-4,06128627-4 34 | 06289358-4,06128623-4 35 | 06209003-4,06128627-4 36 | 06209003-4,06128623-4 37 | 06128627-4,06128623-4 38 | 06915307-4,06453319-5 39 | 06915307-4,06292880-5 40 | 06915307-4,06289358-5 41 | 06915307-4,06209003-5 42 | 06915307-4,06128627-5 43 | 06915307-4,06128623-5 44 | 06453319-5,06292880-5 45 | 06453319-5,06289358-5 46 | 06453319-5,06209003-5 47 | 06453319-5,06128627-5 48 | 06453319-5,06128623-5 49 | 06292880-5,06289358-5 50 | 06292880-5,06209003-5 51 | 06292880-5,06128627-5 52 | 06292880-5,06128623-5 53 | 06289358-5,06209003-5 54 | 06289358-5,06128627-5 55 | 06289358-5,06128623-5 56 | 06209003-5,06128627-5 57 | 06209003-5,06128623-5 58 | 06128627-5,06128623-5 59 | 06915307-5,06453319-6 60 | 06915307-5,06292880-6 61 | 06915307-5,06289358-6 62 | 06915307-5,06209003-6 63 | 06915307-5,06128627-6 64 | 06915307-5,06128623-6 65 | 06453319-6,06292880-6 66 | 06453319-6,06289358-6 67 | 06453319-6,06209003-6 68 | 06453319-6,06128627-6 69 | 06453319-6,06128623-6 70 | 06292880-6,06289358-6 71 | 06292880-6,06209003-6 72 | 06292880-6,06128627-6 73 | 06292880-6,06128623-6 74 | 06289358-6,06209003-6 75 | 06289358-6,06128627-6 76 | 06289358-6,06128623-6 77 | 06209003-6,06128627-6 78 | 06209003-6,06128623-6 79 | 06128627-6,06128623-6 80 | 06915307-1,06453319-2 81 | 06915307-1,06292880-2 82 | 06915307-1,06289358-2 83 | 06915307-1,06209003-2 84 | 06915307-1,06128627-2 85 | 06915307-1,06128623-2 86 | 06453319-2,06292880-2 87 | 06453319-2,06289358-2 88 | 06453319-2,06209003-2 89 | 06453319-2,06128627-2 90 | 06453319-2,06128623-2 91 | 06292880-2,06289358-2 92 | 06292880-2,06209003-2 93 | 06292880-2,06128627-2 94 | 06292880-2,06128623-2 95 | 06289358-2,06209003-2 96 | 06289358-2,06128627-2 97 | 06289358-2,06128623-2 98 | 06209003-2,06128627-2 99 | 06209003-2,06128623-2 100 | 06128627-2,06128623-2 101 | 06915307-2,06453319-3 102 | 06915307-2,06292880-3 103 | 06915307-2,06289358-3 104 | 06915307-2,06209003-3 105 | 06915307-2,06128627-3 106 | 06915307-2,06128623-3 107 | 06453319-3,06292880-3 108 | 06453319-3,06289358-3 109 | 06453319-3,06209003-3 110 | 06453319-3,06128627-3 111 | 06453319-3,06128623-3 112 | 06292880-3,06289358-3 113 | 06292880-3,06209003-3 114 | 06292880-3,06128627-3 115 | 06292880-3,06128623-3 116 | 06289358-3,06209003-3 117 | 06289358-3,06128627-3 118 | 06289358-3,06128623-3 119 | 06209003-3,06128627-3 120 | 06209003-3,06128623-3 121 | 06128627-3,06128623-3 122 | 06915307-0,06453319-1 123 | 06915307-0,06292880-1 124 | 06915307-0,06289358-1 125 | 06915307-0,06209003-1 126 | 06915307-0,06128627-1 127 | 06915307-0,06128623-1 128 | 06453319-1,06292880-1 129 | 06453319-1,06289358-1 130 | 06453319-1,06209003-1 131 | 06453319-1,06128627-1 132 | 06453319-1,06128623-1 133 | 06292880-1,06289358-1 134 | 06292880-1,06209003-1 135 | 06292880-1,06128627-1 136 | 06292880-1,06128623-1 137 | 06289358-1,06209003-1 138 | 06289358-1,06128627-1 139 | 06289358-1,06128623-1 140 | 06209003-1,06128627-1 141 | 06209003-1,06128623-1 142 | 06128627-1,06128623-1 143 | 07171415-5,07099871-4 144 | 07171415-5,07013303-4 145 | 07171415-5,06961723-3 146 | 07099871-4,07013303-4 147 | 07099871-4,06961723-3 148 | 07013303-4,06961723-3 149 | 07171415-1,07099871-0 150 | 07171415-1,07013303-0 151 | 07171415-1,06961723-0 152 | 07099871-0,07013303-0 153 | 07099871-0,06961723-0 154 | 07013303-0,06961723-0 155 | -------------------------------------------------------------------------------- /test/data/berkeley/tset05_3.txt: -------------------------------------------------------------------------------- 1 | 07171415-4,07099871-3 2 | 07171415-4,07013303-3 3 | 07171415-4,06961723-2 4 | 07099871-3,07013303-3 5 | 07099871-3,06961723-2 6 | 07013303-3,06961723-2 7 | 07171415-2,07099871-5 8 | 07171415-0,07099871-1 9 | 07171415-0,07013303-1 10 | 07171415-0,06961723-1 11 | 07099871-1,07013303-1 12 | 07099871-1,06961723-1 13 | 07013303-1,06961723-1 14 | 05726480-1,07420980-0 15 | 05726480-1,07529217-0 16 | 07420980-0,07529217-0 17 | 06915307-3,06453319-4 18 | 06915307-3,06292880-4 19 | 06915307-3,06289358-4 20 | 06915307-3,06209003-4 21 | 06915307-3,06128627-4 22 | 06915307-3,06128623-4 23 | 06453319-4,06292880-4 24 | 06453319-4,06289358-4 25 | 06453319-4,06209003-4 26 | 06453319-4,06128627-4 27 | 06453319-4,06128623-4 28 | 06292880-4,06289358-4 29 | 06292880-4,06209003-4 30 | 06292880-4,06128627-4 31 | 06292880-4,06128623-4 32 | 06289358-4,06209003-4 33 | 06289358-4,06128627-4 34 | 06289358-4,06128623-4 35 | 06209003-4,06128627-4 36 | 06209003-4,06128623-4 37 | 06128627-4,06128623-4 38 | 06915307-4,06453319-5 39 | 06915307-4,06292880-5 40 | 06915307-4,06289358-5 41 | 06915307-4,06209003-5 42 | 06915307-4,06128627-5 43 | 06915307-4,06128623-5 44 | 06453319-5,06292880-5 45 | 06453319-5,06289358-5 46 | 06453319-5,06209003-5 47 | 06453319-5,06128627-5 48 | 06453319-5,06128623-5 49 | 06292880-5,06289358-5 50 | 06292880-5,06209003-5 51 | 06292880-5,06128627-5 52 | 06292880-5,06128623-5 53 | 06289358-5,06209003-5 54 | 06289358-5,06128627-5 55 | 06289358-5,06128623-5 56 | 06209003-5,06128627-5 57 | 06209003-5,06128623-5 58 | 06128627-5,06128623-5 59 | 06915307-5,06453319-6 60 | 06915307-5,06292880-6 61 | 06915307-5,06289358-6 62 | 06915307-5,06209003-6 63 | 06915307-5,06128627-6 64 | 06915307-5,06128623-6 65 | 06453319-6,06292880-6 66 | 06453319-6,06289358-6 67 | 06453319-6,06209003-6 68 | 06453319-6,06128627-6 69 | 06453319-6,06128623-6 70 | 06292880-6,06289358-6 71 | 06292880-6,06209003-6 72 | 06292880-6,06128627-6 73 | 06292880-6,06128623-6 74 | 06289358-6,06209003-6 75 | 06289358-6,06128627-6 76 | 06289358-6,06128623-6 77 | 06209003-6,06128627-6 78 | 06209003-6,06128623-6 79 | 06128627-6,06128623-6 80 | 06915307-1,06453319-2 81 | 06915307-1,06292880-2 82 | 06915307-1,06289358-2 83 | 06915307-1,06209003-2 84 | 06915307-1,06128627-2 85 | 06915307-1,06128623-2 86 | 06453319-2,06292880-2 87 | 06453319-2,06289358-2 88 | 06453319-2,06209003-2 89 | 06453319-2,06128627-2 90 | 06453319-2,06128623-2 91 | 06292880-2,06289358-2 92 | 06292880-2,06209003-2 93 | 06292880-2,06128627-2 94 | 06292880-2,06128623-2 95 | 06289358-2,06209003-2 96 | 06289358-2,06128627-2 97 | 06289358-2,06128623-2 98 | 06209003-2,06128627-2 99 | 06209003-2,06128623-2 100 | 06128627-2,06128623-2 101 | 06915307-2,06453319-3 102 | 06915307-2,06292880-3 103 | 06915307-2,06289358-3 104 | 06915307-2,06209003-3 105 | 06915307-2,06128627-3 106 | 06915307-2,06128623-3 107 | 06453319-3,06292880-3 108 | 06453319-3,06289358-3 109 | 06453319-3,06209003-3 110 | 06453319-3,06128627-3 111 | 06453319-3,06128623-3 112 | 06292880-3,06289358-3 113 | 06292880-3,06209003-3 114 | 06292880-3,06128627-3 115 | 06292880-3,06128623-3 116 | 06289358-3,06209003-3 117 | 06289358-3,06128627-3 118 | 06289358-3,06128623-3 119 | 06209003-3,06128627-3 120 | 06209003-3,06128623-3 121 | 06128627-3,06128623-3 122 | 06915307-0,06453319-1 123 | 06915307-0,06292880-1 124 | 06915307-0,06289358-1 125 | 06915307-0,06209003-1 126 | 06915307-0,06128627-1 127 | 06915307-0,06128623-1 128 | 06453319-1,06292880-1 129 | 06453319-1,06289358-1 130 | 06453319-1,06209003-1 131 | 06453319-1,06128627-1 132 | 06453319-1,06128623-1 133 | 06292880-1,06289358-1 134 | 06292880-1,06209003-1 135 | 06292880-1,06128627-1 136 | 06292880-1,06128623-1 137 | 06289358-1,06209003-1 138 | 06289358-1,06128627-1 139 | 06289358-1,06128623-1 140 | 06209003-1,06128627-1 141 | 06209003-1,06128623-1 142 | 06128627-1,06128623-1 143 | 07171415-5,07099871-4 144 | 07171415-5,07013303-4 145 | 07171415-5,06961723-3 146 | 07099871-4,07013303-4 147 | 07099871-4,06961723-3 148 | 07013303-4,06961723-3 149 | 07171415-1,07099871-0 150 | 07171415-1,07013303-0 151 | 07171415-1,06961723-0 152 | 07099871-0,07013303-0 153 | 07099871-0,06961723-0 154 | 07013303-0,06961723-0 155 | -------------------------------------------------------------------------------- /test/data/berkeley/tset05_4.txt: -------------------------------------------------------------------------------- 1 | 07171415-4,07099871-3 2 | 07171415-4,07013303-3 3 | 07171415-4,06961723-2 4 | 07099871-3,07013303-3 5 | 07099871-3,06961723-2 6 | 07013303-3,06961723-2 7 | 07171415-2,07099871-5 8 | 07171415-0,07099871-1 9 | 07171415-0,07013303-1 10 | 07171415-0,06961723-1 11 | 07099871-1,07013303-1 12 | 07099871-1,06961723-1 13 | 07013303-1,06961723-1 14 | 05726480-1,07420980-0 15 | 05726480-1,07529217-0 16 | 07420980-0,07529217-0 17 | 06915307-3,06453319-4 18 | 06915307-3,06292880-4 19 | 06915307-3,06289358-4 20 | 06915307-3,06209003-4 21 | 06915307-3,06128627-4 22 | 06915307-3,06128623-4 23 | 06453319-4,06292880-4 24 | 06453319-4,06289358-4 25 | 06453319-4,06209003-4 26 | 06453319-4,06128627-4 27 | 06453319-4,06128623-4 28 | 06292880-4,06289358-4 29 | 06292880-4,06209003-4 30 | 06292880-4,06128627-4 31 | 06292880-4,06128623-4 32 | 06289358-4,06209003-4 33 | 06289358-4,06128627-4 34 | 06289358-4,06128623-4 35 | 06209003-4,06128627-4 36 | 06209003-4,06128623-4 37 | 06128627-4,06128623-4 38 | 06915307-4,06453319-5 39 | 06915307-4,06292880-5 40 | 06915307-4,06289358-5 41 | 06915307-4,06209003-5 42 | 06915307-4,06128627-5 43 | 06915307-4,06128623-5 44 | 06453319-5,06292880-5 45 | 06453319-5,06289358-5 46 | 06453319-5,06209003-5 47 | 06453319-5,06128627-5 48 | 06453319-5,06128623-5 49 | 06292880-5,06289358-5 50 | 06292880-5,06209003-5 51 | 06292880-5,06128627-5 52 | 06292880-5,06128623-5 53 | 06289358-5,06209003-5 54 | 06289358-5,06128627-5 55 | 06289358-5,06128623-5 56 | 06209003-5,06128627-5 57 | 06209003-5,06128623-5 58 | 06128627-5,06128623-5 59 | 06915307-5,06453319-6 60 | 06915307-5,06292880-6 61 | 06915307-5,06289358-6 62 | 06915307-5,06209003-6 63 | 06915307-5,06128627-6 64 | 06915307-5,06128623-6 65 | 06453319-6,06292880-6 66 | 06453319-6,06289358-6 67 | 06453319-6,06209003-6 68 | 06453319-6,06128627-6 69 | 06453319-6,06128623-6 70 | 06292880-6,06289358-6 71 | 06292880-6,06209003-6 72 | 06292880-6,06128627-6 73 | 06292880-6,06128623-6 74 | 06289358-6,06209003-6 75 | 06289358-6,06128627-6 76 | 06289358-6,06128623-6 77 | 06209003-6,06128627-6 78 | 06209003-6,06128623-6 79 | 06128627-6,06128623-6 80 | 06915307-1,06453319-2 81 | 06915307-1,06292880-2 82 | 06915307-1,06289358-2 83 | 06915307-1,06209003-2 84 | 06915307-1,06128627-2 85 | 06915307-1,06128623-2 86 | 06453319-2,06292880-2 87 | 06453319-2,06289358-2 88 | 06453319-2,06209003-2 89 | 06453319-2,06128627-2 90 | 06453319-2,06128623-2 91 | 06292880-2,06289358-2 92 | 06292880-2,06209003-2 93 | 06292880-2,06128627-2 94 | 06292880-2,06128623-2 95 | 06289358-2,06209003-2 96 | 06289358-2,06128627-2 97 | 06289358-2,06128623-2 98 | 06209003-2,06128627-2 99 | 06209003-2,06128623-2 100 | 06128627-2,06128623-2 101 | 06915307-2,06453319-3 102 | 06915307-2,06292880-3 103 | 06915307-2,06289358-3 104 | 06915307-2,06209003-3 105 | 06915307-2,06128627-3 106 | 06915307-2,06128623-3 107 | 06453319-3,06292880-3 108 | 06453319-3,06289358-3 109 | 06453319-3,06209003-3 110 | 06453319-3,06128627-3 111 | 06453319-3,06128623-3 112 | 06292880-3,06289358-3 113 | 06292880-3,06209003-3 114 | 06292880-3,06128627-3 115 | 06292880-3,06128623-3 116 | 06289358-3,06209003-3 117 | 06289358-3,06128627-3 118 | 06289358-3,06128623-3 119 | 06209003-3,06128627-3 120 | 06209003-3,06128623-3 121 | 06128627-3,06128623-3 122 | 06915307-0,06453319-1 123 | 06915307-0,06292880-1 124 | 06915307-0,06289358-1 125 | 06915307-0,06209003-1 126 | 06915307-0,06128627-1 127 | 06915307-0,06128623-1 128 | 06453319-1,06292880-1 129 | 06453319-1,06289358-1 130 | 06453319-1,06209003-1 131 | 06453319-1,06128627-1 132 | 06453319-1,06128623-1 133 | 06292880-1,06289358-1 134 | 06292880-1,06209003-1 135 | 06292880-1,06128627-1 136 | 06292880-1,06128623-1 137 | 06289358-1,06209003-1 138 | 06289358-1,06128627-1 139 | 06289358-1,06128623-1 140 | 06209003-1,06128627-1 141 | 06209003-1,06128623-1 142 | 06128627-1,06128623-1 143 | 07171415-5,07099871-4 144 | 07171415-5,07013303-4 145 | 07171415-5,06961723-3 146 | 07099871-4,07013303-4 147 | 07099871-4,06961723-3 148 | 07013303-4,06961723-3 149 | 07171415-1,07099871-0 150 | 07171415-1,07013303-0 151 | 07171415-1,06961723-0 152 | 07099871-0,07013303-0 153 | 07099871-0,06961723-0 154 | 07013303-0,06961723-0 155 | -------------------------------------------------------------------------------- /test/data/berkeley/tset05_5.txt: -------------------------------------------------------------------------------- 1 | 07171415-2,07099871-5 2 | 07171415-4,07099871-3 3 | 07171415-4,07013303-3 4 | 07171415-4,06961723-2 5 | 07099871-3,07013303-3 6 | 07099871-3,06961723-2 7 | 07013303-3,06961723-2 8 | 07171415-0,07099871-1 9 | 07171415-0,07013303-1 10 | 07171415-0,06961723-1 11 | 07099871-1,07013303-1 12 | 07099871-1,06961723-1 13 | 07013303-1,06961723-1 14 | 05726480-1,07420980-0 15 | 05726480-1,07529217-0 16 | 07420980-0,07529217-0 17 | 06915307-3,06453319-4 18 | 06915307-3,06292880-4 19 | 06915307-3,06289358-4 20 | 06915307-3,06209003-4 21 | 06915307-3,06128627-4 22 | 06915307-3,06128623-4 23 | 06453319-4,06292880-4 24 | 06453319-4,06289358-4 25 | 06453319-4,06209003-4 26 | 06453319-4,06128627-4 27 | 06453319-4,06128623-4 28 | 06292880-4,06289358-4 29 | 06292880-4,06209003-4 30 | 06292880-4,06128627-4 31 | 06292880-4,06128623-4 32 | 06289358-4,06209003-4 33 | 06289358-4,06128627-4 34 | 06289358-4,06128623-4 35 | 06209003-4,06128627-4 36 | 06209003-4,06128623-4 37 | 06128627-4,06128623-4 38 | 06915307-4,06453319-5 39 | 06915307-4,06292880-5 40 | 06915307-4,06289358-5 41 | 06915307-4,06209003-5 42 | 06915307-4,06128627-5 43 | 06915307-4,06128623-5 44 | 06453319-5,06292880-5 45 | 06453319-5,06289358-5 46 | 06453319-5,06209003-5 47 | 06453319-5,06128627-5 48 | 06453319-5,06128623-5 49 | 06292880-5,06289358-5 50 | 06292880-5,06209003-5 51 | 06292880-5,06128627-5 52 | 06292880-5,06128623-5 53 | 06289358-5,06209003-5 54 | 06289358-5,06128627-5 55 | 06289358-5,06128623-5 56 | 06209003-5,06128627-5 57 | 06209003-5,06128623-5 58 | 06128627-5,06128623-5 59 | 06915307-5,06453319-6 60 | 06915307-5,06292880-6 61 | 06915307-5,06289358-6 62 | 06915307-5,06209003-6 63 | 06915307-5,06128627-6 64 | 06915307-5,06128623-6 65 | 06453319-6,06292880-6 66 | 06453319-6,06289358-6 67 | 06453319-6,06209003-6 68 | 06453319-6,06128627-6 69 | 06453319-6,06128623-6 70 | 06292880-6,06289358-6 71 | 06292880-6,06209003-6 72 | 06292880-6,06128627-6 73 | 06292880-6,06128623-6 74 | 06289358-6,06209003-6 75 | 06289358-6,06128627-6 76 | 06289358-6,06128623-6 77 | 06209003-6,06128627-6 78 | 06209003-6,06128623-6 79 | 06128627-6,06128623-6 80 | 06915307-1,06453319-2 81 | 06915307-1,06292880-2 82 | 06915307-1,06289358-2 83 | 06915307-1,06209003-2 84 | 06915307-1,06128627-2 85 | 06915307-1,06128623-2 86 | 06453319-2,06292880-2 87 | 06453319-2,06289358-2 88 | 06453319-2,06209003-2 89 | 06453319-2,06128627-2 90 | 06453319-2,06128623-2 91 | 06292880-2,06289358-2 92 | 06292880-2,06209003-2 93 | 06292880-2,06128627-2 94 | 06292880-2,06128623-2 95 | 06289358-2,06209003-2 96 | 06289358-2,06128627-2 97 | 06289358-2,06128623-2 98 | 06209003-2,06128627-2 99 | 06209003-2,06128623-2 100 | 06128627-2,06128623-2 101 | 06915307-2,06453319-3 102 | 06915307-2,06292880-3 103 | 06915307-2,06289358-3 104 | 06915307-2,06209003-3 105 | 06915307-2,06128627-3 106 | 06915307-2,06128623-3 107 | 06453319-3,06292880-3 108 | 06453319-3,06289358-3 109 | 06453319-3,06209003-3 110 | 06453319-3,06128627-3 111 | 06453319-3,06128623-3 112 | 06292880-3,06289358-3 113 | 06292880-3,06209003-3 114 | 06292880-3,06128627-3 115 | 06292880-3,06128623-3 116 | 06289358-3,06209003-3 117 | 06289358-3,06128627-3 118 | 06289358-3,06128623-3 119 | 06209003-3,06128627-3 120 | 06209003-3,06128623-3 121 | 06128627-3,06128623-3 122 | 06915307-0,06453319-1 123 | 06915307-0,06292880-1 124 | 06915307-0,06289358-1 125 | 06915307-0,06209003-1 126 | 06915307-0,06128627-1 127 | 06915307-0,06128623-1 128 | 06453319-1,06292880-1 129 | 06453319-1,06289358-1 130 | 06453319-1,06209003-1 131 | 06453319-1,06128627-1 132 | 06453319-1,06128623-1 133 | 06292880-1,06289358-1 134 | 06292880-1,06209003-1 135 | 06292880-1,06128627-1 136 | 06292880-1,06128623-1 137 | 06289358-1,06209003-1 138 | 06289358-1,06128627-1 139 | 06289358-1,06128623-1 140 | 06209003-1,06128627-1 141 | 06209003-1,06128623-1 142 | 06128627-1,06128623-1 143 | 07171415-5,07099871-4 144 | 07171415-5,07013303-4 145 | 07171415-5,06961723-3 146 | 07099871-4,07013303-4 147 | 07099871-4,06961723-3 148 | 07013303-4,06961723-3 149 | 07171415-1,07099871-0 150 | 07171415-1,07013303-0 151 | 07171415-1,06961723-0 152 | 07099871-0,07013303-0 153 | 07099871-0,06961723-0 154 | 07013303-0,06961723-0 155 | -------------------------------------------------------------------------------- /test/data/berkeley/tset05_6.txt: -------------------------------------------------------------------------------- 1 | 07171415-2,07099871-5 2 | 07171415-4,07099871-3 3 | 07171415-4,07013303-3 4 | 07171415-4,06961723-2 5 | 07099871-3,07013303-3 6 | 07099871-3,06961723-2 7 | 07013303-3,06961723-2 8 | 07171415-0,07099871-1 9 | 07171415-0,07013303-1 10 | 07171415-0,06961723-1 11 | 07099871-1,07013303-1 12 | 07099871-1,06961723-1 13 | 07013303-1,06961723-1 14 | 05726480-1,07420980-0 15 | 05726480-1,07529217-0 16 | 07420980-0,07529217-0 17 | 06915307-3,06453319-4 18 | 06915307-3,06292880-4 19 | 06915307-3,06289358-4 20 | 06915307-3,06209003-4 21 | 06915307-3,06128627-4 22 | 06915307-3,06128623-4 23 | 06453319-4,06292880-4 24 | 06453319-4,06289358-4 25 | 06453319-4,06209003-4 26 | 06453319-4,06128627-4 27 | 06453319-4,06128623-4 28 | 06292880-4,06289358-4 29 | 06292880-4,06209003-4 30 | 06292880-4,06128627-4 31 | 06292880-4,06128623-4 32 | 06289358-4,06209003-4 33 | 06289358-4,06128627-4 34 | 06289358-4,06128623-4 35 | 06209003-4,06128627-4 36 | 06209003-4,06128623-4 37 | 06128627-4,06128623-4 38 | 06915307-4,06453319-5 39 | 06915307-4,06292880-5 40 | 06915307-4,06289358-5 41 | 06915307-4,06209003-5 42 | 06915307-4,06128627-5 43 | 06915307-4,06128623-5 44 | 06453319-5,06292880-5 45 | 06453319-5,06289358-5 46 | 06453319-5,06209003-5 47 | 06453319-5,06128627-5 48 | 06453319-5,06128623-5 49 | 06292880-5,06289358-5 50 | 06292880-5,06209003-5 51 | 06292880-5,06128627-5 52 | 06292880-5,06128623-5 53 | 06289358-5,06209003-5 54 | 06289358-5,06128627-5 55 | 06289358-5,06128623-5 56 | 06209003-5,06128627-5 57 | 06209003-5,06128623-5 58 | 06128627-5,06128623-5 59 | 06915307-5,06453319-6 60 | 06915307-5,06292880-6 61 | 06915307-5,06289358-6 62 | 06915307-5,06209003-6 63 | 06915307-5,06128627-6 64 | 06915307-5,06128623-6 65 | 06453319-6,06292880-6 66 | 06453319-6,06289358-6 67 | 06453319-6,06209003-6 68 | 06453319-6,06128627-6 69 | 06453319-6,06128623-6 70 | 06292880-6,06289358-6 71 | 06292880-6,06209003-6 72 | 06292880-6,06128627-6 73 | 06292880-6,06128623-6 74 | 06289358-6,06209003-6 75 | 06289358-6,06128627-6 76 | 06289358-6,06128623-6 77 | 06209003-6,06128627-6 78 | 06209003-6,06128623-6 79 | 06128627-6,06128623-6 80 | 06915307-1,06453319-2 81 | 06915307-1,06292880-2 82 | 06915307-1,06289358-2 83 | 06915307-1,06209003-2 84 | 06915307-1,06128627-2 85 | 06915307-1,06128623-2 86 | 06453319-2,06292880-2 87 | 06453319-2,06289358-2 88 | 06453319-2,06209003-2 89 | 06453319-2,06128627-2 90 | 06453319-2,06128623-2 91 | 06292880-2,06289358-2 92 | 06292880-2,06209003-2 93 | 06292880-2,06128627-2 94 | 06292880-2,06128623-2 95 | 06289358-2,06209003-2 96 | 06289358-2,06128627-2 97 | 06289358-2,06128623-2 98 | 06209003-2,06128627-2 99 | 06209003-2,06128623-2 100 | 06128627-2,06128623-2 101 | 06915307-2,06453319-3 102 | 06915307-2,06292880-3 103 | 06915307-2,06289358-3 104 | 06915307-2,06209003-3 105 | 06915307-2,06128627-3 106 | 06915307-2,06128623-3 107 | 06453319-3,06292880-3 108 | 06453319-3,06289358-3 109 | 06453319-3,06209003-3 110 | 06453319-3,06128627-3 111 | 06453319-3,06128623-3 112 | 06292880-3,06289358-3 113 | 06292880-3,06209003-3 114 | 06292880-3,06128627-3 115 | 06292880-3,06128623-3 116 | 06289358-3,06209003-3 117 | 06289358-3,06128627-3 118 | 06289358-3,06128623-3 119 | 06209003-3,06128627-3 120 | 06209003-3,06128623-3 121 | 06128627-3,06128623-3 122 | 06915307-0,06453319-1 123 | 06915307-0,06292880-1 124 | 06915307-0,06289358-1 125 | 06915307-0,06209003-1 126 | 06915307-0,06128627-1 127 | 06915307-0,06128623-1 128 | 06453319-1,06292880-1 129 | 06453319-1,06289358-1 130 | 06453319-1,06209003-1 131 | 06453319-1,06128627-1 132 | 06453319-1,06128623-1 133 | 06292880-1,06289358-1 134 | 06292880-1,06209003-1 135 | 06292880-1,06128627-1 136 | 06292880-1,06128623-1 137 | 06289358-1,06209003-1 138 | 06289358-1,06128627-1 139 | 06289358-1,06128623-1 140 | 06209003-1,06128627-1 141 | 06209003-1,06128623-1 142 | 06128627-1,06128623-1 143 | 07171415-5,07099871-4 144 | 07171415-5,07013303-4 145 | 07171415-5,06961723-3 146 | 07099871-4,07013303-4 147 | 07099871-4,06961723-3 148 | 07013303-4,06961723-3 149 | 07171415-1,07099871-0 150 | 07171415-1,07013303-0 151 | 07171415-1,06961723-0 152 | 07099871-0,07013303-0 153 | 07099871-0,06961723-0 154 | 07013303-0,06961723-0 155 | -------------------------------------------------------------------------------- /test/test_comparators.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include "testutils.h" 11 | 12 | using std::string; 13 | 14 | class ComparatorsTest : public CppUnit::TestCase { 15 | 16 | public: 17 | 18 | ComparatorsTest(std::string name) : CppUnit::TestCase(name) { 19 | 20 | describe_test(INDENT0, name.c_str()); 21 | } 22 | 23 | 24 | /** 25 | * This is insane to have a distance measure 26 | * which returns anything other than zero for 27 | * zero distance. See comments in source code. 28 | */ 29 | void test_zero() { 30 | 31 | string lat1("0.0"); 32 | string lon1("0.0"); 33 | string lat2("0.0"); 34 | string lon2("0.0"); 35 | int result = latloncmp(lat1, lon1, lat2, lon2); 36 | CPPUNIT_ASSERT(1 == result); 37 | describe_pass(INDENT2, "All 0.0 lats and lons return 1"); 38 | } 39 | 40 | 41 | void test_latloncmp() { 42 | 43 | string lat1("38.38"); 44 | string lon1("102.102"); 45 | string lat2("38.38"); 46 | string lon2("102.102"); 47 | int result = latloncmp(lat1, lon1, lat2, lon2); 48 | CPPUNIT_ASSERT(5 == result); 49 | describe_pass(INDENT2, "Identical lats and lons return 5"); 50 | } 51 | 52 | void test_latlon_nullstrings() { 53 | 54 | string lat1(""); 55 | string lon1(""); 56 | string lat2(""); 57 | string lon2(""); 58 | int result = latloncmp(lat1, lon1, lat2, lon2); 59 | CPPUNIT_ASSERT(1 == result); 60 | describe_pass(INDENT2, "Null string lats and lons return 1"); 61 | } 62 | 63 | 64 | void test_extract_initials() { 65 | string source("foo bar"); 66 | // This is how it's used in attribute.cpp:321 // dmd 2012/07/01 67 | char initials[64]; 68 | char * result = extract_initials(initials, source.c_str()); 69 | string dest(result); 70 | CPPUNIT_ASSERT(dest == string("fb")); 71 | } 72 | 73 | /** 74 | * Compare the extracted strings in data[0] to see whether they 75 | * started with the same letter and whether one contains the other. 76 | * i.e. 77 | * "DAVID WILLIAM" vs "DAVID" = 3 ( max score) 78 | * "DAVID WILLIAM" vs "WILLIAM" = 0 (min score, not starting with the same letters) 79 | * "DAVID WILLIAM" vs "DAVE" = 0 ( either one does not container the other. ) 80 | * "" vs "" = 2 ( both missing information ) 81 | * "DAVID" vs "" = 1 ( one missing information ) 82 | */ 83 | void test_midnamecmp() { 84 | string dw("DAVID WILLIAM"); 85 | string w("WILLIAM"); 86 | string dave("DAVE"); 87 | string david("DAVID"); 88 | string empty(""); 89 | CPPUNIT_ASSERT(3 == midnamecmp(dw, david)); 90 | CPPUNIT_ASSERT(2 == midnamecmp(empty, empty)); 91 | CPPUNIT_ASSERT(1 == midnamecmp(david, empty)); 92 | CPPUNIT_ASSERT(0 == midnamecmp(dw, w)); 93 | CPPUNIT_ASSERT(0 == midnamecmp(dw, dave)); 94 | } 95 | 96 | 97 | /** 98 | * The name_compare function builds a similarity 99 | * weight. 100 | */ 101 | void test_name_compare() { 102 | 103 | string s1("foo"); 104 | string s2("bar"); 105 | string s3("baz"); 106 | string s4("foo"); 107 | string s5(""); 108 | string s6(""); 109 | 110 | CPPUNIT_ASSERT(4 == name_compare(s1,s4,0,0)); 111 | CPPUNIT_ASSERT(3 == name_compare(s2,s3,0,0)); 112 | CPPUNIT_ASSERT(1 == name_compare(s5,s6,0,0)); 113 | // This is returning zero as part of an abbreviation 114 | // finding function or something. 115 | //std::cout << "name_compare: " << name_compare(s1,s3,0,0) << std::endl; 116 | // TODO: Figure out how to pass through everything such 117 | // that a result = 2 is returned. 118 | //CPPUNIT_ASSERT(2 == name_compare(s1,s3,0,0)); 119 | } 120 | 121 | }; 122 | 123 | 124 | void test_comparators() { 125 | 126 | ComparatorsTest * ct = new ComparatorsTest(std::string("Comparators test")); 127 | ct->test_zero(); 128 | ct->test_latloncmp(); 129 | ct->test_latlon_nullstrings(); 130 | 131 | ct->test_extract_initials(); 132 | ct->test_midnamecmp(); 133 | ct->test_name_compare(); 134 | delete ct; 135 | } 136 | 137 | 138 | #ifdef test_comparators_STANDALONE 139 | int 140 | main(int, char **) { 141 | test_comparators(); 142 | return 0; 143 | } 144 | #endif 145 | -------------------------------------------------------------------------------- /test/test_similarity.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | //#include 8 | //#include 9 | 10 | #if 0 11 | 12 | Comparison function scoring: LEFT/RIGHT=LEFT VS RIGHT 13 | 14 | 1) Firstname: 0-6. Factors: # of token and similarity between tokens 15 | Totally different: THOMAS ERIC/RICHARD JACK EVAN 16 | ONE NAME MISSING: THOMAS ERIC/(NONE) 17 | THOMAS ERIC/ THOMAS JOHN ALEX 18 | LEE RON ERIC/LEE ALEX ERIC 19 | //No space match but raw names don't: JOHNERIC/JOHN ERIC. Short name vs long name: ERIC/ERIC THOMAS 20 | ALEX NICHOLAS/ALEX NICHOLAS TAKASHI 21 | ALEX NICHOLAS/ALEX NICHOLA (Might be not exactly the same but identified the same by jaro-wrinkler) 22 | 23 | 2) Lastname: 0-6 Factors: # of token and similarity between tokens 24 | Totally different: ANDERSON/DAVIDSON 25 | ONE NAME MISSING: ANDERSON/(NONE) 26 | First part non-match: DE AMOUR/DA AMOUR 27 | VAN DE WAALS/VAN DES WAALS 28 | DE AMOUR/DEAMOUR 29 | JOHNSTON/JOHNSON 30 | DE AMOUR/DE AMOURS 31 | 32 | 3) Midname: 0-4 (THE FOLLOWING EXAMPLES ARE FROM THE COLUMN FIRSTNAME, SO FIRSTNAME IS INCLUDED) 33 | THOMAS ERIC/JOHN THOMAS 34 | JOHN ERIC/JOHN (MISSING) 35 | THOMAS ERIC ALEX/JACK ERIC RONALD 36 | THOMAS ERIC RON ALEX EDWARD/JACK ERIC RON ALEX LEE 37 | THOMAS ERIC/THOMAS ERIC LEE 38 | 39 | 4) Assignee: 0-8 40 | DIFFERENT ASGNUM, TOTALY DIFFERENT NAMES (NO single common word) 41 | DIFFERENT ASGNUM, One name missing 42 | DIFFERENT ASGNUM, Harvard University Longwood Medical School /Dartmouth Hitchcock Medical Center 43 | DIFFERENT ASGNUM, Harvard University President and Fellows / Presidents and Fellow of Harvard 44 | DIFFERENT ASGNUM, Harvard University / Harvard University Medical School 45 | DIFFERENT ASGNUM, Microsoft Corporation/Microsoft Corporated 46 | SAME ASGNUM, COMPANY SIZE>1000 47 | SAME ASGNUM, 1000>SIZE>100 48 | SAME ASGNUM, SIZE<100 49 | 50 | 5) CLASS: 0-4 51 | # OF COMMON CLASSES. MISSING=1 52 | 53 | 6) COAUTHERS 0-10 54 | # OF COMMON COAUTHORS 55 | 56 | 7) DISTANCE: 0-7 FACTORS: LONGITUDE/LATITUDE, STREET ADDRESS 57 | TOTALLY DIFFERENT 58 | ONE IS MISSING 59 | 75 69 | #include 70 | 71 | #include "testutils.h" 72 | 73 | SimilarityProfile 74 | csp(uint32_t s1, uint32_t s2) { 75 | SimilarityProfile sp; 76 | sp.push_back(s1); 77 | sp.push_back(s2); 78 | return sp; 79 | } 80 | 81 | 82 | class SimilarityTest : public CppUnit::TestCase { 83 | 84 | private: 85 | SimilarityProfile sp, max, min; 86 | 87 | public: 88 | SimilarityTest(std::string name) : CppUnit::TestCase(name) { 89 | 90 | describe_test(INDENT0, name.c_str()); 91 | 92 | max.push_back(2); 93 | max.push_back(2); 94 | min.push_back(0); 95 | min.push_back(0); 96 | } 97 | 98 | void test_sp2index() { 99 | 100 | sp.push_back(1); 101 | sp.push_back(1); 102 | uint32_t index = sp2index(sp, min, max); 103 | CPPUNIT_ASSERT(4 == index); 104 | //std::cout << "sp2index: " << index << std::endl; 105 | 106 | sp.clear(); 107 | sp.push_back(1); 108 | sp.push_back(0); 109 | index = sp2index(sp, min, max); 110 | CPPUNIT_ASSERT(3 == index); 111 | //std::cout << "sp2index: " << index << std::endl; 112 | describe_pass(INDENT2, "Retrieved index given similarity profile"); 113 | } 114 | 115 | void test_index2sp() { 116 | 117 | SimilarityProfile sp = index2sp(4, min, max); 118 | CPPUNIT_ASSERT(sp == csp(1,1)); 119 | //print_similarity(sp); 120 | sp = index2sp(3, min, max); 121 | CPPUNIT_ASSERT(sp == csp(1,0)); 122 | //print_similarity(sp); 123 | describe_pass(INDENT2, "Retrieved similarity profile given index"); 124 | } 125 | 126 | 127 | 128 | void test_cpp11_syntax() { 129 | vector sp1 = {1, 2, 3}; 130 | vector sp2({1, 2, 3}); 131 | vector sp3{1, 2, 3}; 132 | } 133 | 134 | 135 | void runTest() { 136 | test_sp2index(); 137 | test_index2sp(); 138 | test_cpp11_syntax(); 139 | } 140 | }; 141 | 142 | 143 | void 144 | test_similarity() { 145 | 146 | SimilarityTest * st = new SimilarityTest(std::string("Similarity test")); 147 | st->runTest(); 148 | delete st; 149 | } 150 | 151 | 152 | #ifdef test_similarity_STANDALONE 153 | int 154 | main(int, char **) { 155 | 156 | test_similarity(); 157 | return 0; 158 | } 159 | #endif 160 | -------------------------------------------------------------------------------- /test/test_namecompare.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Run this code with valgrind ./namecompare --leak-check=full 3 | */ 4 | 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | #include "testutils.h" 12 | 13 | using std::string; 14 | 15 | class NameCompareTest : public CppUnit::TestCase { 16 | 17 | public: 18 | NameCompareTest(std::string name) : CppUnit::TestCase(name) { 19 | 20 | describe_test(INDENT0, name.c_str()); 21 | } 22 | 23 | /** 24 | * The `name_compare` function computes similarity values as follows: 25 | * 0: two strings are completely different 26 | * 1: both strings empty 27 | * 2: one string is an abbreviation of the other string 28 | * 3: strings are the same with one misspelled 29 | * 4: both strings identical 30 | */ 31 | 32 | void 33 | test_empty() { 34 | 35 | std::string s1(""); 36 | std::string s2(""); 37 | CPPUNIT_ASSERT (1 == name_compare(s1,s2,0,0)); 38 | } 39 | 40 | 41 | void 42 | test_is_misspell() { 43 | 44 | std::string s1("misspell"); 45 | std::string s2("misspel"); 46 | CPPUNIT_ASSERT (3 == name_compare(s1,s2,0,0)); 47 | } 48 | 49 | 50 | void 51 | test_is_not_misspell() { 52 | 53 | const char * s1 = "misspell"; 54 | const char * s2 = "misspell"; 55 | CPPUNIT_ASSERT (4 == name_compare(s1,s2,0,0)); 56 | } 57 | 58 | 59 | void 60 | test_is_missepll() { 61 | 62 | std::string s1("misspell"); 63 | std::string s2("missepll"); 64 | CPPUNIT_ASSERT (3 == name_compare(s1,s2,0,0)); 65 | } 66 | 67 | 68 | void 69 | test_is_misspall() { 70 | 71 | std::string s1("misspell"); 72 | std::string s2("misspall"); 73 | CPPUNIT_ASSERT (3 == name_compare(s1,s2,0,0)); 74 | } 75 | 76 | 77 | void 78 | test_not_same() { 79 | 80 | std::string s1("function"); 81 | std::string s2("egregious"); 82 | CPPUNIT_ASSERT (0 == name_compare(s1,s2,0,0)); 83 | } 84 | 85 | 86 | 87 | /** 88 | * The case where 2 is returned 89 | * indicates one string is an abbreviation 90 | * of the other string. 91 | */ 92 | void 93 | test_is_abbrev1() { 94 | 95 | std::string s1("miss"); 96 | std::string s2("misspell"); 97 | CPPUNIT_ASSERT (2 == name_compare(s1,s2,3,0)); 98 | } 99 | 100 | /** 101 | * Reverse the abbreviation and try again. 102 | */ 103 | void 104 | test_is_abbrev2() { 105 | 106 | std::string s1("misspell"); 107 | std::string s2("miss"); 108 | CPPUNIT_ASSERT (2 == name_compare(s1,s2,3,0)); 109 | } 110 | 111 | void print_namecompare(std::string s1, std::string s2, int prev, int cur, int value) { 112 | 113 | std::cout << "s1: " << s1 << ", " 114 | << "s2: " << s2 << ", " 115 | << "prev: " << prev << ", " 116 | << "cur: " << cur << ", " 117 | << "value: " << value 118 | << std::endl; 119 | } 120 | 121 | void traverse_cursors(std::string s1, std::string s2) { 122 | 123 | for (unsigned int prev=0; prevrunTests(); 167 | delete nct; 168 | } 169 | 170 | 171 | #ifdef test_namecompare_STANDALONE 172 | int 173 | main(int, char **) { 174 | 175 | // http://stackoverflow.com/questions/138383/colored-grep/138528#138528 176 | // http://stackoverflow.com/questions/9158150/colored-output-in-c 177 | // std::cout << "\033[32mTesting \033[36mname_\033[33mcompare...!\033[0m" << std::endl; 178 | test_name_comparison(); 179 | return 0; 180 | } 181 | #endif 182 | -------------------------------------------------------------------------------- /test/test_blocking.cpp: -------------------------------------------------------------------------------- 1 | /** @file */ 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "testdata.h" 16 | #include "testutils.h" 17 | #include "fake.h" 18 | 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "fake.h" 28 | 29 | #include "testutils.h" 30 | 31 | 32 | class BlockingTest : public CppUnit::TestCase { 33 | 34 | private: 35 | FakeTest * ft; 36 | RecordPList recpointers; 37 | vector rpv; 38 | const cBlocking_Operation_By_Coauthors * cob; 39 | 40 | public: 41 | BlockingTest(std::string name) : CppUnit::TestCase(name) { 42 | 43 | describe_test(INDENT0, name.c_str()); 44 | // No middle names in clustertest.csv 45 | //const string filename("testdata/clustertest.csv"); 46 | // assignee_comparison has middle names 47 | const string filename("testdata/assignee_comparison.csv"); 48 | ft = new FakeTest(string("Fake RatioComponentTest"), filename); 49 | ft->load_fake_data(filename); 50 | recpointers = ft->get_recpointers(); 51 | rpv = ft->get_recvecs(); 52 | cob = ft->get_coauthor_blocking(); 53 | } 54 | 55 | 56 | void test_read_blocking_config() { 57 | 58 | Spec spec; 59 | 60 | spec.it("Reads a valid blocking file correctly", DO_SPEC { 61 | std::ofstream devnull("/dev/null"); 62 | int result; 63 | result = BlockingConfiguration::config_blocking("./fixtures/BlockingConfig.txt", string("Round 1"), devnull); 64 | return (0 == result); 65 | }); 66 | 67 | // TODO: handle result = 1 68 | // TODO: handle result = 2 69 | } 70 | 71 | 72 | void test_multi_column_blocking() { 73 | 74 | describe_test(INDENT2, "Testing multicolumn blocking"); 75 | 76 | Spec spec; 77 | 78 | std::auto_ptr bptr = get_blocking_pointer(); 79 | const BlockByColumns & blocker_ref = dynamic_cast (*bptr); 80 | vector names = blocker_ref.get_blocking_attribute_names(); 81 | 82 | StringRemoveSpace rsobj; 83 | StringNoSpaceTruncate nstobj; 84 | nstobj.set_truncater(2, 4, true); 85 | string man = nstobj.manipulate("EXAMPLES"); 86 | 87 | spec.it("Manipulate EXAMPLE to acquire AMPL", [&man](Description desc)->bool { 88 | return (man == string("AMPL")); 89 | }); 90 | 91 | vector vec_strman = { 92 | &rsobj, &rsobj, &nstobj 93 | }; 94 | 95 | vector vec_label = { "Firstname", "Middlename", "Lastname" }; 96 | vector vec_di = { 1, 0, 1 }; 97 | BlockByColumns mcmobj (vec_strman, vec_label, vec_di); 98 | 99 | const Record * r = rpv[0]; 100 | 101 | spec.it("Does some blocking on a first/middle/last: %s", [r,&mcmobj,&spec](Description desc)->bool { 102 | string binfo = mcmobj.extract_blocking_info(r); 103 | sprintf(spec.buf, desc, binfo.c_str()); 104 | return (string("PHILIP##E##RAND##") == binfo); 105 | }); 106 | 107 | spec.it("Does some blocking on a first/middle/last: %s", [r,&mcmobj,&spec](Description desc)->bool { 108 | string cinfo = mcmobj.extract_column_info(r,2); 109 | sprintf(spec.buf, desc, cinfo.c_str()); 110 | return (string("RAND") == cinfo); 111 | }); 112 | 113 | } 114 | 115 | 116 | void test_get_topN_coauthors() { 117 | const Record * prec = recpointers.front(); 118 | std::cout << "front pointer: " << recpointers.front() << std::endl; 119 | prec->print(); 120 | //cob->get_topN_coauthors(prec, 2); 121 | } 122 | 123 | 124 | void test_coauthor_blocking() { 125 | //const Record * prec = recpointers.front(); 126 | std::cout << "front pointer: " << recpointers.front() << std::endl; 127 | //std::string bi = cob->extract_blocking_info(recpointers.front()); 128 | //std::cout << "blocking info: " << bi << std::endl; 129 | } 130 | 131 | 132 | void runTest() { 133 | test_read_blocking_config(); 134 | test_multi_column_blocking(); 135 | //test_get_topN_coauthors(); 136 | //test_coauthor_blocking(); 137 | } 138 | }; 139 | 140 | 141 | void 142 | test_blocking() { 143 | 144 | BlockingTest * bt = new BlockingTest(std::string("Blocking test...")); 145 | bt->runTest(); 146 | delete bt; 147 | } 148 | 149 | 150 | 151 | #ifdef test_blocking_STANDALONE 152 | int 153 | main(int UP(argc), char ** UP(argv)) { 154 | 155 | test_blocking(); 156 | return 0; 157 | } 158 | #endif 159 | -------------------------------------------------------------------------------- /include/exceptions.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PATENT_EXCEPTIONS_H 3 | #define PATENT_EXCEPTIONS_H 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | 18 | /* 19 | * cAbstract_Exception 20 | * -- cException_Diff_Attribute 21 | * -- cException_No_Comparision_Function 22 | * -- cException_ColumnName_Not_Found 23 | * -- cException_Insufficient_Interactives 24 | * -- cException_No_Interactives 25 | * -- cException_Invalid_Function 26 | * -- cException_Interactive_Misalignment 27 | * -- cException_File_Not_Found 28 | * -- cException_Assignee_Not_In_Tree 29 | * -- cException_Invalid_Attribute_For_Sort 30 | * -- cException_Invalid_Probability 31 | * -- cException_Vector_Data 32 | * -- cException_Attribute_Not_In_Tree 33 | * -- cException_Duplicate_Attribute_In_Tree 34 | * -- cException_Unknown_Similarity_Profile 35 | * -- cException_Attribute_Disabled 36 | * -- cException_Other 37 | * -- cException_Blocking_Disabled 38 | * 39 | * To create a new exception class, simply inherit the cAbstract_Exception class, and write a constructor. 40 | * 41 | */ 42 | 43 | 44 | class cAbstract_Exception: public std::exception { 45 | private: 46 | string m_errmsg; 47 | public: 48 | cAbstract_Exception(const char *errmsg) : m_errmsg(errmsg) {}; 49 | const char * what() const throw() {return m_errmsg.c_str();} 50 | ~cAbstract_Exception() throw() {}; 51 | }; 52 | 53 | 54 | class cException_Diff_Attribute : public cAbstract_Exception{}; 55 | 56 | 57 | class cException_No_Comparision_Function : public cAbstract_Exception { 58 | public: 59 | cException_No_Comparision_Function(const char* errmsg): cAbstract_Exception(errmsg) {}; 60 | }; 61 | 62 | 63 | class cException_ColumnName_Not_Found : public cAbstract_Exception { 64 | public: 65 | cException_ColumnName_Not_Found(const char* errmsg): cAbstract_Exception(errmsg) {}; 66 | }; 67 | 68 | 69 | class cException_Insufficient_Interactives: public cAbstract_Exception { 70 | public: 71 | cException_Insufficient_Interactives(const char* errmsg): cAbstract_Exception(errmsg) {}; 72 | }; 73 | 74 | 75 | class cException_No_Interactives: public cAbstract_Exception { 76 | public: 77 | cException_No_Interactives(const char* errmsg): cAbstract_Exception(errmsg) {}; 78 | }; 79 | 80 | 81 | class cException_Invalid_Function : public cAbstract_Exception { 82 | public: 83 | cException_Invalid_Function( const char * errmsg) : cAbstract_Exception(errmsg) {}; 84 | }; 85 | 86 | 87 | class cException_Interactive_Misalignment: public cAbstract_Exception { 88 | public: 89 | cException_Interactive_Misalignment (const char* errmsg) : cAbstract_Exception(errmsg) {}; 90 | }; 91 | 92 | 93 | class cException_File_Not_Found: public cAbstract_Exception { 94 | public: 95 | cException_File_Not_Found(const char* errmsg): cAbstract_Exception(errmsg) {}; 96 | }; 97 | 98 | 99 | class cException_Assignee_Not_In_Tree: public cAbstract_Exception { 100 | public: 101 | cException_Assignee_Not_In_Tree(const char* errmsg): cAbstract_Exception(errmsg) {}; 102 | }; 103 | 104 | 105 | class cException_Invalid_Attribute_For_Sort: public cAbstract_Exception{ 106 | public: 107 | cException_Invalid_Attribute_For_Sort(const char* errmsg): cAbstract_Exception(errmsg) {}; 108 | }; 109 | 110 | 111 | class cException_Invalid_Probability: public cAbstract_Exception { 112 | public: 113 | cException_Invalid_Probability(const char* errmsg): cAbstract_Exception(errmsg) {}; 114 | }; 115 | 116 | 117 | class cException_Vector_Data: public cAbstract_Exception { 118 | public: 119 | cException_Vector_Data(const char* errmsg): cAbstract_Exception(errmsg) {}; 120 | }; 121 | 122 | 123 | class cException_Attribute_Not_In_Tree: public cAbstract_Exception { 124 | public: 125 | cException_Attribute_Not_In_Tree(const char* errmsg): cAbstract_Exception(errmsg) {}; 126 | }; 127 | 128 | 129 | class cException_Duplicate_Attribute_In_Tree: public cAbstract_Exception { 130 | public: 131 | cException_Duplicate_Attribute_In_Tree(const char* errmsg): cAbstract_Exception(errmsg) {}; 132 | }; 133 | 134 | 135 | class cException_Unknown_Similarity_Profile: public cAbstract_Exception { 136 | public: 137 | cException_Unknown_Similarity_Profile(const char* errmsg): cAbstract_Exception(errmsg) {}; 138 | }; 139 | 140 | 141 | class cException_Attribute_Disabled : public cAbstract_Exception { 142 | public: 143 | cException_Attribute_Disabled(const char* errmsg): cAbstract_Exception(errmsg){}; 144 | }; 145 | 146 | 147 | class cException_Other: public cAbstract_Exception { 148 | public: 149 | cException_Other(const char* errmsg): cAbstract_Exception(errmsg){}; 150 | }; 151 | 152 | 153 | class cException_Blocking_Disabled : public cAbstract_Exception { 154 | public: 155 | cException_Blocking_Disabled(const char* errmsg): cAbstract_Exception(errmsg){}; 156 | }; 157 | 158 | #endif /* PATENT_EXCEPTIONS_H */ 159 | -------------------------------------------------------------------------------- /test/test_strcmp95.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "strcmp95.h" 13 | 14 | #include "testutils.h" 15 | 16 | 17 | using std::string; 18 | 19 | // Need a tempate for comparing floats 20 | bool 21 | is_equal(float f1, float f2, float tol) { 22 | return (fabs(f1-f2) < tol); 23 | } 24 | 25 | class Strcmp95Test : public CppUnit::TestCase { 26 | 27 | private: 28 | std::stringstream description; 29 | 30 | public: 31 | Strcmp95Test(std::string name) : CppUnit::TestCase(name) { 32 | 33 | describe_test(INDENT0, name.c_str()); 34 | } 35 | 36 | float compute_jw(const char * s1, const char * s2) { 37 | return strcmp95_modified(s1, s2); 38 | } 39 | 40 | float compute_jw(string s1, string s2) { 41 | return strcmp95_modified(s1.c_str(), s2.c_str()); 42 | } 43 | 44 | void print_comparison(string s1, string s2, float value) { 45 | std::cout << s1 << " vs. " << s2 << ": " << value << std::endl; 46 | } 47 | 48 | std::string describe_comparison(string s1, string s2, float value) { 49 | std::stringstream desc; 50 | desc << s1 << " vs. " << s2 << ": " << value; 51 | return desc.str(); 52 | } 53 | 54 | 55 | void test_identical() { 56 | 57 | string s1("foo"), s2("foo"); 58 | float val = compute_jw(s1, s2); 59 | //print_comparison(s1, s2, val); 60 | CPPUNIT_ASSERT (is_equal(val, 1.0, 0.000001)); 61 | describe_pass(INDENT2, "Jaro/Winkler returned 1 for identical strings"); 62 | } 63 | 64 | 65 | void test_different() { 66 | 67 | string s1("foo"), s2("bar"); 68 | float val = compute_jw(s1, s2); 69 | //print_comparison(s1, s2, val); 70 | CPPUNIT_ASSERT (is_equal(val, 0.0, 0.000001)); 71 | describe_pass(INDENT2, "Jaro/Winkler returned 0 for totally different strings"); 72 | } 73 | 74 | 75 | void test_permuted() { 76 | 77 | // The following were extracted from an appendix of an 78 | // early version of the disambiguation paper. 79 | string s1("MATTHEW"); 80 | string s2("HEWMATT"); 81 | 82 | float val = compute_jw(s1, s2); 83 | string desc = string("Jaro/Winkler: ") + describe_comparison(s1, s2, val); 84 | CPPUNIT_ASSERT_DOUBLES_EQUAL (val, 0.4857, 0.0001); 85 | describe_pass(INDENT2, desc.c_str()); 86 | 87 | val = compute_jw(s2, s1); 88 | desc = string("Jaro/Winkler: ") + describe_comparison(s2, s1, val); 89 | CPPUNIT_ASSERT_DOUBLES_EQUAL (val, 0.4857, 0.0001); 90 | describe_pass(INDENT2, desc.c_str()); 91 | } 92 | 93 | 94 | void test_the_rest() { 95 | 96 | string s1(""); 97 | string s2(""); 98 | 99 | float val; 100 | 101 | s1 = "MATTHEW"; 102 | s2 = "MATEW"; 103 | val = compute_jw(s1,s2); 104 | string desc = string("Jaro/Winkler: ") + describe_comparison(s2, s1, val); 105 | describe_pass(INDENT2, desc.c_str()); 106 | CPPUNIT_ASSERT_DOUBLES_EQUAL (val, 0.94166, 0.0001); 107 | //st->print_comparison(s1, s2, val); 108 | //CPPUNIT_ASSERT (is_equal(val, 92.5, 0.000001)); 109 | 110 | s2 = "MATT"; 111 | val = compute_jw(s1, s2); 112 | //st->print_comparison(s1, s2, val); 113 | //CPPUNIT_ASSERT (is_equal(val, 92.5, 0.000001)); 114 | 115 | s2 = "MTATWEH"; 116 | val = compute_jw(s1, s2); 117 | //st->print_comparison(s1, s2, val); 118 | //CPPUNIT_ASSERT (is_equal(val, 92.5, 0.000001)); 119 | 120 | s2 = "M"; 121 | val = compute_jw("MATTHEW", "M"); 122 | //st->print_comparison(s1, s2, val); 123 | //CPPUNIT_ASSERT (is_equal(val, 92.5, 0.000001)); 124 | 125 | s2 = "TALIN"; 126 | val = compute_jw(s1, s2); 127 | //st->print_comparison(s1, s2, val); 128 | //CPPUNIT_ASSERT (is_equal(val, 92.5, 0.000001)); 129 | 130 | s2 = "XYZ"; 131 | val = compute_jw(s1, s2); 132 | //st->print_comparison(s1, s2, val); 133 | //CPPUNIT_ASSERT (is_equal(val, 92.5, 0.000001)); 134 | 135 | // The next three come from Wikipedia 136 | s1 = "dixon"; 137 | s2 = "dicksonx"; 138 | val = compute_jw(s1, s2); 139 | //st->print_comparison(s1, s2, val); 140 | //CPPUNIT_ASSERT (is_equal(val, 92.5, 0.000001)); 141 | 142 | s1 = "duane"; 143 | s2 = "dwayne"; 144 | val = compute_jw(s1, s2); 145 | //st->print_comparison(s1, s2, val); 146 | //CPPUNIT_ASSERT (is_equal(val, 92.5, 0.000001)); 147 | 148 | s1 = "MARTHA"; 149 | s2 = "MARHTA"; 150 | val = compute_jw(s1, s2); 151 | //st->print_comparison(s1, s2, val); 152 | //CPPUNIT_ASSERT (is_equal(val, 92.5, 0.000001)); 153 | } 154 | 155 | }; 156 | 157 | 158 | void 159 | test_strcmp95() { 160 | 161 | Strcmp95Test * st = new Strcmp95Test(std::string("Jaro/Winkler comparison test")); 162 | st->test_identical(); 163 | st->test_different(); 164 | st->test_permuted(); 165 | st->test_the_rest(); 166 | 167 | delete st; 168 | } 169 | 170 | 171 | #ifdef test_strcmp95_STANDALONE 172 | int 173 | main(int, char **) { 174 | test_strcmp95(); 175 | return 0; 176 | } 177 | #endif 178 | -------------------------------------------------------------------------------- /test/test_assignee_comparison.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // Really good web pages: 6 | // http://stackoverflow.com/questions/318064/how-do-you-declare-an-interface-in-c 7 | // http://stackoverflow.com/questions/7182359/template-instantiation-details-of-gcc-and-ms-compilers 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "testdata.h" 16 | #include "colortest.h" 17 | #include "testutils.h" 18 | #include "fake.h" 19 | 20 | using std::string; 21 | using std::cout; 22 | using std::endl; 23 | 24 | 25 | class AssigneeComparisonTest : public CppUnit::TestCase { 26 | 27 | private: 28 | FakeTest * ft; 29 | RecordPList rp; 30 | string H0008; 31 | 32 | public: 33 | 34 | AssigneeComparisonTest(std::string name) : CppUnit::TestCase(name) { 35 | describe_test(INDENT0, name.c_str()); 36 | ft = new FakeTest("FakeTest for assignees", "./testdata/assignee_comparison.csv"); 37 | ft->load_fake_data("./testdata/assignee_comparison.csv"); 38 | rp = ft->get_recpointers(); 39 | 40 | H0008 = string("H0008"); 41 | } 42 | 43 | void print_assignee_comparison() { 44 | } 45 | 46 | 47 | /** 48 | * These need to be unscrewed, the assignee comparison doesn't 49 | * work the same as the lat/long interactive. 50 | */ 51 | void compare_assignee() { 52 | 53 | describe_test(INDENT2, "Testing Assignee comparison"); 54 | 55 | uint32_t similarity; 56 | 57 | 58 | string asgnum("H0008"); 59 | 60 | cAssignee d1("IBM"); 61 | d1.split_string("IBM"); 62 | cAsgNum asg1(H0008.c_str()); 63 | asg1.split_string(H0008.c_str()); 64 | vector atpv1; 65 | atpv1.push_back(&asg1); 66 | d1.config_interactive(atpv1); 67 | 68 | cAssignee d2("IBM"); 69 | d2.split_string("IBM"); 70 | cAsgNum asg2(asgnum.c_str()); 71 | asg2.split_string(asgnum.c_str()); 72 | vector atpv2; 73 | atpv2.push_back(&asg2); 74 | d2.config_interactive(atpv2); 75 | 76 | d1.activate_comparator(); 77 | 78 | RecordPList rp = ft->get_recpointers(); 79 | cAssignee::configure_assignee(rp); 80 | 81 | similarity = d1.compare(d2); 82 | //std::cout << "Assignee similarity d1, d2: " << similarity << std::endl; 83 | // 4 is correct, but likely for the wrong reason: asgnums get some 84 | // sort of wierdo pointer counting thing. 85 | CPPUNIT_ASSERT(4 == similarity); 86 | describe_pass(INDENT4, "Comparing similarity for IBM"); 87 | 88 | } 89 | 90 | 91 | void test_different_assignees() { 92 | 93 | cAssignee::configure_assignee(rp); 94 | cAssignee d2("IBM"); 95 | d2.split_string("IBM"); 96 | cAsgNum asg2(H0008.c_str()); 97 | asg2.split_string(H0008.c_str()); 98 | vector atpv2; 99 | atpv2.push_back(&asg2); 100 | d2.config_interactive(atpv2); 101 | 102 | 103 | cAssignee a3("International Harvester"); 104 | a3.split_string("International Harvester"); 105 | cAsgNum asg3("I77777"); 106 | asg3.split_string("I77777"); 107 | vector atpv3; 108 | atpv3.push_back(&asg3); 109 | a3.config_interactive(atpv3); 110 | 111 | a3.activate_comparator(); 112 | 113 | 114 | uint32_t similarity = d2.compare(a3); 115 | //std::cout << "Assignee similarity d1, d2: " << similarity << std::endl; 116 | CPPUNIT_ASSERT(4 == similarity); 117 | describe_fail(INDENT4, "Comparing similarity for IBM & International Harvester"); 118 | 119 | } 120 | 121 | 122 | void test_r1r2() { 123 | 124 | RecordPList rp = ft->get_recpointers(); 125 | cAssignee::configure_assignee(rp); 126 | vector rpv = ft->get_recvecs(); 127 | const Record & r1 = *rpv[1]; 128 | const Record & r2 = *rpv[1]; 129 | 130 | vector active_similarity_attributes; 131 | active_similarity_attributes.push_back(string("Assignee")); 132 | //Record::activate_comparators_by_name(BlockingConfiguration::active_similarity_attributes); 133 | Record::activate_comparators_by_name(active_similarity_attributes); 134 | 135 | SimilarityProfile sp = r1.record_compare(r2); 136 | uint32_t similarity = sp[0]; 137 | CPPUNIT_ASSERT(6 == similarity); 138 | describe_pass(INDENT4, "Comparing similarity for r1 and r2"); 139 | //print_similarity(sp); 140 | } 141 | 142 | 143 | void test_pending() { 144 | 145 | Spec spec; 146 | spec.xit ("Test how pending works", DO_SPEC { 147 | return false; 148 | }); 149 | } 150 | 151 | 152 | void runTest() { 153 | compare_assignee(); 154 | test_different_assignees(); 155 | test_r1r2(); 156 | test_pending(); 157 | } 158 | 159 | }; 160 | 161 | 162 | void 163 | test_fetch_records() { 164 | 165 | AssigneeComparisonTest * act = new AssigneeComparisonTest("Testing assignee_comparison"); 166 | act->runTest(); 167 | delete act; 168 | } 169 | 170 | 171 | #ifdef test_assignee_comparison_STANDALONE 172 | int 173 | main(int, char **) { 174 | 175 | test_fetch_records(); 176 | 177 | return 0; 178 | } 179 | #endif 180 | -------------------------------------------------------------------------------- /test/test_clusterinfo.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "testdata.h" 15 | #include "testutils.h" 16 | #include "fake.h" 17 | 18 | 19 | 20 | class ClusterInfoTest : public CppUnit::TestCase { 21 | 22 | private: 23 | 24 | FakeTest * ft; 25 | RecordPList recpointers; 26 | vector rpv; 27 | list all_records; 28 | 29 | 30 | public: 31 | 32 | ClusterInfoTest(std::string name) : CppUnit::TestCase(name) { 33 | 34 | describe_test(INDENT0, name.c_str()); 35 | const string filename("testdata/assignee_comparison.csv"); 36 | ft = new FakeTest(string("Fake ClusterInfo test"), filename); 37 | ft->load_fake_data(filename); 38 | recpointers = ft->get_recpointers(); 39 | all_records = ft->get_all_records(); 40 | rpv = ft->get_recvecs(); 41 | } 42 | 43 | 44 | void test_get_initial_prior() { 45 | 46 | describe_test(INDENT2, "Testing get_initial_prior"); 47 | 48 | const Record * r2 = rpv[2]; 49 | const Record * r3 = rpv[3]; 50 | const Record * r4 = rpv[4]; 51 | // Dummy value for cohesion 52 | double cohesion = 0.2343; 53 | ClusterHead ch(r2, cohesion); 54 | 55 | RecordPList rpl1 = { r2 }; 56 | Cluster c1 = Cluster(ch, rpl1); 57 | 58 | RecordPList rpl2 = { r3, r4 }; 59 | Cluster c2 = Cluster(ch, rpl2); 60 | 61 | ClusterInfo::ClusterList rg = { c1, c2 }; 62 | 63 | double prior = get_initial_prior(rg); 64 | 65 | Spec spec; 66 | spec.it("get_initial_prior() returns ~0.333333", [&](Description desc)->bool { 67 | return (fabs(0.333333-prior) < 0.00001); 68 | }); 69 | 70 | rg.clear(); 71 | rg.push_back(c2); 72 | rg.push_back(c1); 73 | 74 | spec.it("Reorder and get_initial_prior() returns ~0.333333", [&](Description desc)->bool { 75 | return (fabs(0.333333-prior) < 0.00001); 76 | }); 77 | 78 | } 79 | 80 | 81 | void test_get_initial_prior2() { 82 | 83 | const Record * r2 = rpv[2]; 84 | const Record * r3 = rpv[3]; 85 | const Record * r4 = rpv[4]; 86 | const Record * r5 = rpv[5]; 87 | const Record * r6 = rpv[6]; 88 | const Record * r7 = rpv[7]; 89 | const Record * r8 = rpv[8]; 90 | // Dummy value for cohesion 91 | double cohesion = 0.2343; 92 | ClusterHead ch(r2, cohesion); 93 | 94 | RecordPList rpl1 = { r2, r3, r8 }; 95 | Cluster c1 = Cluster(ch, rpl1); 96 | RecordPList rpl2 = { r4, r5, r6, r7 }; 97 | Cluster c2 = Cluster(ch, rpl2); 98 | ClusterInfo::ClusterList rg = { c1, c2 }; 99 | 100 | double prior = get_initial_prior(rg); 101 | 102 | Spec spec; 103 | spec.it("get_initial_prior() returns ~0.428571", [&](Description desc)->bool { 104 | //return ((0.333333-prior) < 0.00001); 105 | return (fabs(0.428571-prior) < 0.00001); 106 | }); 107 | } 108 | 109 | void test_constructor() { 110 | 111 | describe_test(INDENT2, "Test the ClusterInfo constructor"); 112 | 113 | map uid_dict; 114 | const string uid_identifier = cUnique_Record_ID::static_get_class_name(); 115 | create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier); 116 | ClusterInfo match(uid_dict, true, true, false); 117 | Spec spec; 118 | spec.xit("ClusterInfo constructor works", DO_SPEC { 119 | return true; 120 | }); 121 | } 122 | 123 | 124 | void test_adjust_prior() { 125 | 126 | const Record * r2 = rpv[2]; 127 | const Record * r3 = rpv[3]; 128 | const Record * r4 = rpv[4]; 129 | const Record * r5 = rpv[5]; 130 | const Record * r6 = rpv[6]; 131 | const Record * r7 = rpv[7]; 132 | const Record * r8 = rpv[8]; 133 | // Dummy value for cohesion 134 | double cohesion = 0.2343; 135 | ClusterHead ch(r2, cohesion); 136 | 137 | RecordPList rpl1 = { r2, r3, r8 }; 138 | Cluster c1 = Cluster(ch, rpl1); 139 | RecordPList rpl2 = { r4, r5, r6, r7 }; 140 | Cluster c2 = Cluster(ch, rpl2); 141 | ClusterInfo::ClusterList rg = { c1, c2 }; 142 | 143 | map uid_dict; 144 | const string uid_identifier = cUnique_Record_ID::static_get_class_name(); 145 | create_btree_uid2record_pointer(uid_dict, all_records, uid_identifier); 146 | ClusterInfo match(uid_dict, true, true, false); 147 | 148 | double adjusted = match.adjust_prior(rg, string("FOOBAR###BAZ"), 0.333, false); 149 | 150 | Spec spec; 151 | spec.xit("adjust_prior() should do nothing", DO_SPEC { 152 | return true; 153 | }); 154 | } 155 | 156 | void runTests() { 157 | test_get_initial_prior(); 158 | test_get_initial_prior2(); 159 | test_adjust_prior(); 160 | test_constructor(); 161 | } 162 | 163 | }; 164 | 165 | 166 | void 167 | test_clusterinfo() { 168 | 169 | ClusterInfoTest * cit = new ClusterInfoTest(std::string("ClusterInfo test")); 170 | cit->runTests(); 171 | delete cit; 172 | } 173 | 174 | 175 | #ifdef test_clusterinfo_STANDALONE 176 | int 177 | main(int UP(argc), char ** UP(argv)) { 178 | 179 | test_clusterinfo(); 180 | return 0; 181 | } 182 | #endif 183 | -------------------------------------------------------------------------------- /include/newcluster.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PATENT_NEWCLUSTER_H 3 | #define PATENT_NEWCLUSTER_H 4 | 5 | #include "engine.h" 6 | #include "clusterhead.h" 7 | 8 | /** 9 | * Cluster objects are the molecules of disambiguation, 10 | * while Record objects are atoms of disambiguation. 11 | * Each cluster contains a cluster_head, a list of members, and some other 12 | * information. The aim of disambiguation is reorganize clusters 13 | * so that some can probably compound to bigger ones. Disambiguation 14 | * starts from smallest clusters that contain only one record, and 15 | * ends with clusters that contain some amount of records. 16 | */ 17 | class Cluster { 18 | 19 | private: 20 | 21 | static const unsigned int invalid_year = 0; 22 | 23 | //ClusterHead m_info: cluster head of the cluster, 24 | //including the delegate and the cohesion of the cluster. 25 | ClusterHead m_info; 26 | 27 | //RecordPList m_fellows: the list of members of the cluster. 28 | RecordPList m_fellows; 29 | 30 | //bool m_mergeable: a boolean, indicating "*this" cluster 31 | //has been merged into others or not. 32 | bool m_mergeable; 33 | 34 | //bool m_usable: a boolean preventing misuse earlier than 35 | //fully prepared. 36 | bool m_usable; 37 | 38 | //static const cRatios * pratio: a pointer that points to a 39 | //cRatio object which contains a map of similarity profile to ratio 40 | static const cRatios * pratio; 41 | 42 | /** 43 | * static const map * reference_pointer: 44 | * a pointer that points to a patent tree, which can be obtained in 45 | * a cBlocking_Operation_By_Coauthor object. 46 | */ 47 | static const PatentTree * reference_pointer; 48 | 49 | //Cluster & operator = ( const Cluster &): forbid the assignment operation. 50 | Cluster & operator = ( const Cluster &); 51 | 52 | //void find_representative(): to sets a component of cluster 53 | //head to be the record whose columns appear most frequently 54 | //among the all the members. 55 | void find_representative(); 56 | 57 | 58 | unsigned int first_patent_year; 59 | 60 | unsigned int last_patent_year; 61 | 62 | set < const cLatitude * > locs; 63 | 64 | void update_locations(); 65 | 66 | void update_year_range(); 67 | 68 | unsigned int patents_gap( const Cluster & rhs) const; 69 | 70 | bool is_valid_year() const; 71 | 72 | public: 73 | 74 | // Cluster(const ClusterHead & info, const RecordPList & fellows): constructor 75 | Cluster(const ClusterHead & info, const RecordPList & fellows); 76 | 77 | // ~Cluster() : destructor 78 | ~Cluster(); 79 | 80 | // Cluster ( const Cluster & rhs ): copy constructor 81 | Cluster ( const Cluster & rhs ); 82 | 83 | /** 84 | * void merge( Cluster & mergee, const ClusterHead & info): 85 | * merge the "mergee" cluster into "*this", and set 86 | * the cluster head of the new cluster to be info. 87 | */ 88 | void merge( Cluster & mergee, const ClusterHead & info); 89 | 90 | /** 91 | * ClusterHead disambiguate(const Cluster & rhs, const double prior, 92 | * const double mutual_threshold) const: 93 | * disambiguate "*this" cluster with rhs cluster, 94 | * with the prior and mutual_threshold information. 95 | * Returns a ClusterHead to tell whether the two clusters should 96 | * be merged or not, and if yes, the cohesion of the new one. 97 | */ 98 | ClusterHead disambiguate(const Cluster & rhs, const double prior, 99 | const double mutual_threshold) const; 100 | 101 | //static void set_ratiomap_pointer( const cRatios & r): 102 | //set the ratio map pointer to a good one. 103 | static void set_ratiomap_pointer( const cRatios & r) {pratio = &r;} 104 | 105 | //const RecordPList & get_fellows() const: 106 | //get the members (actually it is reference to const) of the cluster. 107 | const RecordPList & get_fellows() const { 108 | return m_fellows; 109 | } 110 | 111 | //const ClusterHead & get_cluster_head () const: 112 | //get the cluster head (const reference) of the cluster. 113 | const ClusterHead & get_cluster_head () const {return m_info;}; 114 | 115 | //void insert_elem( const Record *): insert a new member into 116 | //the member list. This could potentially change the cluster head. 117 | void insert_elem(const Record *); 118 | 119 | //void self_repair(): call this if insertion of elements is done manually, 120 | //usually for a batch of record objects (not recommended). 121 | void self_repair(); 122 | 123 | //static void set_reference_patent_tree_pointer( 124 | //const map < const Record *, RecordPList, cSort_by_attrib > & reference_patent_tree): 125 | //set the patent tree pointer. 126 | static void set_reference_patent_tree_pointer( 127 | const PatentTree & reference_patent_tree) { 128 | reference_pointer = & reference_patent_tree; 129 | } 130 | 131 | /** 132 | * void change_mid_name(): to change pointers to abbreviated middle 133 | * names to full middle names. This step is controversial, as it 134 | * actually changed the raw data. Or more correctly, it changed 135 | * the pointers of the raw data. 136 | */ 137 | void change_mid_name(); 138 | 139 | void add_uid2uinv(Uid2UinvTree & uid2uinv) const; 140 | }; 141 | 142 | 143 | /** 144 | * cException_Empty_Cluster: an exception that may be used. 145 | */ 146 | class cException_Empty_Cluster : public cAbstract_Exception { 147 | public: 148 | cException_Empty_Cluster(const char * errmsg): cAbstract_Exception(errmsg) {}; 149 | }; 150 | 151 | 152 | #endif /* PATENT_NEWCLUSTER_H */ 153 | -------------------------------------------------------------------------------- /test/test_record.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "../src/record_private.h" 11 | 12 | #include "testutils.h" 13 | 14 | class RecordTest : public CppUnit::TestCase { 15 | 16 | public: 17 | RecordTest(std::string name) : CppUnit::TestCase(name) { 18 | 19 | describe_test(INDENT0, name.c_str()); 20 | } 21 | 22 | Record make_foobar_record() { 23 | 24 | cFirstname * firstname = new cFirstname("Foo"); 25 | cMiddlename * middlename = new cMiddlename("X"); 26 | cLastname * lastname = new cLastname("Bar"); 27 | cLatitude * latitude = new cLatitude("42.00"); 28 | cAssignee * assignee = new cAssignee("Gonesilent"); 29 | cCity * city = new cCity("Burlingame"); 30 | cCountry * country = new cCountry("US"); 31 | 32 | vector temp_vec_attrib; 33 | 34 | temp_vec_attrib.push_back(firstname); 35 | temp_vec_attrib.push_back(middlename); 36 | temp_vec_attrib.push_back(lastname); 37 | temp_vec_attrib.push_back(latitude); 38 | temp_vec_attrib.push_back(assignee); 39 | temp_vec_attrib.push_back(city); 40 | temp_vec_attrib.push_back(country); 41 | 42 | Record tmprec(temp_vec_attrib); 43 | //std::cout << "Record..." << std::endl; 44 | return tmprec; 45 | } 46 | 47 | 48 | Record make_quuxalot_record() { 49 | 50 | cFirstname * firstname = new cFirstname("Quux"); 51 | cMiddlename * middlename = new cMiddlename("A"); 52 | cLastname * lastname = new cLastname("Lot"); 53 | cLatitude * latitude = new cLatitude("42.00"); 54 | cAssignee * assignee = new cAssignee("Gonesilent"); 55 | cCity * city = new cCity("Burlingame"); 56 | cCountry * country = new cCountry("US"); 57 | 58 | vector temp_vec_attrib; 59 | 60 | temp_vec_attrib.push_back(firstname); 61 | temp_vec_attrib.push_back(middlename); 62 | temp_vec_attrib.push_back(lastname); 63 | temp_vec_attrib.push_back(latitude); 64 | temp_vec_attrib.push_back(assignee); 65 | temp_vec_attrib.push_back(city); 66 | temp_vec_attrib.push_back(country); 67 | 68 | Record tmprec(temp_vec_attrib); 69 | //std::cout << "Record..." << std::endl; 70 | return tmprec; 71 | } 72 | 73 | 74 | void attribute_record() { 75 | cFirstname * firstname = new cFirstname("Foobar"); 76 | std::string tmp("bar"); 77 | 78 | firstname->reset_data(tmp.c_str()); 79 | const Attribute * pAttrib; 80 | 81 | pAttrib = firstname->clone(); 82 | 83 | vector temp_vec_attrib; 84 | temp_vec_attrib.push_back(pAttrib); 85 | Record tmprec(temp_vec_attrib); 86 | 87 | //Record(const vector & input_vec); 88 | //const std::vector input_vec; 89 | //input_vec.push_back(firstname); 90 | //Record r(input_vec); 91 | delete firstname; 92 | } 93 | 94 | void read_records() { 95 | CPPUNIT_ASSERT(1 == 1); 96 | } 97 | 98 | void delete_record() { 99 | Record * rc = new Record(); 100 | delete rc; 101 | } 102 | 103 | void test_get_data_by_index() { 104 | Record foobar = make_foobar_record(); 105 | Record quux = make_quuxalot_record(); 106 | 107 | const vector data = quux.get_data_by_index(1); 108 | vector::const_iterator it = data.begin(); 109 | 110 | // Ok, so there's no data to print. Some other things have to 111 | // happen first. 112 | for (; it != data.end(); ++it) { 113 | std::cout << "data: " << *it << std::endl; 114 | } 115 | } 116 | 117 | void test_parse_column_names() { 118 | 119 | string names("Lastname,Firstname"); 120 | vector columns = parse_column_names(names); 121 | 122 | CPPUNIT_ASSERT(string("Lastname") == columns[0]); 123 | CPPUNIT_ASSERT(string("Firstname") == columns[1]); 124 | } 125 | 126 | 127 | void test_create_column_indices() { 128 | 129 | vector requested_columns; 130 | requested_columns.push_back("Firstname"); 131 | requested_columns.push_back("Lastname"); 132 | requested_columns.push_back("Middlename"); 133 | 134 | vector total_col_names; 135 | total_col_names.push_back("Lastname"); 136 | total_col_names.push_back("Firstname"); 137 | total_col_names.push_back("Middlename"); 138 | 139 | vector rci = create_column_indices(requested_columns, total_col_names); 140 | 141 | CPPUNIT_ASSERT(1 == rci[0]); 142 | CPPUNIT_ASSERT(0 == rci[1]); 143 | CPPUNIT_ASSERT(2 == rci[2]); 144 | 145 | } 146 | 147 | 148 | void test_sample_record_pointer() { 149 | 150 | Spec spec; 151 | spec.it("Correctly sets sample record pointer", DO_SPEC_THIS { 152 | Record foobar = make_foobar_record(); 153 | foobar.set_sample_record(&foobar); 154 | const Record & sample(Record::get_sample_record()); 155 | return (&foobar == &sample); 156 | }); 157 | } 158 | 159 | 160 | void runTest() { 161 | delete_record(); 162 | make_foobar_record(); 163 | make_quuxalot_record(); 164 | attribute_record(); 165 | read_records(); 166 | test_get_data_by_index(); 167 | test_create_column_indices(); 168 | test_parse_column_names(); 169 | test_sample_record_pointer(); 170 | } 171 | }; 172 | 173 | 174 | void 175 | test_records() { 176 | 177 | RecordTest * rt = new RecordTest(std::string("Record unit testing")); 178 | rt->runTest(); 179 | delete rt; 180 | } 181 | 182 | 183 | #ifdef test_record_STANDALONE 184 | int 185 | main(int UP(argc), char ** UP(argv)) { 186 | 187 | test_records(); 188 | return 0; 189 | } 190 | #endif 191 | --------------------------------------------------------------------------------