├── license.md ├── code ├── Makevars ├── cafe_mac ├── cafe_linux ├── CAFEGUI.pyc ├── cafe_win.exe ├── mtTkinter.pyc ├── image │ ├── clear.gif │ ├── load.gif │ ├── logo.ico │ ├── save.gif │ ├── addDir.gif │ ├── addFile.gif │ ├── openfile.gif │ ├── remove.gif │ ├── setting.gif │ ├── zoomin.gif │ └── zoomout.gif ├── Makefile ├── output.h ├── seq_model.h ├── output.cpp ├── seq_model.cpp ├── utils.h ├── utils.cpp ├── kmer.h ├── mtTkinter.py ├── dist_model.h ├── dist_model.cpp ├── main.cpp ├── SimpleMatrix.h └── kmer.cpp ├── image ├── snapshot1.jpg ├── snapshot2.jpg ├── snapshot3.jpg ├── snapshot4.jpg ├── snapshot5.jpg ├── snapshot6.jpg ├── snapshot7.jpg ├── snapshot8.jpg └── snapshot9.jpg ├── example └── hash │ ├── hash_AB243556_L_0_k_1 │ ├── hash_AB008550_L_0_k_1 │ ├── hash_AB008550_L_0_k_2 │ ├── hash_AB008550_L_0_k_3 │ ├── hash_AB008550_L_0_k_4 │ ├── hash_AB008550_L_0_k_5 │ ├── hash_AB008550_L_0_k_6 │ ├── hash_AB008550_L_0_k_7 │ ├── hash_AB008550_L_0_k_8 │ ├── hash_AB009866_L_0_k_1 │ ├── hash_AB009866_L_0_k_2 │ ├── hash_AB009866_L_0_k_3 │ ├── hash_AB009866_L_0_k_4 │ ├── hash_AB009866_L_0_k_5 │ ├── hash_AB009866_L_0_k_6 │ ├── hash_AB009866_L_0_k_7 │ ├── hash_AB009866_L_0_k_8 │ ├── hash_AB044554_L_0_k_1 │ ├── hash_AB044554_L_0_k_2 │ ├── hash_AB044554_L_0_k_3 │ ├── hash_AB044554_L_0_k_4 │ ├── hash_AB044554_L_0_k_5 │ ├── hash_AB044554_L_0_k_6 │ ├── hash_AB044554_L_0_k_7 │ ├── hash_AB044554_L_0_k_8 │ ├── hash_AB045978_L_0_k_1 │ ├── hash_AB045978_L_0_k_2 │ ├── hash_AB045978_L_0_k_3 │ ├── hash_AB045978_L_0_k_4 │ ├── hash_AB045978_L_0_k_5 │ ├── hash_AB045978_L_0_k_6 │ ├── hash_AB045978_L_0_k_7 │ ├── hash_AB045978_L_0_k_8 │ ├── hash_AB243556_L_0_k_2 │ ├── hash_AB243556_L_0_k_3 │ ├── hash_AB243556_L_0_k_4 │ ├── hash_AB243556_L_0_k_5 │ ├── hash_AB243556_L_0_k_6 │ ├── hash_AB243556_L_0_k_7 │ ├── hash_AB243556_L_0_k_8 │ ├── hash_AB276040_L_0_k_1 │ ├── hash_AB276040_L_0_k_2 │ ├── hash_AB276040_L_0_k_3 │ ├── hash_AB276040_L_0_k_4 │ ├── hash_AB276040_L_0_k_5 │ ├── hash_AB276040_L_0_k_6 │ ├── hash_AB276040_L_0_k_7 │ ├── hash_AB276040_L_0_k_8 │ ├── hash_AB366653_L_0_k_1 │ ├── hash_AB366653_L_0_k_2 │ ├── hash_AB366653_L_0_k_3 │ ├── hash_AB366653_L_0_k_4 │ ├── hash_AB366653_L_0_k_5 │ ├── hash_AB366653_L_0_k_6 │ ├── hash_AB366653_L_0_k_7 │ ├── hash_AB366653_L_0_k_8 │ ├── hash_AB370205_L_0_k_1 │ ├── hash_AB370205_L_0_k_2 │ ├── hash_AB370205_L_0_k_3 │ ├── hash_AB370205_L_0_k_4 │ ├── hash_AB370205_L_0_k_5 │ ├── hash_AB370205_L_0_k_6 │ ├── hash_AB370205_L_0_k_7 │ ├── hash_AB370205_L_0_k_8 │ ├── hash_AB370268_L_0_k_1 │ ├── hash_AB370268_L_0_k_2 │ ├── hash_AB370268_L_0_k_3 │ ├── hash_AB370268_L_0_k_4 │ ├── hash_AB370268_L_0_k_5 │ ├── hash_AB370268_L_0_k_6 │ ├── hash_AB370268_L_0_k_7 │ ├── hash_AB370268_L_0_k_8 │ ├── hash_AB451219_L_0_k_1 │ ├── hash_AB451219_L_0_k_2 │ ├── hash_AB451219_L_0_k_3 │ ├── hash_AB451219_L_0_k_4 │ ├── hash_AB451219_L_0_k_5 │ ├── hash_AB451219_L_0_k_6 │ ├── hash_AB451219_L_0_k_7 │ ├── hash_AB451219_L_0_k_8 │ ├── hash_AB472900_L_0_k_1 │ ├── hash_AB472900_L_0_k_2 │ ├── hash_AB472900_L_0_k_3 │ ├── hash_AB472900_L_0_k_4 │ ├── hash_AB472900_L_0_k_5 │ ├── hash_AB472900_L_0_k_6 │ ├── hash_AB472900_L_0_k_7 │ └── hash_AB472900_L_0_k_8 └── README.md /license.md: -------------------------------------------------------------------------------- 1 | USC-RL v1.0 2 | -------------------------------------------------------------------------------- /code/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CXXFLAGS = -std=c++11 -------------------------------------------------------------------------------- /code/cafe_mac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/cafe_mac -------------------------------------------------------------------------------- /code/cafe_linux: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/cafe_linux -------------------------------------------------------------------------------- /code/CAFEGUI.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/CAFEGUI.pyc -------------------------------------------------------------------------------- /code/cafe_win.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/cafe_win.exe -------------------------------------------------------------------------------- /code/mtTkinter.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/mtTkinter.pyc -------------------------------------------------------------------------------- /code/image/clear.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/image/clear.gif -------------------------------------------------------------------------------- /code/image/load.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/image/load.gif -------------------------------------------------------------------------------- /code/image/logo.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/image/logo.ico -------------------------------------------------------------------------------- /code/image/save.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/image/save.gif -------------------------------------------------------------------------------- /image/snapshot1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/image/snapshot1.jpg -------------------------------------------------------------------------------- /image/snapshot2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/image/snapshot2.jpg -------------------------------------------------------------------------------- /image/snapshot3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/image/snapshot3.jpg -------------------------------------------------------------------------------- /image/snapshot4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/image/snapshot4.jpg -------------------------------------------------------------------------------- /image/snapshot5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/image/snapshot5.jpg -------------------------------------------------------------------------------- /image/snapshot6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/image/snapshot6.jpg -------------------------------------------------------------------------------- /image/snapshot7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/image/snapshot7.jpg -------------------------------------------------------------------------------- /image/snapshot8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/image/snapshot8.jpg -------------------------------------------------------------------------------- /image/snapshot9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/image/snapshot9.jpg -------------------------------------------------------------------------------- /code/image/addDir.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/image/addDir.gif -------------------------------------------------------------------------------- /code/image/addFile.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/image/addFile.gif -------------------------------------------------------------------------------- /code/image/openfile.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/image/openfile.gif -------------------------------------------------------------------------------- /code/image/remove.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/image/remove.gif -------------------------------------------------------------------------------- /code/image/setting.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/image/setting.gif -------------------------------------------------------------------------------- /code/image/zoomin.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/image/zoomin.gif -------------------------------------------------------------------------------- /code/image/zoomout.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/code/image/zoomout.gif -------------------------------------------------------------------------------- /example/hash/hash_AB243556_L_0_k_1: -------------------------------------------------------------------------------- 1 | VA\H!:3 -------------------------------------------------------------------------------- /example/hash/hash_AB008550_L_0_k_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB008550_L_0_k_1 -------------------------------------------------------------------------------- /example/hash/hash_AB008550_L_0_k_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB008550_L_0_k_2 -------------------------------------------------------------------------------- /example/hash/hash_AB008550_L_0_k_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB008550_L_0_k_3 -------------------------------------------------------------------------------- /example/hash/hash_AB008550_L_0_k_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB008550_L_0_k_4 -------------------------------------------------------------------------------- /example/hash/hash_AB008550_L_0_k_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB008550_L_0_k_5 -------------------------------------------------------------------------------- /example/hash/hash_AB008550_L_0_k_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB008550_L_0_k_6 -------------------------------------------------------------------------------- /example/hash/hash_AB008550_L_0_k_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB008550_L_0_k_7 -------------------------------------------------------------------------------- /example/hash/hash_AB008550_L_0_k_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB008550_L_0_k_8 -------------------------------------------------------------------------------- /example/hash/hash_AB009866_L_0_k_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB009866_L_0_k_1 -------------------------------------------------------------------------------- /example/hash/hash_AB009866_L_0_k_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB009866_L_0_k_2 -------------------------------------------------------------------------------- /example/hash/hash_AB009866_L_0_k_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB009866_L_0_k_3 -------------------------------------------------------------------------------- /example/hash/hash_AB009866_L_0_k_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB009866_L_0_k_4 -------------------------------------------------------------------------------- /example/hash/hash_AB009866_L_0_k_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB009866_L_0_k_5 -------------------------------------------------------------------------------- /example/hash/hash_AB009866_L_0_k_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB009866_L_0_k_6 -------------------------------------------------------------------------------- /example/hash/hash_AB009866_L_0_k_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB009866_L_0_k_7 -------------------------------------------------------------------------------- /example/hash/hash_AB009866_L_0_k_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB009866_L_0_k_8 -------------------------------------------------------------------------------- /example/hash/hash_AB044554_L_0_k_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB044554_L_0_k_1 -------------------------------------------------------------------------------- /example/hash/hash_AB044554_L_0_k_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB044554_L_0_k_2 -------------------------------------------------------------------------------- /example/hash/hash_AB044554_L_0_k_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB044554_L_0_k_3 -------------------------------------------------------------------------------- /example/hash/hash_AB044554_L_0_k_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB044554_L_0_k_4 -------------------------------------------------------------------------------- /example/hash/hash_AB044554_L_0_k_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB044554_L_0_k_5 -------------------------------------------------------------------------------- /example/hash/hash_AB044554_L_0_k_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB044554_L_0_k_6 -------------------------------------------------------------------------------- /example/hash/hash_AB044554_L_0_k_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB044554_L_0_k_7 -------------------------------------------------------------------------------- /example/hash/hash_AB044554_L_0_k_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB044554_L_0_k_8 -------------------------------------------------------------------------------- /example/hash/hash_AB045978_L_0_k_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB045978_L_0_k_1 -------------------------------------------------------------------------------- /example/hash/hash_AB045978_L_0_k_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB045978_L_0_k_2 -------------------------------------------------------------------------------- /example/hash/hash_AB045978_L_0_k_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB045978_L_0_k_3 -------------------------------------------------------------------------------- /example/hash/hash_AB045978_L_0_k_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB045978_L_0_k_4 -------------------------------------------------------------------------------- /example/hash/hash_AB045978_L_0_k_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB045978_L_0_k_5 -------------------------------------------------------------------------------- /example/hash/hash_AB045978_L_0_k_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB045978_L_0_k_6 -------------------------------------------------------------------------------- /example/hash/hash_AB045978_L_0_k_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB045978_L_0_k_7 -------------------------------------------------------------------------------- /example/hash/hash_AB045978_L_0_k_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB045978_L_0_k_8 -------------------------------------------------------------------------------- /example/hash/hash_AB243556_L_0_k_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB243556_L_0_k_2 -------------------------------------------------------------------------------- /example/hash/hash_AB243556_L_0_k_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB243556_L_0_k_3 -------------------------------------------------------------------------------- /example/hash/hash_AB243556_L_0_k_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB243556_L_0_k_4 -------------------------------------------------------------------------------- /example/hash/hash_AB243556_L_0_k_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB243556_L_0_k_5 -------------------------------------------------------------------------------- /example/hash/hash_AB243556_L_0_k_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB243556_L_0_k_6 -------------------------------------------------------------------------------- /example/hash/hash_AB243556_L_0_k_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB243556_L_0_k_7 -------------------------------------------------------------------------------- /example/hash/hash_AB243556_L_0_k_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB243556_L_0_k_8 -------------------------------------------------------------------------------- /example/hash/hash_AB276040_L_0_k_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB276040_L_0_k_1 -------------------------------------------------------------------------------- /example/hash/hash_AB276040_L_0_k_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB276040_L_0_k_2 -------------------------------------------------------------------------------- /example/hash/hash_AB276040_L_0_k_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB276040_L_0_k_3 -------------------------------------------------------------------------------- /example/hash/hash_AB276040_L_0_k_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB276040_L_0_k_4 -------------------------------------------------------------------------------- /example/hash/hash_AB276040_L_0_k_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB276040_L_0_k_5 -------------------------------------------------------------------------------- /example/hash/hash_AB276040_L_0_k_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB276040_L_0_k_6 -------------------------------------------------------------------------------- /example/hash/hash_AB276040_L_0_k_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB276040_L_0_k_7 -------------------------------------------------------------------------------- /example/hash/hash_AB276040_L_0_k_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB276040_L_0_k_8 -------------------------------------------------------------------------------- /example/hash/hash_AB366653_L_0_k_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB366653_L_0_k_1 -------------------------------------------------------------------------------- /example/hash/hash_AB366653_L_0_k_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB366653_L_0_k_2 -------------------------------------------------------------------------------- /example/hash/hash_AB366653_L_0_k_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB366653_L_0_k_3 -------------------------------------------------------------------------------- /example/hash/hash_AB366653_L_0_k_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB366653_L_0_k_4 -------------------------------------------------------------------------------- /example/hash/hash_AB366653_L_0_k_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB366653_L_0_k_5 -------------------------------------------------------------------------------- /example/hash/hash_AB366653_L_0_k_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB366653_L_0_k_6 -------------------------------------------------------------------------------- /example/hash/hash_AB366653_L_0_k_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB366653_L_0_k_7 -------------------------------------------------------------------------------- /example/hash/hash_AB366653_L_0_k_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB366653_L_0_k_8 -------------------------------------------------------------------------------- /example/hash/hash_AB370205_L_0_k_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370205_L_0_k_1 -------------------------------------------------------------------------------- /example/hash/hash_AB370205_L_0_k_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370205_L_0_k_2 -------------------------------------------------------------------------------- /example/hash/hash_AB370205_L_0_k_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370205_L_0_k_3 -------------------------------------------------------------------------------- /example/hash/hash_AB370205_L_0_k_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370205_L_0_k_4 -------------------------------------------------------------------------------- /example/hash/hash_AB370205_L_0_k_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370205_L_0_k_5 -------------------------------------------------------------------------------- /example/hash/hash_AB370205_L_0_k_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370205_L_0_k_6 -------------------------------------------------------------------------------- /example/hash/hash_AB370205_L_0_k_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370205_L_0_k_7 -------------------------------------------------------------------------------- /example/hash/hash_AB370205_L_0_k_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370205_L_0_k_8 -------------------------------------------------------------------------------- /example/hash/hash_AB370268_L_0_k_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370268_L_0_k_1 -------------------------------------------------------------------------------- /example/hash/hash_AB370268_L_0_k_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370268_L_0_k_2 -------------------------------------------------------------------------------- /example/hash/hash_AB370268_L_0_k_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370268_L_0_k_3 -------------------------------------------------------------------------------- /example/hash/hash_AB370268_L_0_k_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370268_L_0_k_4 -------------------------------------------------------------------------------- /example/hash/hash_AB370268_L_0_k_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370268_L_0_k_5 -------------------------------------------------------------------------------- /example/hash/hash_AB370268_L_0_k_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370268_L_0_k_6 -------------------------------------------------------------------------------- /example/hash/hash_AB370268_L_0_k_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370268_L_0_k_7 -------------------------------------------------------------------------------- /example/hash/hash_AB370268_L_0_k_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB370268_L_0_k_8 -------------------------------------------------------------------------------- /example/hash/hash_AB451219_L_0_k_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB451219_L_0_k_1 -------------------------------------------------------------------------------- /example/hash/hash_AB451219_L_0_k_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB451219_L_0_k_2 -------------------------------------------------------------------------------- /example/hash/hash_AB451219_L_0_k_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB451219_L_0_k_3 -------------------------------------------------------------------------------- /example/hash/hash_AB451219_L_0_k_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB451219_L_0_k_4 -------------------------------------------------------------------------------- /example/hash/hash_AB451219_L_0_k_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB451219_L_0_k_5 -------------------------------------------------------------------------------- /example/hash/hash_AB451219_L_0_k_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB451219_L_0_k_6 -------------------------------------------------------------------------------- /example/hash/hash_AB451219_L_0_k_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB451219_L_0_k_7 -------------------------------------------------------------------------------- /example/hash/hash_AB451219_L_0_k_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB451219_L_0_k_8 -------------------------------------------------------------------------------- /example/hash/hash_AB472900_L_0_k_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB472900_L_0_k_1 -------------------------------------------------------------------------------- /example/hash/hash_AB472900_L_0_k_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB472900_L_0_k_2 -------------------------------------------------------------------------------- /example/hash/hash_AB472900_L_0_k_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB472900_L_0_k_3 -------------------------------------------------------------------------------- /example/hash/hash_AB472900_L_0_k_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB472900_L_0_k_4 -------------------------------------------------------------------------------- /example/hash/hash_AB472900_L_0_k_5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB472900_L_0_k_5 -------------------------------------------------------------------------------- /example/hash/hash_AB472900_L_0_k_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB472900_L_0_k_6 -------------------------------------------------------------------------------- /example/hash/hash_AB472900_L_0_k_7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB472900_L_0_k_7 -------------------------------------------------------------------------------- /example/hash/hash_AB472900_L_0_k_8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/younglululu/CAFE/HEAD/example/hash/hash_AB472900_L_0_k_8 -------------------------------------------------------------------------------- /code/Makefile: -------------------------------------------------------------------------------- 1 | CC = g++ -Wall -w -std=c++11 -Ofast -march=native 2 | 3 | LSOURCE = utils.cpp seq_model.cpp kmer.cpp dist_model.cpp output.cpp main.cpp 4 | LHEADER = utils.h SimpleMatrix.h seq_model.h kmer.h dist_model.h output.h 5 | 6 | cafe: $(LSOURCE) $(HEADER) 7 | $(CC) $(LSOURCE) -pthread -o cafe 8 | 9 | clean: 10 | rm -f *.o cafe 11 | -------------------------------------------------------------------------------- /code/output.h: -------------------------------------------------------------------------------- 1 | /*************************************************************** 2 | * Copyright (C) 2016 Yang Lu 3 | * Computational and Molecular Biology, Department of Biological Science 4 | * University of Southern California, LA, CA 90089, USA 5 | * 6 | * Related publication: 7 | * TBA 8 | ***************************************************************/ 9 | #ifndef _OUTPUT_H 10 | #define _OUTPUT_H 11 | 12 | #include "utils.h" 13 | #include "SimpleMatrix.h" 14 | 15 | enum OUTPUT_TYPE { PLAIN, PHYLIP, CYTOSCAPE, MDS }; 16 | 17 | class OutputWriter 18 | { 19 | public: 20 | static OutputWriter *getInstance(); 21 | void writeToFile(OUTPUT_TYPE arg_output_type, smat::Matrix* arg_distMat, std::vector* arg_nameVec, std::string str_arg_outputFileURL); 22 | void writeToConsole(OUTPUT_TYPE arg_output_type, smat::Matrix* arg_distMat, std::vector* arg_nameVec); 23 | 24 | private: 25 | OutputWriter(){} 26 | static OutputWriter* instance; 27 | }; 28 | 29 | #endif -------------------------------------------------------------------------------- /code/seq_model.h: -------------------------------------------------------------------------------- 1 | /*************************************************************** 2 | * Copyright (C) 2016 Yang Lu 3 | * Computational and Molecular Biology, Department of Biological Science 4 | * University of Southern California, LA, CA 90089, USA 5 | * 6 | * Related publication: 7 | * TBA 8 | ***************************************************************/ 9 | #ifndef _SEQ_MODEL_H 10 | #define _SEQ_MODEL_H 11 | 12 | #include "utils.h" 13 | 14 | class MarkovModel 15 | { 16 | public: 17 | MarkovModel(int i_arg_order); 18 | ~MarkovModel(); 19 | void normalize(); 20 | void print(); 21 | //void constructMarg(unsigned long long* vec_arg_orderDim); 22 | void constructMarg(unsigned long long* vec_arg_orderkmerIdx, unsigned long long* vec_arg_orderkmerCnt, unsigned long long l_arg_dim); 23 | void addMargProb(unsigned long long l_arg_currKmerIdx, double d_value); 24 | double getMargProb(unsigned long long l_arg_currKmerIdx); 25 | void addTransProb(unsigned long long l_arg_currKmerIdx, unsigned long long i_arg_route, double d_value); 26 | double getTransProb(unsigned long long l_arg_currKmerIdx, unsigned long long i_arg_route); 27 | int getOrder() { return i_order; } 28 | 29 | private: 30 | int i_order; 31 | unsigned long long i_rowDim; 32 | double** arr_obvTransProb; //non-empty rows by 4 matrix 33 | double* vec_obvMargProb; //non-empty rows by 1 matrix 34 | std::unordered_map* kmerIdxRowIdxTable; 35 | }; 36 | 37 | 38 | #endif -------------------------------------------------------------------------------- /code/output.cpp: -------------------------------------------------------------------------------- 1 | #include "output.h" 2 | 3 | OutputWriter* OutputWriter::instance = 0; 4 | 5 | OutputWriter* OutputWriter::getInstance() 6 | { 7 | if (!instance) instance = new OutputWriter(); 8 | return instance; 9 | } 10 | 11 | void OutputWriter::writeToFile(OUTPUT_TYPE arg_output_type, smat::Matrix* arg_distMat, std::vector* arg_nameVec, std::string str_arg_outputFileURL) 12 | { 13 | std::ofstream tmp_ofsPipe(str_arg_outputFileURL.c_str(), std::ofstream::out); 14 | if (PLAIN == arg_output_type || CYTOSCAPE == arg_output_type) 15 | { 16 | for (int i = 0; isize(); ++i) 17 | for (int j = i + 1; jsize(); ++j) 18 | tmp_ofsPipe << arg_nameVec->at(i) << "\t" << arg_nameVec->at(j) << "\t" << arg_distMat->get(i, j) << std::endl; 19 | } 20 | else if (PHYLIP == arg_output_type) 21 | { 22 | tmp_ofsPipe << arg_nameVec->size() << std::endl; 23 | for (int i = 0; isize(); ++i) 24 | { 25 | tmp_ofsPipe << arg_nameVec->at(i); 26 | for (int j = 0; jsize(); ++j) 27 | tmp_ofsPipe << "\t" << arg_distMat->get(i, j); 28 | tmp_ofsPipe << std::endl; 29 | } 30 | } 31 | else if (MDS == arg_output_type) 32 | { 33 | smat::Matrix * mat = arg_distMat->MDS_UCF(2, 30); 34 | for (int i = 0; isize(); ++i) 35 | { 36 | tmp_ofsPipe << arg_nameVec->at(i); 37 | for (int j = 0; jcolumns(); ++j) 38 | tmp_ofsPipe << "\t" << mat->get(i, j); 39 | tmp_ofsPipe << std::endl; 40 | } 41 | } 42 | 43 | tmp_ofsPipe.close(); 44 | } 45 | 46 | 47 | void OutputWriter::writeToConsole(OUTPUT_TYPE arg_output_type, smat::Matrix* arg_distMat, std::vector* arg_nameVec) 48 | { 49 | if (PLAIN == arg_output_type || CYTOSCAPE == arg_output_type) 50 | { 51 | for (int i = 0; isize(); ++i) 52 | for (int j = i + 1; jsize(); ++j) 53 | std::cout << arg_nameVec->at(i) << "\t" << arg_nameVec->at(j) << "\t" << arg_distMat->get(i, j) << std::endl; 54 | } 55 | else if (PHYLIP == arg_output_type) 56 | { 57 | std::cout << arg_nameVec->size() << std::endl; 58 | for (int i = 0; isize(); ++i) 59 | { 60 | std::cout << arg_nameVec->at(i); 61 | for (int j = 0; jsize(); ++j) 62 | std::cout << "\t" << arg_distMat->get(i, j); 63 | std::cout << std::endl; 64 | } 65 | } 66 | else if (MDS == arg_output_type) 67 | { 68 | smat::Matrix * mat = arg_distMat->MDS_UCF(2, 30); 69 | for (int i = 0; isize(); ++i) 70 | { 71 | std::cout << arg_nameVec->at(i); 72 | for (int j = 0; jcolumns(); ++j) 73 | std::cout << "\t" << mat->get(i, j); 74 | std::cout << std::endl; 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /code/seq_model.cpp: -------------------------------------------------------------------------------- 1 | #include "seq_model.h" 2 | #include 3 | #include 4 | 5 | MarkovModel::MarkovModel(int i_arg_order) { i_order = i_arg_order; i_rowDim = 0; } 6 | 7 | MarkovModel::~MarkovModel() 8 | { 9 | for (unsigned long long i = 0; i < i_rowDim; ++i) delete[] arr_obvTransProb[i]; 10 | delete[] arr_obvTransProb; delete[] vec_obvMargProb; delete kmerIdxRowIdxTable; 11 | } 12 | 13 | void MarkovModel::constructMarg(unsigned long long* vec_arg_orderkmerIdx, unsigned long long* vec_arg_orderkmerCnt, unsigned long long l_arg_dim) 14 | { 15 | kmerIdxRowIdxTable = new std::unordered_map(); 16 | 17 | i_rowDim = l_arg_dim; 18 | vec_obvMargProb = new double[i_rowDim]; memset(vec_obvMargProb, 0, sizeof(double) * i_rowDim); 19 | 20 | for (unsigned long long rowIdx = 0; rowIdx < l_arg_dim; ++rowIdx) 21 | { 22 | unsigned long long currKmerIdx = vec_arg_orderkmerIdx[rowIdx]; 23 | (*kmerIdxRowIdxTable)[currKmerIdx] = rowIdx; 24 | vec_obvMargProb[rowIdx] = vec_arg_orderkmerCnt[rowIdx]; 25 | } 26 | 27 | arr_obvTransProb = new double*[i_rowDim]; 28 | for (unsigned long long i = 0; i < i_rowDim; ++i) 29 | { 30 | arr_obvTransProb[i] = new double[BASE]; 31 | memset(arr_obvTransProb[i], 0, sizeof(double) * BASE); 32 | } 33 | } 34 | 35 | void MarkovModel::normalize() 36 | { 37 | double d_tmp_totalCnt = 0; 38 | for (std::unordered_map::iterator iter = kmerIdxRowIdxTable->begin(); iter != kmerIdxRowIdxTable->end(); iter++) 39 | { 40 | unsigned long long currKmerIdx = iter->first; 41 | unsigned long long newIdx = (*kmerIdxRowIdxTable)[currKmerIdx]; 42 | 43 | d_tmp_totalCnt += vec_obvMargProb[newIdx]; 44 | double d_tmp_localCnt = 0; 45 | for (unsigned long long j = 0; j < BASE; ++j) d_tmp_localCnt += arr_obvTransProb[newIdx][j]; 46 | 47 | if (d_tmp_localCnt > 0) 48 | { 49 | //for (unsigned long long j = 0; j < BASE; ++j) arr_obvTransProb[newIdx][j] /= d_tmp_localCnt; 50 | for (unsigned long long j = 0; j < BASE; ++j) 51 | { 52 | if (arr_obvTransProb[newIdx][j] > 0) arr_obvTransProb[newIdx][j] = log(arr_obvTransProb[newIdx][j]) - log(d_tmp_localCnt); 53 | } 54 | } 55 | } 56 | 57 | if (d_tmp_totalCnt > 0) 58 | { 59 | double d_tmp_totalCnt_log = log(d_tmp_totalCnt); 60 | for (std::unordered_map::iterator iter = kmerIdxRowIdxTable->begin(); iter != kmerIdxRowIdxTable->end(); iter++) 61 | { 62 | unsigned long long currKmerIdx = iter->first; 63 | unsigned long long newIdx = (*kmerIdxRowIdxTable)[currKmerIdx]; 64 | //vec_obvMargProb[newIdx] /= d_tmp_totalCnt; 65 | if (vec_obvMargProb[newIdx] > 0) vec_obvMargProb[newIdx] = log(vec_obvMargProb[newIdx]) - d_tmp_totalCnt_log; 66 | } 67 | } 68 | 69 | } 70 | 71 | void MarkovModel::print() 72 | { 73 | std::vector idxVec; 74 | for (std::unordered_map::iterator iter = kmerIdxRowIdxTable->begin(); iter != kmerIdxRowIdxTable->end(); iter++) idxVec.push_back(iter->first); 75 | std::sort(idxVec.begin(), idxVec.end()); 76 | 77 | for (std::vector::iterator iter = idxVec.begin(); iter != idxVec.end(); iter++) 78 | { 79 | unsigned long long currKmerIdx = *iter; 80 | unsigned long long newIdx = (*kmerIdxRowIdxTable)[currKmerIdx]; 81 | std::cout << currKmerIdx << "\t" << vec_obvMargProb[newIdx] << "\t"; 82 | std::cout << arr_obvTransProb[newIdx][0] << " " << arr_obvTransProb[newIdx][1] << " " << arr_obvTransProb[newIdx][2] << " " << arr_obvTransProb[newIdx][3] << std::endl; 83 | } 84 | 85 | } 86 | 87 | void MarkovModel::addMargProb(unsigned long long l_arg_currKmerIdx, double d_value) 88 | { 89 | if (kmerIdxRowIdxTable->find(l_arg_currKmerIdx) == kmerIdxRowIdxTable->end()) return; 90 | vec_obvMargProb[(*kmerIdxRowIdxTable)[l_arg_currKmerIdx]] += d_value; 91 | } 92 | 93 | double MarkovModel::getMargProb(unsigned long long l_arg_currKmerIdx) 94 | { 95 | if (kmerIdxRowIdxTable->find(l_arg_currKmerIdx) == kmerIdxRowIdxTable->end()) return 0; 96 | return vec_obvMargProb[(*kmerIdxRowIdxTable)[l_arg_currKmerIdx]]; 97 | } 98 | 99 | void MarkovModel::addTransProb(unsigned long long l_arg_currKmerIdx, unsigned long long i_arg_route, double d_value) 100 | { 101 | if (kmerIdxRowIdxTable->find(l_arg_currKmerIdx) == kmerIdxRowIdxTable->end()) return; 102 | arr_obvTransProb[(*kmerIdxRowIdxTable)[l_arg_currKmerIdx]][i_arg_route] += d_value; 103 | } 104 | 105 | double MarkovModel::getTransProb(unsigned long long l_arg_currKmerIdx, unsigned long long i_arg_route) 106 | { 107 | if (kmerIdxRowIdxTable->find(l_arg_currKmerIdx) == kmerIdxRowIdxTable->end()) return 0; 108 | return arr_obvTransProb[(*kmerIdxRowIdxTable)[l_arg_currKmerIdx]][i_arg_route]; 109 | } 110 | -------------------------------------------------------------------------------- /code/utils.h: -------------------------------------------------------------------------------- 1 | /*************************************************************** 2 | * Copyright (C) 2016 Yang Lu 3 | * Computational and Molecular Biology, Department of Biological Science 4 | * University of Southern California, LA, CA 90089, USA 5 | * 6 | * Related publication: 7 | * TBA 8 | ***************************************************************/ 9 | #ifndef _Utils_H 10 | #define _Utils_H 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | #include 30 | #include 31 | #include 32 | 33 | 34 | #define BASE 4 35 | #define MAX_ORDER 10 36 | #define LOG2 log(2) 37 | 38 | // if the char char_arg_nt violates ACGTOnly condition, return -1; Otherwise, 39 | // map A/a, C/c, G/g, T/t to 0, 1, 2, 3 respectively. 40 | int nt2int(char char_arg_nt); 41 | 42 | // if the char char_arg_nt violates ACGTOnly condition, return N; Otherwise, 43 | // map A/a, C/c, G/g, T/t to T, G, C, A respectively. 44 | char nt2ComplementNt(char char_arg_nt); 45 | 46 | // if the seq violates ACGTOnly condition, return -1; Otherwise, map the seq 47 | // into an integer ranging from 0 to 4^{length(seq)}-1, i.e. the corresponding 48 | // index in the Markov transition matrix or k-mer vector. For example, 49 | // nt2index("AAAA")=0, nt2index("TAAA")=64, nt2index("TTTT")=255, etc. 50 | // 51 | // str_arg_seq: the m-mer in m-order Markov model or k-mer seq 52 | unsigned long long nt2index(const std::string & str_arg_seq); 53 | 54 | unsigned long long index2revCompleIdx(unsigned long long i_arg_index, int i_arg_seqLength); 55 | 56 | // the inverse operation of nt2index 57 | // 58 | // i_arg_index: the index in the Markov transition matrix or k-mer vector 59 | // i_arg_seqLength: the length of seq 60 | std::string index2nt(unsigned long long i_arg_index, int i_arg_seqLength); 61 | 62 | // reverse complement of the input str, expect to satisfy ACGTOnly condition 63 | std::string revComplementStr(const std::string & currStr); 64 | 65 | //complement of the input str, expect to satisfy ACGTOnly condition 66 | std::string complementStr(const std::string & currStr); 67 | 68 | // reverse of the input str 69 | std::string revStr(const std::string & currStr); 70 | 71 | std::string trim(std::string currStr); 72 | 73 | void split(const std::string& currStr, std::string delim, std::vector & ret); 74 | 75 | std::string toLowerCase(std::string currStr); 76 | 77 | bool endsWith(std::string const & currStr, std::string const & ending); 78 | 79 | /*unsigned long long updateIdx(unsigned long long i_arg_currIdx, const std::string & str_arg_appendStr) 80 | { 81 | unsigned long long i_tmp_newIdx = i_arg_currIdx; 82 | 83 | for (unsigned int i = 0; i < str_arg_appendStr.size(); ++i) 84 | { 85 | i_tmp_newIdx = (i_tmp_newIdx << 2) + nt2int(str_arg_appendStr.at(i)); 86 | 87 | if (i_tmp_newIdx < i_arg_currIdx) 88 | throw std::runtime_error(" left shifting cause overflow! "); 89 | } 90 | return i_tmp_newIdx; 91 | }*/ 92 | 93 | bool file_exists(const std::string & str_arg_filename); 94 | bool dir_exists(const std::string & str_arg_directory); 95 | std::string getFileName(const std::string & str_arg_path); 96 | 97 | template void free_vec_ptr(std::vector & v) 98 | { 99 | int size = v.size(); 100 | T* p = NULL; 101 | for (int i = 0; i < size; i++) 102 | { 103 | p = v[i]; 104 | delete[] p; 105 | } 106 | v.clear(); 107 | } 108 | 109 | // find the max and argmax in an array 110 | template T max(const T * x, int n, int* argmax) 111 | { 112 | *argmax = 0; 113 | T max_val = x[0]; 114 | for (int i = 1; i < n; i++) 115 | if (x[i] > max_val) 116 | { 117 | max_val = x[i]; 118 | *argmax = i; 119 | } 120 | return max_val; 121 | } 122 | 123 | // find the max and argmax in an vector 124 | template T max_vec(std::vector & v, int n, int* argmax) 125 | { 126 | *argmax = 0; 127 | T max_val = v[0]; 128 | for (int i = 1; i < n; i++) 129 | if (v[i] > max_val) 130 | { 131 | max_val = v[i]; 132 | *argmax = i; 133 | } 134 | return max_val; 135 | } 136 | 137 | 138 | // find the min and argmin in an vector 139 | template T min_vec(std::vector & v, int n, int* argmin) 140 | { 141 | *argmin = 0; 142 | T min_val = v[0]; 143 | for (int i = 1; i < n; i++) 144 | if (v[i] < min_val) 145 | { 146 | min_val = v[i]; 147 | *argmin = i; 148 | } 149 | return min_val; 150 | } 151 | 152 | //given log(a) and log(b), return log(a + b) 153 | double log_sum(double log_a, double log_b); 154 | 155 | // give a_1, ..., a_n, 156 | // return log(exp(a_1)+...+exp(a_n)) 157 | double log_normalize(double * array, int nlen); 158 | 159 | // the vector version 160 | double log_normalize(std::vector & vec, int nlen); 161 | 162 | //given log(a) and log(b), return log(a - b) a>b 163 | double log_subtract(double log_a, double log_b); 164 | 165 | bool almostEquals(double a, double b); 166 | 167 | namespace patch 168 | { 169 | template < typename T > std::string to_string( const T& n ) 170 | { 171 | std::ostringstream stm ; 172 | stm << n ; 173 | return stm.str() ; 174 | } 175 | } 176 | 177 | #endif -------------------------------------------------------------------------------- /code/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | int nt2int(char char_arg_nt) 10 | { 11 | int i_returnVal = -1; 12 | if (char_arg_nt == 'A' || char_arg_nt == 'a') i_returnVal = 0; 13 | else if (char_arg_nt == 'C' || char_arg_nt == 'c') i_returnVal = 1; 14 | else if (char_arg_nt == 'G' || char_arg_nt == 'g') i_returnVal = 2; 15 | else if (char_arg_nt == 'T' || char_arg_nt == 't') i_returnVal = 3; 16 | return i_returnVal; 17 | } 18 | 19 | char nt2ComplementNt(char char_arg_nt) 20 | { 21 | int c_returnChar = 'N'; 22 | if (char_arg_nt == 'A' || char_arg_nt == 'a') c_returnChar = 'T'; 23 | else if (char_arg_nt == 'C' || char_arg_nt == 'c') c_returnChar = 'G'; 24 | else if (char_arg_nt == 'G' || char_arg_nt == 'g') c_returnChar = 'C'; 25 | else if (char_arg_nt == 'T' || char_arg_nt == 't') c_returnChar = 'A'; 26 | return c_returnChar; 27 | } 28 | 29 | unsigned long long nt2index(const std::string & str_arg_seq) 30 | { 31 | //return updateIdx((unsigned long long)0, str_arg_seq); 32 | unsigned long long i_tmp_newIdx = 0; 33 | 34 | for (unsigned int i = 0; i < str_arg_seq.size(); ++i) 35 | { 36 | i_tmp_newIdx = (i_tmp_newIdx << 2) + nt2int(str_arg_seq.at(i)); 37 | 38 | if (i_tmp_newIdx < 0) 39 | throw std::runtime_error(" left shifting cause overflow! "); 40 | } 41 | return i_tmp_newIdx; 42 | } 43 | 44 | std::string index2nt(unsigned long long i_arg_index, int i_arg_seqLength) 45 | { 46 | if (i_arg_index < 0) return ""; 47 | 48 | std::string str_tmp_dict = "ACGT"; 49 | unsigned long long i_tmp_currIndex = i_arg_index; 50 | std::stringstream tmp_revStrStream; 51 | for (int idx = 0; idx < i_arg_seqLength; ++idx) 52 | { 53 | tmp_revStrStream << str_tmp_dict.at(i_tmp_currIndex % BASE); 54 | i_tmp_currIndex = (i_tmp_currIndex >> 2); 55 | } 56 | 57 | std::string str_revStr = tmp_revStrStream.str(); 58 | std::stringstream tmp_strStream; 59 | for (std::string::reverse_iterator rit = str_revStr.rbegin(); rit != str_revStr.rend(); ++rit) tmp_strStream << *rit; 60 | return tmp_strStream.str(); 61 | } 62 | 63 | unsigned long long index2revCompleIdx(unsigned long long i_arg_index, int i_arg_seqLength) 64 | { 65 | if (i_arg_index < 0) return 0; 66 | 67 | std::string str_tmp_dict = "ACGT"; 68 | unsigned long long i_tmp_currIndex = i_arg_index, revCompleIdx = 0; 69 | 70 | for (int idx = 0; idx < i_arg_seqLength; ++idx) 71 | { 72 | revCompleIdx = (revCompleIdx << 2) + (BASE - 1 - (i_tmp_currIndex % BASE)); 73 | i_tmp_currIndex = (i_tmp_currIndex >> 2); 74 | } 75 | return revCompleIdx; 76 | //return nt2index(revComplementStr(index2nt(i_arg_index, i_arg_seqLength))); 77 | } 78 | 79 | std::string revComplementStr(const std::string & currStr) 80 | { 81 | std::stringstream ss; 82 | for (int i = currStr.length() - 1; i >= 0; --i) ss << nt2ComplementNt(currStr[i]); 83 | return ss.str(); 84 | } 85 | 86 | std::string complementStr(const std::string & currStr) 87 | { 88 | std::stringstream ss; 89 | for (int i = 0; i < currStr.length(); ++i) ss << nt2ComplementNt(currStr[i]); 90 | return ss.str(); 91 | } 92 | 93 | std::string revStr(const std::string & currStr) 94 | { 95 | std::stringstream ss; 96 | for (int i = currStr.length() - 1; i >= 0; --i) ss << currStr[i]; 97 | return ss.str(); 98 | } 99 | 100 | std::string trim(std::string currStr) 101 | { 102 | if (currStr.empty()) return currStr; 103 | 104 | currStr.erase(0, currStr.find_first_not_of(" ")); 105 | currStr.erase(currStr.find_last_not_of(" ") + 1); 106 | return currStr; 107 | } 108 | 109 | void split(const std::string& currStr, std::string delim, std::vector & ret) 110 | { 111 | size_t last = 0; 112 | size_t index = currStr.find_first_of(delim, last); 113 | 114 | while (index != std::string::npos) 115 | { 116 | ret.push_back(trim(currStr.substr(last, index - last))); 117 | last = index + 1; 118 | index = currStr.find_first_of(delim, last); 119 | } 120 | if (index - last>0) ret.push_back(trim(currStr.substr(last, index - last))); 121 | } 122 | 123 | std::string toLowerCase(std::string currStr) 124 | { 125 | std::string str_tmp_out; 126 | std::transform(currStr.begin(), currStr.end(), std::back_inserter(str_tmp_out), ::tolower); 127 | return str_tmp_out; 128 | } 129 | 130 | bool endsWith(std::string const & currStr, std::string const & ending) 131 | { 132 | if (ending.size() > currStr.size()) return false; 133 | return std::equal(ending.rbegin(), ending.rend(), currStr.rbegin()); 134 | } 135 | 136 | bool file_exists(const std::string & str_arg_filename) 137 | { 138 | std::ifstream f(str_arg_filename.c_str()); 139 | if (f.good()) { f.close(); return true; } 140 | else { f.close(); return false; } 141 | } 142 | 143 | bool dir_exists(const std::string & str_arg_directory) 144 | { 145 | struct stat buffer; 146 | return (stat(str_arg_directory.c_str(), &buffer) == 0); 147 | } 148 | 149 | std::string getFileName(const std::string & str_arg_path) 150 | { 151 | std::string base = str_arg_path.substr(str_arg_path.find_last_of("/\\") + 1); 152 | const size_t period_idx = base.rfind('.'); 153 | if (std::string::npos != period_idx) base.erase(period_idx); 154 | return base; 155 | } 156 | 157 | 158 | double log_sum(double log_a, double log_b) 159 | { 160 | double v; 161 | if (log_a < log_b) v = log_b + log(1 + exp(log_a - log_b)); 162 | else v = log_a + log(1 + exp(log_b - log_a)); 163 | return v; 164 | } 165 | 166 | 167 | double log_normalize(double * array, int nlen) 168 | { 169 | const double log_max = 100.0; // the log(maximum in double precision), make sure it is large enough. 170 | int argmax; 171 | double max_val = max(array, nlen, &argmax); //get the maximum value in the array to avoid overflow 172 | double log_shift = log_max - log(nlen + 1.0) - max_val; 173 | double sum = 0.0; 174 | for (int i = 0; i < nlen; i++) sum += exp(array[i] + log_shift); //shift it 175 | 176 | double log_norm = log(sum) - log_shift; 177 | for (int i = 0; i < nlen; i++) array[i] -= log_norm; //shift it back 178 | return log_norm; 179 | } 180 | 181 | 182 | double log_normalize(std::vector & vec, int nlen) 183 | { 184 | const double log_max = 100.0; // the log(maximum in double precision), make sure it is large enough. 185 | int argmax; 186 | double max_val = max_vec(vec, nlen, &argmax); //get the maximum value in the array to avoid overflow 187 | double log_shift = log_max - log(nlen + 1.0) - max_val; 188 | double sum = 0.0; 189 | for (int i = 0; i < nlen; i++) sum += exp(vec[i] + log_shift); //shift it 190 | 191 | double log_norm = log(sum) - log_shift; 192 | for (int i = 0; i < nlen; i++) vec[i] -= log_norm; //shift it back 193 | return log_norm; 194 | } 195 | 196 | 197 | double log_subtract(double log_a, double log_b) 198 | { 199 | if (log_a < log_b) return -1000.0; 200 | double v; 201 | v = log_a + log(1 - exp(log_b - log_a)); 202 | return v; 203 | } 204 | 205 | bool almostEquals(double a, double b) 206 | { 207 | return std::fabs(a - b) <= std::numeric_limits::epsilon(); 208 | } -------------------------------------------------------------------------------- /code/kmer.h: -------------------------------------------------------------------------------- 1 | /*************************************************************** 2 | * Copyright (C) 2016 Yang Lu 3 | * Computational and Molecular Biology, Department of Biological Science 4 | * University of Southern California, LA, CA 90089, USA 5 | * 6 | * Related publication: 7 | * TBA 8 | ***************************************************************/ 9 | #ifndef _KMER_H 10 | #define _KMER_H 11 | 12 | #include "seq_model.h" 13 | 14 | class AbsIter 15 | { 16 | public: 17 | AbsIter(int i_arg_k, std::unordered_map* arg_kmerCntUnorderMap) { i_k = i_arg_k; currKmer = 0; kmerCntUnorderMap = arg_kmerCntUnorderMap; }; 18 | virtual ~AbsIter() {} 19 | 20 | virtual void operator++() = 0; 21 | virtual bool hasNext() = 0; 22 | virtual double operator*() = 0; 23 | 24 | void operator++(int) { ++(*this); }; 25 | unsigned long long getCurrKmer() { return currKmer; } 26 | 27 | public: 28 | int i_k; 29 | unsigned long long currKmer; 30 | std::unordered_map* kmerCntUnorderMap; 31 | }; 32 | 33 | class KmerCntTraverseIter : public AbsIter 34 | { 35 | public: 36 | KmerCntTraverseIter(int i_arg_k, std::unordered_map* arg_kmerCntUnorderMap) : AbsIter(i_arg_k, arg_kmerCntUnorderMap) { maxAllowedIdx = (unsigned long long)pow(BASE, i_arg_k) - 1; } 37 | void operator++() { currKmer++; }; 38 | bool hasNext() { return (currKmer <= maxAllowedIdx); }; 39 | double operator*() { return (*kmerCntUnorderMap)[currKmer]; }; 40 | 41 | public: 42 | unsigned long long maxAllowedIdx; 43 | }; 44 | 45 | class AbsHashIter : public AbsIter 46 | { 47 | public: 48 | AbsHashIter(int i_arg_k, std::unordered_map* arg_kmerCntUnorderMap, std::vector* arg_kmerVec) : AbsIter(i_arg_k, arg_kmerCntUnorderMap) { currKmerVecIdx = 0; kmerVec = arg_kmerVec; currKmer = kmerVec->at(currKmerVecIdx); } 49 | void operator++() { currKmerVecIdx++; if (currKmerVecIdx < kmerVec->size()) currKmer = kmerVec->at(currKmerVecIdx); }; 50 | bool hasNext() { return (currKmerVecIdx < kmerVec->size()); }; 51 | virtual double operator*() = 0; 52 | 53 | public: 54 | unsigned long long currKmerVecIdx; 55 | std::vector* kmerVec; 56 | }; 57 | 58 | class KmerCntHashIter : public AbsHashIter 59 | { 60 | public: 61 | KmerCntHashIter(int i_arg_k, std::unordered_map* arg_kmerCntUnorderMap, std::vector* arg_kmerVec) : AbsHashIter(i_arg_k, arg_kmerCntUnorderMap, arg_kmerVec) {} 62 | double operator*() { return (*kmerCntUnorderMap)[currKmer]; } 63 | }; 64 | 65 | class KmerFreqHashIter : public AbsHashIter 66 | { 67 | public: 68 | KmerFreqHashIter(int i_arg_k, std::unordered_map* arg_kmerCntUnorderMap, std::vector* arg_kmerVec, unsigned long l_arg_totalKmer) : AbsHashIter(i_arg_k, arg_kmerCntUnorderMap, arg_kmerVec) { totalKmerInv = 1.0 / l_arg_totalKmer; } 69 | double operator*() { return totalKmerInv * (*kmerCntUnorderMap)[currKmer]; } 70 | 71 | private: 72 | double totalKmerInv; 73 | }; 74 | 75 | class KmerProbDelegate 76 | { 77 | public: 78 | KmerProbDelegate(int i_arg_k, MarkovModel* arg_mrkvModel, bool b_arg_isRevCompl = false); 79 | 80 | void init(); 81 | double getKmerlogProb(unsigned long long queryNextKmerIdx); 82 | 83 | private: 84 | bool push(unsigned long long idx); 85 | bool push(std::stack & indices); 86 | bool increment(); 87 | unsigned long long pop(); 88 | 89 | private: 90 | MarkovModel* markovModel; 91 | bool isEnd, isRevCompl; 92 | int i_k; 93 | unsigned long long nextPosKmerIdx, maxAllowedIdx; 94 | 95 | std::stack kmer_traceStack; 96 | std::stack orderIdx_traceStack; 97 | std::stack logProbProd_traceStack; 98 | std::stack lowerIdx_traceStack, upperIdx_traceStack; 99 | }; 100 | 101 | class KmerProbEnsembDelegate 102 | { 103 | public: 104 | KmerProbEnsembDelegate(int i_arg_k, MarkovModel* arg_mrkvModel, bool b_arg_singleStrain); 105 | ~KmerProbEnsembDelegate(); 106 | 107 | void init(); 108 | double getKmerlogProb(unsigned long long queryNextKmerIdx); 109 | 110 | private: 111 | bool b_singleStrain; 112 | int i_k; 113 | MarkovModel* markovModel; 114 | KmerProbDelegate *kmerProbDelegate, *revComplKmerProbDelegate; 115 | }; 116 | 117 | class AbsDistStrategy 118 | { 119 | public: 120 | AbsDistStrategy(int i_arg_k, bool b_arg_singleStrain) { i_k = i_arg_k; b_singleStrain = b_arg_singleStrain; } 121 | virtual double getDist() = 0; 122 | 123 | public: 124 | int i_k; 125 | bool b_singleStrain; 126 | }; 127 | 128 | 129 | class AbsTupleDistStrategy : public AbsDistStrategy 130 | { 131 | public: 132 | AbsTupleDistStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsDistStrategy(i_arg_k, b_arg_singleStrain){} 133 | virtual void dealWithTuple(double src_X_w, double trgt_X_w) = 0; 134 | }; 135 | 136 | 137 | class AbsQuadStrategy : public AbsDistStrategy 138 | { 139 | public: 140 | AbsQuadStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsDistStrategy(i_arg_k, b_arg_singleStrain){} 141 | virtual void dealWithQuad(double src_X_w, double src_EX_w, double trgt_X_w, double trgt_EX_w) = 0; 142 | }; 143 | 144 | class AbsMrkvStrategy : public AbsDistStrategy 145 | { 146 | public: 147 | AbsMrkvStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsDistStrategy(i_arg_k, b_arg_singleStrain){} 148 | virtual void dealWithMrkv(MarkovModel* src_mrkvModel, MarkovModel* trgt_mrkvModel) = 0; 149 | }; 150 | 151 | 152 | class IterFactory 153 | { 154 | public: 155 | static IterFactory *getInstance(); 156 | 157 | double getFreqDist(AbsTupleDistStrategy* distStrategy, 158 | std::unordered_map* src_kmerCntUnorderMap, std::vector* src_kmerVec, unsigned long src_totalKmer, 159 | std::unordered_map* trgt_kmerCntUnorderMap, std::vector* trgt_kmerVec, unsigned long trgt_totalKmer); 160 | 161 | double getCntDist(AbsTupleDistStrategy* distStrategy, int i_arg_lowerCnt, 162 | std::unordered_map* src_kmerCntUnorderMap, std::vector* src_kmerVec, 163 | std::unordered_map* trgt_kmerCntUnorderMap, std::vector* trgt_kmerVec); 164 | 165 | //specific to ChiSq Distance 166 | double getCntDist(AbsQuadStrategy* distStrategy, 167 | std::unordered_map* src_kmerCntUnorderMap, std::vector* src_kmerVec, 168 | std::unordered_map* trgt_kmerCntUnorderMap, std::vector* trgt_kmerVec); 169 | 170 | double getCntExpDist(AbsQuadStrategy* distStrategy, int i_arg_lowerCnt, 171 | std::unordered_map* src_kmerCntUnorderMap, std::vector* src_kmerVec, MarkovModel* src_mrkvModel, unsigned long src_totalKmer, 172 | std::unordered_map* trgt_kmerCntUnorderMap, std::vector* trgt_kmerVec, MarkovModel* trgt_mrkvModel, unsigned long trgt_totalKmer); 173 | 174 | double getMrkvDist(AbsMrkvStrategy* distStrategy, MarkovModel* src_mrkvModel, MarkovModel* trgt_mrkvModel); 175 | 176 | double getCoPhylogDist(int i_arg_k, 177 | std::unordered_map* src_kmerCntUnorderMap, std::vector* src_kmerVec, 178 | std::unordered_map* trgt_kmerCntUnorderMap, std::vector* trgt_kmerVec); 179 | 180 | AbsIter* getKmerCntIterator(int i_arg_k, std::unordered_map* arg_kmerCntUnorderMap, std::vector* arg_kmerVec, int i_arg_lowerCnt); 181 | AbsIter* getKmerFreqIterator(int i_arg_k, std::unordered_map* arg_kmerCntUnorderMap, std::vector* arg_kmerVec, unsigned long l_arg_totalKmer); 182 | KmerProbEnsembDelegate* getKmerProbDelegate(int i_arg_k, bool b_arg_singleStrain, MarkovModel* arg_mrkvModel); 183 | 184 | private: 185 | IterFactory(){} 186 | static IterFactory* instance; 187 | }; 188 | 189 | 190 | class KmerModel 191 | { 192 | public: 193 | KmerModel(int i_arg_k, bool b_arg_singleStrain) { i_k = i_arg_k; b_singleStrain = b_arg_singleStrain; kmerCntUnorderMap = new std::unordered_map(); kmerVec = new std::vector(); } 194 | ~KmerModel() { delete kmerCntUnorderMap; delete kmerVec; } 195 | 196 | bool load(int i_arg_k, std::string str_arg_inputURL); 197 | bool saveFromLargerK(int i_arg_k, int i_arg_larger_k, std::string str_arg_inputURL, std::string str_arg_outputURL); 198 | bool saveFromJellyFish(std::string str_arg_jfTxtURL, std::string str_arg_outputURL); 199 | bool saveFromFasta(int i_arg_k, std::string str_arg_fastaFileURL, std::string str_arg_outputURL); 200 | 201 | unsigned long totalKmer(); 202 | MarkovModel* getMarkovModel(int i_arg_order, std::string str_arg_saveURLPrefix); 203 | 204 | private: 205 | bool save(std::map *kmerCntMap, std::string str_arg_outputURL); 206 | 207 | public: 208 | int i_k; 209 | bool b_singleStrain; 210 | std::unordered_map *kmerCntUnorderMap; 211 | std::vector* kmerVec; 212 | }; 213 | 214 | 215 | int getEstMarkovOrder(int i_arg_k, std::string str_arg_saveURLPrefix, std::string str_arg_seqName); 216 | 217 | #endif -------------------------------------------------------------------------------- /code/mtTkinter.py: -------------------------------------------------------------------------------- 1 | '''Thread-safe version of Tkinter. 2 | 3 | Copyright (c) 2009, Allen B. Taylor 4 | 5 | This module is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU Lesser Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU Lesser Public License for more details. 14 | 15 | You should have received a copy of the GNU Lesser Public License 16 | along with this program. If not, see . 17 | 18 | Usage: 19 | 20 | import mtTkinter as Tkinter 21 | # Use "Tkinter." as usual. 22 | 23 | or 24 | 25 | from mtTkinter import * 26 | # Use Tkinter module definitions as usual. 27 | 28 | This module modifies the original Tkinter module in memory, making all 29 | functionality thread-safe. It does this by wrapping the Tk class' tk 30 | instance with an object that diverts calls through an event queue when 31 | the call is issued from a thread other than the thread in which the Tk 32 | instance was created. The events are processed in the creation thread 33 | via an 'after' event. 34 | 35 | The modified Tk class accepts two additional keyword parameters on its 36 | __init__ method: 37 | mtDebug: 38 | 0 = No debug output (default) 39 | 1 = Minimal debug output 40 | ... 41 | 9 = Full debug output 42 | mtCheckPeriod: 43 | Amount of time in milliseconds (default 100) between checks for 44 | out-of-thread events when things are otherwise idle. Decreasing 45 | this value can improve GUI responsiveness, but at the expense of 46 | consuming more CPU cycles. 47 | 48 | Note that, because it modifies the original Tkinter module (in memory), 49 | other modules that use Tkinter (e.g., Pmw) reap the benefits automagically 50 | as long as mtTkinter is imported at some point before extra threads are 51 | created. 52 | 53 | Author: Allen B. Taylor, a.b.taylor@gmail.com 54 | ''' 55 | 56 | from Tkinter import * 57 | import threading 58 | import Queue 59 | 60 | class _Tk(object): 61 | """ 62 | Wrapper for underlying attribute tk of class Tk. 63 | """ 64 | 65 | def __init__(self, tk, mtDebug = 0, mtCheckPeriod = 10): 66 | self._tk = tk 67 | 68 | # Create the incoming event queue. 69 | self._eventQueue = Queue.Queue(1) 70 | 71 | # Identify the thread from which this object is being created so we can 72 | # tell later whether an event is coming from another thread. 73 | self._creationThread = threading.currentThread() 74 | 75 | # Store remaining values. 76 | self._debug = mtDebug 77 | self._checkPeriod = mtCheckPeriod 78 | 79 | def __getattr__(self, name): 80 | # Divert attribute accesses to a wrapper around the underlying tk 81 | # object. 82 | return _TkAttr(self, getattr(self._tk, name)) 83 | 84 | class _TkAttr(object): 85 | """ 86 | Thread-safe callable attribute wrapper. 87 | """ 88 | 89 | def __init__(self, tk, attr): 90 | self._tk = tk 91 | self._attr = attr 92 | 93 | def __call__(self, *args, **kwargs): 94 | """ 95 | Thread-safe method invocation. 96 | Diverts out-of-thread calls through the event queue. 97 | Forwards all other method calls to the underlying tk object directly. 98 | """ 99 | 100 | # Check if we're in the creation thread. 101 | if threading.currentThread() == self._tk._creationThread: 102 | # We're in the creation thread; just call the event directly. 103 | if self._tk._debug >= 8 or \ 104 | self._tk._debug >= 3 and self._attr.__name__ == 'call' and \ 105 | len(args) >= 1 and args[0] == 'after': 106 | print 'Calling event directly:', \ 107 | self._attr.__name__, args, kwargs 108 | return self._attr(*args, **kwargs) 109 | else: 110 | # We're in a different thread than the creation thread; enqueue 111 | # the event, and then wait for the response. 112 | responseQueue = Queue.Queue(1) 113 | if self._tk._debug >= 1: 114 | print 'Marshalling event:', self._attr.__name__, args, kwargs 115 | self._tk._eventQueue.put((self._attr, args, kwargs, responseQueue)) 116 | isException, response = responseQueue.get() 117 | 118 | # Handle the response, whether it's a normal return value or 119 | # an exception. 120 | if isException: 121 | exType, exValue, exTb = response 122 | raise exType, exValue, exTb 123 | else: 124 | return response 125 | 126 | # Define a hook for class Tk's __init__ method. 127 | def _Tk__init__(self, *args, **kwargs): 128 | # We support some new keyword arguments that the original __init__ method 129 | # doesn't expect, so separate those out before doing anything else. 130 | new_kwnames = ('mtCheckPeriod', 'mtDebug') 131 | new_kwargs = {} 132 | for name, value in kwargs.items(): 133 | if name in new_kwnames: 134 | new_kwargs[name] = value 135 | del kwargs[name] 136 | 137 | # Call the original __init__ method, creating the internal tk member. 138 | self.__original__init__mtTkinter(*args, **kwargs) 139 | 140 | # Replace the internal tk member with a wrapper that handles calls from 141 | # other threads. 142 | self.tk = _Tk(self.tk, **new_kwargs) 143 | 144 | # Set up the first event to check for out-of-thread events. 145 | self.after_idle(_CheckEvents, self) 146 | 147 | # Replace Tk's original __init__ with the hook. 148 | Tk.__original__init__mtTkinter = Tk.__init__ 149 | Tk.__init__ = _Tk__init__ 150 | 151 | def _CheckEvents(tk): 152 | "Event checker event." 153 | 154 | used = False 155 | try: 156 | # Process all enqueued events, then exit. 157 | while True: 158 | try: 159 | # Get an event request from the queue. 160 | method, args, kwargs, responseQueue = \ 161 | tk.tk._eventQueue.get_nowait() 162 | except: 163 | # No more events to process. 164 | break 165 | else: 166 | # Call the event with the given arguments, and then return 167 | # the result back to the caller via the response queue. 168 | used = True 169 | if tk.tk._debug >= 2: 170 | print 'Calling event from main thread:', \ 171 | method.__name__, args, kwargs 172 | try: 173 | responseQueue.put((False, method(*args, **kwargs))) 174 | except SystemExit, ex: 175 | raise SystemExit, ex 176 | except Exception, ex: 177 | # Calling the event caused an exception; return the 178 | # exception back to the caller so that it can be raised 179 | # in the caller's thread. 180 | from sys import exc_info 181 | exType, exValue, exTb = exc_info() 182 | responseQueue.put((True, (exType, exValue, exTb))) 183 | finally: 184 | # Schedule to check again. If we just processed an event, check 185 | # immediately; if we didn't, check later. 186 | if used: 187 | tk.after_idle(_CheckEvents, tk) 188 | else: 189 | tk.after(tk.tk._checkPeriod, _CheckEvents, tk) 190 | 191 | # Test thread entry point. 192 | def _testThread(root): 193 | text = "This is Tcl/Tk version %s" % TclVersion 194 | if TclVersion >= 8.1: 195 | try: 196 | text = text + unicode("\nThis should be a cedilla: \347", 197 | "iso-8859-1") 198 | except NameError: 199 | pass # no unicode support 200 | try: 201 | if root.globalgetvar('tcl_platform(threaded)'): 202 | text = text + "\nTcl is built with thread support" 203 | else: 204 | raise RuntimeError 205 | except: 206 | text = text + "\nTcl is NOT built with thread support" 207 | text = text + "\nmtTkinter works with or without Tcl thread support" 208 | label = Label(root, text=text) 209 | label.pack() 210 | button = Button(root, text="Click me!", 211 | command=lambda root=root: root.button.configure( 212 | text="[%s]" % root.button['text'])) 213 | button.pack() 214 | root.button = button 215 | quit = Button(root, text="QUIT", command=root.destroy) 216 | quit.pack() 217 | # The following three commands are needed so the window pops 218 | # up on top on Windows... 219 | root.iconify() 220 | root.update() 221 | root.deiconify() 222 | # Simulate button presses... 223 | button.invoke() 224 | root.after(1000, _pressOk, root, button) 225 | 226 | # Test button continuous press event. 227 | def _pressOk(root, button): 228 | button.invoke() 229 | try: 230 | root.after(1000, _pressOk, root, button) 231 | except: 232 | pass # Likely we're exiting 233 | 234 | # Test. Mostly borrowed from the Tkinter module, but the important bits moved 235 | # into a separate thread. 236 | if __name__ == '__main__': 237 | import threading 238 | root = Tk(mtDebug = 1) 239 | thread = threading.Thread(target = _testThread, args=(root,)) 240 | thread.start() 241 | root.mainloop() 242 | thread.join() 243 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # **CAFE** 2 | **aCcelerated Alignment-FrEe sequence analysis** 3 | 4 | =================== 5 | 6 | Thank you for downloading CAFE for molecular sequence analysis using state-of-art Alignment-Free methods. This software provides the well-optimized programs to compute overall **28** distance/dissimilarity measures including (1) conventional measures based on k-mer counts, (2) newly developed measures based on background adjusted k-mer counts, and (3) measures based on presence/absence of k-mers. The detailed definitions can be found in the paper. 7 | 8 | CAFE works with sequence data, both long genomic sequences and shotgun sequence reads from NGS technologies, and subsequently generates pairwise dissimilarities among the sequences as output. CAFE provides four types of visualized downstream analysis, including heatmap, two dimensional projection using principal coordinate analysis (PCoA), network display, and sequence clustering into a dendrogram by using the neighbour-joining algorithm. All the analysis can be performed by simply clicking through well-designed graphical user interface (GUI) on two common operating systems ( Mac and Windows) or invoking a stand-alone command line executable program on three common operating systems (Linux, Mac, and Windows) 9 | 10 | 11 | One-click Installation 12 | ============ 13 | (DO NOT download using the “clone or download” button above) 14 | 15 | Installation on Windows 16 | ------------------------ 17 | > 1. Download the Windows Version of CAFE from [**here**](https://www.dropbox.com/s/o2d6z5pze8ih7iw/CAFE_v071017.zip?dl=0) or [**here**](http://pan.baidu.com/s/1gfJ1mQB) 18 | > 2. Unzip it 19 | > 3. Due to permission settings of User Account Control in different version of windows, the default executable of **cafe_win.exe** may not work as expected. To test it, just double-click **cafe_win.exe** or run it through the command line. If there is some alert or errors, please rename **cafe_win_2.exe** or **cafe_win_3.exe** to **cafe_win.exe** and repeat until passing the test. 20 | > 4. Within the folder, double-click **CAFEGUI.exe**. Be patient for the first time. If it fails again, try to right-click **CAFEGUI.exe** and 'Run as administrator'. 21 | 22 | Installation on Mac 23 | ------------------------ 24 | > 1. Download the Mac Version of CAFE from [**here**](https://www.dropbox.com/s/i3rsu8steiwemzd/CAFE_mac_latest.zip?dl=0) or [**here**](http://pan.baidu.com/s/1kUIwHbL) 25 | > 2. Unzip it 26 | > 3. Within the folder, double-click **CAFEGUI**. If fails, please use the terminal to execute "./CAFEGUI". 27 | 28 | 29 | Usage 30 | ===== 31 | 32 | Guidance on Graphical User Interface 33 | ------------------------ 34 | 35 |

36 | 37 |

38 | The graphical user interface has the layout shown in the above figure, containing six parts in terms of functionality: 39 | 40 | The red area corresponds to the Data Selection Toolbar. The sequence data can be either long genomic sequences or shotgun sequence reads from NGS technologies, with the file extension '.fasta', '.fa' or '.fna'. 41 | 42 | ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/load.gif) : Load Existing Results in Phylip format. 43 | 44 | ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/addFile.gif) : Add one genome sequence to the list. 45 | 46 | ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/addDir.gif) : Add all genome sequences from directory to the list. 47 | 48 | ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/remove.gif) : Remove Selected genome sequences in the list. 49 | 50 | ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/clear.gif) : Remove all genome sequences in the list. 51 | 52 | The yellow area involves parameter configuration related to various distance measures, including the selection of 28 distance measures, k-mer length, potential Markov Order encoding the sequence model, the threshold cutoff of the k-mer occurrences, and whether to consider the reverse complement of each k-mer, which is a common practice in dealing with shotgun sequence reads from NGS technologies. Usually the potential Markov Order remains unclear to the user. The simple yet time-consuming way is to choose '-1' as inferring the optimal Markov Order automatically by using the Bayesian Information Criterion (BIC). 53 | 54 | The pink area corresponds to the Image Toolbar. When the visualized results have been plotted, users can either zoom in or zoom out the figure by clicking the button or using the mouse wheel. Meanwhile, the figure can be saved locally by clicking the button or right-clicking the mouse. 55 | 56 | ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/zoomin.gif) : Zoom in the current figure. 57 | 58 | ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/zoomout.gif) : Zoom out the current figure. 59 | 60 | ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/save.gif) : Save the current figure. 61 | 62 | The green area contains the list of all sequence added from the Data Selection Toolbar. 63 | 64 | The blue area keeps track of the running information when calculating the distance measures. 65 | 66 | The purple area contains the key to visualize the relationship among the input sequences using different approaches. Specifically, CAFE provides four types of visualized downstream analysis, including heatmap, two dimensional projection using principal coordinate analysis (PCoA), network display, and sequence clustering into a dendrogram by using the neighbour-joining algorithm. Each analysis is shown in the respective tabbed window. 67 | 68 | 69 | An Usage Example of Graphical User Interface 70 | ------------------------ 71 | 72 | 73 | Here we go through a toy example step-by-step. You can find a folder named "example" in the unzipped folder. 74 | 75 | We first click the ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/addDir.gif) button of the Data Selection Toolbar and select the "data" folder, selecting all the virus genome sequence files into the input list. 76 | 77 |

78 | 79 |

80 | 81 | 82 | We then specify the alignment-free distance. Here we choose Manhattan distance measure, and simply click the 'Run' button, with default k-mer length setting ( K=8 ). Then calculated pairwise distances will be saved into a file named 'result.Ma.phylip'. The file is saved in standard phylip format. Meanwhile, the result is available in visualized plots. Also, we can track the progress through the console in the left panel. 83 | 84 |

85 | 86 |

87 | 88 | Notice that users can always load previously saved phylip results for visualization by clicking the ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/load.gif) button of the Data Selection Toolbar! 89 | 90 |

91 | 92 |

93 | 94 | Once the visualized results have been plotted, users can either zoom in or zoom out the figure by clicking the ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/zoomin.gif) and ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/zoomout.gif) buttons or using the mouse wheel. Meanwhile, the figure can be saved locally by clicking the ![alt tag](https://raw.githubusercontent.com/younglululu/CAFE/master/code/image/save.gif) button or through the popup menu by right-clicking the mouse. 95 | 96 | Here is the dendrogram of the pairwise distances by using the neighbour-joining algorithm. 97 | 98 |

99 | 100 |

101 | 102 | Here is the two dimensional projection using principal coordinate analysis (PCoA). 103 | 104 |

105 | 106 |

107 | 108 | Here is the heatmap. 109 | 110 |

111 | 112 |

113 | 114 | Here is the network analysis with respect to the 10% quantile of the edges with smallest distance as weight. 115 | 116 |

117 | 118 |

119 | 120 | 121 | 122 | Usage of Stand-alone Executable Program 123 | ------------------------ 124 | 125 | > **Command: ** ./cafe [options]* -D < dist > -I < fa_files > -K < intK > 126 | 127 | > - Main arguments: 128 | 129 | -D < dist >: Comma-separated list of distance measurements, **E.g.** -D D2star,Ma,CVtree. The options include: 130 | 131 | Conventional measures based on kmer counts : 132 | 133 | 1. Ch: Chebyshev distance 134 | 135 | 2. Canberra: Canberra distance 136 | 137 | 3. Chisq: Chi-Square distance 138 | 139 | 4. Cosine: Cosine distance 140 | 141 | 5. Co-phylog: Co-phylog distance 142 | 143 | 6. D2: D2 distance 144 | 145 | 7. Eu: Euclidean distance 146 | 147 | 8. FFP: Feature frequency profiles (FFP) 148 | 149 | 9. JS: Jensen-Shannon divergence 150 | 151 | 10. Ma: Manhattan distance 152 | 153 | 11. Pearson: Pearson distance 154 | 155 | Newly developed measures based on background adjusted kmer counts: 156 | 157 | 1. CVtree: CVtree distance 158 | 159 | 2. D2shepp: D2shepp distance 160 | 161 | 3. D2star: D2star distance 162 | 163 | Measures based on presence/absence of kmers: 164 | 165 | 1. Anderberg: Anderberg distance 166 | 167 | 2. Antidice: anti-Dice distance 168 | 169 | 3. Dice: Dice distance 170 | 171 | 4. Gower: Gower distance 172 | 173 | 5. Hamman: Hamman distance 174 | 175 | 6. Hamming: Hamming distance 176 | 177 | 7. Jaccard: Jaccard distance 178 | 179 | 8. Kulczynski: Kulczynski distance 180 | 181 | 9. Matching: Matching distance 182 | 183 | 10. Ochiai: Ochiai distance 184 | 185 | 11. Phi: Pearson Phi distance 186 | 187 | 12. Russel: Russel-Rao distance 188 | 189 | 13. Sneath: Sneath-Sokal distance 190 | 191 | 14. Tanimoto: Rogers-Tanimoto distance 192 | 193 | 15. Yule: Yule distance 194 | 195 | -F < fa_Dir >: Folder containing only fasta files with extension '.fasta', '.fa', and '.fna'. 196 | 197 | -I < fa_files >: Comma-separated list of sequence fasta files, e.g. -I speciesA.fa,speciesB.fa,speciesC.fa. Pairwise similarity is calculated based upon the sequences specified with this option. 198 | 199 | -K < intK >: Kmer Length. 200 | 201 | > - Options: 202 | 203 | -J < jfexe_path >: Use jellyfish to accelerate kmer counting. denotes the file path of jellyfish executable file, e.g. jellyfish-2.2.4/bin/./jellyfish 204 | 205 | -L < lower >: Only consider k-mer with occurrence >= . The default value is 0. 206 | 207 | -M < order >: Markov Order involved in D2star, D2shepp and JS. There are two possible options. The first option is one single value indicating that all the sequences use the same order. The second option is comma-separated list of orders. Notice that the length of the list should match the number of fasta files. The order value could be non-negative integer but less than Kmer length or \"-1\" with the special intention to automatically infer the suitable order (not suitable for JS). The default Markov Order is -1 as inferring the optimal Markov Order automatically by using the Bayesian Information Criterion (BIC). 208 | 209 | -R: Consider Reverse Complement in kmer counting. 210 | 211 | -S < dir >: Save/Load calculated k-mer count binary files to the folder < dir >. Each input fasta file corresponds to particular model. 212 | 213 | -O < path >: Output results to file at < path >. 214 | 215 | -T < type >: The output type as the input to downstream analysis, including: plain, [phylip](http://evolution.genetics.washington.edu/phylip.html) (as hierarchical clustering), [cytoscape](www.cytoscape.org/) (as network analysis) and mds (Multidimensional Scaling as 2D plotting). E.g. -T mds. The default type is plain. 216 | 217 | > - Examples: 218 | 219 | ./cafe -M 0 -O output_path -S model_dir -T plain -I speciesA.fa,speciesB.fa -J /panfs/cmb-panasas2/ylu465/jellyfish-2.2.4/bin/./jellyfish -K 10 -D D2star,Ma 220 | 221 | ./cafe -M 0 -S model_dir -I speciesA.fa,speciesB.fa -J /panfs/cmb-panasas2/ylu465/jellyfish-2.2.4/bin/./jellyfish -K 10 -D D2star,Ma 222 | 223 | ./cafe -M 0 -L 2 -I speciesA.fa,speciesB.fa -J /panfs/cmb-panasas2/ylu465/jellyfish-2.2.4/bin/./jellyfish -K 10 -D D2star,Ma -R 224 | 225 | 226 | 227 | Contacts and bug reports 228 | ======================== 229 | 230 | Please send bug reports, comments, or questions to 231 | 232 | Yang Lu: [ylu465@usc.edu](mailto:ylu465@usc.edu) 233 | 234 | Prof. Fengzhu Sun: [fsun@usc.edu](mailto:fsun@usc.edu) 235 | 236 | 237 | ---------- 238 | 239 | Copyright and License Information 240 | ================================= 241 | 242 | 243 | This software is Copyright © 2018 The University of Southern California. All Rights Reserved. 244 | 245 | This program is under the terms of USC-RL v1.0 License. 246 | 247 | Permission to use, copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. 248 | 249 | Permission to make commercial use of this software may be obtained by contacting: 250 | USC Stevens Center for Innovation 251 | University of Southern California 252 | 1150 S. Olive Street, Suite 2300 253 | Los Angeles, CA 90115, USA 254 | 255 | This software program and documentation are copyrighted by The University of Southern California. The software program and documentation are supplied "as is", without any accompanying services from USC. USC does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. 256 | 257 | Last update: 08-May-2018 258 | -------------------------------------------------------------------------------- /code/dist_model.h: -------------------------------------------------------------------------------- 1 | /*************************************************************** 2 | * Copyright (C) 2016 Yang Lu 3 | * Computational and Molecular Biology, Department of Biological Science 4 | * University of Southern California, LA, CA 90089, USA 5 | * 6 | * Related publication: 7 | * TBA 8 | ***************************************************************/ 9 | #ifndef _DIST_MODEL_H 10 | #define _DIST_MODEL_H 11 | 12 | #include "kmer.h" 13 | 14 | enum dist{ 15 | D2, D2STAR, D2SHEPP, CVtree, 16 | Ch, Eu, Ma, FFP, 17 | CHISQ, JS, Co_Phylog, 18 | COSINE, PEARSON, CANBERRA, HAMMING, 19 | MATCHING, JACCARD, TANIMOTO, DICE, ANTIDICE, SNEATH, HAMMAN, PHI, ANDERBERG, GOWER, RUSSEL, YULE, OCHIAI, KULCZYNSKI 20 | }; 21 | 22 | class L1FreqStrategy : public AbsTupleDistStrategy 23 | { 24 | public: 25 | L1FreqStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsTupleDistStrategy(i_arg_k, b_arg_singleStrain){ d_result = 0; } 26 | void dealWithTuple(double src_X_w, double trgt_X_w) { double d_tmp_diff = src_X_w - trgt_X_w; if (d_tmp_diff < 0) d_tmp_diff = (0 - d_tmp_diff); d_result += d_tmp_diff; } 27 | double getDist(){ return d_result; } 28 | 29 | private: 30 | double d_result; 31 | }; 32 | 33 | class L2FreqStrategy : public AbsTupleDistStrategy 34 | { 35 | public: 36 | L2FreqStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsTupleDistStrategy(i_arg_k, b_arg_singleStrain){ d_result = 0; } 37 | void dealWithTuple(double src_X_w, double trgt_X_w) { double d_tmp_diff = src_X_w - trgt_X_w; d_tmp_diff = d_tmp_diff*d_tmp_diff; d_result += d_tmp_diff; } 38 | double getDist(){ return sqrt(d_result); } 39 | 40 | private: 41 | double d_result; 42 | }; 43 | 44 | class FFPStrategy : public AbsTupleDistStrategy 45 | { 46 | public: 47 | FFPStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsTupleDistStrategy(i_arg_k, b_arg_singleStrain){ d_result = 0; } 48 | void dealWithTuple(double src_X_w, double trgt_X_w) 49 | { 50 | if(0 == src_X_w || 0 == trgt_X_w) return; 51 | d_result += src_X_w*(log(src_X_w)-log(trgt_X_w)) / LOG2; d_result += trgt_X_w*(log(trgt_X_w)-log(src_X_w)) / LOG2; 52 | } 53 | double getDist(){ return 0.5*d_result; } 54 | 55 | private: 56 | double d_result; 57 | }; 58 | 59 | class LInfFreqStrategy : public AbsTupleDistStrategy 60 | { 61 | public: 62 | LInfFreqStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsTupleDistStrategy(i_arg_k, b_arg_singleStrain){ d_result = 0; } 63 | void dealWithTuple(double src_X_w, double trgt_X_w) { double d_tmp_diff = src_X_w - trgt_X_w; if (d_tmp_diff < 0) d_tmp_diff = (0 - d_tmp_diff); d_result = std::max(d_result, d_tmp_diff); } 64 | double getDist(){ return d_result; } 65 | 66 | private: 67 | double d_result; 68 | }; 69 | 70 | class PearsonStrategy : public AbsTupleDistStrategy 71 | { 72 | public: 73 | PearsonStrategy(int i_arg_k, bool b_arg_singleStrain) :AbsTupleDistStrategy(i_arg_k, b_arg_singleStrain){ sum_src_X_w_trgt_X_w = 0; sum_src_X_w_sq = 0; sum_trgt_X_w_sq = 0; sum_src_X_w = 0; sum_trgt_X_w = 0; sum_count=0; } 74 | void dealWithTuple(double src_X_w, double trgt_X_w) { sum_src_X_w_trgt_X_w += (src_X_w * trgt_X_w); sum_src_X_w_sq += (src_X_w * src_X_w); sum_trgt_X_w_sq += (trgt_X_w * trgt_X_w); sum_src_X_w += src_X_w; sum_trgt_X_w += trgt_X_w; sum_count++; } 75 | double getDist(){ return 1.0-(sum_src_X_w_trgt_X_w-sum_src_X_w*sum_trgt_X_w/sum_count)/(sqrt(sum_src_X_w_sq-sum_src_X_w*sum_src_X_w/sum_count)*sqrt(sum_trgt_X_w_sq-sum_trgt_X_w*sum_trgt_X_w/sum_count)); } 76 | 77 | private: 78 | double sum_src_X_w_trgt_X_w, sum_src_X_w_sq, sum_trgt_X_w_sq, sum_src_X_w, sum_trgt_X_w, sum_count; 79 | }; 80 | 81 | class CanberraStrategy : public AbsTupleDistStrategy 82 | { 83 | public: 84 | CanberraStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsTupleDistStrategy(i_arg_k, b_arg_singleStrain){ d_result = 0; } 85 | void dealWithTuple(double src_X_w, double trgt_X_w) { double d_tmp_diff = src_X_w - trgt_X_w; if (d_tmp_diff < 0) d_tmp_diff = (0 - d_tmp_diff); d_result += (d_tmp_diff/(src_X_w + trgt_X_w)); } 86 | double getDist(){ return d_result; } 87 | 88 | private: 89 | double d_result; 90 | }; 91 | 92 | class HammingStrategy : public AbsTupleDistStrategy 93 | { 94 | public: 95 | HammingStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsTupleDistStrategy(i_arg_k, b_arg_singleStrain){ d_result = 0; sum = 0; } 96 | void dealWithTuple(double src_X_w, double trgt_X_w) { sum++; if((src_X_w>0 && 0==trgt_X_w) || (trgt_X_w>0 && 0==src_X_w)) d_result++;} 97 | double getDist(){ return d_result/sum; } 98 | 99 | private: 100 | double d_result, sum; 101 | }; 102 | 103 | class AbsBinaryTupleStrategy : public AbsTupleDistStrategy 104 | { 105 | public: 106 | AbsBinaryTupleStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsTupleDistStrategy(i_arg_k, b_arg_singleStrain){ A = 0; B = 0; C = 0; D = 0; N = 0;} 107 | virtual void dealWithTuple(double src_X_w, double trgt_X_w) { N++; if(src_X_w>0 && trgt_X_w>0) A++; if(src_X_w>0 && 0==trgt_X_w) B++; if(0==src_X_w && trgt_X_w>0) C++; if(0==src_X_w && 0==trgt_X_w) D++;} 108 | 109 | public: 110 | double A,B,C,D,N; 111 | }; 112 | 113 | class MatchingStrategy : public AbsBinaryTupleStrategy 114 | { 115 | public: 116 | MatchingStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 117 | double getDist(){ return 1-(A+D)/N; } 118 | }; 119 | 120 | class JaccardStrategy : public AbsBinaryTupleStrategy 121 | { 122 | public: 123 | JaccardStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 124 | double getDist(){ return 1-A/(N-D); } 125 | }; 126 | 127 | class TanimotoStrategy : public AbsBinaryTupleStrategy 128 | { 129 | public: 130 | TanimotoStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 131 | double getDist(){ return 1-(A+D)/((A+D)+2*(B+C)); } 132 | }; 133 | 134 | class DiceStrategy : public AbsBinaryTupleStrategy 135 | { 136 | public: 137 | DiceStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 138 | double getDist(){ return 1-2*A/(2*A+B+C); } 139 | }; 140 | 141 | class AntidiceStrategy : public AbsBinaryTupleStrategy 142 | { 143 | public: 144 | AntidiceStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 145 | double getDist(){ return 1-A/(A+B+C); } 146 | }; 147 | 148 | class SneathStrategy : public AbsBinaryTupleStrategy 149 | { 150 | public: 151 | SneathStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 152 | double getDist(){ return 1-2*(A+D)/(2*(A+D)+(B+C)); } 153 | }; 154 | 155 | class HammanStrategy : public AbsBinaryTupleStrategy 156 | { 157 | public: 158 | HammanStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 159 | double getDist(){ return 1-(((A+D)-(B+C))/N)*(((A+D)-(B+C))/N); } 160 | }; 161 | 162 | class PhiStrategy : public AbsBinaryTupleStrategy 163 | { 164 | public: 165 | PhiStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 166 | double getDist(){ return 1-1-((A*D-B*C)/sqrt((A+B)*(A+C)*(D+B)*(D+C)))*((A*D-B*C)/sqrt((A+B)*(A+C)*(D+B)*(D+C))); } 167 | }; 168 | 169 | class AnderbergStrategy : public AbsBinaryTupleStrategy 170 | { 171 | public: 172 | AnderbergStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 173 | double getDist(){ return 1-(A/(A+B)+A/(A+C)+D/(C+D)+D/(B+D))/4; } 174 | }; 175 | 176 | class GowerStrategy : public AbsBinaryTupleStrategy 177 | { 178 | public: 179 | GowerStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 180 | double getDist(){ return 1-A*D/sqrt((A+B)*(A+C)*(D+B*(D+C))); } 181 | }; 182 | 183 | class RusselStrategy : public AbsBinaryTupleStrategy 184 | { 185 | public: 186 | RusselStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 187 | double getDist(){ return 1-A/N; } 188 | }; 189 | 190 | class YuleStrategy : public AbsBinaryTupleStrategy 191 | { 192 | public: 193 | YuleStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 194 | double getDist(){ return 1-((A*D-B*C)/(A*D+B*C))*((A*D-B*C)/(A*D+B*C)); } 195 | }; 196 | 197 | class OchiaiStrategy : public AbsBinaryTupleStrategy 198 | { 199 | public: 200 | OchiaiStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 201 | double getDist(){ return 1-A/sqrt((A+B)*(A+C)); } 202 | }; 203 | 204 | class KulczynskiStrategy : public AbsBinaryTupleStrategy 205 | { 206 | public: 207 | KulczynskiStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsBinaryTupleStrategy(i_arg_k, b_arg_singleStrain){ } 208 | double getDist(){ return 1-(A/(A+B)+A/(A+C))/2; } 209 | }; 210 | 211 | class D2Strategy : public AbsTupleDistStrategy 212 | { 213 | public: 214 | D2Strategy(int i_arg_k, bool b_arg_singleStrain) : AbsTupleDistStrategy(i_arg_k, b_arg_singleStrain){ sum_src_X_w_trgt_X_w = 0; sum_src_X_w_sq = 0; sum_trgt_X_w_sq = 0; } 215 | void dealWithTuple(double src_X_w, double trgt_X_w) { sum_src_X_w_trgt_X_w += (src_X_w * trgt_X_w); sum_src_X_w_sq += (src_X_w * src_X_w); sum_trgt_X_w_sq += (trgt_X_w * trgt_X_w); } 216 | double getDist(){ return 1.0 - sum_src_X_w_trgt_X_w / (sqrt(sum_src_X_w_sq)*sqrt(sum_trgt_X_w_sq)); } 217 | 218 | private: 219 | double sum_src_X_w_trgt_X_w, sum_src_X_w_sq, sum_trgt_X_w_sq; 220 | }; 221 | 222 | class D2starStrategy : public AbsQuadStrategy 223 | { 224 | public: 225 | D2starStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsQuadStrategy(i_arg_k, b_arg_singleStrain){ sum_numerator = 0; sum_src_X_w_tilde_sq_div_EX_w = 0; sum_trgt_X_w_tilde_sq_div_EX_w = 0; } 226 | void dealWithQuad(double src_X_w, double src_EX_w, double trgt_X_w, double trgt_EX_w); 227 | double getDist(){ return 0.5*(1.0 - sum_numerator / (sqrt(sum_src_X_w_tilde_sq_div_EX_w)*sqrt(sum_trgt_X_w_tilde_sq_div_EX_w))); } 228 | 229 | private: 230 | double sum_numerator, sum_src_X_w_tilde_sq_div_EX_w, sum_trgt_X_w_tilde_sq_div_EX_w; 231 | }; 232 | 233 | class D2sheppStrategy : public AbsQuadStrategy 234 | { 235 | public: 236 | D2sheppStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsQuadStrategy(i_arg_k, b_arg_singleStrain){ sum_numerator = 0; sum_src_X_w_tilde_sq_div_sqr_sum = 0; sum_trgt_X_w_tilde_sq_div_sqr_sum = 0; } 237 | void dealWithQuad(double src_X_w, double src_EX_w, double trgt_X_w, double trgt_EX_w); 238 | double getDist(){ return 0.5*(1.0 - sum_numerator / (sqrt(sum_src_X_w_tilde_sq_div_sqr_sum)*sqrt(sum_trgt_X_w_tilde_sq_div_sqr_sum))); } 239 | 240 | private: 241 | double sum_numerator, sum_src_X_w_tilde_sq_div_sqr_sum, sum_trgt_X_w_tilde_sq_div_sqr_sum; 242 | }; 243 | 244 | class HaoStrategy : public AbsQuadStrategy 245 | { 246 | public: 247 | HaoStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsQuadStrategy(i_arg_k, b_arg_singleStrain){ sum_numerator = 0; sum_sq_src_X_w_tilde_div_EX_w = 0; sum_sq_trgt_X_w_tilde_div_EX_w = 0; } 248 | void dealWithQuad(double src_X_w, double src_EX_w, double trgt_X_w, double trgt_EX_w); 249 | double getDist(){ return 0.5*(1.0 - sum_numerator / (sqrt(sum_sq_src_X_w_tilde_div_EX_w)*sqrt(sum_sq_trgt_X_w_tilde_div_EX_w))); } 250 | 251 | private: 252 | double sum_numerator, sum_sq_src_X_w_tilde_div_EX_w , sum_sq_trgt_X_w_tilde_div_EX_w; 253 | }; 254 | 255 | class ChiSqStrategy : public AbsQuadStrategy 256 | { 257 | public: 258 | ChiSqStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsQuadStrategy(i_arg_k, b_arg_singleStrain){ sum = 0; } 259 | void dealWithQuad(double src_X_w, double src_EX_w, double trgt_X_w, double trgt_EX_w); 260 | double getDist(){ return sum; } 261 | private: 262 | double sum ; 263 | }; 264 | 265 | class JensenShannonStrategy : public AbsMrkvStrategy 266 | { 267 | public: 268 | JensenShannonStrategy(int i_arg_k, bool b_arg_singleStrain) : AbsMrkvStrategy(i_arg_k, b_arg_singleStrain){ sum_entropy = 0; src_entropy = 0; trgt_entropy = 0; } 269 | void dealWithMrkv(MarkovModel* src_mrkvModel, MarkovModel* trgt_mrkvModel); 270 | double getDist(){ return sqrt( - sum_entropy + (src_entropy + trgt_entropy) / 2); } 271 | 272 | private: 273 | double sum_entropy, src_entropy, trgt_entropy; 274 | }; 275 | 276 | 277 | class DistFactory 278 | { 279 | public: 280 | static DistFactory *getInstance(); 281 | 282 | double getL1dist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 283 | double getL2dist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 284 | double getLInfdist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 285 | double getChiSqdist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 286 | double getFFPdist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 287 | double getCoPhylogdist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 288 | 289 | double getPearsondist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 290 | double getCanberradist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 291 | double getHammingdist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 292 | 293 | double getMatchingdist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 294 | double getJaccarddist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 295 | double getTanimotodist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 296 | double getDicedist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 297 | double getAntidicedist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 298 | double getSneathdist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 299 | double getHammandist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 300 | double getPhidist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 301 | double getAnderbergdist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 302 | double getGowerdist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 303 | double getRusseldist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 304 | double getYuledist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 305 | double getOchiaidist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 306 | double getKulczynskidist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 307 | 308 | 309 | double getD2dist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 310 | 311 | double getD2stardist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, 312 | int i_src_order, int i_trgt_order, std::string str_src_saveURLPrefix, std::string str_trgt_saveURLPrefix, 313 | KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 314 | 315 | double getD2sheppdist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, 316 | int i_src_order, int i_trgt_order, std::string str_src_saveURLPrefix, std::string str_trgt_saveURLPrefix, 317 | KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 318 | 319 | double getHaodist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, 320 | std::string str_src_saveURLPrefix, std::string str_trgt_saveURLPrefix, 321 | KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 322 | 323 | double getJensenShannondist(int i_arg_k, bool b_arg_singleStrain, 324 | int i_order, std::string str_src_saveURLPrefix, std::string str_trgt_saveURLPrefix, 325 | KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel); 326 | 327 | 328 | private: 329 | DistFactory(){} 330 | static DistFactory* instance; 331 | }; 332 | 333 | 334 | 335 | 336 | #endif -------------------------------------------------------------------------------- /code/dist_model.cpp: -------------------------------------------------------------------------------- 1 | #include "dist_model.h" 2 | 3 | DistFactory* DistFactory::instance = 0; 4 | 5 | DistFactory* DistFactory::getInstance() 6 | { 7 | if (!instance) instance = new DistFactory(); 8 | return instance; 9 | } 10 | 11 | void D2starStrategy::dealWithQuad(double src_X_w, double src_EX_w, double trgt_X_w, double trgt_EX_w) 12 | { 13 | if (0 == src_EX_w || 0 == trgt_EX_w) return; 14 | 15 | double src_X_w_tilde = src_X_w - src_EX_w; 16 | double trgt_X_w_tilde = trgt_X_w - trgt_EX_w; 17 | double numerator = src_X_w_tilde * trgt_X_w_tilde / sqrt(src_EX_w*trgt_EX_w); 18 | double src_X_w_tilde_sq_div_EX_w = src_X_w_tilde*src_X_w_tilde / src_EX_w; 19 | double trgt_X_w_tilde_sq_div_EX_w = trgt_X_w_tilde*trgt_X_w_tilde / trgt_EX_w; 20 | 21 | sum_numerator += numerator; 22 | sum_src_X_w_tilde_sq_div_EX_w += src_X_w_tilde_sq_div_EX_w; 23 | sum_trgt_X_w_tilde_sq_div_EX_w += trgt_X_w_tilde_sq_div_EX_w; 24 | } 25 | 26 | void D2sheppStrategy::dealWithQuad(double src_X_w, double src_EX_w, double trgt_X_w, double trgt_EX_w) 27 | { 28 | double src_X_w_tilde = src_X_w - src_EX_w; 29 | double trgt_X_w_tilde = trgt_X_w - trgt_EX_w; 30 | double src_X_w_tilde_sq = src_X_w_tilde*src_X_w_tilde; 31 | double trgt_X_w_tilde_sq = trgt_X_w_tilde*trgt_X_w_tilde; 32 | double denominator = sqrt(src_X_w_tilde_sq + trgt_X_w_tilde_sq); 33 | 34 | if (0 == denominator) return; 35 | 36 | sum_numerator += src_X_w_tilde * trgt_X_w_tilde / denominator; 37 | sum_src_X_w_tilde_sq_div_sqr_sum += src_X_w_tilde_sq / denominator; 38 | sum_trgt_X_w_tilde_sq_div_sqr_sum += trgt_X_w_tilde_sq / denominator; 39 | } 40 | 41 | void HaoStrategy::dealWithQuad(double src_X_w, double src_EX_w, double trgt_X_w, double trgt_EX_w) 42 | { 43 | if (0 == src_EX_w || 0 == trgt_EX_w) return; 44 | 45 | double src_X_w_tilde_div_EX_w = ((double)src_X_w - src_EX_w) / src_EX_w; 46 | double trgt_X_w_tilde_div_EX_w = ((double)trgt_X_w - trgt_EX_w) / trgt_EX_w; 47 | 48 | sum_numerator += src_X_w_tilde_div_EX_w*trgt_X_w_tilde_div_EX_w; 49 | sum_sq_src_X_w_tilde_div_EX_w += src_X_w_tilde_div_EX_w*src_X_w_tilde_div_EX_w; 50 | sum_sq_trgt_X_w_tilde_div_EX_w += trgt_X_w_tilde_div_EX_w*trgt_X_w_tilde_div_EX_w; 51 | } 52 | 53 | void ChiSqStrategy::dealWithQuad(double src_X_w, double src_EX_w, double trgt_X_w, double trgt_EX_w) 54 | { 55 | //double tmp1 = src_X_w_1 - src_X_w*all_X_w_1 / all_X_w; sum += (tmp1*tmp1*all_X_w / (src_X_w*all_X_w_1)); 56 | double tmp1 = src_EX_w - src_X_w*trgt_EX_w / trgt_X_w; sum += (tmp1*tmp1*trgt_X_w / (src_X_w*trgt_EX_w)); 57 | } 58 | 59 | void JensenShannonStrategy::dealWithMrkv(MarkovModel* src_mrkvModel, MarkovModel* trgt_mrkvModel) 60 | { 61 | int i_src_order = src_mrkvModel->getOrder(); 62 | 63 | for (unsigned long long i = 0; i < (unsigned long long)pow(BASE, i_src_order); ++i) 64 | { 65 | double sum_entropyOverCol = 0, src_entropyOverCol = 0, trgt_entropyOverCol = 0; 66 | 67 | for (unsigned int j = 0; j < BASE; ++j) 68 | { 69 | if (0 != src_mrkvModel->getTransProb(i, j)) src_entropyOverCol += exp(src_mrkvModel->getTransProb(i, j)) * src_mrkvModel->getTransProb(i, j) / LOG2; 70 | if (0 != trgt_mrkvModel->getTransProb(i, j)) trgt_entropyOverCol += exp(trgt_mrkvModel->getTransProb(i, j)) * trgt_mrkvModel->getTransProb(i, j) / LOG2; 71 | double d_tmp_sumTransProb = (exp(src_mrkvModel->getTransProb(i, j)) + exp(trgt_mrkvModel->getTransProb(i, j))) / 2; 72 | 73 | if (0 != d_tmp_sumTransProb) sum_entropyOverCol += d_tmp_sumTransProb * log(d_tmp_sumTransProb) / LOG2; 74 | } 75 | src_entropy += exp(src_mrkvModel->getMargProb(i)) * src_entropyOverCol; 76 | trgt_entropy += exp(trgt_mrkvModel->getMargProb(i)) * trgt_entropyOverCol; 77 | sum_entropy += (exp(src_mrkvModel->getMargProb(i)) + exp(trgt_mrkvModel->getMargProb(i))) / 2 * sum_entropyOverCol; 78 | } 79 | } 80 | 81 | 82 | double DistFactory::getL1dist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 83 | { 84 | L1FreqStrategy* strategy = new L1FreqStrategy(i_arg_k, b_arg_singleStrain); 85 | double dist = IterFactory::getInstance()->getFreqDist(strategy, 86 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, arg_srcKmerModel->totalKmer(), 87 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec, arg_trgtKmerModel->totalKmer()); 88 | 89 | delete strategy; 90 | return dist; 91 | } 92 | 93 | double DistFactory::getL2dist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 94 | { 95 | L2FreqStrategy* strategy = new L2FreqStrategy(i_arg_k, b_arg_singleStrain); 96 | double dist = IterFactory::getInstance()->getFreqDist(strategy, 97 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, arg_srcKmerModel->totalKmer(), 98 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec, arg_trgtKmerModel->totalKmer()); 99 | 100 | delete strategy; 101 | return dist; 102 | } 103 | 104 | double DistFactory::getLInfdist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 105 | { 106 | LInfFreqStrategy* strategy = new LInfFreqStrategy(i_arg_k, b_arg_singleStrain); 107 | 108 | double dist = IterFactory::getInstance()->getFreqDist(strategy, 109 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, arg_srcKmerModel->totalKmer(), 110 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec, arg_trgtKmerModel->totalKmer()); 111 | 112 | delete strategy; 113 | return dist; 114 | } 115 | 116 | double DistFactory::getFFPdist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 117 | { 118 | FFPStrategy* strategy = new FFPStrategy(i_arg_k, b_arg_singleStrain); 119 | double dist = IterFactory::getInstance()->getFreqDist(strategy, 120 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, arg_srcKmerModel->totalKmer(), 121 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec, arg_trgtKmerModel->totalKmer()); 122 | 123 | delete strategy; 124 | return dist; 125 | } 126 | 127 | double DistFactory::getCoPhylogdist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 128 | { 129 | double dist = IterFactory::getInstance()->getCoPhylogDist(i_arg_k, 130 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 131 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 132 | return dist; 133 | } 134 | 135 | double DistFactory::getPearsondist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 136 | { 137 | PearsonStrategy* strategy = new PearsonStrategy(i_arg_k, b_arg_singleStrain); 138 | double dist = IterFactory::getInstance()->getFreqDist(strategy, 139 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, arg_srcKmerModel->totalKmer(), 140 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec, arg_trgtKmerModel->totalKmer()); 141 | 142 | delete strategy; 143 | return dist; 144 | } 145 | 146 | double DistFactory::getCanberradist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 147 | { 148 | CanberraStrategy* strategy = new CanberraStrategy(i_arg_k, b_arg_singleStrain); 149 | double dist = IterFactory::getInstance()->getFreqDist(strategy, 150 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, arg_srcKmerModel->totalKmer(), 151 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec, arg_trgtKmerModel->totalKmer()); 152 | 153 | delete strategy; 154 | return dist; 155 | } 156 | 157 | double DistFactory::getHammingdist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 158 | { 159 | HammingStrategy* strategy = new HammingStrategy(i_arg_k, b_arg_singleStrain); 160 | double dist = IterFactory::getInstance()->getFreqDist(strategy, 161 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, arg_srcKmerModel->totalKmer(), 162 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec, arg_trgtKmerModel->totalKmer()); 163 | 164 | delete strategy; 165 | return dist; 166 | } 167 | 168 | double DistFactory::getMatchingdist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 169 | { 170 | MatchingStrategy* strategy = new MatchingStrategy(i_arg_k, b_arg_singleStrain); 171 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 172 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 173 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 174 | 175 | delete strategy; 176 | return dist; 177 | } 178 | 179 | double DistFactory::getJaccarddist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 180 | { 181 | JaccardStrategy* strategy = new JaccardStrategy(i_arg_k, b_arg_singleStrain); 182 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 183 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 184 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 185 | 186 | delete strategy; 187 | return dist; 188 | } 189 | 190 | double DistFactory::getTanimotodist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 191 | { 192 | TanimotoStrategy* strategy = new TanimotoStrategy(i_arg_k, b_arg_singleStrain); 193 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 194 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 195 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 196 | 197 | delete strategy; 198 | return dist; 199 | } 200 | 201 | double DistFactory::getDicedist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 202 | { 203 | DiceStrategy* strategy = new DiceStrategy(i_arg_k, b_arg_singleStrain); 204 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 205 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 206 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 207 | 208 | delete strategy; 209 | return dist; 210 | } 211 | 212 | double DistFactory::getAntidicedist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 213 | { 214 | AntidiceStrategy* strategy = new AntidiceStrategy(i_arg_k, b_arg_singleStrain); 215 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 216 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 217 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 218 | 219 | delete strategy; 220 | return dist; 221 | } 222 | 223 | double DistFactory::getSneathdist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 224 | { 225 | SneathStrategy* strategy = new SneathStrategy(i_arg_k, b_arg_singleStrain); 226 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 227 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 228 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 229 | 230 | delete strategy; 231 | return dist; 232 | } 233 | 234 | double DistFactory::getHammandist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 235 | { 236 | HammanStrategy* strategy = new HammanStrategy(i_arg_k, b_arg_singleStrain); 237 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 238 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 239 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 240 | 241 | delete strategy; 242 | return dist; 243 | } 244 | 245 | double DistFactory::getPhidist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 246 | { 247 | PhiStrategy* strategy = new PhiStrategy(i_arg_k, b_arg_singleStrain); 248 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 249 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 250 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 251 | 252 | delete strategy; 253 | return dist; 254 | } 255 | 256 | double DistFactory::getAnderbergdist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 257 | { 258 | AnderbergStrategy* strategy = new AnderbergStrategy(i_arg_k, b_arg_singleStrain); 259 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 260 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 261 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 262 | 263 | delete strategy; 264 | return dist; 265 | } 266 | 267 | double DistFactory::getGowerdist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 268 | { 269 | GowerStrategy* strategy = new GowerStrategy(i_arg_k, b_arg_singleStrain); 270 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 271 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 272 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 273 | 274 | delete strategy; 275 | return dist; 276 | } 277 | 278 | double DistFactory::getRusseldist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 279 | { 280 | RusselStrategy* strategy = new RusselStrategy(i_arg_k, b_arg_singleStrain); 281 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 282 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 283 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 284 | 285 | delete strategy; 286 | return dist; 287 | } 288 | 289 | double DistFactory::getYuledist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 290 | { 291 | YuleStrategy* strategy = new YuleStrategy(i_arg_k, b_arg_singleStrain); 292 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 293 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 294 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 295 | 296 | delete strategy; 297 | return dist; 298 | } 299 | 300 | double DistFactory::getOchiaidist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 301 | { 302 | OchiaiStrategy* strategy = new OchiaiStrategy(i_arg_k, b_arg_singleStrain); 303 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 304 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 305 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 306 | 307 | delete strategy; 308 | return dist; 309 | } 310 | 311 | double DistFactory::getKulczynskidist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 312 | { 313 | KulczynskiStrategy* strategy = new KulczynskiStrategy(i_arg_k, b_arg_singleStrain); 314 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 315 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 316 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 317 | 318 | delete strategy; 319 | return dist; 320 | } 321 | 322 | double DistFactory::getD2dist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 323 | { 324 | D2Strategy* strategy = new D2Strategy(i_arg_k, b_arg_singleStrain); 325 | double dist = IterFactory::getInstance()->getCntDist(strategy, i_arg_lowerCnt, 326 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 327 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 328 | 329 | delete strategy; 330 | return dist; 331 | } 332 | 333 | double DistFactory::getChiSqdist(int i_arg_k, bool b_arg_singleStrain, KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 334 | { 335 | ChiSqStrategy* strategy = new ChiSqStrategy(i_arg_k, b_arg_singleStrain); 336 | 337 | double dist = IterFactory::getInstance()->getCntDist(strategy, 338 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, 339 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec); 340 | 341 | delete strategy; 342 | return dist; 343 | } 344 | 345 | double DistFactory::getD2stardist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, 346 | int i_src_order, int i_trgt_order, std::string str_src_saveURLPrefix, std::string str_trgt_saveURLPrefix, 347 | KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 348 | { 349 | D2starStrategy* strategy = new D2starStrategy(i_arg_k, b_arg_singleStrain); 350 | 351 | MarkovModel* src_markovModel = arg_srcKmerModel->getMarkovModel(i_src_order, str_src_saveURLPrefix); 352 | MarkovModel* trgt_markovModel = arg_trgtKmerModel->getMarkovModel(i_trgt_order, str_trgt_saveURLPrefix); 353 | 354 | double dist = IterFactory::getInstance()->getCntExpDist(strategy, i_arg_lowerCnt, 355 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, src_markovModel, arg_srcKmerModel->totalKmer(), 356 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec, trgt_markovModel, arg_trgtKmerModel->totalKmer()); 357 | 358 | delete strategy; delete src_markovModel; delete trgt_markovModel; 359 | return dist; 360 | } 361 | 362 | double DistFactory::getD2sheppdist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, 363 | int i_src_order, int i_trgt_order, std::string str_src_saveURLPrefix, std::string str_trgt_saveURLPrefix, 364 | KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 365 | { 366 | D2sheppStrategy* strategy = new D2sheppStrategy(i_arg_k, b_arg_singleStrain); 367 | 368 | MarkovModel* src_markovModel = arg_srcKmerModel->getMarkovModel(i_src_order, str_src_saveURLPrefix); 369 | MarkovModel* trgt_markovModel = arg_trgtKmerModel->getMarkovModel(i_trgt_order, str_trgt_saveURLPrefix); 370 | 371 | double dist = IterFactory::getInstance()->getCntExpDist(strategy, i_arg_lowerCnt, 372 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, src_markovModel, arg_srcKmerModel->totalKmer(), 373 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec, trgt_markovModel, arg_trgtKmerModel->totalKmer()); 374 | 375 | delete strategy; delete src_markovModel; delete trgt_markovModel; 376 | return dist; 377 | } 378 | 379 | double DistFactory::getHaodist(int i_arg_k, bool b_arg_singleStrain, int i_arg_lowerCnt, 380 | std::string str_src_saveURLPrefix, std::string str_trgt_saveURLPrefix, 381 | KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 382 | { 383 | HaoStrategy* strategy = new HaoStrategy(i_arg_k, b_arg_singleStrain); 384 | 385 | MarkovModel* src_markovModel = arg_srcKmerModel->getMarkovModel(i_arg_k - 2, str_src_saveURLPrefix); 386 | MarkovModel* trgt_markovModel = arg_trgtKmerModel->getMarkovModel(i_arg_k - 2, str_trgt_saveURLPrefix); 387 | 388 | double dist = IterFactory::getInstance()->getCntExpDist(strategy, i_arg_lowerCnt, 389 | arg_srcKmerModel->kmerCntUnorderMap, arg_srcKmerModel->kmerVec, src_markovModel, arg_srcKmerModel->totalKmer(), 390 | arg_trgtKmerModel->kmerCntUnorderMap, arg_trgtKmerModel->kmerVec, trgt_markovModel, arg_trgtKmerModel->totalKmer()); 391 | 392 | delete strategy; delete src_markovModel; delete trgt_markovModel; 393 | return dist; 394 | } 395 | 396 | double DistFactory::getJensenShannondist(int i_arg_k, bool b_arg_singleStrain, 397 | int i_order, std::string str_src_saveURLPrefix, std::string str_trgt_saveURLPrefix, 398 | KmerModel* arg_srcKmerModel, KmerModel* arg_trgtKmerModel) 399 | { 400 | JensenShannonStrategy* strategy = new JensenShannonStrategy(i_arg_k, b_arg_singleStrain); 401 | 402 | MarkovModel* src_markovModel = arg_srcKmerModel->getMarkovModel(i_order, str_src_saveURLPrefix); 403 | MarkovModel* trgt_markovModel = arg_trgtKmerModel->getMarkovModel(i_order, str_trgt_saveURLPrefix); 404 | 405 | double dist = IterFactory::getInstance()->getMrkvDist(strategy, src_markovModel, trgt_markovModel); 406 | 407 | delete strategy; delete src_markovModel; delete trgt_markovModel; 408 | return dist; 409 | } 410 | 411 | -------------------------------------------------------------------------------- /code/main.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************** 2 | * Copyright (C) 2016 Yang Lu 3 | * Computational and Molecular Biology, Department of Biological Science 4 | * University of Southern California, LA, CA 90089, USA 5 | * 6 | * Related publication: 7 | * TBA 8 | ***************************************************************/ 9 | #include "utils.h" 10 | #include "seq_model.h" 11 | #include "kmer.h" 12 | #include "dist_model.h" 13 | #include "output.h" 14 | 15 | 16 | #if defined(__unix__) || defined(__APPLE__) || defined(__MACH__) 17 | #include 18 | int getdir(std::string dir, std::vector &files) 19 | { 20 | std::string str_currDir = dir; 21 | if (!str_currDir.empty() && !endsWith(str_currDir, "/")) str_currDir.append("/"); 22 | 23 | DIR *dp; 24 | struct dirent *dirp; 25 | if ((dp = opendir(dir.c_str())) == NULL) { 26 | std::cout << "Error(" << errno << ") opening " << dir << std::endl; 27 | return errno; 28 | } 29 | 30 | while ((dirp = readdir(dp)) != NULL) 31 | { 32 | if (endsWith(dirp->d_name, ".fasta") || endsWith(dirp->d_name, ".fa") || endsWith(dirp->d_name, ".fna")) 33 | { 34 | std::string fastaURL = str_currDir; 35 | fastaURL.append(std::string(dirp->d_name)); 36 | files.push_back(fastaURL); 37 | } 38 | } 39 | closedir(dp); 40 | return 0; 41 | } 42 | #elif defined(_WIN32) || defined(WIN32) 43 | #include 44 | std::string wchar_t2string(const wchar_t *wchar) 45 | { 46 | std::string str = ""; 47 | int index = 0; 48 | while (wchar[index] != 0) 49 | { 50 | str += (char)wchar[index]; 51 | ++index; 52 | } 53 | return str; 54 | } 55 | 56 | wchar_t *string2wchar_t(const std::string &str) 57 | { 58 | wchar_t wchar[260]; 59 | int index = 0; 60 | while (index < str.size()) 61 | { 62 | wchar[index] = (wchar_t)str[index]; 63 | ++index; 64 | } 65 | wchar[index] = 0; 66 | return wchar; 67 | } 68 | int getdir(std::string dir, std::vector &files) 69 | { 70 | std::string str_currDir = dir; 71 | if (!str_currDir.empty() && !endsWith(str_currDir, "/")) str_currDir.append("/"); 72 | 73 | std::string searchDir = str_currDir; searchDir.append("*.*"); 74 | WIN32_FIND_DATA FindFileData; 75 | wchar_t * FileName = string2wchar_t(searchDir); 76 | HANDLE hFind = FindFirstFile(FileName, &FindFileData); 77 | 78 | while (FindNextFile(hFind, &FindFileData)) 79 | { 80 | std::string file = wchar_t2string(FindFileData.cFileName); 81 | if (endsWith(file, ".fasta") || endsWith(file, ".fa") || endsWith(file, ".fna")) 82 | { 83 | std::string fastaURL = str_currDir; 84 | fastaURL.append(std::string(file)); 85 | files.push_back(fastaURL); 86 | } 87 | } 88 | return 0; 89 | } 90 | #endif 91 | 92 | void print_usage_and_exit() 93 | { 94 | printf("CAFE:\t aCcelerated Alignment-FrEe sequence analysis\n"); 95 | printf("Description:\t The program provides 29 alignment-free sequence distance measures.\n"); 96 | printf("Authors:\t Yang Lu and Prof. Fengzhu Sun, Computational and Molecular Biology, University of Southern California.\n"); 97 | printf("\nusage:\n"); 98 | printf("./cafe [options]* -D -I -K \n"); 99 | printf("\nMain arguments\n"); 100 | printf("\t-D \tComma-separated list of distance measurements, E.g. -D D2star,Ma,CVtree. The options include: \n"); 101 | printf("\t Conventional measures based on k-mer counts : \n"); 102 | 103 | printf("\t\t Ch: Chebyshev distance \n"); 104 | printf("\t\t Canberra: Canberra distance \n"); 105 | printf("\t\t Chisq: Chi-Square distance \n"); 106 | printf("\t\t Cosine: Cosine distance \n"); 107 | printf("\t\t Co-phylog: Co-phylog distance with the seed C_{(k-1)/2,(k-1)/2}O_{1} when k is odd or C_{k/2-1,k/2}O_{1} when k is even \n"); 108 | printf("\t\t D2: D2 distance \n"); 109 | printf("\t\t Eu: Euclidean distance \n"); 110 | printf("\t\t FFP: Feature frequency profiles (FFP) \n"); 111 | printf("\t\t JS: Jensen-Shannon divergence \n"); 112 | printf("\t\t Ma: Manhattan distance \n"); 113 | printf("\t\t Pearson: Pearson distance \n"); 114 | 115 | printf("\t Newly developed measures based on background adjusted k-mer counts: \n"); 116 | printf("\t\t CVtree: CVtree distance \n"); 117 | printf("\t\t D2shepp: D2shepp distance \n"); 118 | printf("\t\t D2star: D2star distance \n"); 119 | 120 | printf("\t Measures based on presence/absence of k-mers: \n"); 121 | printf("\t\t Anderberg: Anderberg distance \n"); 122 | printf("\t\t Antidice: anti-Dice distance \n"); 123 | printf("\t\t Dice: Dice distance \n"); 124 | printf("\t\t Gower: Gower distance \n"); 125 | printf("\t\t Hamman: Hamman distance \n"); 126 | printf("\t\t Hamming: Hamming distance \n"); 127 | printf("\t\t Jaccard: Jaccard distance \n"); 128 | printf("\t\t Kulczynski: Kulczynski distance \n"); 129 | printf("\t\t Matching: Matching distance \n"); 130 | printf("\t\t Ochiai: Ochiai distance \n"); 131 | printf("\t\t Phi: Pearson Phi distance \n"); 132 | printf("\t\t Russel: Russel-Rao distance \n"); 133 | printf("\t\t Sneath: Sneath-Sokal distance \n"); 134 | printf("\t\t Tanimoto: Rogers-Tanimoto distance \n"); 135 | printf("\t\t Yule: Yule distance \n"); 136 | 137 | printf("\t-F \tFolder containing only fasta files with extension '.fasta', '.fa', and '.fna'. \n"); 138 | printf("\t-I \tComma-separated list of sequence fasta files, e.g. -I speciesA.fa,speciesB.fa,speciesC.fa. Pairwise similarity is calculated based upon the sequences specified with this option. \n"); 139 | printf("\t-K \tKmer Length\n"); 140 | printf("\nOptions\n"); 141 | printf("\t-J \tUse jellyfish to accelerate kmer counting. denotes the file path of jellyfish executable file, e.g. jellyfish-2.2.4/bin/./jellyfish \n"); 142 | printf("\t-L \tOnly consider k-mer with occurrence >= . The default value is 0. \n"); 143 | printf("\t-M \tMarkov Order involved in D2star and D2shepp. There are two possible options. The first option is one single value indicating that all the sequences use the same order. The second option is comma-separated list of orders. Notice that the length of the list should match the number of fasta files. The order value could be non-negative integer but less than Kmer length or \"-1\" with the special intention to automatically infer the suitable order (not suitable for JS). The default Markov Order is -1 (Automaticcaly determine by BIC).\n"); 144 | printf("\t-R \t\tConsider Reverse Complement in kmer counting. \n"); 145 | printf("\t-S \tSave/Load calculated k-mer count binary files to the folder . Each input fasta file corresponds to particular model. \n"); 146 | printf("\t-O \tOutput results to file at \n"); 147 | printf("\t-T \tThe output type as the input to downstream analysis, including: plain, phylip (as hierarchical clustering), cytoscape (as network analysis) and mds (Multidimensional Scaling as 2D plotting). E.g. -T mds. The default type is plain. \n"); 148 | //printf("\t-V \tSave visualization result to the folder . \n"); 149 | printf("\nExamples:\n"); 150 | printf("\t./cafe -M 0 -O output_path -S model_dir -T plain -I speciesA.fa,speciesB.fa -J jellyfish-2.2.4/bin/./jellyfish -K 10 -D D2star,Ma\n"); 151 | printf("\t./cafe -M 0 -S model_dir -I speciesA.fa,speciesB.fa -J jellyfish-2.2.4/bin/./jellyfish -K 10 -D D2star,Ma\n"); 152 | printf("\t./cafe -M 0 -L 2 -I speciesA.fa,speciesB.fa -J jellyfish-2.2.4/bin/./jellyfish -K 10 -D D2star,Ma -R\n"); 153 | printf("\n"); 154 | 155 | exit(0); 156 | } 157 | 158 | 159 | 160 | 161 | int main(int argc, char* argv[]) 162 | { 163 | clock_t startTime, endTime; 164 | startTime = clock(); 165 | 166 | std::vector vec_distStr; 167 | std::vector vec_orderStr; 168 | std::vector vec_fastaFiles; 169 | 170 | std::vector vec_order; 171 | std::vector vec_dist; 172 | std::vector vec_namelist; 173 | std::vector vec_saveURLlist; 174 | 175 | int i_k = 0, i_lowerCnt = 0; 176 | OUTPUT_TYPE outputType = PHYLIP; 177 | bool singleStrain = true, containChiSq = false, containCvtree = false, jellyfishValid = true; 178 | std::string str_save_modelDir = "", str_save_vizDir = "", str_outputFileURL = "", str_jellyfishExeURL = ""; 179 | 180 | printf("Start parsing the arguments... \n"); 181 | if (argc < 2 || !strcmp(argv[1], "-help") || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--usage")) print_usage_and_exit(); 182 | for (int i = 1; i < argc; ++i) 183 | { 184 | if (!strcmp(argv[i], "-I") || !strcmp(argv[i], "-i")) split(std::string(argv[++i]), ",", vec_fastaFiles); 185 | if (!strcmp(argv[i], "-F") || !strcmp(argv[i], "-f")) 186 | { 187 | std::string fastaDir = std::string(argv[++i]); 188 | std::cout << fastaDir << std::endl; 189 | getdir(fastaDir, vec_fastaFiles); 190 | for (int idx = 0; idx < vec_fastaFiles.size(); ++idx) std::cout << "Append: " << vec_fastaFiles[idx] << std::endl; 191 | } 192 | else if (!strcmp(argv[i], "-K") || !strcmp(argv[i], "-k")) i_k = atoi(argv[++i]); 193 | else if (!strcmp(argv[i], "-M") || !strcmp(argv[i], "-m")) split(std::string(argv[++i]), ",", vec_orderStr); 194 | else if (!strcmp(argv[i], "-L") || !strcmp(argv[i], "-l")) i_lowerCnt = atoi(argv[++i]); 195 | else if (!strcmp(argv[i], "-S") || !strcmp(argv[i], "-s")) str_save_modelDir = std::string(argv[++i]); 196 | else if (!strcmp(argv[i], "-O") || !strcmp(argv[i], "-o")) str_outputFileURL = std::string(argv[++i]); 197 | else if (!strcmp(argv[i], "-V") || !strcmp(argv[i], "-v")) str_save_vizDir = std::string(argv[++i]); 198 | else if (!strcmp(argv[i], "-J") || !strcmp(argv[i], "-j")) 199 | { 200 | str_jellyfishExeURL = std::string(argv[++i]); 201 | if (!file_exists(str_jellyfishExeURL)) 202 | { 203 | jellyfishValid = false; 204 | std::cout << "[error]: Jellyfish executable file not exist at " << str_jellyfishExeURL << std::endl; 205 | //print_usage_and_exit(); 206 | } 207 | } 208 | else if (!strcmp(argv[i], "-T") || !strcmp(argv[i], "-t")) 209 | { 210 | ++i; 211 | if (!strcmp(toLowerCase(std::string(argv[i])).c_str(), "plain")) outputType = PLAIN; 212 | else if (!strcmp(toLowerCase(std::string(argv[i])).c_str(), "phylip")) outputType = PHYLIP; 213 | else if (!strcmp(toLowerCase(std::string(argv[i])).c_str(), "cytoscape")) outputType = CYTOSCAPE; 214 | else if (!strcmp(toLowerCase(std::string(argv[i])).c_str(), "mds")) outputType = MDS; 215 | else printf("[warning]: The output type is unrecognized! \n"); 216 | } 217 | else if (!strcmp(argv[i], "-D") || !strcmp(argv[i], "-d")) 218 | { 219 | split(std::string(argv[++i]), ",", vec_distStr); 220 | } 221 | else if (!strcmp(argv[i], "-R") || !strcmp(argv[i], "-r")) { singleStrain = false; } 222 | } 223 | 224 | for (int j = 0; j < vec_distStr.size(); ++j) 225 | { 226 | if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "d2")) vec_dist.push_back(D2); 227 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "d2star")) vec_dist.push_back(D2STAR); 228 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "d2shepp")) vec_dist.push_back(D2SHEPP); 229 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "ma")) vec_dist.push_back(Ma); 230 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "eu")) vec_dist.push_back(Eu); 231 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "ch")) vec_dist.push_back(Ch); 232 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "ffp")) vec_dist.push_back(FFP); 233 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "co-phylog")) vec_dist.push_back(Co_Phylog); 234 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "cvtree")) { containCvtree = true; vec_dist.push_back(CVtree); } 235 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "js")) vec_dist.push_back(JS); 236 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "chisq")) { containChiSq = true; vec_dist.push_back(CHISQ); } 237 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "cosine")) vec_dist.push_back(COSINE); 238 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "pearson")) vec_dist.push_back(PEARSON); 239 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "canberra")) vec_dist.push_back(CANBERRA); 240 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "hamming")) vec_dist.push_back(HAMMING); 241 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "matching")) vec_dist.push_back(MATCHING); 242 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "jaccard")) vec_dist.push_back(JACCARD); 243 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "tanimoto")) vec_dist.push_back(TANIMOTO); 244 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "dice")) vec_dist.push_back(DICE); 245 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "antidice")) vec_dist.push_back(ANTIDICE); 246 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "sneath")) vec_dist.push_back(SNEATH); 247 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "hamman")) vec_dist.push_back(HAMMAN); 248 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "phi")) vec_dist.push_back(PHI); 249 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "anderberg")) vec_dist.push_back(ANDERBERG); 250 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "gower")) vec_dist.push_back(GOWER); 251 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "russel")) vec_dist.push_back(RUSSEL); 252 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "yule")) vec_dist.push_back(YULE); 253 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "ochiai")) vec_dist.push_back(OCHIAI); 254 | else if (!strcmp(toLowerCase(vec_distStr[j]).c_str(), "kulczynski")) vec_dist.push_back(KULCZYNSKI); 255 | else 256 | { 257 | printf("[warning]: The distance measurement %s is unrecognized!\n ", vec_distStr[j].c_str()); 258 | vec_distStr.erase(vec_distStr.begin() + j); 259 | } 260 | } 261 | if (!str_save_modelDir.empty() && !dir_exists(str_save_modelDir)) { std::string cmd = "mkdir " + str_save_modelDir; system(cmd.c_str()); } 262 | 263 | // validity check for Kmer length 264 | if (i_k <= 0) 265 | { 266 | printf("[error]: Kmer length should be positive! \n"); 267 | print_usage_and_exit(); 268 | } 269 | 270 | if (i_k <= 2 && containCvtree) 271 | { 272 | printf("[error]: Kmer length must exceed 2 in CVTree! \n"); 273 | print_usage_and_exit(); 274 | } 275 | 276 | // validity check for possible markov order 277 | if (vec_orderStr.empty()) vec_orderStr.push_back("0"); 278 | if (vec_orderStr.size() != vec_fastaFiles.size()) 279 | { 280 | if (vec_orderStr.size() > 1) 281 | { 282 | printf("[error]: The length of the order list should match the number of fasta files! \n"); 283 | print_usage_and_exit(); 284 | } 285 | else if (vec_orderStr.size() == 1) 286 | { 287 | while (vec_orderStr.size() < vec_fastaFiles.size()) vec_orderStr.push_back(vec_orderStr.at(0)); 288 | } 289 | } 290 | for (int i = 0; i < vec_orderStr.size(); ++i) 291 | { 292 | int order = atoi(vec_orderStr[i].c_str()); 293 | if (order >= i_k || order < -1) 294 | { 295 | printf("[error]: Markov Order should be non-negative and less than k or '-1' indicating auto inference! \n"); 296 | std::cout << "k = " << i_k << " order = " << order << std::endl; 297 | print_usage_and_exit(); 298 | } 299 | } 300 | 301 | // validity check for input fasta files 302 | for (int i = 0; i < vec_fastaFiles.size(); ++i) 303 | { 304 | if (!file_exists(vec_fastaFiles[i])) 305 | { 306 | std::cout << "Input file not exist! Skip: " << vec_fastaFiles[i] << std::endl; 307 | vec_fastaFiles.erase(vec_fastaFiles.begin() + i); 308 | vec_orderStr.erase(vec_orderStr.begin() + i); 309 | } 310 | } 311 | printf("Finish parsing the arguments.\n"); 312 | 313 | printf("Start pre-processing the hash of input fasta files...\n"); 314 | // validity check and pre-processing for hashs of input fasta files 315 | for (int i = 0; i < vec_fastaFiles.size(); ++i) 316 | { 317 | int order = atoi(vec_orderStr[i].c_str()); 318 | std::vector cachedOrderVec; 319 | if (containChiSq) cachedOrderVec.push_back(i_k + 1); 320 | cachedOrderVec.push_back(i_k); 321 | if (containCvtree) { cachedOrderVec.push_back(i_k - 1); cachedOrderVec.push_back(i_k - 2); } 322 | 323 | if (0 == order) cachedOrderVec.push_back(1); 324 | else if (order > 0) { cachedOrderVec.push_back(order + 1); cachedOrderVec.push_back(order); } 325 | else { for (int minOrder = i_k-1; minOrder > 0 ; --minOrder) cachedOrderVec.push_back(minOrder); } 326 | 327 | std::string str_seqName = getFileName(vec_fastaFiles[i]); 328 | std::string str_saveDir = str_save_modelDir; 329 | if (!str_saveDir.empty() && !endsWith(str_saveDir, "/")) str_saveDir.append("/"); 330 | str_saveDir.append("hash_").append(str_seqName).append("_L_").append(patch::to_string(i_lowerCnt)).append("_k_"); 331 | 332 | for (int orderIdx = 0; orderIdx < cachedOrderVec.size(); ++orderIdx) 333 | { 334 | int currK = cachedOrderVec[orderIdx]; 335 | std::string str_saveURL = str_saveDir + patch::to_string(currK); if (file_exists(str_saveURL)) continue; 336 | 337 | KmerModel* kmerModel = new KmerModel(currK, true); 338 | if (0 == orderIdx) 339 | { 340 | bool jellyfishSucceed = false; 341 | if (jellyfishValid) 342 | { 343 | std::string str_jfBinURL = str_saveURL; str_jfBinURL.append(".jf"); 344 | std::string str_jfTabTxtURL = str_saveURL; str_jfTabTxtURL.append(".cnt"); 345 | 346 | std::string lowerCntStr = " -L " + patch::to_string(i_lowerCnt); 347 | if (i_lowerCnt < 2) lowerCntStr = ""; 348 | 349 | std::string cmd1 = str_jellyfishExeURL + " count -m " + patch::to_string(currK) + " -s 500M -t 20" + lowerCntStr + " -o " + str_jfBinURL + " " + vec_fastaFiles[i]; 350 | system(cmd1.c_str()); std::cout << "Execute Command: " << cmd1 << std::endl; 351 | 352 | if (file_exists(str_jfBinURL)) 353 | { 354 | std::cout << "Jellyfish succeed in count!" << std::endl; 355 | std::string cmd2 = str_jellyfishExeURL + " dump -t " + str_jfBinURL + lowerCntStr + " > " + str_jfTabTxtURL; 356 | system(cmd2.c_str()); std::cout << "Execute Command: " << cmd2 << std::endl; 357 | 358 | if (file_exists(str_jfTabTxtURL)) 359 | { 360 | std::cout << "Jellyfish succeed in dump!" << std::endl; 361 | kmerModel->saveFromJellyFish(str_jfTabTxtURL, str_saveURL); 362 | jellyfishSucceed = true; 363 | } 364 | } 365 | } 366 | 367 | if (!jellyfishSucceed) 368 | { 369 | std::cout << "Jellyfish not succeed! Now use slow counting!" << std::endl; 370 | kmerModel->saveFromFasta(currK, vec_fastaFiles[i], str_saveURL); 371 | } 372 | } 373 | else 374 | { 375 | int prevK = cachedOrderVec[orderIdx - 1]; 376 | std::string str_prevOrderURL = str_saveDir + patch::to_string(prevK); 377 | kmerModel->saveFromLargerK(currK, prevK, str_prevOrderURL, str_saveURL); 378 | } 379 | delete kmerModel; std::cout << "Now save model to " << str_saveURL << std::endl; 380 | } 381 | } 382 | 383 | for (int i = 0; i < vec_fastaFiles.size(); ++i) 384 | { 385 | std::string str_seqName = getFileName(vec_fastaFiles[i]); 386 | 387 | std::string str_saveDir = str_save_modelDir; 388 | if (!str_saveDir.empty() && !endsWith(str_saveDir, "/")) 389 | str_saveDir.append("/"); 390 | str_saveDir.append("hash_").append(str_seqName).append("_L_").append(patch::to_string(i_lowerCnt)).append("_k_"); 391 | 392 | int order = atoi(vec_orderStr[i].c_str()); 393 | if (order < 0) order = getEstMarkovOrder(i_k, str_saveDir, str_seqName); // need auto infer order 394 | 395 | vec_order.push_back(order); 396 | vec_saveURLlist.push_back(str_saveDir); 397 | vec_namelist.push_back(str_seqName); 398 | } 399 | printf("Finish pre-processing the hash of input fasta files.\n"); 400 | 401 | endTime = clock(); 402 | std::cout << "Time Elapsed: " << ((float)endTime - (float)startTime) / CLOCKS_PER_SEC << " seconds" << std::endl; 403 | startTime = clock(); 404 | 405 | printf("Start calculating the distance...\n"); 406 | for (int distIdx = 0; distIdx < vec_dist.size(); ++distIdx) 407 | { 408 | dist currDist = vec_dist[distIdx]; 409 | int hashK = i_k; if (CHISQ == currDist) hashK = i_k + 1; 410 | 411 | smat::Matrix* simMat = new smat::Matrix(vec_fastaFiles.size(), vec_fastaFiles.size(), 0); 412 | KmerModel* src_kmerModel, *trgt_kmerModel; 413 | 414 | for (int i = 0; i < vec_fastaFiles.size() - 1; ++i) 415 | { 416 | src_kmerModel = new KmerModel(hashK, singleStrain); 417 | src_kmerModel->load(hashK, vec_saveURLlist[i] + patch::to_string(hashK)); 418 | 419 | for (int j = i + 1; j < vec_fastaFiles.size(); ++j) 420 | { 421 | trgt_kmerModel = new KmerModel(hashK, singleStrain); 422 | trgt_kmerModel->load(hashK, vec_saveURLlist[j] + patch::to_string(hashK)); 423 | 424 | double distVal = 0; 425 | 426 | if (D2 == currDist || COSINE == currDist) distVal = DistFactory::getInstance()->getD2dist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 427 | else if (MATCHING == currDist) distVal = DistFactory::getInstance()->getMatchingdist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 428 | else if (JACCARD == currDist) distVal = DistFactory::getInstance()->getJaccarddist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 429 | else if (TANIMOTO == currDist) distVal = DistFactory::getInstance()->getTanimotodist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 430 | else if (DICE == currDist) distVal = DistFactory::getInstance()->getDicedist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 431 | else if (ANTIDICE == currDist) distVal = DistFactory::getInstance()->getAntidicedist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 432 | else if (SNEATH == currDist) distVal = DistFactory::getInstance()->getSneathdist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 433 | else if (HAMMAN == currDist) distVal = DistFactory::getInstance()->getHammandist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 434 | else if (PHI == currDist) distVal = DistFactory::getInstance()->getPhidist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 435 | else if (ANDERBERG == currDist) distVal = DistFactory::getInstance()->getAnderbergdist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 436 | else if (GOWER == currDist) distVal = DistFactory::getInstance()->getGowerdist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 437 | else if (RUSSEL == currDist) distVal = DistFactory::getInstance()->getRusseldist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 438 | else if (YULE == currDist) distVal = DistFactory::getInstance()->getYuledist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 439 | else if (OCHIAI == currDist) distVal = DistFactory::getInstance()->getOchiaidist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 440 | else if (KULCZYNSKI == currDist) distVal = DistFactory::getInstance()->getKulczynskidist(hashK, singleStrain, i_lowerCnt, src_kmerModel, trgt_kmerModel); 441 | else if (D2STAR == currDist) 442 | distVal = DistFactory::getInstance()->getD2stardist(hashK, singleStrain, i_lowerCnt, vec_order.at(i), vec_order.at(j), vec_saveURLlist[i], vec_saveURLlist[j], src_kmerModel, trgt_kmerModel); 443 | else if (D2SHEPP == currDist) 444 | distVal = DistFactory::getInstance()->getD2sheppdist(hashK, singleStrain, i_lowerCnt, vec_order.at(i), vec_order.at(j), vec_saveURLlist[i], vec_saveURLlist[j], src_kmerModel, trgt_kmerModel); 445 | else if (CVtree == currDist) 446 | distVal = DistFactory::getInstance()->getHaodist(hashK, singleStrain, i_lowerCnt, vec_saveURLlist[i], vec_saveURLlist[j], src_kmerModel, trgt_kmerModel); 447 | else if (Ma == currDist) distVal = DistFactory::getInstance()->getL1dist(hashK, singleStrain, src_kmerModel, trgt_kmerModel); 448 | else if (Eu == currDist) distVal = DistFactory::getInstance()->getL2dist(hashK, singleStrain, src_kmerModel, trgt_kmerModel); 449 | else if (Ch == currDist) distVal = DistFactory::getInstance()->getLInfdist(hashK, singleStrain, src_kmerModel, trgt_kmerModel); 450 | else if (FFP == currDist) distVal = DistFactory::getInstance()->getFFPdist(hashK, singleStrain, src_kmerModel, trgt_kmerModel); 451 | else if (Co_Phylog == currDist) distVal = DistFactory::getInstance()->getCoPhylogdist(hashK, singleStrain, src_kmerModel, trgt_kmerModel); 452 | else if (CHISQ == currDist) distVal = DistFactory::getInstance()->getChiSqdist(hashK, singleStrain, src_kmerModel, trgt_kmerModel); 453 | else if (PEARSON == currDist) distVal = DistFactory::getInstance()->getPearsondist(hashK, singleStrain, src_kmerModel, trgt_kmerModel); 454 | else if (CANBERRA == currDist) distVal = DistFactory::getInstance()->getCanberradist(hashK, singleStrain, src_kmerModel, trgt_kmerModel); 455 | else if (HAMMING == currDist) distVal = DistFactory::getInstance()->getHammingdist(hashK, singleStrain, src_kmerModel, trgt_kmerModel); 456 | else if (JS == currDist) 457 | { 458 | //if (vec_order.at(i) != vec_order.at(j)) throw std::runtime_error("JS expect same order! "); 459 | int maxOrder = vec_order.at(i); if(maxOrder < vec_order.at(j)) maxOrder = vec_order.at(j); 460 | distVal = DistFactory::getInstance()->getJensenShannondist(hashK, singleStrain, maxOrder, vec_saveURLlist[i], vec_saveURLlist[j], src_kmerModel, trgt_kmerModel); 461 | } 462 | simMat->set(i, j, distVal); simMat->set(j, i, distVal); 463 | delete trgt_kmerModel; 464 | } 465 | 466 | delete src_kmerModel; 467 | } 468 | 469 | std::cout << "-------------------------------------------------" << std::endl; 470 | std::cout << "Dist: \t" << vec_distStr[distIdx] << std::endl; 471 | 472 | if (!str_outputFileURL.empty()) 473 | { 474 | std::string postfixedOutputURL1 = str_outputFileURL; 475 | postfixedOutputURL1.append(".").append(vec_distStr[distIdx]).append(".plain"); 476 | OutputWriter::getInstance()->writeToFile(PLAIN, simMat, &vec_namelist, postfixedOutputURL1); 477 | 478 | std::string postfixedOutputURL2 = str_outputFileURL; 479 | postfixedOutputURL2.append(".").append(vec_distStr[distIdx]).append(".phylip"); 480 | OutputWriter::getInstance()->writeToFile(PHYLIP, simMat, &vec_namelist, postfixedOutputURL2); 481 | 482 | } 483 | OutputWriter::getInstance()->writeToConsole(outputType, simMat, &vec_namelist); 484 | 485 | endTime = clock(); 486 | std::cout << "Time Elapsed: " << ((float)endTime - (float)startTime) / CLOCKS_PER_SEC << " seconds" << std::endl; 487 | startTime = clock(); 488 | } 489 | printf("Finish calculating the distance.\n"); 490 | printf("Done"); 491 | //system("pause"); 492 | return 0; 493 | } 494 | -------------------------------------------------------------------------------- /code/SimpleMatrix.h: -------------------------------------------------------------------------------- 1 | /*************************************************************** 2 | * Copyright (C) 2013 Quan Wang 3 | * Signal Analysis and Machine Perception Laboratory 4 | * Department of Electrical, Computer, and Systems Engineering 5 | * Rensselaer Polytechnic Institute, Troy, NY 12180, USA 6 | * 7 | * Related publication: 8 | * Quan Wang, Kim L. Boyer. 9 | * Feature Learning by Multidimensional Scaling and its Applications in Object Recognition. 10 | * 2013 26th SIBGRAPI Conference on Graphics, Patterns and Images (Sibgrapi). IEEE, 2013. 11 | * 12 | * Modifed by Yang Lu 13 | * Computational and Molecular Biology, Department of Biological Science 14 | * University of Southern California, LA, CA 90089, USA 15 | * 16 | * Related publication: 17 | * TBA 18 | ***************************************************************/ 19 | #ifndef SIMPLE_MATRIX_H 20 | #define SIMPLE_MATRIX_H 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #define MAX_LINE_LENGTH 100000 30 | #define EPSILON 0.0000001 31 | namespace smat 32 | { 33 | 34 | /********************************************** 35 | * Declaration part 36 | **********************************************/ 37 | 38 | template 39 | class Matrix 40 | { 41 | public: 42 | Matrix(int rows, int columns); // initialization without assigning values 43 | Matrix(int rows, int columns, T value); // initialization with all same values 44 | Matrix(int rows, int columns, std::string type); // special matrix such as I 45 | Matrix(const char * filename); // load matrix from txt file 46 | ~Matrix(); // destruction 47 | 48 | void set(int r, int c, T value); // row, column, value 49 | T get(int r, int c); // row, column 50 | int rows(); // number of rows 51 | int columns(); // number of columns 52 | 53 | void print(); // print the matrix 54 | Matrix * copy(); // copy itself to a new matrix 55 | 56 | void saveTxt(const char * filename); // save matrix to txt file 57 | 58 | // B=M' 59 | Matrix * transpose(); 60 | // B=M(r1:r2,c1:c2) 61 | Matrix * sub(int r1, int r2, int c1, int c2); // submatrix 62 | // B=|M| 63 | Matrix * abs(); // absolute values 64 | 65 | // numbers of matrix 66 | T trace(); // trace 67 | double fnorm(); // Frobenius norm 68 | double pnorm(double p); // p-norm 69 | T maxEl(int &r, int &c); // max element 70 | T minEl(int &r, int &c); // min element 71 | double mean(); // mean of elements 72 | T sum(); // sum of elements 73 | double std(); // standard deviation of elements 74 | 75 | 76 | // M=M+a 77 | void addNumberSelf(T value); // add a number to itself in space 78 | // M=M*a 79 | void multiplyNumberSelf(T value); // add a number to itself in space 80 | 81 | // M=M+A 82 | void addMatrixSelf(Matrix * A); // add a matrix to itself in space 83 | // M=M.*A 84 | void dotMultiplyMatrixSelf(Matrix * A); // dot multiply a matrix to itself in space 85 | 86 | // B=M+A 87 | Matrix * addMatrixNew(Matrix * A); // add a matrix to itself with new matrix 88 | // B=M.*A 89 | Matrix * dotMultiplyMatrixNew(Matrix * A); // dot multiply a matrix to itself with new matrix 90 | // B=M*A 91 | Matrix * multiplyMatrixNew(Matrix * A); // multiply a matrix to itself with new matrix 92 | 93 | // Multidimensional scaling (MDS) 94 | // This function re-implements Laurens van der Maaten's MDS in his Matlab Toolbox for Dimensionality Reduction 95 | // The Matlab MDS can be downloaded at http://crcv.ucf.edu/source/dimension 96 | Matrix * MDS_UCF(int dim, int iter); 97 | 98 | private: 99 | int rows_; 100 | int columns_; 101 | T **v; 102 | }; 103 | 104 | 105 | 106 | /********************************************** 107 | * Utilities part 108 | **********************************************/ 109 | 110 | template 111 | T min(T v1, T v2) 112 | { 113 | if (v1 118 | T max(T v1, T v2) 119 | { 120 | if (v1>v2) return v1; 121 | else return v2; 122 | } 123 | 124 | template 125 | void swap(T &v1, T &v2) 126 | { 127 | T v3 = v1; 128 | v1 = v2; 129 | v2 = v3; 130 | } 131 | 132 | template 133 | double sign(T v) 134 | { 135 | if (v>0) return 1.0; 136 | else if (v<0) return -1.0; 137 | else return 0.0; 138 | } 139 | 140 | /********************************************** 141 | * Implementation part 142 | **********************************************/ 143 | 144 | template 145 | Matrix::Matrix(int rows, int columns) // initialization without assigning values 146 | { 147 | if (rows<1 || columns<1) 148 | { 149 | printf("Invalid construction arguments: rows=%d, columns=%d\n", rows, columns); 150 | exit(1); 151 | } 152 | 153 | rows_ = rows; 154 | columns_ = columns; 155 | 156 | v = new T *[rows]; 157 | for (int i = 0; i 164 | Matrix::Matrix(int rows, int columns, T value) // initialization with all same values 165 | { 166 | if (rows<1 || columns<1) 167 | { 168 | printf("Invalid construction arguments: rows=%d, columns=%d\n", rows, columns); 169 | exit(1); 170 | } 171 | 172 | rows_ = rows; 173 | columns_ = columns; 174 | 175 | v = new T *[rows]; 176 | for (int i = 0; i 188 | Matrix::Matrix(int rows, int columns, std::string type) // special matrix such as I 189 | { 190 | if (rows<1 || columns<1) 191 | { 192 | printf("Invalid construction arguments: rows=%d, columns=%d\n", rows, columns); 193 | exit(1); 194 | } 195 | rows_ = rows; 196 | columns_ = columns; 197 | 198 | v = new T *[rows]; 199 | for (int i = 0; i= rows || k<0) 260 | { 261 | printf("Invalid row index: %d\n", k); 262 | exit(1); 263 | } 264 | T temp = v[i][j]; 265 | v[i][j] = v[k][j]; 266 | v[k][j] = temp; 267 | } 268 | } 269 | } 270 | 271 | else 272 | { 273 | printf("Undefined matrix type: %s\n", type.c_str()); 274 | exit(1); 275 | } 276 | } 277 | 278 | template 279 | Matrix::Matrix(const char * filename) 280 | { 281 | FILE * pFile; 282 | // first pass: matrix size 283 | int rows = 0; 284 | int columns = 0; 285 | 286 | pFile = fopen(filename, "r"); 287 | if (pFile == NULL) 288 | { 289 | printf("File \"%s\" cannot be found.\n", filename); 290 | exit(1); 291 | } 292 | char line[MAX_LINE_LENGTH]; 293 | char * token = NULL; 294 | while (fgets(line, MAX_LINE_LENGTH, pFile) != NULL) 295 | { 296 | rows++; 297 | if (rows == 1) // count the number of columns 298 | { 299 | token = strtok(line, " ,\t"); 300 | while (token != NULL && token[0] >= 32) 301 | { 302 | columns++; 303 | token = strtok(NULL, " ,\t"); 304 | } 305 | } 306 | else // check whether every row contains the same number of elements with the first row 307 | { 308 | int check = 0; 309 | token = strtok(line, " ,\t"); 310 | while (token != NULL && token[0] >= 32) 311 | { 312 | check++; 313 | token = strtok(NULL, " ,\t"); 314 | } 315 | if (check= rows) break; 344 | for (int j = 0; j 356 | Matrix::~Matrix() // destruction 357 | { 358 | for (int i = 0; i 366 | void Matrix::set(int r, int c, T value) // row, column, value 367 | { 368 | if (r<0 || r >= rows_ || c<0 || c >= columns_) 369 | { 370 | printf("Invalid index in set(): r=%d, c=%d\n", r, c); 371 | exit(1); 372 | } 373 | v[r][c] = value; 374 | } 375 | 376 | template 377 | T Matrix::get(int r, int c) // row, column 378 | { 379 | if (r<0 || r >= rows_ || c<0 || c >= columns_) 380 | { 381 | printf("Invalid index in get(): r=%d, c=%d\n", r, c); 382 | exit(1); 383 | } 384 | return v[r][c]; 385 | } 386 | 387 | template 388 | int Matrix::rows() // number of rows 389 | { 390 | return rows_; 391 | } 392 | 393 | template 394 | int Matrix::columns() // number of columns 395 | { 396 | return columns_; 397 | } 398 | 399 | template 400 | void Matrix::print() // print the matrix 401 | { 402 | printf("\n"); 403 | for (int i = 0; i 415 | Matrix * Matrix::copy() // copy itself to a new matrix 416 | { 417 | Matrix * A = new Matrix(rows_, columns_); 418 | for (int i = 0; iset(i, j, v[i][j]); 423 | } 424 | } 425 | return A; 426 | } 427 | 428 | template 429 | void Matrix::saveTxt(const char * filename) 430 | { 431 | FILE * pFile; 432 | pFile = fopen(filename, "w"); 433 | if (pFile == NULL) 434 | { 435 | printf("Cannot save to file \"%s\".\n", filename); 436 | exit(1); 437 | } 438 | for (int i = 0; i 451 | Matrix * Matrix::transpose() 452 | { 453 | Matrix * A = new Matrix(columns_, rows_); 454 | for (int i = 0; iset(i, j, v[j][i]); 459 | } 460 | } 461 | return A; 462 | } 463 | 464 | template 465 | Matrix * Matrix::sub(int r1, int r2, int c1, int c2) // submatrix 466 | { 467 | if (r1<0 || r1 >= rows_ || r2<0 || r2 >= rows_ || r2= columns_ || c2<0 || c2>columns_ || c2 * A = new Matrix(newRows, newColumns); 476 | for (int i = 0; iset(i, j, v[i + r1][j + c1]); 481 | } 482 | } 483 | return A; 484 | } 485 | 486 | template 487 | Matrix * Matrix::abs() // absolute values 488 | { 489 | Matrix * A = new Matrix(rows_, columns_); 490 | for (int i = 0; iset(i, j, v[i][j]>0 ? v[i][j] : -v[i][j]); 495 | } 496 | } 497 | return A; 498 | } 499 | 500 | template 501 | T Matrix::trace() // trace 502 | { 503 | T x = 0; 504 | for (int i = 0; i(rows_, columns_); i++) 505 | { 506 | x += v[i][i]; 507 | } 508 | return x; 509 | } 510 | 511 | template 512 | double Matrix::fnorm() // Frobenius norm 513 | { 514 | double x = 0; 515 | for (int i = 0; i 526 | double Matrix::pnorm(double p) // p-norm 527 | { 528 | double x = 0; 529 | for (int i = 0; i 540 | T Matrix::maxEl(int &r, int &c) // max element 541 | { 542 | T x = v[0][0]; 543 | r = 0; 544 | c = 0; 545 | for (int i = 0; ix) 550 | { 551 | x = v[i][j]; 552 | r = i; 553 | c = j; 554 | } 555 | } 556 | } 557 | return x; 558 | } 559 | 560 | template 561 | T Matrix::minEl(int &r, int &c) // min element 562 | { 563 | T x = v[0][0]; 564 | r = 0; 565 | c = 0; 566 | for (int i = 0; i 582 | double Matrix::mean() // mean of elements 583 | { 584 | double x = 0; 585 | for (int i = 0; i 596 | T Matrix::sum() // sum of elements 597 | { 598 | T x = 0; 599 | for (int i = 0; i 610 | double Matrix::std() // standard deviation of elements 611 | { 612 | double m = mean(); 613 | double s = 0; 614 | for (int i = 0; i 626 | void Matrix::addNumberSelf(T value) // add a number to itself in space 627 | { 628 | for (int i = 0; i 638 | void Matrix::multiplyNumberSelf(T value) // add a number to itself in space 639 | { 640 | for (int i = 0; i 650 | void Matrix::addMatrixSelf(Matrix * A) // add a matrix to itself in space 651 | { 652 | if (rows_ != A->rows() || columns_ != A->columns()) 653 | { 654 | printf("Unmatched matrix sizes in matrix summation.\n"); 655 | exit(1); 656 | } 657 | 658 | for (int i = 0; iget(i, j); 663 | } 664 | } 665 | } 666 | 667 | template 668 | void Matrix::dotMultiplyMatrixSelf(Matrix * A) // dot multiply a matrix to itself in space 669 | { 670 | if (rows_ != A->rows() || columns_ != A->columns()) 671 | { 672 | printf("Unmatched matrix sizes in matrix dot multiplication.\n"); 673 | exit(1); 674 | } 675 | 676 | for (int i = 0; iget(i, j); 681 | } 682 | } 683 | } 684 | 685 | template 686 | Matrix * Matrix::addMatrixNew(Matrix * A) // add a matrix to itself with new matrix 687 | { 688 | if (rows_ != A->rows() || columns_ != A->columns()) 689 | { 690 | printf("Unmatched matrix sizes in matrix summation.\n"); 691 | exit(1); 692 | } 693 | 694 | Matrix * B = new Matrix(rows_, columns_); 695 | for (int i = 0; iset(i, j, v[i][j] + A->get(i, j)); 700 | } 701 | } 702 | return B; 703 | } 704 | 705 | template 706 | Matrix * Matrix::dotMultiplyMatrixNew(Matrix * A) // dot multiply a matrix to itself with new matrix 707 | { 708 | if (rows_ != A->rows() || columns_ != A->columns()) 709 | { 710 | printf("Unmatched matrix sizes in matrix dot multiplication.\n"); 711 | exit(1); 712 | } 713 | 714 | Matrix * B = new Matrix(rows_, columns_); 715 | for (int i = 0; iset(i, j, v[i][j] * A->get(i, j)); 720 | } 721 | } 722 | return B; 723 | } 724 | 725 | template 726 | Matrix * Matrix::multiplyMatrixNew(Matrix * A) // multiply a matrix to itself with new matrix 727 | { 728 | if (columns_ != A->rows()) 729 | { 730 | printf("Unmatched matrix sizes in matrix multiplication.\n"); 731 | exit(1); 732 | } 733 | 734 | Matrix * B = new Matrix(rows_, A->columns()); 735 | T temp; 736 | for (int i = 0; icolumns(); j++) 739 | { 740 | temp = 0; 741 | for (int k = 0; kget(k, j)); 744 | } 745 | B->set(i, j, temp); 746 | } 747 | } 748 | return B; 749 | } 750 | 751 | template 752 | Matrix * Matrix::MDS_UCF(int dim, int iter) 753 | { 754 | if (rows() != columns()) { printf("Input distance matrix to MDS is not square.\n"); exit(1); } 755 | if (dim<1) { printf("Invalid dimension for MDS.\n"); exit(1); } 756 | if (iter<1){ printf("Invalid number of iterations for MDS.\n"); exit(1); } 757 | 758 | Matrix * X = new Matrix(rows(), dim, "rand"); 759 | double D_mean = mean(); // mean value of distance matrix 760 | X->addNumberSelf(-0.5); // move to the center 761 | X->multiplyNumberSelf(0.1*D_mean / (1.0 / 3.0*sqrt((double)dim))); // before this step, mean distance is 1/3*sqrt(d) 762 | 763 | double lr = 0.05; // learning rate 764 | double r = 2; // metric 765 | int n = rows(); // number of vectors 766 | 767 | Matrix * dh = new Matrix(n, n, 0.0); 768 | Matrix * pmat = new Matrix(n, dim); 769 | Matrix * dhdum = new Matrix(n, 1); 770 | Matrix * dhmat = new Matrix(n - 1, dim, 0); 771 | 772 | Matrix * RP = new Matrix(n, iter, "randperm"); // the matrix for random permutation numbers 773 | int i, j; 774 | double temp; 775 | int m; 776 | 777 | //printf("MDS iteration:"); 778 | for (int it = 0; itget(rp, it) - 1; 785 | 786 | for (i = 0; iset(i, j, X->get(m, j) - X->get(i, j)); 791 | } 792 | } 793 | 794 | for (i = 0; iget(i, j)), r); 800 | } 801 | dhdum->set(i, 0, pow(temp, 1 / r)); 802 | } 803 | 804 | for (i = 0; iset(m, i, dhdum->get(i, 0)); 809 | dh->set(i, m, dhdum->get(i, 0)); 810 | } 811 | 812 | for (i = 0; i= m) ii = i + 1; 816 | temp = lr * (dhdum->get(ii, 0) - get(ii, m)) * pow(dhdum->get(ii, 0), 1 - r); 817 | for (j = 0; jset(i, j, temp); 820 | } 821 | } 822 | 823 | for (i = 0; i= m) ii = i + 1; 827 | for (j = 0; jget(ii, j); 830 | temp += dhmat->get(i, j) * pow(fabs(pmat->get(ii, j)), r - 1) * sign(pmat->get(ii, j)); 831 | 832 | X->set(ii, j, temp); 833 | } 834 | } 835 | } 836 | } 837 | 838 | delete dh; 839 | delete pmat; 840 | delete dhdum; 841 | delete dhmat; 842 | delete RP; 843 | 844 | return X; 845 | } 846 | 847 | /********************************************** 848 | * Algorithm part 849 | **********************************************/ 850 | 851 | // Calculate the pairwise interpoint Euclidean distances 852 | // X is data matrix, D is distance matrix 853 | /*void EuclideanDistanceMatrix(Matrix * X, Matrix * D) 854 | { 855 | int i, j, k; 856 | double temp; 857 | if (D == NULL) 858 | { 859 | printf("Input matrix pointer is NULL.\n"); 860 | exit(1); 861 | } 862 | else if (X->rows() != D->rows() || X->rows() != D->columns()) 863 | { 864 | printf("Invalid distance matrix dimension.\n"); 865 | exit(1); 866 | } 867 | 868 | for (i = 0; irows(); i++) D->set(i, i, 0.0); 869 | 870 | for (i = 0; irows() - 1; i++) 871 | { 872 | for (j = i + 1; jcolumns(); j++) 873 | { 874 | temp = 0; 875 | for (k = 0; kcolumns(); k++) 876 | { 877 | temp += pow(X->get(i, k) - X->get(j, k), 2); 878 | } 879 | D->set(i, j, sqrt(temp)); 880 | } 881 | } 882 | 883 | for (i = 1; irows(); i++) 884 | { 885 | for (j = 0; jset(i, j, D->get(j, i)); 888 | } 889 | } 890 | }*/ 891 | 892 | // Copy all elements of X to Y 893 | /*void ElementCopy(Matrix * X, Matrix * Y) 894 | { 895 | if (Y == NULL) 896 | { 897 | printf("Input matrix pointer is NULL.\n"); 898 | exit(1); 899 | } 900 | else if (X->rows() != Y->rows() || X->columns() != Y->columns()) 901 | { 902 | printf("Invalid matrix dimension.\n"); 903 | exit(1); 904 | } 905 | for (int i = 0; irows(); i++) 906 | { 907 | for (int j = 0; jcolumns(); j++) 908 | { 909 | Y->set(i, j, X->get(i, j)); 910 | } 911 | } 912 | }*/ 913 | 914 | // Multidimensional scaling (MDS) 915 | // This function re-implements Laurens van der Maaten's MDS in his Matlab Toolbox for Dimensionality Reduction 916 | // The Matlab MDS can be downloaded at http://crcv.ucf.edu/source/dimension 917 | //Matrix * MDS_UCF(Matrix * D, Matrix * X0, int dim, int iter) 918 | //{ 919 | // if (D->rows() != D->columns()) 920 | // { 921 | // printf("Input distance matrix to MDS is not square.\n"); 922 | // exit(1); 923 | // } 924 | // if (dim<1) 925 | // { 926 | // printf("Invalid dimension for MDS.\n"); 927 | // exit(1); 928 | // } 929 | // if (iter<1) 930 | // { 931 | // printf("Invalid number of iterations for MDS.\n"); 932 | // exit(1); 933 | // } 934 | 935 | // Matrix * X = NULL; 936 | 937 | // // with initialization 938 | // if (X0 != NULL) 939 | // { 940 | // if (X0->rows() != D->rows() || X0->columns() != dim) 941 | // { 942 | // printf("Input initialization to MDS has invalid dimension.\n"); 943 | // exit(1); 944 | // } 945 | // X = X0->copy(); 946 | // } 947 | // // without initialization 948 | // else 949 | // { 950 | // X = new Matrix(D->rows(), dim, "rand"); 951 | // double D_mean = D->mean(); // mean value of distance matrix 952 | // X->addNumberSelf(-0.5); // move to the center 953 | // X->multiplyNumberSelf(0.1*D_mean / (1.0 / 3.0*sqrt((double)dim))); // before this step, mean distance is 1/3*sqrt(d) 954 | // } 955 | 956 | // double lr = 0.05; // learning rate 957 | // double r = 2; // metric 958 | // int n = D->rows(); // number of vectors 959 | 960 | 961 | // Matrix * dh = new Matrix(n, n, 0.0); 962 | // Matrix * pmat = new Matrix(n, dim); 963 | // Matrix * dhdum = new Matrix(n, 1); 964 | // Matrix * dhmat = new Matrix(n - 1, dim, 0); 965 | 966 | // Matrix * RP = new Matrix(n, iter, "randperm"); // the matrix for random permutation numbers 967 | // int i, j; 968 | // double temp; 969 | // int m; 970 | 971 | // printf("MDS iteration:"); 972 | // for (int it = 0; itget(rp, it) - 1; 979 | 980 | // for (i = 0; iset(i, j, X->get(m, j) - X->get(i, j)); 985 | // } 986 | // } 987 | 988 | // for (i = 0; iget(i, j)), r); 994 | // } 995 | // dhdum->set(i, 0, pow(temp, 1 / r)); 996 | // } 997 | 998 | // for (i = 0; iset(m, i, dhdum->get(i, 0)); 1003 | // dh->set(i, m, dhdum->get(i, 0)); 1004 | // } 1005 | 1006 | // for (i = 0; i= m) ii = i + 1; 1010 | // temp = lr * (dhdum->get(ii, 0) - D->get(ii, m)) * pow(dhdum->get(ii, 0), 1 - r); 1011 | // for (j = 0; jset(i, j, temp); 1014 | // } 1015 | // } 1016 | 1017 | // for (i = 0; i= m) ii = i + 1; 1021 | // for (j = 0; jget(ii, j); 1024 | // temp += dhmat->get(i, j) * pow(fabs(pmat->get(ii, j)), r - 1) * sign(pmat->get(ii, j)); 1025 | 1026 | // X->set(ii, j, temp); 1027 | // } 1028 | // } 1029 | // } 1030 | // } 1031 | 1032 | // printf("\n"); 1033 | 1034 | // delete dh; 1035 | // delete pmat; 1036 | // delete dhdum; 1037 | // delete dhmat; 1038 | // delete RP; 1039 | 1040 | // return X; 1041 | //} 1042 | 1043 | // Multidimensional scaling (MDS) with SMACOF 1044 | // This code re-implements Michael Bronstein's SMACOF in his Matlab Toolbox for Surface Comparison and Analysis 1045 | // The Matlab SMACOF can be downloaded at http://tosca.cs.technion.ac.il/ 1046 | //Matrix * MDS_SMACOF(Matrix * D, Matrix * X0, int dim, int iter) 1047 | //{ 1048 | // if (D->rows() != D->columns()) 1049 | // { 1050 | // printf("Input distance matrix to MDS is not square.\n"); 1051 | // exit(1); 1052 | // } 1053 | // if (dim<1) 1054 | // { 1055 | // printf("Invalid dimension for MDS.\n"); 1056 | // exit(1); 1057 | // } 1058 | // if (iter<1) 1059 | // { 1060 | // printf("Invalid number of iterations for MDS.\n"); 1061 | // exit(1); 1062 | // } 1063 | 1064 | // Matrix * X = NULL; 1065 | 1066 | // // with initialization 1067 | // if (X0 != NULL) 1068 | // { 1069 | // if (X0->rows() != D->rows() || X0->columns() != dim) 1070 | // { 1071 | // printf("Input initialization to MDS has invalid dimension.\n"); 1072 | // exit(1); 1073 | // } 1074 | // X = X0->copy(); 1075 | // } 1076 | // // without initialization 1077 | // else 1078 | // { 1079 | // X = new Matrix(D->rows(), dim, "rand"); 1080 | // double D_mean = D->mean(); // mean value of distance matrix 1081 | // X->addNumberSelf(-0.5); // move to the center 1082 | // X->multiplyNumberSelf(0.1*D_mean / (1.0 / 3.0*sqrt((double)dim))); // before this step, mean distance is 1/3*sqrt(d) 1083 | // } 1084 | 1085 | 1086 | // Matrix * Z = X->copy(); 1087 | // Matrix * D_ = new Matrix(D->rows(), D->columns()); 1088 | // Matrix * B = new Matrix(D->rows(), D->columns()); 1089 | // int i, j, k; 1090 | // double temp; 1091 | 1092 | // EuclideanDistanceMatrix(X, D_); 1093 | 1094 | // printf("MDS iteration:"); 1095 | // for (int it = 0; itrows(); i++) 1102 | // { 1103 | // for (j = 0; jcolumns(); j++) 1104 | // { 1105 | // if (i == j || fabs(D_->get(i, j))set(i, j, 0.0); 1108 | // } 1109 | // else 1110 | // { 1111 | // B->set(i, j, -D->get(i, j) / D_->get(i, j)); 1112 | // } 1113 | // } 1114 | // } 1115 | 1116 | // for (j = 0; jcolumns(); j++) 1117 | // { 1118 | // temp = 0; 1119 | // for (i = 0; irows(); i++) 1120 | // { 1121 | // temp += B->get(i, j); 1122 | // } 1123 | // B->set(j, j, -temp); 1124 | // } 1125 | 1126 | // // X = B*Z/size(D,1); 1127 | // for (i = 0; irows(); i++) 1128 | // { 1129 | // for (j = 0; jcolumns(); j++) 1130 | // { 1131 | // temp = 0; 1132 | // for (k = 0; kcolumns(); k++) 1133 | // { 1134 | // temp += (B->get(i, k)*Z->get(k, j)); 1135 | // } 1136 | // X->set(i, j, temp / (double)D->rows()); 1137 | // } 1138 | // } 1139 | 1140 | // // D_ = calc_D (X); 1141 | // EuclideanDistanceMatrix(X, D_); 1142 | 1143 | // // Z = X; 1144 | // ElementCopy(X, Z); 1145 | // } 1146 | 1147 | // printf("\n"); 1148 | 1149 | // delete Z; 1150 | // delete D_; 1151 | // delete B; 1152 | 1153 | // return X; 1154 | //} 1155 | } 1156 | 1157 | #endif 1158 | -------------------------------------------------------------------------------- /code/kmer.cpp: -------------------------------------------------------------------------------- 1 | #include "kmer.h" 2 | #include 3 | 4 | IterFactory* IterFactory::instance = 0; 5 | 6 | IterFactory* IterFactory::getInstance() 7 | { 8 | if (!instance) instance = new IterFactory(); 9 | return instance; 10 | } 11 | 12 | AbsIter* IterFactory::getKmerCntIterator(int i_arg_k, std::unordered_map* arg_kmerCntUnorderMap, std::vector* arg_kmerVec, int i_arg_lowerCnt) 13 | { 14 | if (i_arg_lowerCnt <= 0) return new KmerCntTraverseIter(i_arg_k, arg_kmerCntUnorderMap); 15 | else return new KmerCntHashIter(i_arg_k, arg_kmerCntUnorderMap, arg_kmerVec); 16 | } 17 | 18 | AbsIter* IterFactory::getKmerFreqIterator(int i_arg_k, std::unordered_map* arg_kmerCntUnorderMap, std::vector* arg_kmerVec, unsigned long l_arg_totalKmer) 19 | { 20 | return new KmerFreqHashIter(i_arg_k, arg_kmerCntUnorderMap, arg_kmerVec, l_arg_totalKmer); 21 | } 22 | 23 | KmerProbEnsembDelegate* IterFactory::getKmerProbDelegate(int i_arg_k, bool b_arg_singleStrain, MarkovModel* arg_mrkvModel) 24 | { 25 | KmerProbEnsembDelegate* kmerProbDelegate = new KmerProbEnsembDelegate(i_arg_k, arg_mrkvModel, b_arg_singleStrain); 26 | kmerProbDelegate->init(); 27 | return kmerProbDelegate; 28 | } 29 | 30 | double IterFactory::getFreqDist(AbsTupleDistStrategy* distStrategy, 31 | std::unordered_map* src_kmerCntUnorderMap, std::vector* src_kmerVec, unsigned long src_totalKmer, 32 | std::unordered_map* trgt_kmerCntUnorderMap, std::vector* trgt_kmerVec, unsigned long trgt_totalKmer) 33 | { 34 | AbsIter* src_freqIter = getKmerFreqIterator(distStrategy->i_k, src_kmerCntUnorderMap, src_kmerVec, src_totalKmer); 35 | AbsIter* trgt_freqIter = getKmerFreqIterator( distStrategy->i_k, trgt_kmerCntUnorderMap, trgt_kmerVec, trgt_totalKmer); 36 | 37 | std::queue src_kmer_queue, trgt_kmer_queue; 38 | std::queue src_freq_queue, trgt_freq_queue; 39 | 40 | while (src_freqIter->hasNext() || trgt_freqIter->hasNext()) 41 | { 42 | if (src_freqIter->hasNext()) 43 | { 44 | double src_X_w = *(*src_freqIter); double src_kmer = src_freqIter->getCurrKmer(); (*src_freqIter)++; 45 | src_kmer_queue.push(src_kmer); src_freq_queue.push(src_X_w); 46 | } 47 | 48 | if (trgt_freqIter->hasNext()) 49 | { 50 | double trgt_X_w = *(*trgt_freqIter); double trgt_kmer = trgt_freqIter->getCurrKmer(); (*trgt_freqIter)++; 51 | trgt_kmer_queue.push(trgt_kmer); trgt_freq_queue.push(trgt_X_w); 52 | } 53 | 54 | while (!src_kmer_queue.empty() && !trgt_kmer_queue.empty()) 55 | { 56 | if (src_kmer_queue.front() < trgt_kmer_queue.front()) 57 | { 58 | distStrategy->dealWithTuple(src_freq_queue.front(), 0); 59 | src_kmer_queue.pop(); src_freq_queue.pop(); 60 | } 61 | else if (src_kmer_queue.front() > trgt_kmer_queue.front()) 62 | { 63 | distStrategy->dealWithTuple(0, trgt_freq_queue.front()); 64 | trgt_kmer_queue.pop(); trgt_freq_queue.pop(); 65 | } 66 | else 67 | { 68 | distStrategy->dealWithTuple(src_freq_queue.front(), trgt_freq_queue.front()); 69 | src_kmer_queue.pop(); src_freq_queue.pop(); trgt_kmer_queue.pop(); trgt_freq_queue.pop(); 70 | } 71 | } 72 | } 73 | while (!src_kmer_queue.empty()) 74 | { 75 | distStrategy->dealWithTuple(src_freq_queue.front(), 0); 76 | src_kmer_queue.pop(); src_freq_queue.pop(); 77 | } 78 | while (!trgt_kmer_queue.empty()) 79 | { 80 | distStrategy->dealWithTuple(0, trgt_freq_queue.front()); 81 | trgt_kmer_queue.pop(); trgt_freq_queue.pop(); 82 | } 83 | 84 | delete src_freqIter; delete trgt_freqIter; 85 | return distStrategy->getDist(); 86 | } 87 | 88 | double IterFactory::getCoPhylogDist(int i_arg_k, 89 | std::unordered_map* src_kmerCntUnorderMap, std::vector* src_kmerVec, 90 | std::unordered_map* trgt_kmerCntUnorderMap, std::vector* trgt_kmerVec) 91 | { 92 | int maskLen = i_arg_k; maskLen = (maskLen >> 1); maskLen = (maskLen << 1); 93 | unsigned long long mask_low = ((unsigned long long)1 << maskLen) - 1; 94 | 95 | std::unordered_map* src_context_dup = new std::unordered_map(); 96 | std::unordered_map* src_context_obj = new std::unordered_map(); 97 | //std::unordered_map* src_context_cnt = new std::unordered_map(); 98 | std::unordered_map* trgt_context_dup = new std::unordered_map(); 99 | std::unordered_map* trgt_context_obj = new std::unordered_map(); 100 | //std::unordered_map* trgt_context_cnt = new std::unordered_map(); 101 | 102 | AbsIter* src_CntIter = getKmerCntIterator(i_arg_k, src_kmerCntUnorderMap, src_kmerVec, 0); 103 | AbsIter* trgt_CntIter = getKmerCntIterator(i_arg_k, trgt_kmerCntUnorderMap, trgt_kmerVec, 0); 104 | 105 | while (src_CntIter->hasNext()) 106 | { 107 | double src_X_w = *(*src_CntIter); unsigned long long src_kmer = src_CntIter->getCurrKmer(); (*src_CntIter)++; if (src_X_w <= 0) continue; 108 | 109 | unsigned long long kmer_low = (src_kmer & mask_low); 110 | unsigned long long kmer_high = (src_kmer >> maskLen); 111 | int objVal = (kmer_high % BASE); kmer_high = (kmer_high >> 2); 112 | unsigned long long kmer_new = ((kmer_high << maskLen) + kmer_low); 113 | 114 | (*src_context_dup)[kmer_new] ++; 115 | (*src_context_obj)[kmer_new] = objVal; 116 | //(*src_context_cnt)[kmer_new] = src_X_w; 117 | } 118 | 119 | while (trgt_CntIter->hasNext()) 120 | { 121 | double trgt_X_w = *(*trgt_CntIter); unsigned long long trgt_kmer = trgt_CntIter->getCurrKmer(); (*trgt_CntIter)++; if (trgt_X_w <= 0) continue; 122 | 123 | unsigned long long kmer_low = (trgt_kmer & mask_low); 124 | unsigned long long kmer_high = (trgt_kmer >> maskLen); 125 | int objVal = (kmer_high % BASE); kmer_high = (kmer_high >> 2); 126 | unsigned long long kmer_new = ((kmer_high << maskLen) + kmer_low); 127 | 128 | (*trgt_context_dup)[kmer_new] ++; 129 | (*trgt_context_obj)[kmer_new] = objVal; 130 | //(*trgt_context_cnt)[kmer_new] = trgt_X_w; 131 | } 132 | 133 | double sum=0, hit=0; 134 | for (std::unordered_map::iterator iter = src_context_dup->begin(); iter != src_context_dup->end(); iter++) 135 | { 136 | unsigned long long currKmerIdx = iter->first; 137 | int objDup_src = (*src_context_dup)[currKmerIdx]; int objDup_trgt = (*trgt_context_dup)[currKmerIdx]; 138 | 139 | if(1 == objDup_src && 1 == objDup_trgt) 140 | { 141 | sum ++; 142 | int obj_src = (*src_context_obj)[currKmerIdx]; int obj_trgt = (*trgt_context_obj)[currKmerIdx]; 143 | 144 | if(obj_src != obj_trgt) hit ++; 145 | } 146 | } 147 | 148 | delete src_CntIter; delete trgt_CntIter; 149 | delete src_context_dup; delete src_context_obj; //delete src_context_cnt; 150 | delete trgt_context_dup; delete trgt_context_obj; //delete trgt_context_cnt; 151 | 152 | if(0 == sum) return 0; 153 | else return hit/sum; 154 | } 155 | 156 | double IterFactory::getCntDist(AbsTupleDistStrategy* distStrategy, int i_arg_lowerCnt, 157 | std::unordered_map* src_kmerCntUnorderMap, std::vector* src_kmerVec, 158 | std::unordered_map* trgt_kmerCntUnorderMap, std::vector* trgt_kmerVec) 159 | { 160 | AbsIter* src_CntIter = getKmerCntIterator(distStrategy->i_k, src_kmerCntUnorderMap, src_kmerVec, i_arg_lowerCnt); 161 | AbsIter* trgt_CntIter = getKmerCntIterator(distStrategy->i_k, trgt_kmerCntUnorderMap, trgt_kmerVec, i_arg_lowerCnt); 162 | 163 | std::queue src_kmer_queue, trgt_kmer_queue; 164 | std::queue src_cnt_queue, trgt_cnt_queue; 165 | 166 | while (src_CntIter->hasNext() || trgt_CntIter->hasNext()) 167 | { 168 | if (src_CntIter->hasNext()) 169 | { 170 | double src_X_w = *(*src_CntIter); double src_kmer = src_CntIter->getCurrKmer(); (*src_CntIter)++; 171 | if (src_X_w >= i_arg_lowerCnt) { src_kmer_queue.push(src_kmer); src_cnt_queue.push(src_X_w); } 172 | } 173 | 174 | if (trgt_CntIter->hasNext()) 175 | { 176 | double trgt_X_w = *(*trgt_CntIter); double trgt_kmer = trgt_CntIter->getCurrKmer(); (*trgt_CntIter)++; 177 | if (trgt_X_w >= i_arg_lowerCnt) { trgt_kmer_queue.push(trgt_kmer); trgt_cnt_queue.push(trgt_X_w); } 178 | } 179 | 180 | while (!src_kmer_queue.empty() && !trgt_kmer_queue.empty()) 181 | { 182 | if (src_kmer_queue.front() < trgt_kmer_queue.front()) 183 | { 184 | src_kmer_queue.pop(); src_cnt_queue.pop(); 185 | } 186 | else if (src_kmer_queue.front() > trgt_kmer_queue.front()) 187 | { 188 | trgt_kmer_queue.pop(); trgt_cnt_queue.pop(); 189 | } 190 | else 191 | { 192 | distStrategy->dealWithTuple(src_cnt_queue.front(), trgt_cnt_queue.front()); 193 | src_kmer_queue.pop(); src_cnt_queue.pop(); trgt_kmer_queue.pop(); trgt_cnt_queue.pop(); 194 | } 195 | } 196 | } 197 | delete src_CntIter; delete trgt_CntIter; 198 | return distStrategy->getDist(); 199 | } 200 | 201 | double IterFactory::getCntDist(AbsQuadStrategy* distStrategy, 202 | std::unordered_map* src_kmerCntUnorderMap, std::vector* src_kmerVec, 203 | std::unordered_map* trgt_kmerCntUnorderMap, std::vector* trgt_kmerVec) 204 | { 205 | AbsIter* src_kmerCntIter = getKmerCntIterator(distStrategy->i_k, src_kmerCntUnorderMap, src_kmerVec, 0); 206 | AbsIter* trgt_kmerCntIter = getKmerCntIterator(distStrategy->i_k, trgt_kmerCntUnorderMap, trgt_kmerVec, 0); 207 | 208 | double src_X_w_1, src_X_w_2, src_X_w_3, src_X_w_4, src_X_w; 209 | double trgt_X_w_1, trgt_X_w_2, trgt_X_w_3, trgt_X_w_4, trgt_X_w; 210 | double all_X_w_1, all_X_w_2, all_X_w_3, all_X_w_4, all_X_w; 211 | 212 | while (true) 213 | { 214 | if (src_kmerCntIter->hasNext() && trgt_kmerCntIter->hasNext()) { src_X_w_1 = *(*src_kmerCntIter); (*src_kmerCntIter)++; trgt_X_w_1 = *(*trgt_kmerCntIter); (*trgt_kmerCntIter)++; } else break; 215 | if (src_kmerCntIter->hasNext() && trgt_kmerCntIter->hasNext()) { src_X_w_2 = *(*src_kmerCntIter); (*src_kmerCntIter)++; trgt_X_w_2 = *(*trgt_kmerCntIter); (*trgt_kmerCntIter)++; } else break; 216 | if (src_kmerCntIter->hasNext() && trgt_kmerCntIter->hasNext()) { src_X_w_3 = *(*src_kmerCntIter); (*src_kmerCntIter)++; trgt_X_w_3 = *(*trgt_kmerCntIter); (*trgt_kmerCntIter)++; } else break; 217 | if (src_kmerCntIter->hasNext() && trgt_kmerCntIter->hasNext()) { src_X_w_4 = *(*src_kmerCntIter); (*src_kmerCntIter)++; trgt_X_w_4 = *(*trgt_kmerCntIter); (*trgt_kmerCntIter)++; } else break; 218 | 219 | src_X_w = src_X_w_1 + src_X_w_2 + src_X_w_3 + src_X_w_4; 220 | trgt_X_w = trgt_X_w_1 + trgt_X_w_2 + trgt_X_w_3 + trgt_X_w_4; 221 | all_X_w_1 = src_X_w_1 + trgt_X_w_1; 222 | all_X_w_2 = src_X_w_2 + trgt_X_w_2; 223 | all_X_w_3 = src_X_w_3 + trgt_X_w_3; 224 | all_X_w_4 = src_X_w_4 + trgt_X_w_4; 225 | all_X_w = src_X_w + trgt_X_w; 226 | 227 | if (0 == all_X_w) continue; 228 | 229 | if (all_X_w_1 > 0) 230 | { 231 | if (src_X_w > 0){ distStrategy->dealWithQuad(src_X_w, src_X_w_1, all_X_w, all_X_w_1); } 232 | if (trgt_X_w > 0){ distStrategy->dealWithQuad(trgt_X_w, trgt_X_w_1, all_X_w, all_X_w_1); } 233 | } 234 | if (all_X_w_2 > 0) 235 | { 236 | if (src_X_w > 0){ distStrategy->dealWithQuad(src_X_w, src_X_w_2, all_X_w, all_X_w_2); } 237 | if (trgt_X_w > 0){ distStrategy->dealWithQuad(trgt_X_w, trgt_X_w_2, all_X_w, all_X_w_2); } 238 | } 239 | if (all_X_w_3 > 0) 240 | { 241 | if (src_X_w > 0){ distStrategy->dealWithQuad(src_X_w, src_X_w_3, all_X_w, all_X_w_3); } 242 | if (trgt_X_w > 0){ distStrategy->dealWithQuad(trgt_X_w, trgt_X_w_3, all_X_w, all_X_w_3); } 243 | } 244 | if (all_X_w_4 > 0) 245 | { 246 | if (src_X_w > 0){ distStrategy->dealWithQuad(src_X_w, src_X_w_4, all_X_w, all_X_w_4); } 247 | if (trgt_X_w > 0){ distStrategy->dealWithQuad(trgt_X_w, trgt_X_w_4, all_X_w, all_X_w_4); } 248 | } 249 | } 250 | 251 | delete src_kmerCntIter; delete trgt_kmerCntIter; 252 | return distStrategy->getDist(); 253 | } 254 | 255 | double IterFactory::getCntExpDist(AbsQuadStrategy* distStrategy, int i_arg_lowerCnt, 256 | std::unordered_map* src_kmerCntUnorderMap, std::vector* src_kmerVec, MarkovModel* src_mrkvModel, unsigned long src_totalKmer, 257 | std::unordered_map* trgt_kmerCntUnorderMap, std::vector* trgt_kmerVec, MarkovModel* trgt_mrkvModel, unsigned long trgt_totalKmer) 258 | { 259 | AbsIter* src_CntIter = getKmerCntIterator(distStrategy->i_k, src_kmerCntUnorderMap, src_kmerVec, i_arg_lowerCnt); 260 | AbsIter* trgt_CntIter = getKmerCntIterator(distStrategy->i_k, trgt_kmerCntUnorderMap, trgt_kmerVec, i_arg_lowerCnt); 261 | 262 | KmerProbEnsembDelegate* src_kmerProbDelegate = getKmerProbDelegate(distStrategy->i_k, distStrategy->b_singleStrain, src_mrkvModel); 263 | KmerProbEnsembDelegate* trgt_kmerProbDelegate = getKmerProbDelegate(distStrategy->i_k, distStrategy->b_singleStrain, trgt_mrkvModel); 264 | 265 | double log_src_totalKmerLen = log(src_totalKmer), log_trgt_totalKmerLen = log(trgt_totalKmer); 266 | 267 | std::queue src_kmer_queue, trgt_kmer_queue; 268 | std::queue src_cnt_queue, trgt_cnt_queue; 269 | 270 | while (src_CntIter->hasNext() || trgt_CntIter->hasNext()) 271 | { 272 | if (src_CntIter->hasNext()) 273 | { 274 | double src_X_w = *(*src_CntIter); double src_kmer = src_CntIter->getCurrKmer(); (*src_CntIter)++; 275 | if (src_X_w >= i_arg_lowerCnt) { src_kmer_queue.push(src_kmer); src_cnt_queue.push(src_X_w); } 276 | } 277 | 278 | if (trgt_CntIter->hasNext()) 279 | { 280 | double trgt_X_w = *(*trgt_CntIter); double trgt_kmer = trgt_CntIter->getCurrKmer(); (*trgt_CntIter)++; 281 | if (trgt_X_w >= i_arg_lowerCnt) { trgt_kmer_queue.push(trgt_kmer); trgt_cnt_queue.push(trgt_X_w); } 282 | } 283 | 284 | while (!src_kmer_queue.empty() && !trgt_kmer_queue.empty()) 285 | { 286 | if (src_kmer_queue.front() < trgt_kmer_queue.front()) 287 | { 288 | src_kmer_queue.pop(); src_cnt_queue.pop(); 289 | } 290 | else if (src_kmer_queue.front() > trgt_kmer_queue.front()) 291 | { 292 | trgt_kmer_queue.pop(); trgt_cnt_queue.pop(); 293 | } 294 | else 295 | { 296 | double src_X_w = src_cnt_queue.front(); double src_prob_X_w = src_kmerProbDelegate->getKmerlogProb(src_kmer_queue.front()); 297 | double trgt_X_w = trgt_cnt_queue.front(); double trgt_prob_X_w = trgt_kmerProbDelegate->getKmerlogProb(trgt_kmer_queue.front()); 298 | 299 | src_kmer_queue.pop(); src_cnt_queue.pop(); trgt_kmer_queue.pop(); trgt_cnt_queue.pop(); 300 | 301 | double src_EX_w = ((0 == src_prob_X_w) ? 0 : exp(log_src_totalKmerLen + src_prob_X_w)); 302 | double trgt_EX_w = ((0 == trgt_prob_X_w) ? 0 : exp(log_trgt_totalKmerLen + trgt_prob_X_w)); 303 | 304 | distStrategy->dealWithQuad(src_X_w, src_EX_w, trgt_X_w, trgt_EX_w); 305 | } 306 | } 307 | } 308 | 309 | delete src_CntIter; delete trgt_CntIter; delete src_kmerProbDelegate; delete trgt_kmerProbDelegate; 310 | return distStrategy->getDist(); 311 | } 312 | 313 | double IterFactory::getMrkvDist(AbsMrkvStrategy* distStrategy, MarkovModel* src_mrkvModel, MarkovModel* trgt_mrkvModel) 314 | { 315 | distStrategy->dealWithMrkv(src_mrkvModel, trgt_mrkvModel); 316 | return distStrategy->getDist(); 317 | } 318 | 319 | KmerProbDelegate::KmerProbDelegate(int i_arg_k, MarkovModel* arg_mrkvModel, bool b_arg_isRevCompl) 320 | { 321 | i_k = i_arg_k; 322 | maxAllowedIdx = (unsigned long long)pow(BASE, i_arg_k) - 1; 323 | markovModel = arg_mrkvModel; 324 | isRevCompl = b_arg_isRevCompl; 325 | 326 | //markovModel->print(); 327 | } 328 | 329 | void KmerProbDelegate::init() 330 | { 331 | nextPosKmerIdx = 0; 332 | isEnd = false; 333 | 334 | while (!kmer_traceStack.empty()) kmer_traceStack.pop(); 335 | while (!orderIdx_traceStack.empty()) orderIdx_traceStack.pop(); 336 | while (!logProbProd_traceStack.empty()) logProbProd_traceStack.pop(); 337 | 338 | while (!lowerIdx_traceStack.empty()) lowerIdx_traceStack.pop(); 339 | while (!upperIdx_traceStack.empty()) upperIdx_traceStack.pop(); 340 | 341 | //while (kmer_traceStack.size() < i_k) push(0); 342 | std::stack route_traceStack; 343 | for (int i = 0; i < i_k; ++i) route_traceStack.push(0); 344 | push(route_traceStack); 345 | } 346 | 347 | bool KmerProbDelegate::push(unsigned long long idx) 348 | { 349 | int i_order = markovModel->getOrder(); 350 | 351 | unsigned long long l_tmp_currOrderIdx = 0; 352 | if (!orderIdx_traceStack.empty()) l_tmp_currOrderIdx = orderIdx_traceStack.top(); 353 | 354 | if (!isRevCompl) //regular case 355 | { 356 | if (kmer_traceStack.size() == i_order) 357 | { 358 | if (0 == markovModel->getMargProb(l_tmp_currOrderIdx) && 0 != i_order) return false; 359 | } 360 | 361 | if (kmer_traceStack.size() >= i_order) 362 | { 363 | if (0 == markovModel->getTransProb(l_tmp_currOrderIdx, idx)) return false; 364 | } 365 | 366 | unsigned long long l_tmp_newOrderIdx = 0; // when i_order = 0 367 | if (1 == i_order) 368 | l_tmp_newOrderIdx = idx; 369 | else if (i_order > 1) l_tmp_newOrderIdx = ((~((unsigned long long)3 << ((i_order - 1) * 2)) & l_tmp_currOrderIdx) << 2) + idx; 370 | orderIdx_traceStack.push(l_tmp_newOrderIdx); 371 | 372 | double d_tmp_newLogProbProd = 0; 373 | if (!logProbProd_traceStack.empty()) d_tmp_newLogProbProd = logProbProd_traceStack.top(); 374 | 375 | //if (kmer_traceStack.size() == i_order) d_tmp_newLogProbProd += log(markovModel->getMargProb(l_tmp_currOrderIdx)); 376 | //if (kmer_traceStack.size() >= i_order) d_tmp_newLogProbProd += log(markovModel->getTransProb(l_tmp_currOrderIdx, idx)); 377 | if (kmer_traceStack.size() == i_order) d_tmp_newLogProbProd += markovModel->getMargProb(l_tmp_currOrderIdx); 378 | if (kmer_traceStack.size() >= i_order) d_tmp_newLogProbProd += markovModel->getTransProb(l_tmp_currOrderIdx, idx); 379 | 380 | logProbProd_traceStack.push(d_tmp_newLogProbProd); 381 | } 382 | else //reverse complement case 383 | { 384 | unsigned long long compl_idx = BASE - 1 - idx; 385 | unsigned long long compl_idx_toRemove = l_tmp_currOrderIdx % BASE; 386 | 387 | unsigned long long l_tmp_newOrderIdx = 0; // when i_order = 0 388 | if (0 == i_order) 389 | compl_idx_toRemove = compl_idx; 390 | else if (1 == i_order) 391 | l_tmp_newOrderIdx = compl_idx; 392 | else if (i_order > 1) 393 | { 394 | if (kmer_traceStack.size() < i_order) 395 | l_tmp_newOrderIdx = (compl_idx << (kmer_traceStack.size() * 2)) + l_tmp_currOrderIdx; 396 | else 397 | l_tmp_newOrderIdx = (compl_idx << ((i_order - 1) * 2)) + (l_tmp_currOrderIdx >> 2); 398 | } 399 | 400 | if (kmer_traceStack.size() >= i_order) 401 | { 402 | if (0 == markovModel->getTransProb(l_tmp_newOrderIdx, compl_idx_toRemove)) return false; 403 | } 404 | 405 | if (kmer_traceStack.size() == i_k - 1) 406 | { 407 | if (0 == markovModel->getMargProb(l_tmp_newOrderIdx) && 0 != i_order) return false; 408 | } 409 | 410 | orderIdx_traceStack.push(l_tmp_newOrderIdx); 411 | double d_tmp_newLogProbProd = 0; 412 | if (!logProbProd_traceStack.empty()) d_tmp_newLogProbProd = logProbProd_traceStack.top(); 413 | 414 | //if (kmer_traceStack.size() >= i_order) d_tmp_newLogProbProd += log(markovModel->getTransProb(l_tmp_newOrderIdx, compl_idx_toRemove)); 415 | //if (kmer_traceStack.size() == i_k - 1) d_tmp_newLogProbProd += log(markovModel->getMargProb(l_tmp_newOrderIdx)); 416 | if (kmer_traceStack.size() >= i_order) d_tmp_newLogProbProd += markovModel->getTransProb(l_tmp_newOrderIdx, compl_idx_toRemove); 417 | if (kmer_traceStack.size() == i_k - 1) d_tmp_newLogProbProd += markovModel->getMargProb(l_tmp_newOrderIdx); 418 | logProbProd_traceStack.push(d_tmp_newLogProbProd); 419 | } 420 | 421 | 422 | kmer_traceStack.push(idx); 423 | nextPosKmerIdx = (nextPosKmerIdx << 2) + idx; 424 | 425 | unsigned long long idxLowBound = 0; 426 | unsigned long long gap = (unsigned long long)pow(BASE, i_k - lowerIdx_traceStack.size() - 1); 427 | if (!lowerIdx_traceStack.empty()) idxLowBound = lowerIdx_traceStack.top(); 428 | lowerIdx_traceStack.push(idxLowBound + idx*gap); 429 | upperIdx_traceStack.push(idxLowBound + (idx + 1)*gap - 1); 430 | 431 | return true; 432 | } 433 | 434 | unsigned long long KmerProbDelegate::pop() 435 | { 436 | if (kmer_traceStack.empty()) throw std::runtime_error(" cannot pop from empty stack! "); 437 | 438 | unsigned long long i_tmp_curr = kmer_traceStack.top(); 439 | nextPosKmerIdx = (nextPosKmerIdx >> 2); 440 | 441 | kmer_traceStack.pop(); 442 | orderIdx_traceStack.pop(); 443 | logProbProd_traceStack.pop(); 444 | lowerIdx_traceStack.pop(); 445 | upperIdx_traceStack.pop(); 446 | 447 | return i_tmp_curr; 448 | } 449 | 450 | bool KmerProbDelegate::push(std::stack & indices) 451 | { 452 | while (kmer_traceStack.size() < i_k) 453 | { 454 | unsigned long long i_tmp_nextIdx = 0; 455 | if (!indices.empty()) { i_tmp_nextIdx = indices.top(); indices.pop(); } 456 | 457 | bool b_tmp_succeed = push(i_tmp_nextIdx); 458 | while (!b_tmp_succeed) 459 | { 460 | while (!indices.empty()) indices.pop(); 461 | if (i_tmp_nextIdx >= BASE - 1) break; 462 | i_tmp_nextIdx++; 463 | b_tmp_succeed = push(i_tmp_nextIdx); 464 | } 465 | if (!b_tmp_succeed) 466 | { 467 | bool b_tmp_succeed1 = increment(); 468 | if (!b_tmp_succeed1) return false; 469 | } 470 | } 471 | return true; 472 | } 473 | 474 | bool KmerProbDelegate::increment() 475 | { 476 | while (!kmer_traceStack.empty() && kmer_traceStack.top() >= BASE - 1) pop(); 477 | if (kmer_traceStack.empty()) return false; 478 | 479 | unsigned long long i_tmp_currIdx = pop(); 480 | unsigned long long i_tmp_nextIdx = i_tmp_currIdx + 1; 481 | 482 | bool b_tmp_succeed = push(i_tmp_nextIdx); 483 | while (!b_tmp_succeed) 484 | { 485 | if (i_tmp_nextIdx >= BASE - 1) break; 486 | 487 | i_tmp_nextIdx++; 488 | b_tmp_succeed = push(i_tmp_nextIdx); 489 | } 490 | 491 | if (!b_tmp_succeed) return increment(); 492 | return true; 493 | } 494 | 495 | double KmerProbDelegate::getKmerlogProb(unsigned long long queryNextKmerIdx) 496 | { 497 | if (isEnd) return 0; 498 | 499 | if (nextPosKmerIdx < queryNextKmerIdx) 500 | { 501 | unsigned long long currUpperIdx = upperIdx_traceStack.top(), currLowerIdx = lowerIdx_traceStack.top(); 502 | while (currUpperIdx < queryNextKmerIdx) 503 | { 504 | pop(); 505 | if (!upperIdx_traceStack.empty()) { currUpperIdx = upperIdx_traceStack.top(); currLowerIdx = lowerIdx_traceStack.top(); } 506 | else { currUpperIdx = maxAllowedIdx; currLowerIdx = 0; } 507 | } 508 | 509 | unsigned long long offset = queryNextKmerIdx - currLowerIdx; 510 | std::stack route_traceStack; 511 | for (int i = 0; i < i_k - lowerIdx_traceStack.size(); ++i) 512 | { 513 | unsigned long long i_tmp_route = offset % BASE; 514 | route_traceStack.push(i_tmp_route); 515 | offset = (offset >> 2); 516 | } 517 | 518 | bool b_tmp_succeed = push(route_traceStack); 519 | if (!b_tmp_succeed) { isEnd = true; return 0; } 520 | } 521 | //std::cout << "nextPosKmerIdx\t" << nextPosKmerIdx << std::endl; 522 | 523 | if (nextPosKmerIdx > queryNextKmerIdx) return 0; 524 | else if (queryNextKmerIdx == nextPosKmerIdx) { return logProbProd_traceStack.empty()?0:logProbProd_traceStack.top(); } 525 | else throw std::runtime_error("nextPosKmerIdx < queryNextKmerIdx cannot be true! "); 526 | } 527 | 528 | KmerProbEnsembDelegate::KmerProbEnsembDelegate(int i_arg_k, MarkovModel* arg_mrkvModel, bool b_arg_singleStrain) 529 | { 530 | i_k = i_arg_k; 531 | markovModel = arg_mrkvModel; 532 | b_singleStrain = b_arg_singleStrain; 533 | 534 | kmerProbDelegate = new KmerProbDelegate(i_k, markovModel, false); 535 | if (!b_singleStrain) revComplKmerProbDelegate = new KmerProbDelegate(i_k, markovModel, true); 536 | } 537 | 538 | KmerProbEnsembDelegate::~KmerProbEnsembDelegate() 539 | { 540 | delete kmerProbDelegate; 541 | if (!b_singleStrain) delete revComplKmerProbDelegate; 542 | //delete markovModel; 543 | } 544 | 545 | void KmerProbEnsembDelegate::init() 546 | { 547 | kmerProbDelegate->init(); 548 | if (!b_singleStrain) revComplKmerProbDelegate->init(); 549 | } 550 | 551 | double KmerProbEnsembDelegate::getKmerlogProb(unsigned long long queryNextKmerIdx) 552 | { 553 | double d_log_kmerProb = kmerProbDelegate->getKmerlogProb(queryNextKmerIdx); 554 | if (!b_singleStrain) 555 | { 556 | double d_log_revComplKmerProb = revComplKmerProbDelegate->getKmerlogProb(queryNextKmerIdx); 557 | 558 | if (0 == d_log_kmerProb && 0 == d_log_revComplKmerProb) return 0; 559 | else if (0 == d_log_kmerProb || 0 == d_log_revComplKmerProb) return d_log_revComplKmerProb + d_log_kmerProb - LOG2; 560 | else return log_sum(d_log_kmerProb, d_log_revComplKmerProb) - LOG2; 561 | } 562 | return d_log_kmerProb; 563 | } 564 | 565 | bool KmerModel::load(int i_arg_k, std::string str_arg_inputURL) 566 | { 567 | kmerCntUnorderMap->clear(); kmerVec->clear(); 568 | 569 | unsigned long long i_tmp_rowDim = 0; 570 | std::ifstream tmp_finPipe(str_arg_inputURL.c_str(), std::ios::in | std::ios::binary); 571 | tmp_finPipe.read((char*)&i_tmp_rowDim, sizeof(unsigned long long)); 572 | 573 | unsigned long long* vec_kmerIdx = new unsigned long long[i_tmp_rowDim]; memset(vec_kmerIdx, 0, sizeof(unsigned long long) * i_tmp_rowDim); 574 | unsigned long* vec_kmerCnt = new unsigned long[i_tmp_rowDim]; memset(vec_kmerCnt, 0, sizeof(unsigned long) * i_tmp_rowDim); 575 | 576 | tmp_finPipe.read((char*)vec_kmerIdx, sizeof(unsigned long long) * i_tmp_rowDim); 577 | tmp_finPipe.read((char*)vec_kmerCnt, sizeof(unsigned long) * i_tmp_rowDim); 578 | tmp_finPipe.close(); 579 | 580 | for (unsigned long long rowIdx = 0; rowIdx < i_tmp_rowDim; ++rowIdx) 581 | { 582 | unsigned long long currKmerIdx = vec_kmerIdx[rowIdx]; 583 | unsigned long currKmerCnt = vec_kmerCnt[rowIdx]; 584 | 585 | (*kmerCntUnorderMap)[currKmerIdx] += currKmerCnt; 586 | if (!b_singleStrain) (*kmerCntUnorderMap)[index2revCompleIdx(currKmerIdx, i_arg_k)] += currKmerCnt; 587 | 588 | kmerVec->push_back(currKmerIdx); 589 | 590 | //std::cout << rowIdx << " = " << currKmerIdx << " = " << currKmerCnt << std::endl; 591 | } 592 | 593 | //std::cout << i_tmp_rowDim << " ; "<size() << " ; " << kmerVec->size() << std::endl; 594 | 595 | delete[] vec_kmerIdx; delete[] vec_kmerCnt; 596 | return true; 597 | } 598 | 599 | bool KmerModel::save(std::map *kmerCntMap, std::string str_arg_outputURL) 600 | { 601 | unsigned long long i_tmp_rowDim = kmerCntMap->size(); 602 | unsigned long long* vec_kmerIdx = new unsigned long long[i_tmp_rowDim]; memset(vec_kmerIdx, 0, sizeof(unsigned long long) * i_tmp_rowDim); 603 | unsigned long* vec_kmerCnt = new unsigned long[i_tmp_rowDim]; memset(vec_kmerCnt, 0, sizeof(unsigned long) * i_tmp_rowDim); 604 | 605 | unsigned long long currIdx = 0; 606 | for (std::map::iterator iter = kmerCntMap->begin(); iter != kmerCntMap->end(); iter++) 607 | { 608 | vec_kmerIdx[currIdx] = iter->first; 609 | vec_kmerCnt[currIdx] = iter->second; 610 | currIdx++; 611 | } 612 | 613 | std::ofstream tmp_ofsPipe(str_arg_outputURL.c_str(), std::ios::out | std::ios::binary); 614 | tmp_ofsPipe.write((char*)&i_tmp_rowDim, sizeof(unsigned long long)); 615 | tmp_ofsPipe.write((char*)vec_kmerIdx, sizeof(unsigned long long)*i_tmp_rowDim); 616 | tmp_ofsPipe.write((char*)vec_kmerCnt, sizeof(unsigned long)*i_tmp_rowDim); 617 | tmp_ofsPipe.flush(); 618 | tmp_ofsPipe.close(); 619 | 620 | delete[] vec_kmerIdx; delete[] vec_kmerCnt; 621 | return true; 622 | } 623 | 624 | bool KmerModel::saveFromLargerK(int i_arg_k, int i_arg_larger_k, std::string str_arg_inputURL, std::string str_arg_outputURL) 625 | { 626 | if (i_arg_larger_k < i_arg_k) 627 | { 628 | std::cout << "[error]: i_arg_larger_k < i_arg_k in KmerModel::loadFromLargerK. " << std::endl; 629 | return false; 630 | } 631 | if (i_arg_larger_k == i_arg_k) return load(i_arg_k, str_arg_inputURL); 632 | 633 | std::map *kmerCntMap = new std::map(); 634 | 635 | unsigned long long i_tmp_rowDim = 0; 636 | std::ifstream tmp_finPipe(str_arg_inputURL.c_str(), std::ios::in | std::ios::binary); 637 | tmp_finPipe.read((char*)&i_tmp_rowDim, sizeof(unsigned long long)); 638 | 639 | unsigned long long* vec_kmerIdx = new unsigned long long[i_tmp_rowDim]; memset(vec_kmerIdx, 0, sizeof(unsigned long long) * i_tmp_rowDim); 640 | unsigned long* vec_kmerCnt = new unsigned long[i_tmp_rowDim]; memset(vec_kmerCnt, 0, sizeof(unsigned long) * i_tmp_rowDim); 641 | 642 | tmp_finPipe.read((char*)vec_kmerIdx, sizeof(unsigned long long) * i_tmp_rowDim); 643 | tmp_finPipe.read((char*)vec_kmerCnt, sizeof(unsigned long) * i_tmp_rowDim); 644 | tmp_finPipe.close(); 645 | 646 | for (unsigned long long rowIdx = 0; rowIdx < i_tmp_rowDim; ++rowIdx) 647 | { 648 | unsigned long long currKmerIdx = vec_kmerIdx[rowIdx]; 649 | unsigned long currKmerCnt = vec_kmerCnt[rowIdx]; 650 | 651 | unsigned long long newKmerIdx = (currKmerIdx >> (2 * (i_arg_larger_k - i_arg_k))); 652 | (*kmerCntMap)[newKmerIdx] += currKmerCnt; 653 | //if (!b_singleStrain) (*kmerCntMap)[index2revCompleIdx(newKmerIdx, i_arg_k)] += currKmerCnt; 654 | } 655 | 656 | delete[] vec_kmerIdx; delete[] vec_kmerCnt; 657 | save(kmerCntMap, str_arg_outputURL); 658 | delete kmerCntMap; 659 | 660 | return true; 661 | } 662 | 663 | bool KmerModel::saveFromJellyFish(std::string str_arg_jfTxtURL, std::string str_arg_outputURL) 664 | { 665 | std::map *kmerCntMap = new std::map(); 666 | 667 | std::ifstream fastaStream(str_arg_jfTxtURL.c_str()); 668 | if (!fastaStream.is_open()) return false; 669 | 670 | std::string str_line1 = "", currKmer = ""; 671 | while (getline(fastaStream, str_line1)) 672 | { 673 | if (str_line1.empty() || "" == str_line1) continue; 674 | getline(fastaStream, currKmer); 675 | 676 | str_line1.erase(str_line1.begin()); 677 | long currKmerCnt = atol(str_line1.c_str()); 678 | 679 | unsigned long long currKmerIdx = nt2index(currKmer); 680 | (*kmerCntMap)[currKmerIdx] += currKmerCnt; 681 | //if (!b_singleStrain) (*kmerCntTable)[index2revCompleIdx(currKmerIdx, currKmer.length())] += currKmerCnt; 682 | } 683 | this->i_k = currKmer.length(); 684 | fastaStream.close(); 685 | 686 | save(kmerCntMap, str_arg_outputURL); 687 | delete kmerCntMap; 688 | return true; 689 | } 690 | 691 | bool KmerModel::saveFromFasta(int i_arg_k, std::string str_arg_fastaFileURL, std::string str_arg_outputURL) 692 | { 693 | std::map *kmerCntMap = new std::map(); 694 | 695 | std::ifstream fastaStream(str_arg_fastaFileURL.c_str()); 696 | if (fastaStream.is_open()) 697 | { 698 | std::string str_tmp_line = ""; 699 | int totalCharCnt = 0; 700 | unsigned long long currKmerIdx = 0; 701 | 702 | while (getline(fastaStream, str_tmp_line)) 703 | { 704 | if (str_tmp_line.empty() || "" == str_tmp_line) continue; 705 | if (str_tmp_line.substr(0, 1) == ">") 706 | { 707 | totalCharCnt = 0; currKmerIdx = 0; 708 | continue; 709 | } 710 | 711 | for (int i = 0; i < str_tmp_line.length(); ++i) 712 | { 713 | int idx = nt2int(str_tmp_line.at(i)); if (idx < 0) continue; 714 | if (totalCharCnt < i_arg_k) totalCharCnt++; 715 | currKmerIdx = ((~((unsigned long)3 << ((i_arg_k - 1) * 2)) & currKmerIdx) << 2) + idx; 716 | 717 | if (totalCharCnt < i_arg_k) continue; 718 | (*kmerCntMap)[currKmerIdx] ++; 719 | } 720 | } 721 | fastaStream.close(); 722 | } 723 | this->i_k = i_arg_k; 724 | 725 | save(kmerCntMap, str_arg_outputURL); 726 | delete kmerCntMap; 727 | return true; 728 | } 729 | 730 | unsigned long KmerModel::totalKmer() 731 | { 732 | unsigned long totalKmerCnt = 0; 733 | for (std::unordered_map::iterator iter = kmerCntUnorderMap->begin(); iter != kmerCntUnorderMap->end(); iter++) totalKmerCnt += (*kmerCntUnorderMap)[iter->first]; 734 | return totalKmerCnt; 735 | } 736 | 737 | MarkovModel* KmerModel::getMarkovModel(int i_arg_order, std::string str_arg_saveURLPrefix) 738 | { 739 | MarkovModel* tmp_mrkvModel = new MarkovModel(i_arg_order); 740 | 741 | std::string str_orderURL = str_arg_saveURLPrefix + patch::to_string(i_arg_order); 742 | std::string str_orderPlusOneURL = str_arg_saveURLPrefix + patch::to_string(i_arg_order + 1); 743 | 744 | if ((!file_exists(str_orderURL) && i_arg_order > 0) || !file_exists(str_orderPlusOneURL)) 745 | { 746 | std::cout << "[error]: file not exists! in KmerModel::getMarkovModel !!!" << std::endl; 747 | return NULL; 748 | } 749 | 750 | unsigned long long i_orderDim = 0; 751 | unsigned long long *vec_orderkmerIdx, *vec_orderkmerCnt; 752 | if (i_arg_order <= 0) 753 | { 754 | vec_orderkmerIdx = new unsigned long long[1]; vec_orderkmerIdx[0] = 0; 755 | vec_orderkmerCnt = new unsigned long long[1]; vec_orderkmerCnt[0] = 1; 756 | i_orderDim = 1; 757 | } 758 | else 759 | { 760 | std::ifstream tmp_finPipe(str_orderURL.c_str(), std::ios::in | std::ios::binary); 761 | tmp_finPipe.read((char*)&i_orderDim, sizeof(unsigned long long)); 762 | 763 | vec_orderkmerIdx = new unsigned long long[i_orderDim]; memset(vec_orderkmerIdx, 0, sizeof(unsigned long long) * i_orderDim); 764 | vec_orderkmerCnt = new unsigned long long[i_orderDim]; memset(vec_orderkmerCnt, 0, sizeof(unsigned long long) * i_orderDim); 765 | 766 | tmp_finPipe.read((char*)vec_orderkmerIdx, sizeof(unsigned long long) * i_orderDim); 767 | tmp_finPipe.read((char*)vec_orderkmerCnt, sizeof(unsigned long long) * i_orderDim); 768 | tmp_finPipe.close(); 769 | } 770 | tmp_mrkvModel->constructMarg(vec_orderkmerIdx, vec_orderkmerCnt, i_orderDim); 771 | delete[] vec_orderkmerIdx; delete[] vec_orderkmerCnt; 772 | 773 | 774 | unsigned long long i_orderPlusOneDim = 0; 775 | std::ifstream tmp_finPipePlusOne(str_orderPlusOneURL.c_str(), std::ios::in | std::ios::binary); 776 | tmp_finPipePlusOne.read((char*)&i_orderPlusOneDim, sizeof(unsigned long long)); 777 | 778 | unsigned long long *vec_orderPlusOnekmerIdx = new unsigned long long[i_orderPlusOneDim]; memset(vec_orderPlusOnekmerIdx, 0, sizeof(unsigned long long) * i_orderPlusOneDim); 779 | unsigned long long *vec_orderPlusOnekmerCnt = new unsigned long long[i_orderPlusOneDim]; memset(vec_orderPlusOnekmerCnt, 0, sizeof(unsigned long long) * i_orderPlusOneDim); 780 | 781 | tmp_finPipePlusOne.read((char*)vec_orderPlusOnekmerIdx, sizeof(unsigned long long) * i_orderPlusOneDim); 782 | tmp_finPipePlusOne.read((char*)vec_orderPlusOnekmerCnt, sizeof(unsigned long long) * i_orderPlusOneDim); 783 | tmp_finPipePlusOne.close(); 784 | 785 | for (unsigned long long rowIdx = 0; rowIdx < i_orderPlusOneDim; ++rowIdx) 786 | { 787 | unsigned long long currKmerIdx = vec_orderPlusOnekmerIdx[rowIdx]; 788 | unsigned long long i_tmp_route = currKmerIdx % BASE; 789 | unsigned long long newKmerIdx = (currKmerIdx >> 2); 790 | tmp_mrkvModel->addTransProb(newKmerIdx, i_tmp_route, vec_orderPlusOnekmerCnt[rowIdx]); 791 | } 792 | delete[] vec_orderPlusOnekmerIdx; delete[] vec_orderPlusOnekmerCnt; 793 | 794 | tmp_mrkvModel->normalize(); 795 | return tmp_mrkvModel; 796 | } 797 | 798 | 799 | int getEstMarkovOrder(int i_arg_k, std::string str_arg_saveURLPrefix, std::string str_arg_seqName) 800 | { 801 | std::cout << "Now estimating markov order for " << str_arg_seqName << " ..." << std::endl; 802 | 803 | KmerModel* kmerModel = new KmerModel(i_arg_k, true); 804 | kmerModel->load(i_arg_k, str_arg_saveURLPrefix + patch::to_string(i_arg_k)); 805 | unsigned long l_totalKmer = kmerModel->totalKmer(); 806 | delete kmerModel; 807 | kmerModel = new KmerModel(i_arg_k, true); 808 | 809 | int maxOrder = i_arg_k; if(maxOrder > MAX_ORDER) maxOrder = MAX_ORDER; 810 | std::vector result_traceVec; 811 | for (int currOrder = 0; currOrder < maxOrder; ++currOrder) 812 | { 813 | double BIC = (BASE - 1) * pow(BASE, currOrder) * log(l_totalKmer + i_arg_k - currOrder); 814 | 815 | MarkovModel* markovModel = kmerModel->getMarkovModel(currOrder, str_arg_saveURLPrefix); 816 | 817 | KmerModel* tmpKmerModel = new KmerModel(currOrder + 1, true); 818 | tmpKmerModel->load(currOrder + 1, str_arg_saveURLPrefix + patch::to_string(currOrder + 1)); 819 | 820 | AbsIter* tmpkmerCntIter = IterFactory::getInstance()->getKmerCntIterator(i_arg_k, tmpKmerModel->kmerCntUnorderMap, tmpKmerModel->kmerVec, 1); 821 | 822 | double logLH = 0; 823 | while (tmpkmerCntIter->hasNext()) 824 | { 825 | double src_X_w = *(*tmpkmerCntIter); (*tmpkmerCntIter)++; if (0 == src_X_w) continue; 826 | unsigned long long kmerIdx = tmpkmerCntIter->getCurrKmer(); 827 | 828 | int route = kmerIdx % BASE; 829 | unsigned long long idx_new = (kmerIdx >> 2); 830 | 831 | //if (0 != markovModel->getTransProb(idx_new, route)) logLH += src_X_w*log(markovModel->getTransProb(idx_new, route)); 832 | if (0 != markovModel->getTransProb(idx_new, route)) logLH += src_X_w*markovModel->getTransProb(idx_new, route); 833 | } 834 | BIC = -2 * logLH + BIC; 835 | result_traceVec.push_back(BIC); 836 | 837 | delete tmpkmerCntIter; delete tmpKmerModel; delete markovModel; 838 | } 839 | delete kmerModel; 840 | 841 | 842 | if (result_traceVec.size() <= 1) return 0; 843 | 844 | int min_idx = 0; 845 | min_vec(result_traceVec, result_traceVec.size(), &min_idx); 846 | for (int i = 0; i < result_traceVec.size(); ++i) std::cout << "order = " << i << " BIC = " << result_traceVec.at(i) << std::endl; 847 | std::cout << "The selected order = " << min_idx << std::endl; 848 | return min_idx; 849 | } --------------------------------------------------------------------------------