├── .DS_Store ├── dest ├── 1.txt ├── 10.txt ├── 12.txt ├── 13.txt ├── 14.txt ├── 15.txt ├── 16.txt ├── 17.txt ├── 18.txt ├── 19.txt ├── 2.txt ├── 21.txt ├── 22.txt ├── 23.txt ├── 24.txt ├── 25.txt ├── 26.txt ├── 27.txt ├── 28.txt ├── 29.txt ├── 3.txt ├── 30.txt ├── 31.txt ├── 32.txt ├── 33.txt ├── 34.txt ├── 35.txt ├── 37.txt ├── 38.txt ├── 39.txt ├── 4.txt ├── 40.txt ├── 41.txt ├── 42.txt ├── 44.txt ├── 45.txt ├── 46.txt ├── 47.txt ├── 48.txt ├── 49.txt ├── 5.txt ├── 50.txt ├── 51.txt ├── 52.txt ├── 53.txt ├── 54.txt ├── 55.txt ├── 56.txt ├── 57.txt ├── 58.txt ├── 59.txt ├── 60.txt ├── 7.txt ├── 8.txt ├── 9.txt ├── 6.txt ├── 20.txt ├── 11.txt ├── 43.txt └── 36.txt ├── source ├── 1.txt ├── 10.txt ├── 12.txt ├── 13.txt ├── 14.txt ├── 15.txt ├── 16.txt ├── 17.txt ├── 18.txt ├── 19.txt ├── 2.txt ├── 21.txt ├── 22.txt ├── 23.txt ├── 24.txt ├── 25.txt ├── 26.txt ├── 27.txt ├── 28.txt ├── 29.txt ├── 3.txt ├── 30.txt ├── 31.txt ├── 32.txt ├── 33.txt ├── 34.txt ├── 35.txt ├── 37.txt ├── 38.txt ├── 39.txt ├── 4.txt ├── 40.txt ├── 41.txt ├── 42.txt ├── 44.txt ├── 45.txt ├── 46.txt ├── 47.txt ├── 48.txt ├── 49.txt ├── 5.txt ├── 50.txt ├── 51.txt ├── 52.txt ├── 53.txt ├── 54.txt ├── 55.txt ├── 56.txt ├── 57.txt ├── 58.txt ├── 59.txt ├── 60.txt ├── 7.txt ├── 8.txt ├── 9.txt ├── 6.txt ├── 20.txt ├── 11.txt ├── 43.txt └── 36.txt ├── stops_removed ├── 1.txt ├── 2.txt ├── 3.txt ├── 4.txt ├── 5.txt ├── 7.txt ├── 8.txt ├── 9.txt ├── 10.txt ├── 12.txt ├── 13.txt ├── 14.txt ├── 15.txt ├── 16.txt ├── 17.txt ├── 18.txt ├── 19.txt ├── 21.txt ├── 22.txt ├── 23.txt ├── 24.txt ├── 25.txt ├── 26.txt ├── 27.txt ├── 28.txt ├── 29.txt ├── 30.txt ├── 31.txt ├── 32.txt ├── 33.txt ├── 34.txt ├── 35.txt ├── 37.txt ├── 38.txt ├── 39.txt ├── 40.txt ├── 41.txt ├── 42.txt ├── 44.txt ├── 45.txt ├── 46.txt ├── 47.txt ├── 48.txt ├── 49.txt ├── 50.txt ├── 51.txt ├── 52.txt ├── 53.txt ├── 54.txt ├── 55.txt ├── 56.txt ├── 57.txt ├── 58.txt ├── 59.txt ├── 60.txt ├── 6.txt ├── 20.txt ├── 11.txt ├── 43.txt └── 36.txt ├── plot.py ├── plot1.py ├── 5_result_evaluation.py ├── 6_main_module.py ├── 1_rempunct.py ├── 4_cluster.py ├── README.md ├── 2_alltfidf.py └── 3_distance.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/.DS_Store -------------------------------------------------------------------------------- /dest/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/1.txt -------------------------------------------------------------------------------- /dest/10.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/10.txt -------------------------------------------------------------------------------- /dest/12.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/12.txt -------------------------------------------------------------------------------- /dest/13.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/13.txt -------------------------------------------------------------------------------- /dest/14.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/14.txt -------------------------------------------------------------------------------- /dest/15.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/15.txt -------------------------------------------------------------------------------- /dest/16.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/16.txt -------------------------------------------------------------------------------- /dest/17.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/17.txt -------------------------------------------------------------------------------- /dest/18.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/18.txt -------------------------------------------------------------------------------- /dest/19.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/19.txt -------------------------------------------------------------------------------- /dest/2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/2.txt -------------------------------------------------------------------------------- /dest/21.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/21.txt -------------------------------------------------------------------------------- /dest/22.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/22.txt -------------------------------------------------------------------------------- /dest/23.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/23.txt -------------------------------------------------------------------------------- /dest/24.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/24.txt -------------------------------------------------------------------------------- /dest/25.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/25.txt -------------------------------------------------------------------------------- /dest/26.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/26.txt -------------------------------------------------------------------------------- /dest/27.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/27.txt -------------------------------------------------------------------------------- /dest/28.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/28.txt -------------------------------------------------------------------------------- /dest/29.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/29.txt -------------------------------------------------------------------------------- /dest/3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/3.txt -------------------------------------------------------------------------------- /dest/30.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/30.txt -------------------------------------------------------------------------------- /dest/31.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/31.txt -------------------------------------------------------------------------------- /dest/32.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/32.txt -------------------------------------------------------------------------------- /dest/33.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/33.txt -------------------------------------------------------------------------------- /dest/34.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/34.txt -------------------------------------------------------------------------------- /dest/35.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/35.txt -------------------------------------------------------------------------------- /dest/37.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/37.txt -------------------------------------------------------------------------------- /dest/38.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/38.txt -------------------------------------------------------------------------------- /dest/39.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/39.txt -------------------------------------------------------------------------------- /dest/4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/4.txt -------------------------------------------------------------------------------- /dest/40.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/40.txt -------------------------------------------------------------------------------- /dest/41.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/41.txt -------------------------------------------------------------------------------- /dest/42.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/42.txt -------------------------------------------------------------------------------- /dest/44.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/44.txt -------------------------------------------------------------------------------- /dest/45.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/45.txt -------------------------------------------------------------------------------- /dest/46.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/46.txt -------------------------------------------------------------------------------- /dest/47.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/47.txt -------------------------------------------------------------------------------- /dest/48.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/48.txt -------------------------------------------------------------------------------- /dest/49.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/49.txt -------------------------------------------------------------------------------- /dest/5.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/5.txt -------------------------------------------------------------------------------- /dest/50.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/50.txt -------------------------------------------------------------------------------- /dest/51.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/51.txt -------------------------------------------------------------------------------- /dest/52.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/52.txt -------------------------------------------------------------------------------- /dest/53.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/53.txt -------------------------------------------------------------------------------- /dest/54.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/54.txt -------------------------------------------------------------------------------- /dest/55.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/55.txt -------------------------------------------------------------------------------- /dest/56.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/56.txt -------------------------------------------------------------------------------- /dest/57.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/57.txt -------------------------------------------------------------------------------- /dest/58.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/58.txt -------------------------------------------------------------------------------- /dest/59.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/59.txt -------------------------------------------------------------------------------- /dest/60.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/60.txt -------------------------------------------------------------------------------- /dest/7.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/7.txt -------------------------------------------------------------------------------- /dest/8.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/8.txt -------------------------------------------------------------------------------- /dest/9.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/dest/9.txt -------------------------------------------------------------------------------- /source/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/1.txt -------------------------------------------------------------------------------- /source/10.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/10.txt -------------------------------------------------------------------------------- /source/12.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/12.txt -------------------------------------------------------------------------------- /source/13.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/13.txt -------------------------------------------------------------------------------- /source/14.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/14.txt -------------------------------------------------------------------------------- /source/15.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/15.txt -------------------------------------------------------------------------------- /source/16.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/16.txt -------------------------------------------------------------------------------- /source/17.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/17.txt -------------------------------------------------------------------------------- /source/18.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/18.txt -------------------------------------------------------------------------------- /source/19.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/19.txt -------------------------------------------------------------------------------- /source/2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/2.txt -------------------------------------------------------------------------------- /source/21.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/21.txt -------------------------------------------------------------------------------- /source/22.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/22.txt -------------------------------------------------------------------------------- /source/23.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/23.txt -------------------------------------------------------------------------------- /source/24.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/24.txt -------------------------------------------------------------------------------- /source/25.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/25.txt -------------------------------------------------------------------------------- /source/26.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/26.txt -------------------------------------------------------------------------------- /source/27.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/27.txt -------------------------------------------------------------------------------- /source/28.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/28.txt -------------------------------------------------------------------------------- /source/29.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/29.txt -------------------------------------------------------------------------------- /source/3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/3.txt -------------------------------------------------------------------------------- /source/30.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/30.txt -------------------------------------------------------------------------------- /source/31.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/31.txt -------------------------------------------------------------------------------- /source/32.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/32.txt -------------------------------------------------------------------------------- /source/33.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/33.txt -------------------------------------------------------------------------------- /source/34.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/34.txt -------------------------------------------------------------------------------- /source/35.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/35.txt -------------------------------------------------------------------------------- /source/37.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/37.txt -------------------------------------------------------------------------------- /source/38.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/38.txt -------------------------------------------------------------------------------- /source/39.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/39.txt -------------------------------------------------------------------------------- /source/4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/4.txt -------------------------------------------------------------------------------- /source/40.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/40.txt -------------------------------------------------------------------------------- /source/41.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/41.txt -------------------------------------------------------------------------------- /source/42.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/42.txt -------------------------------------------------------------------------------- /source/44.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/44.txt -------------------------------------------------------------------------------- /source/45.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/45.txt -------------------------------------------------------------------------------- /source/46.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/46.txt -------------------------------------------------------------------------------- /source/47.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/47.txt -------------------------------------------------------------------------------- /source/48.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/48.txt -------------------------------------------------------------------------------- /source/49.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/49.txt -------------------------------------------------------------------------------- /source/5.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/5.txt -------------------------------------------------------------------------------- /source/50.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/50.txt -------------------------------------------------------------------------------- /source/51.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/51.txt -------------------------------------------------------------------------------- /source/52.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/52.txt -------------------------------------------------------------------------------- /source/53.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/53.txt -------------------------------------------------------------------------------- /source/54.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/54.txt -------------------------------------------------------------------------------- /source/55.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/55.txt -------------------------------------------------------------------------------- /source/56.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/56.txt -------------------------------------------------------------------------------- /source/57.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/57.txt -------------------------------------------------------------------------------- /source/58.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/58.txt -------------------------------------------------------------------------------- /source/59.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/59.txt -------------------------------------------------------------------------------- /source/60.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/60.txt -------------------------------------------------------------------------------- /source/7.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/7.txt -------------------------------------------------------------------------------- /source/8.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/8.txt -------------------------------------------------------------------------------- /source/9.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/source/9.txt -------------------------------------------------------------------------------- /stops_removed/1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/1.txt -------------------------------------------------------------------------------- /stops_removed/2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/2.txt -------------------------------------------------------------------------------- /stops_removed/3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/3.txt -------------------------------------------------------------------------------- /stops_removed/4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/4.txt -------------------------------------------------------------------------------- /stops_removed/5.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/5.txt -------------------------------------------------------------------------------- /stops_removed/7.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/7.txt -------------------------------------------------------------------------------- /stops_removed/8.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/8.txt -------------------------------------------------------------------------------- /stops_removed/9.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/9.txt -------------------------------------------------------------------------------- /stops_removed/10.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/10.txt -------------------------------------------------------------------------------- /stops_removed/12.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/12.txt -------------------------------------------------------------------------------- /stops_removed/13.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/13.txt -------------------------------------------------------------------------------- /stops_removed/14.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/14.txt -------------------------------------------------------------------------------- /stops_removed/15.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/15.txt -------------------------------------------------------------------------------- /stops_removed/16.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/16.txt -------------------------------------------------------------------------------- /stops_removed/17.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/17.txt -------------------------------------------------------------------------------- /stops_removed/18.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/18.txt -------------------------------------------------------------------------------- /stops_removed/19.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/19.txt -------------------------------------------------------------------------------- /stops_removed/21.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/21.txt -------------------------------------------------------------------------------- /stops_removed/22.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/22.txt -------------------------------------------------------------------------------- /stops_removed/23.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/23.txt -------------------------------------------------------------------------------- /stops_removed/24.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/24.txt -------------------------------------------------------------------------------- /stops_removed/25.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/25.txt -------------------------------------------------------------------------------- /stops_removed/26.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/26.txt -------------------------------------------------------------------------------- /stops_removed/27.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/27.txt -------------------------------------------------------------------------------- /stops_removed/28.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/28.txt -------------------------------------------------------------------------------- /stops_removed/29.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/29.txt -------------------------------------------------------------------------------- /stops_removed/30.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/30.txt -------------------------------------------------------------------------------- /stops_removed/31.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/31.txt -------------------------------------------------------------------------------- /stops_removed/32.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/32.txt -------------------------------------------------------------------------------- /stops_removed/33.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/33.txt -------------------------------------------------------------------------------- /stops_removed/34.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/34.txt -------------------------------------------------------------------------------- /stops_removed/35.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/35.txt -------------------------------------------------------------------------------- /stops_removed/37.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/37.txt -------------------------------------------------------------------------------- /stops_removed/38.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/38.txt -------------------------------------------------------------------------------- /stops_removed/39.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/39.txt -------------------------------------------------------------------------------- /stops_removed/40.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/40.txt -------------------------------------------------------------------------------- /stops_removed/41.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/41.txt -------------------------------------------------------------------------------- /stops_removed/42.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/42.txt -------------------------------------------------------------------------------- /stops_removed/44.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/44.txt -------------------------------------------------------------------------------- /stops_removed/45.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/45.txt -------------------------------------------------------------------------------- /stops_removed/46.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/46.txt -------------------------------------------------------------------------------- /stops_removed/47.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/47.txt -------------------------------------------------------------------------------- /stops_removed/48.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/48.txt -------------------------------------------------------------------------------- /stops_removed/49.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/49.txt -------------------------------------------------------------------------------- /stops_removed/50.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/50.txt -------------------------------------------------------------------------------- /stops_removed/51.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/51.txt -------------------------------------------------------------------------------- /stops_removed/52.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/52.txt -------------------------------------------------------------------------------- /stops_removed/53.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/53.txt -------------------------------------------------------------------------------- /stops_removed/54.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/54.txt -------------------------------------------------------------------------------- /stops_removed/55.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/55.txt -------------------------------------------------------------------------------- /stops_removed/56.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/56.txt -------------------------------------------------------------------------------- /stops_removed/57.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/57.txt -------------------------------------------------------------------------------- /stops_removed/58.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/58.txt -------------------------------------------------------------------------------- /stops_removed/59.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/59.txt -------------------------------------------------------------------------------- /stops_removed/60.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinavthomas/textclusteringDBSCAN/HEAD/stops_removed/60.txt -------------------------------------------------------------------------------- /plot.py: -------------------------------------------------------------------------------- 1 | from matplotlib.pyplot import plt 2 | 3 | a = [1,2,3,4] 4 | b = [1,2,4,3] 5 | 6 | plt.plot(a,b) 7 | plt.show() -------------------------------------------------------------------------------- /plot1.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | infile1 = open('txt1.txt','r') 4 | inlines = infile1.readlines() 5 | lenx = len(inlines) 6 | 7 | arr1 = [] 8 | arr2 = [] 9 | arr3 = [] 10 | arr4 = [] 11 | 12 | for k in range(lenx): 13 | linex = inlines[k].split() 14 | print linex 15 | arr1.append(float(linex[0])) 16 | arr2.append(float(linex[1])) 17 | arr3.append(float(linex[2])) 18 | arr4.append(float(linex[3])) 19 | ''' 20 | plt.plot(arr1,arr4) 21 | plt.show()''' -------------------------------------------------------------------------------- /stops_removed/6.txt: -------------------------------------------------------------------------------- 1 | babita shivdasani born april former bollywood actress relatively short career starred nineteen films including boxoffice successes haseena maan jayegi farz kismat married randhir kapoor two daughters actresses karisma kareena kapoor 2 | backgroundedit 3 | babita born actor hari shivdasani hindu sindhi family migrated pakistan india british christian mother cousin actress sadhana shivdasani 4 | 5 | careeredit 6 | short career appeared films including dus lakh raaz farz aulad haseena maan jayegi kab kyon aur kahan starred future husband randhir kapoor hit film kal aaj aur kal biggest boxoffice successes haseena maan jayegi future uncleinlaw shashi kapoor farz jeetendra kismat biswajeet 7 | 8 | personal lifeedit 9 | babita married randhir kapoor november citation needed marriage produced two children karisma kapoor kareena kapoor 10 | -------------------------------------------------------------------------------- /source/6.txt: -------------------------------------------------------------------------------- 1 | Babita Shivdasani (born 20 April 1948[1][2][3]) is a former Bollywood actress. In a relatively short career, from 1966 to 1973, she starred in nineteen films, including the box-office successes Haseena Maan Jayegi (1968), Farz (1967), and Kismat (1968). She is married to Randhir Kapoor; her two daughters are the actresses Karisma and Kareena Kapoor. 2 | Background[edit] 3 | Babita was born to actor Hari Shivdasani, who was of a Hindu Sindhi family which had migrated from Pakistan to India, and a British Christian mother.[4][5] Her cousin is actress Sadhana Shivdasani [6] 4 | 5 | Career[edit] 6 | In her short career she appeared in 19 films including: Dus Lakh (1966), Raaz (1967), Farz (1967), Aulad (1968), Haseena Maan Jayegi (1969), and Kab Kyon Aur Kahan (1970). She starred with her future husband Randhir Kapoor in the hit film Kal Aaj Aur Kal. Her biggest box-office successes were Haseena Maan Jayegi (1968) with her future uncle-in-law Shashi Kapoor, Farz with Jeetendra and Kismat with Biswajeet. 7 | 8 | Personal life[edit] 9 | Babita married Randhir Kapoor on 6 November 1971.[citation needed] The marriage produced two children: Karisma Kapoor and Kareena Kapoor. -------------------------------------------------------------------------------- /5_result_evaluation.py: -------------------------------------------------------------------------------- 1 | # cluster performance measurement using Adjusted Rand Index 2 | 3 | def check_items_correct(p,q,arrax): 4 | len1 = len(arrax) 5 | for y in range(len1): 6 | linethis = arrax[y] 7 | if (p in linethis) and (q in linethis): 8 | return 1 9 | return 0 10 | 11 | def check_items_computed(p,q,corrx): 12 | len2 = len(corrx) 13 | for y in range(len2): 14 | linethis = corrx[y] 15 | if (p in linethis) and (q in linethis): 16 | return 1 17 | return 0 18 | 19 | def find_all(nof,arrax,corrx): 20 | same1 = [] 21 | same2 = [] 22 | diff1 = [] 23 | diff2 = [] 24 | 25 | for k in range(nof): 26 | for j in range(k+1,nof): 27 | stat = check_items_correct(k+1,j+1,arrax) 28 | strx = str(k+1)+str(j+1) 29 | if stat==1: 30 | same1.append(strx) 31 | else: 32 | diff1.append(strx) 33 | #print same1,diff1 34 | 35 | for k in range(nof): 36 | for j in range(k+1,nof): 37 | stat = check_items_computed(k+1,j+1,corrx) 38 | strx = str(k+1)+str(j+1) 39 | if stat==1: 40 | same2.append(strx) 41 | else: 42 | diff2.append(strx) 43 | aa = 0 44 | bb = 0 45 | for element in same1: 46 | if element in same2: 47 | aa=aa+1 48 | for elem in diff1: 49 | if elem in diff2: 50 | bb=bb+1 51 | nc2 = nof*(nof-1)/2 52 | scorex = (aa+bb)/float(nc2) 53 | #print aa,bb,nc2, 54 | return scorex 55 | -------------------------------------------------------------------------------- /6_main_module.py: -------------------------------------------------------------------------------- 1 | # exaluating the best cluster result. 2 | 3 | import time 4 | cluster_file = __import__('4_cluster') 5 | comparison_file = __import__('5_result_evaluation') 6 | correct = [[1,2,3,4,5,6,7,8,9,10],[11,12,13,14,15,16,17,18,19,20],[21,22,23,24,25,26,27,28,29,30],[31,32,33,34,35,36,37,38,39,40],[41,42,43,44,45,46,47,48,49,50],[51,52,53,54,55,56,57,58,59,60]] #write the correct cluster pattern here. 7 | 8 | infile1 = open('scores.txt','r') 9 | inlines = infile1.readlines() 10 | lenx = len(inlines) 11 | nof = 60 #input the number of files here 12 | 13 | def findscore(fid1,fid2): 14 | fidx1 = str(fid1) 15 | fidx2 = str(fid2) 16 | for k in range(lenx): 17 | linex = inlines[k].split() 18 | fx1 = linex[0] 19 | fx2 = linex[1] 20 | scx = linex[2] 21 | if fidx1==fx1 and fidx2==fx2: 22 | return float(scx) 23 | break; 24 | 25 | def find_near_threshold(): 26 | num = 0 27 | scorex = 0 28 | for k in range(nof): 29 | for j in range(k+1,nof): 30 | scr = findscore(k+1,j+1) 31 | num = num + 1 32 | scorex = scorex + scr 33 | print scorex, num 34 | nt = scorex/num 35 | #print "threshold = ", nt 36 | return nt 37 | 38 | init_threshold = find_near_threshold() 39 | print init_threshold 40 | threx = round(init_threshold, 2) 41 | 42 | ofiley = open('results_threshold.txt','w') 43 | 44 | threshold = threx - 0.2 45 | maxx = 0 46 | best_threshold = 0 47 | for h in range(40): 48 | threshold = threshold + 0.01 49 | arrx = cluster_file.main_prog(nof,threshold) 50 | scorexx = comparison_file.find_all(nof,arrx,correct) 51 | if scorexx > maxx: 52 | maxx = scorexx 53 | best_threshold = threshold 54 | print threshold, " / ", scorexx 55 | ofiley.write(str(threshold)) 56 | ofiley.write(" ") 57 | ofiley.write(str(scorexx)) 58 | ofiley.write("\n") 59 | 60 | print "maxx is ", maxx,"with threshold" , best_threshold 61 | 62 | arrx = cluster_file.main_prog(nof,best_threshold) 63 | print "arrx = ", arrx 64 | 65 | -------------------------------------------------------------------------------- /dest/6.txt: -------------------------------------------------------------------------------- 1 | babita 2.04772486611 2 | shivdasani 2.04772486611 3 | born 0.630637804805 4 | april 0.346156072681 5 | former 0.346156072681 6 | bollywood 0.64674320208 7 | actress 0.854886526114 8 | relatively 0.510682122239 9 | short 0.601712509177 10 | career 0.729706959058 11 | starred 1.02506372289 12 | nineteen 0.901805273318 13 | films 0.767897036174 14 | including 0.126772627818 15 | boxoffice 1.29206830979 16 | successes 1.29206830979 17 | haseena 2.04772486611 18 | maan 2.04772486611 19 | jayegi 2.04772486611 20 | farz 2.04772486611 21 | kismat 1.62278356229 22 | married 0.767897036174 23 | randhir 2.04772486611 24 | kapoor 2.07944154168 25 | two 0.116511560198 26 | daughters 0.815204340296 27 | actresses 0.69314718056 28 | karisma 1.62278356229 29 | kareena 1.18560177861 30 | backgroundedit 1.02386243305 31 | actor 0.510682122239 32 | hari 0.901805273318 33 | hindu 0.417316185517 34 | sindhi 1.02386243305 35 | family 0.346156072681 36 | migrated 0.901805273318 37 | pakistan 0.484489087801 38 | india 0.179966869745 39 | british 0.288642715587 40 | christian 0.64674320208 41 | mother 0.379638325136 42 | cousin 0.748031438011 43 | sadhana 1.02386243305 44 | careeredit 0.64674320208 45 | appeared 0.417316185517 46 | dus 0.901805273318 47 | lakh 1.02386243305 48 | raaz 1.02386243305 49 | aulad 1.02386243305 50 | kab 1.02386243305 51 | kyon 1.02386243305 52 | aur 1.29206830979 53 | kahan 1.02386243305 54 | future 0.905156267542 55 | husband 0.606546247537 56 | hit 0.460393831858 57 | film 0.417316185517 58 | kal 1.42932754116 59 | aaj 1.02386243305 60 | biggest 0.460393831858 61 | uncleinlaw 1.02386243305 62 | shashi 0.901805273318 63 | jeetendra 0.901805273318 64 | biswajeet 1.02386243305 65 | personal 0.397888154779 66 | lifeedit 0.748031438011 67 | november 0.397888154779 68 | citation 0.69314718056 69 | needed 0.24037476833 70 | marriage 0.460393831858 71 | produced 0.379638325136 72 | children 0.362431928066 73 | -------------------------------------------------------------------------------- /1_rempunct.py: -------------------------------------------------------------------------------- 1 | #remove punctuations! basic text processing 2 | # to do : remove the [num] found in wikipedia pages. 3 | 4 | import nltk,re,pprint 5 | import sys,glob,os 6 | from imp import reload 7 | from nltk.corpus import stopwords 8 | 9 | reload(sys) 10 | #sys.setdefaultencoding('latin1') 11 | 12 | class rempunct: 13 | def __init__(self): 14 | from nltk.corpus import stopwords 15 | #sw = open('stopwords.txt','r').read() 16 | #self.swords = sw.split() 17 | self.swords = set(stopwords.words('english')) 18 | print(len(self.swords),"stopwords present!") 19 | 20 | def allfiles(self,foldername): #returns the name of all files inside the source folder. 21 | owd = os.getcwd() 22 | fld = foldername + "/" 23 | os.chdir(fld) #this is the name of the folder from which the file names are returned. 24 | arr = [] #empty array, the names of files are appended to this array, and returned. 25 | for file in glob.glob("*.txt"): 26 | arr.append(file) 27 | os.chdir(owd) 28 | print("All filenames extracted!") 29 | return arr 30 | 31 | def rem_stop(self,fname,ofilename): 32 | rawlines = open(fname).readlines() 33 | lenl = len(rawlines) 34 | of=open(ofilename,'w') 35 | for r in range(lenl): 36 | linex = rawlines[r] 37 | linex2 = "".join(c for c in linex if c not in ('!','.',':',',','?',';','``','&','-','"','(',')','[',']','0','1','2','3','4','5','6','7','8','9')) 38 | linex3 = linex2.split() 39 | prog=(r+1)/len(rawlines) 40 | for s in range(len(linex3)): 41 | noword = linex3[s].lower() 42 | if noword not in self.swords: 43 | of.write(noword) 44 | of.write(" ") 45 | #self.drawProgressBar(prog) 46 | of.write("\n") 47 | 48 | 49 | def drawProgressBar(self,percent, barLen = 50): #just a progress bar so that you dont lose patience 50 | sys.stdout.write("\r") 51 | progress = "" 52 | for i in range(barLen): 53 | if i