├── .gitignore ├── README.md ├── data ├── PTM │ ├── api-test.pkl │ ├── api-train.pkl │ ├── app-test.pkl │ ├── app-train.pkl │ ├── cr-test.pkl │ ├── cr-train.pkl │ ├── gh-test.pkl │ ├── gh-train.pkl │ ├── jira-test.pkl │ ├── jira-train.pkl │ ├── so-test.pkl │ └── so-train.pkl ├── Senti4SD │ ├── api-test-sd.csv │ ├── app-test-sd.csv │ ├── cr-test-sd.csv │ ├── gh-test-sd.csv │ ├── jira-test-sd.csv │ └── so-test-sd.csv ├── SentiStrength │ ├── api-test-se.csv │ ├── api-test.txt │ ├── app-test-se.csv │ ├── app-test.txt │ ├── cr-test-se.csv │ ├── cr-test.txt │ ├── gh-test-se.csv │ ├── gh-test.txt │ ├── jira-test-se.csv │ ├── jira-test.txt │ ├── so-test-se.csv │ └── so-test.txt └── github-predictions │ └── xlnet-senticr.csv └── scripts ├── PTM ├── api.py ├── app.py ├── cr.py ├── early-stopping │ ├── api.py │ ├── app.py │ ├── cr.py │ ├── github.py │ ├── jira.py │ ├── so.py │ └── utils.py ├── github.py ├── jira.py ├── run_all.sh ├── so.py └── utils.py ├── SentiCR ├── SentiCR.py └── SenticrTest.py ├── StanfordCoreNLP.py ├── analyze-results ├── Senti4SD.py ├── SentiStrength-SE.py ├── SentiStrength.py └── gh-xlnet-senticr.py └── prepare-data ├── convert_senti4sd.py └── convert_sentistrength.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # static files generated from Django application using `collectstatic` 142 | media 143 | static -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | In total, we run (5 + 4) * 6 = 54 experiments. Among them, we directly predict the labels on *Stanford CoreNLP*, *SentiStrength*, *SentiStrength-SE*, *Senti4SD* without re-training. While in *SentiCR* and pre-trained Transformer-based language models, we do supervised learning on each specific dataset. 3 | 4 | Do remember to change your file name or location of the data into the scripts. 5 | # Datasets 6 | Six datasets have been used. The sources of these datasets are noted in the paper. Credit to the original authors. You can download the original datasets in the following sources. 7 | - API Reviews (Downloaded from https://github.com/giasuddin/OpinionValueTSE/blob/master/ConsolidatedMentionResolutionBenchmark.xls) 8 | - APP Reviews (Downloaded from https://sentiment-se.github.io/replication.zip) 9 | - Code Reviews (Download from https://github.com/senticr/SentiCR/blob/master/SentiCR/oracle.xlsx) 10 | - GitHub Comments (Downloaded from https://doi.org/10.6084/m9.figshare.11604597) 11 | - JIRA Issues (https://sentiment-se.github.io/replication.zip) 12 | - StackOverflow (https://sentiment-se.github.io/replication.zip) 13 | 14 | # Approaches 15 | ## SA4SE tools 16 | ### Stanford CoreNLP 17 | Usage: https://github.com/smilli/py-corenlp 18 | ### SentiStrength 19 | Download from: https://www.softpedia.com/get/Others/Home-Education/SentiStrength.shtml 20 | You should download both the exe file and the SentiStrength Data zip file. After you extract the data file, you will see it contains many useful word lists. 21 | ### SentiStrength-SE 22 | Download from: https://laser.cs.uno.edu/Projects/Projects.html 23 | ### SentiCR 24 | Source code:https://github.com/senticr/SentiCR 25 | ### Senti4SD 26 | Source code: https://github.com/collab-uniba/pySenti4SD or https://github.com/collab-uniba/Senti4SD 27 | 28 | ## Pre-trained Transformer-based Language Models 29 | We used pre-trained BERT, XNLet, RoBERTa, and ALBERT models. We use huggingface library: https://huggingface.co/transformers/ 30 | 31 | # Scripts 32 | ## Pre-trained Transformer-based language models 33 | We used six python scripts, i.e., [api.py](./scripts/PTM/api.py), [app.py](./scripts/PTM/app.py), [cr.py](./scripts/PTM/cr.py), [github.py](./scripts/PTM/github.py), [jira.py](./scripts/PTM/jira.py) and [so.py](./scripts/PTM/so.py). For each script, the argument is the model. For example, to run BERT on api data, you can run the [api.py](./scripts/PTM/api.py) like follows: `$ python api.py -m 0` (Instead of tuning the hyper-parameters, we used fixed hyper-parameters as stated in our paper.) 34 | 35 | 36 | You can also apply the early-stopping technique in the scripts, and the code in [this folder](./scripts/PTM/early-stopping). If you would like to train/fine-tune the BERT on dataset API, please run `$ python api.py -m 0 -r 1`; If you would like to evaluate the fine-tuned BERT on dataset API, please run `$ python api.py -m 0 -r 0`. The argument '-r' indicates whether you would like to re-train. 37 | 38 | The data used by this group as located in [PTM folder](./data/PTM/) 39 | 40 | ## SentiCR 41 | After cloning this [repo](https://github.com/senticr/SentiCR), you have to modify the training oracle and its corresponding test part. We also put the modified script in [SentiCR.py](./scripts/SentiCR/SentiCR.py) and [SenticrTest.py](./scripts/SentiCR/SenticrTest.py). You can replace our scripts in your cloned SentiCR repo to run the test. You should notice that you have to use the same training and test dataset. For example, use training oracle (GitHub dataset) and test file (GitHub dataset). 42 | 43 | ## Senti4SD 44 | After your clone the [Senti4SD](https://github.com/collab-uniba/pySenti4SD) or [pySenti4SD](https://github.com/collab-uniba/Senti4SD), just run the following command without re-training: 45 | 46 | ```bash 47 | sh classification.sh -i test_dataset.csv -o predictions.csv or 48 | sh classification.sh test_dataset.csv predictions.csv 49 | ``` 50 | 51 | After getting the predictions, please run [Senti4SD.py](./scripts/analyze-results/Senti4SD.py) to analyze the prediction performance. 52 | 53 | ## Stanford CoreNLP 54 | After you download and start the Stanford CoreNLP server, you can import the library by referring the example in this [repo](https://github.com/smilli/py-corenlp). Our script is in [StanfordCoreNLP.py](./scripts/StanfordCoreNLP.py). 55 | 56 | ## SentiStrength 57 | ### Prepare data 58 | As all the input should only be one line, we should convert our data into this format in case some sentences have multiple lines. Our used test data can be found in [SentiStrength folder](./data/SentiStrength/). If you want to run your own, you can directly use our [script](./scripts/prepare-data/convert_sentistrength.py) 59 | ### Prediction 60 | Run SentiStrength2.3Free.exe. The process is as follows: 61 | 1. Select reporting options 62 | Click on 'Reporting Options' -> Unchoose 'Report Classification Rationale' and 'Report Translation (From Abbreviations etc.)'. In other words, we only select 'Report Sentiment Strength CLassifications [don't uncheck this normally ever]. 63 | 2. Select input file 64 | Click on 'Sentiment Strength Analysis' -> 'Analyse ALL Texts in File [each line separately]' -> Select the test file -> 'Echo header line to the results?', select 'Yes' -> 'Which column contains the text? Enter 1 for ...', enter 1 -> Choose your folder to save the output file 65 | 66 | 67 | 68 | The input file is a txt file, which in each line is a test text. It will output a file, in each line, it has two values, which represent the negative and positive values, respectively. Our strategy is to calculate the sum of these two values. 69 | ### Evaluation 70 | After getting the predictions, please run [SentiStrength.py](./scripts/analyze-results/SentiStrength.py) to analyze the prediction performance. 71 | 72 | ## SentiStrength-SE 73 | Almost the same workflow as SentiStrength. 74 | ### Prepare data 75 | The same as SentiStrength. 76 | ### Prediction 77 | This application is almost the same as SentiStrength. It will output two integer values, and we assign a sentiment value based on the sum. 78 | ```bash 79 | java -jar SentiStrength-SE_V1.5.jar 80 | ``` 81 | Input-> Select the test file 82 | 83 | 84 | Detect Sentiments 85 | ### Evaluation 86 | After getting the predictions, please run [SentiStrength-SE.py](./scripts/analyze-results/SentiStrength-SE.py) to analyze the predictions. 87 | 88 | ## Discussion part 89 | We compared the predictions made by XLNet and SentiCR in Discussion part in our paper. The script used is [gh-xlnet-senticr.py](./scripts/analyze-results/gh-xlnet-senticr.py). 90 | 91 | # Contact 92 | If you have any problems, feel free to contact Ting Zhang (tingzhang.2019@phdcs.smu.edu.sg) 93 | 94 | # Cite 95 | If you find this repo useful, please consider to cite our work. 96 | ``` 97 | @inproceedings{zhang2020sentiment, 98 | title={Sentiment Analysis for Software Engineering: How Far Can Pre-trained Transformer Models Go?}, 99 | author={Zhang, Ting and Xu, Bowen and Thung, Ferdian and Haryono, Stefanus Agus and Lo, David and Jiang, Lingxiao}, 100 | booktitle={2020 IEEE International Conference on Software Maintenance and Evolution (ICSME)}, 101 | pages={70--80}, 102 | year={2020}, 103 | organization={IEEE} 104 | } 105 | ``` 106 | -------------------------------------------------------------------------------- /data/PTM/api-test.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/api-test.pkl -------------------------------------------------------------------------------- /data/PTM/api-train.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/api-train.pkl -------------------------------------------------------------------------------- /data/PTM/app-test.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/app-test.pkl -------------------------------------------------------------------------------- /data/PTM/app-train.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/app-train.pkl -------------------------------------------------------------------------------- /data/PTM/cr-test.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/cr-test.pkl -------------------------------------------------------------------------------- /data/PTM/cr-train.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/cr-train.pkl -------------------------------------------------------------------------------- /data/PTM/gh-test.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/gh-test.pkl -------------------------------------------------------------------------------- /data/PTM/gh-train.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/gh-train.pkl -------------------------------------------------------------------------------- /data/PTM/jira-test.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/jira-test.pkl -------------------------------------------------------------------------------- /data/PTM/jira-train.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/jira-train.pkl -------------------------------------------------------------------------------- /data/PTM/so-test.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/so-test.pkl -------------------------------------------------------------------------------- /data/PTM/so-train.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/so-train.pkl -------------------------------------------------------------------------------- /data/Senti4SD/app-test-sd.csv: -------------------------------------------------------------------------------- 1 | Text,Polarity 2 | ? horrible on moment. my battery dies faster !,negative 3 | ? good features slow deliveries and hit and miss notifications.,negative 4 | fun and picture the game has beautiful pictures.,positive 5 | crap ~ refund requested the game freezes about 6mins into it everytime ~ crap shack game ~ refunds hard now.,negative 6 | just love it! been using for 9 months now and it is perfect app! ?,positive 7 | 5 stars this is exactly what i am looking for. thank you so much.,positive 8 | great it's awesome,positive 9 | i hope there's penguinz this is a really fun game.,positive 10 | slow no point. the viewfinder display is really laggy and slow. doesn't seem to offer any creative black and white features so how does it improve over just desaturating color photos?,negative 11 | "well.. it was awesome .. i do like the new look but now it also now gets ""unexpected errors ""and hitting retry just gets the same. won't work til you clear the data out in settings.. but doing that all the time is annoying. please fix this issue!",negative 12 | awasome it is so cute,positive 13 | i like it because i enjoy it the picture is very small,positive 14 | ? guns by calford is better,positive 15 | excellent app! but add more! this is much better than samsung's equation detection but the only thing missing in this app is the lack of variety. just like in the note 10.1 please add more equations and have the ability to solve complex algebra. i'll be glad to pay for such a feature. makes work so much easier!,positive 16 | it is fun i love it,positive 17 | yampee exellent nice and cool app,positive 18 | great memory game great for the mind keep you on your toes. helps with memory.,positive 19 | thanks :) this app is very useful and i keep checking it out every morning.,positive 20 | thanks :-) xperia sp it's very inspiring!,positive 21 | crashed system stopped working. no matter what i did this app kept crashing the system. my phone wouldn't work. the screen kept going black and i couldn't run anything. happened all of a sudden. uninstalled and went over to go launcher ex. same features smoother not crashing even with more widgets running. sorry adw.,negative 22 | nice it is very useful,positive 23 | yala is an app for dowloading and listening to arabic music and radio the idea and concept is amazing but it has so many bugs with lots of songs at times and it messes up my music when it connects to some random wifi. please stop and fix this.,negative 24 | very easy and fun not many programs deserve a 5 star but this is one of them. i do miss some features (or maybe i just don't know how to use the app) like continue reading from my last page ad it sometimes reloads from start with me articles but overall this is a very good app. thanks guys!,positive 25 | great app.. nice app .. keep it up.. it is very useful app for us... *****,positive 26 | everything i wanted takes less space than default clock of moto x and packs so much info. the default settings are pretty impressive. did not have to customize much. won't mind paying for it.,positive 27 | freezes is an awesome game until the past few days it freezes after each mission.,negative 28 | awesome to the developer i just gave another slot machine app very poor ratings your game is 100 the best on android and i've played all the slot machines android has to offer and by far plenty of coins and i'm having fun as hell if it drains all the money now i would not be disappointed i have never paid for a app but i think i will pay for this one stupendous job,positive 29 | very nice. great keyboard app. many great tools and a ton of customization.,positive 30 | game this game is kool!!!!,positive 31 | amazing! awesome app for mathematics. worth thousand starts. thanks devs. though the ui is hard to understand. thanks for the small size,positive 32 | invaluable! i use this app daily and can't imagine not having it.,positive 33 | won't load on s3 like everyone else game freezes when trying to load a mission. so they updated the app and now the new version isn't compatible with my phone. bah puh!,negative 34 | i really like this game!!!!!! this is a good game!!!!,positive 35 | woooow it is gooooooood game:-) :-) :-),positive 36 | freezes after about 2 minutes of gameplay,negative 37 | anoying bug i use both the powertoggles notification bar widget and home screen widget. really good concept as a whole. but... the home screen widget always freezes a couple hours after i configured it. the widgetbis used to launch my favorite apps not for toggling anything if you want to resolve it i can try to give you more info. ofc rating goes up if this is resolved!! :-) ** bug doesn't occur anymore! :d,negative 38 | match cast lines up don't show i mean aside from that the app is pretty great,positive 39 | ? cute but eats up alot of battery.,negative 40 | best twitter client! if you are looking for the best twitter client search no more. this is it! it's similar to tweetbot on ios. you get all the great features from pull to refresh inline images column views white/black holo theme background tap to jump to top and so much more. oh and there are no ads in this version. it is simply amazing.,positive 41 | help love this game but now it wont let me sell my fish. it freezes and crashes. please fix!,negative 42 | ? so far so good. ? however freezes up way too much. hope that error gets fixed soon or i will have to give it the boot.,negative 43 | network error! suddenly after downloading an update pack i cannot login and said that loading failed. check your connection network. but my connection is working finely. please fix this. thanks,negative 44 | excellent good app for people abroad,positive 45 | fun widget good selection of hilts and colors.,positive 46 | shame its a good game dont get me wrong but since theupdate with the hunter and new maps it freezes unless you stik to old maps.... cumon update fix please,negative 47 | awesome this app is legendary!,positive 48 | one thing missing i really like this app. simple user interface and works smoothly. espn should add a feature to notify you when your current selection is finished. that is the only thing missing.,positive 49 | excellent you can change your car s it's easy to park it self,positive 50 | pls fix bugs nice apps!,negative 51 | worked good at first... when i first got it it worked well now it freezes up and only displays blank white screen.,negative 52 | simply the best i very rarely write reviews. even more rare is when i like an app enough to buy it. this app deserves both. i have seriously tried most music players for the android and this one has everything i need. audio engine is tops start and stop via headphone jack great album art download. lots of other programs have these things but either lack in these or other features. i especially like the simple issue of bass and treble control. price a little high that was the only thing holding me back from buying it sooner. was thinking of giving it 4 stars because of price but; what the heck- i gave it five.,positive 53 | good app its a very good app very nice and easy an simple ui.,positive 54 | good app i found the app is very good for forecast. i like it.,positive 55 | the game constantly freezes. very frustrated since update. the game freezes and goes to green screen. i have to restart tablet every time to get game to respond. fix?,negative 56 | ? motorola devour. becoming more and more sure this game freezes my phones? very fun game though? it is a challenge and addictive?,neutral 57 | slot game. it was a great game but it has stopped paying out almost completely.,negative 58 | freezes every time only works when you restart your phone,negative 59 | ads great app but the ads come up all the time and it ruins it,negative 60 | very fun!! worth the memory. :-d,positive 61 | horrible installed perfectly and though was great app. uploaded some pics from gallery with no problem. when wanted to take a pic as demo here shows above it was just a black scree. pressed camera button nothing. can upload from gallery but cant use the main feature of this app which is what is advertised. why would i use this to upload from my phone? my phone hasd bluetooth and a usb to hook up to my pc. i can send those pics via text email and facebook. the main purpose of this app does not work!,negative 62 | great apps works well and looks great. lots of available providers. from the 5 or so apps i've tested deliveries seems to support the most. i submitted a bug report when statuses from one of the providers wasn't working as expected. the developer promptly got back to me and fixed the issue. only feature i would like is auto-detection of provider based on the tracking number. but as a fellow programmer i understand the challenges presented by such a feature so this doesn't affect my rating.,positive 63 | need to add a feature you guys should add a custom car builder feature where you can just build your own car but every part you buy will just keep adding up your total overall great app its fun and it kills time,positive 64 | great alarm but has new bug making it use lots of battery i like this alarm i definitely like being able to wake up to a random playlist. but since an update its now using massive amounts of battery using the gps. i'm guessing it's something to do with sunrise feature. i don't use the sunrise alarm and i certainly don't move between countries enough for it to constantly be updating. so either a setting to not allow access to gps or to only update once a day would be useful. otherwise i'll have to go back to the standard alarm until it's fixed.,neutral 65 | gs3 best weather and clock widget out there hands down,positive 66 | good for me 112089 come to my friend in the game.,positive 67 | doesn't work fix bugs and maybe i'll give it 4 stars. sorry!,negative 68 | some cant download because when you download a swf an error appears pls fix ill rate 5 star for this,negative 69 | nice but after i update i can't open the game? pls fix this game and i love it.i will give 5 star if this game fixed the bug. username: roenan12 server: s2 libra,negative 70 | freezes when browser loads freezes when browser loads to play game,negative 71 | awesome it's very useful to all people.,positive 72 | can't install i can't install it on my samsung note iii please fix it. error. (941),negative 73 | note 2 it's very useful to use........love it,positive 74 | good app it's customizable which i like. sometimes it's a bit difficult getting items to line up properly and the areas for each part of the display don't change size dynamically so the font sizes will get bigger or smaller to fit in the space alloted for them which i don't like. overall it's a good app though.,positive 75 | "too slow. app is slower than mobile site and/or competing livejournal apps like""eljay"" (search the marketplace for it). what with the lack of features even in the main pc site it's proof that livejournal is no match for even the much maligned facebook...rip livejournal.",negative 76 | very good but can you add download all attachment as zip option ? we miss this very nice feature which is available in web version.,positive 77 | "widget locks up my galaxy s3 this a very beautiful looking app and i like it a lot but it regularly locks up the homescreen widget the clock freezes and the weather status locks up with the status ""locating"". my phone is a samsung galaxy s3 running on 4.1.1 based in the uk. hope this info helps solve the issue as this app is definitely worth a full 5 stars if it were not for it locking up. please fix this and i'll even purchase the add free pro version ;-)",negative 78 | a must have! great app and a perfect companion for many other apps.,positive 79 | like minecraft pe. has good graphics,positive 80 | ? what is the best the best thing to scan?,neutral 81 | bug when i leave the rest room i am always sent back to main menu please fix it and ill give it a better rate,negative 82 | great app love the- my list feature. i can find the best deals and plan my black friday shopping trip,positive 83 | its what i was looking for i love this app,positive 84 | amazing! a must have app,positive 85 | galaxy ace india is greatest,positive 86 | awesome it's a good app,positive 87 | stop button crash this app repatly fix that bug and u get 5 stars,negative 88 | time time freezes and will not update since latest version.,negative 89 | far from as cool as smartglass i went with a ps4 rather than the xbox one but i do miss smart glass. they need to make this app enable the voice features and things like the store should be built-in not a browser link.,negative 90 | finally one great game for android.,positive 91 | excellent app no complaints at all but still waiting for urdu localization,positive 92 | good app great launcher. i really like it but it is killing my battery.,neutral 93 | regret buying this app fcs so randomly that it's almost impossible to use it for longer period of time. lack of updates have made it even more impossible to use. haven't seen even a single much requested feature by the community being implemented in this app despite being ok play store for more or less 4 months now. disappointed.,negative 94 | offline browsing please allow us to open local html or swf files. there are few browsers out there that can not play local html but those lack the browsing/playing features of this app.,neutral 95 | amazing this is freaking great,positive 96 | ? not for motorola defy,neutral 97 | thankyou waited for such an app for a long time. jazakallah!,positive 98 | awesome but... its a really good player no doubt. but since last update some of my albums wont show and even songs wont show. not to mention it repeats some albums and its not my memory card cause i only have 1 album for each group of songs. please fix this. its the best player ive come across id hate to have to uninstall it :(,neutral 99 | cutiest emoticons cute emoticons to have fun..,positive 100 | game freezes! i play on the pc and now tried to play over and over on my samsung s3 and it freezes up after 2 min. in the game!,negative 101 | great app! i love this app! it's great to have my book lists with me wherever i go and i can easily keep track of which books i've read to each of my kids. one thing missing is the browse feature like the apple version has; i miss that! add that and i will rate 5 stars!,positive 102 | ? i love it but some times it freezes and i have to do somthing else,negative 103 | a superb app i came to android about a year ago from an iphone and i tried every reddit client out there such as baconreader and reddit news. these were all good apps but none of them compared to my experience with alienblue. then i tried reddit sync and it has become almost exclusively the only way i read redditch anymore. the holo interface and black night mode are easy on my eyes and it has all of the features i need.,positive 104 | improving rapidly. this app used to be crappy while the service was ok but recently the app has started receiving updates on a regular basis adding a lot of basic functionality that should have been there from the beginning. now it features landscape support resume from last position notification bar controls and save to sd card. however i still sorely miss a decent widget easily sortable queues or the extensions found in the desktop app.,positive 105 | -------------------------------------------------------------------------------- /data/SentiStrength/app-test-se.csv: -------------------------------------------------------------------------------- 1 | ? horrible on moment. my battery dies faster !,-1 2 | ? good features slow deliveries and hit and miss notifications.,-1 3 | fun and picture the game has beautiful pictures.,1 4 | crap ~ refund requested the game freezes about 6mins into it everytime ~ crap shack game ~ refunds hard now.,-1 5 | just love it! been using for 9 months now and it is perfect app! ?,1 6 | 5 stars this is exactly what i am looking for. thank you so much.,1 7 | great it's awesome,1 8 | i hope there's penguinz this is a really fun game.,1 9 | slow no point. the viewfinder display is really laggy and slow. doesn't seem to offer any creative black and white features so how does it improve over just desaturating color photos?,-1 10 | "well.. it was awesome .. i do like the new look but now it also now gets ""unexpected errors ""and hitting retry just gets the same. won't work til you clear the data out in settings.. but doing that all the time is annoying. please fix this issue!",-1 11 | awasome it is so cute,1 12 | i like it because i enjoy it the picture is very small,1 13 | ? guns by calford is better,1 14 | excellent app! but add more! this is much better than samsung's equation detection but the only thing missing in this app is the lack of variety. just like in the note 10.1 please add more equations and have the ability to solve complex algebra. i'll be glad to pay for such a feature. makes work so much easier!,1 15 | it is fun i love it,1 16 | yampee exellent nice and cool app,1 17 | great memory game great for the mind keep you on your toes. helps with memory.,1 18 | thanks :) this app is very useful and i keep checking it out every morning.,1 19 | thanks :-) xperia sp it's very inspiring!,1 20 | crashed system stopped working. no matter what i did this app kept crashing the system. my phone wouldn't work. the screen kept going black and i couldn't run anything. happened all of a sudden. uninstalled and went over to go launcher ex. same features smoother not crashing even with more widgets running. sorry adw.,-1 21 | nice it is very useful,1 22 | yala is an app for dowloading and listening to arabic music and radio the idea and concept is amazing but it has so many bugs with lots of songs at times and it messes up my music when it connects to some random wifi. please stop and fix this.,-1 23 | very easy and fun not many programs deserve a 5 star but this is one of them. i do miss some features (or maybe i just don't know how to use the app) like continue reading from my last page ad it sometimes reloads from start with me articles but overall this is a very good app. thanks guys!,1 24 | great app.. nice app .. keep it up.. it is very useful app for us... *****,1 25 | everything i wanted takes less space than default clock of moto x and packs so much info. the default settings are pretty impressive. did not have to customize much. won't mind paying for it.,1 26 | freezes is an awesome game until the past few days it freezes after each mission.,-1 27 | awesome to the developer i just gave another slot machine app very poor ratings your game is 100 the best on android and i've played all the slot machines android has to offer and by far plenty of coins and i'm having fun as hell if it drains all the money now i would not be disappointed i have never paid for a app but i think i will pay for this one stupendous job,1 28 | very nice. great keyboard app. many great tools and a ton of customization.,1 29 | game this game is kool!!!!,1 30 | amazing! awesome app for mathematics. worth thousand starts. thanks devs. though the ui is hard to understand. thanks for the small size,1 31 | invaluable! i use this app daily and can't imagine not having it.,1 32 | won't load on s3 like everyone else game freezes when trying to load a mission. so they updated the app and now the new version isn't compatible with my phone. bah puh!,-1 33 | i really like this game!!!!!! this is a good game!!!!,1 34 | woooow it is gooooooood game:-) :-) :-),1 35 | freezes after about 2 minutes of gameplay,-1 36 | anoying bug i use both the powertoggles notification bar widget and home screen widget. really good concept as a whole. but... the home screen widget always freezes a couple hours after i configured it. the widgetbis used to launch my favorite apps not for toggling anything if you want to resolve it i can try to give you more info. ofc rating goes up if this is resolved!! :-) ** bug doesn't occur anymore! :d,-1 37 | match cast lines up don't show i mean aside from that the app is pretty great,1 38 | ? cute but eats up alot of battery.,-1 39 | best twitter client! if you are looking for the best twitter client search no more. this is it! it's similar to tweetbot on ios. you get all the great features from pull to refresh inline images column views white/black holo theme background tap to jump to top and so much more. oh and there are no ads in this version. it is simply amazing.,1 40 | help love this game but now it wont let me sell my fish. it freezes and crashes. please fix!,-1 41 | ? so far so good. ? however freezes up way too much. hope that error gets fixed soon or i will have to give it the boot.,-1 42 | network error! suddenly after downloading an update pack i cannot login and said that loading failed. check your connection network. but my connection is working finely. please fix this. thanks,-1 43 | excellent good app for people abroad,1 44 | fun widget good selection of hilts and colors.,1 45 | shame its a good game dont get me wrong but since theupdate with the hunter and new maps it freezes unless you stik to old maps.... cumon update fix please,-1 46 | awesome this app is legendary!,1 47 | one thing missing i really like this app. simple user interface and works smoothly. espn should add a feature to notify you when your current selection is finished. that is the only thing missing.,1 48 | excellent you can change your car s it's easy to park it self,1 49 | pls fix bugs nice apps!,-1 50 | worked good at first... when i first got it it worked well now it freezes up and only displays blank white screen.,-1 51 | simply the best i very rarely write reviews. even more rare is when i like an app enough to buy it. this app deserves both. i have seriously tried most music players for the android and this one has everything i need. audio engine is tops start and stop via headphone jack great album art download. lots of other programs have these things but either lack in these or other features. i especially like the simple issue of bass and treble control. price a little high that was the only thing holding me back from buying it sooner. was thinking of giving it 4 stars because of price but; what the heck- i gave it five.,1 52 | good app its a very good app very nice and easy an simple ui.,1 53 | good app i found the app is very good for forecast. i like it.,1 54 | the game constantly freezes. very frustrated since update. the game freezes and goes to green screen. i have to restart tablet every time to get game to respond. fix?,-1 55 | ? motorola devour. becoming more and more sure this game freezes my phones? very fun game though? it is a challenge and addictive?,0 56 | slot game. it was a great game but it has stopped paying out almost completely.,-1 57 | freezes every time only works when you restart your phone,-1 58 | ads great app but the ads come up all the time and it ruins it,-1 59 | very fun!! worth the memory. :-d,1 60 | horrible installed perfectly and though was great app. uploaded some pics from gallery with no problem. when wanted to take a pic as demo here shows above it was just a black scree. pressed camera button nothing. can upload from gallery but cant use the main feature of this app which is what is advertised. why would i use this to upload from my phone? my phone hasd bluetooth and a usb to hook up to my pc. i can send those pics via text email and facebook. the main purpose of this app does not work!,-1 61 | great apps works well and looks great. lots of available providers. from the 5 or so apps i've tested deliveries seems to support the most. i submitted a bug report when statuses from one of the providers wasn't working as expected. the developer promptly got back to me and fixed the issue. only feature i would like is auto-detection of provider based on the tracking number. but as a fellow programmer i understand the challenges presented by such a feature so this doesn't affect my rating.,1 62 | need to add a feature you guys should add a custom car builder feature where you can just build your own car but every part you buy will just keep adding up your total overall great app its fun and it kills time,1 63 | great alarm but has new bug making it use lots of battery i like this alarm i definitely like being able to wake up to a random playlist. but since an update its now using massive amounts of battery using the gps. i'm guessing it's something to do with sunrise feature. i don't use the sunrise alarm and i certainly don't move between countries enough for it to constantly be updating. so either a setting to not allow access to gps or to only update once a day would be useful. otherwise i'll have to go back to the standard alarm until it's fixed.,0 64 | gs3 best weather and clock widget out there hands down,1 65 | good for me 112089 come to my friend in the game.,1 66 | doesn't work fix bugs and maybe i'll give it 4 stars. sorry!,-1 67 | some cant download because when you download a swf an error appears pls fix ill rate 5 star for this,-1 68 | nice but after i update i can't open the game? pls fix this game and i love it.i will give 5 star if this game fixed the bug. username: roenan12 server: s2 libra,-1 69 | freezes when browser loads freezes when browser loads to play game,-1 70 | awesome it's very useful to all people.,1 71 | can't install i can't install it on my samsung note iii please fix it. error. (941),-1 72 | note 2 it's very useful to use........love it,1 73 | good app it's customizable which i like. sometimes it's a bit difficult getting items to line up properly and the areas for each part of the display don't change size dynamically so the font sizes will get bigger or smaller to fit in the space alloted for them which i don't like. overall it's a good app though.,1 74 | "too slow. app is slower than mobile site and/or competing livejournal apps like""eljay"" (search the marketplace for it). what with the lack of features even in the main pc site it's proof that livejournal is no match for even the much maligned facebook...rip livejournal.",-1 75 | very good but can you add download all attachment as zip option ? we miss this very nice feature which is available in web version.,1 76 | "widget locks up my galaxy s3 this a very beautiful looking app and i like it a lot but it regularly locks up the homescreen widget the clock freezes and the weather status locks up with the status ""locating"". my phone is a samsung galaxy s3 running on 4.1.1 based in the uk. hope this info helps solve the issue as this app is definitely worth a full 5 stars if it were not for it locking up. please fix this and i'll even purchase the add free pro version ;-)",-1 77 | a must have! great app and a perfect companion for many other apps.,1 78 | like minecraft pe. has good graphics,1 79 | ? what is the best the best thing to scan?,0 80 | bug when i leave the rest room i am always sent back to main menu please fix it and ill give it a better rate,-1 81 | great app love the- my list feature. i can find the best deals and plan my black friday shopping trip,1 82 | its what i was looking for i love this app,1 83 | amazing! a must have app,1 84 | galaxy ace india is greatest,1 85 | awesome it's a good app,1 86 | stop button crash this app repatly fix that bug and u get 5 stars,-1 87 | time time freezes and will not update since latest version.,-1 88 | far from as cool as smartglass i went with a ps4 rather than the xbox one but i do miss smart glass. they need to make this app enable the voice features and things like the store should be built-in not a browser link.,-1 89 | finally one great game for android.,1 90 | excellent app no complaints at all but still waiting for urdu localization,1 91 | good app great launcher. i really like it but it is killing my battery.,0 92 | regret buying this app fcs so randomly that it's almost impossible to use it for longer period of time. lack of updates have made it even more impossible to use. haven't seen even a single much requested feature by the community being implemented in this app despite being ok play store for more or less 4 months now. disappointed.,-1 93 | offline browsing please allow us to open local html or swf files. there are few browsers out there that can not play local html but those lack the browsing/playing features of this app.,0 94 | amazing this is freaking great,1 95 | ? not for motorola defy,0 96 | thankyou waited for such an app for a long time. jazakallah!,1 97 | awesome but... its a really good player no doubt. but since last update some of my albums wont show and even songs wont show. not to mention it repeats some albums and its not my memory card cause i only have 1 album for each group of songs. please fix this. its the best player ive come across id hate to have to uninstall it :(,0 98 | cutiest emoticons cute emoticons to have fun..,1 99 | game freezes! i play on the pc and now tried to play over and over on my samsung s3 and it freezes up after 2 min. in the game!,-1 100 | great app! i love this app! it's great to have my book lists with me wherever i go and i can easily keep track of which books i've read to each of my kids. one thing missing is the browse feature like the apple version has; i miss that! add that and i will rate 5 stars!,1 101 | ? i love it but some times it freezes and i have to do somthing else,-1 102 | a superb app i came to android about a year ago from an iphone and i tried every reddit client out there such as baconreader and reddit news. these were all good apps but none of them compared to my experience with alienblue. then i tried reddit sync and it has become almost exclusively the only way i read redditch anymore. the holo interface and black night mode are easy on my eyes and it has all of the features i need.,1 103 | improving rapidly. this app used to be crappy while the service was ok but recently the app has started receiving updates on a regular basis adding a lot of basic functionality that should have been there from the beginning. now it features landscape support resume from last position notification bar controls and save to sd card. however i still sorely miss a decent widget easily sortable queues or the extensions found in the desktop app.,1 104 | -------------------------------------------------------------------------------- /data/SentiStrength/app-test.txt: -------------------------------------------------------------------------------- 1 | sent 2 | ? horrible on moment. my battery dies faster ! 3 | ? good features slow deliveries and hit and miss notifications. 4 | fun and picture the game has beautiful pictures. 5 | crap ~ refund requested the game freezes about 6mins into it everytime ~ crap shack game ~ refunds hard now. 6 | just love it! been using for 9 months now and it is perfect app! ? 7 | 5 stars this is exactly what i am looking for. thank you so much. 8 | great it's awesome 9 | i hope there's penguinz this is a really fun game. 10 | slow no point. the viewfinder display is really laggy and slow. doesn't seem to offer any creative black and white features so how does it improve over just desaturating color photos? 11 | "well.. it was awesome .. i do like the new look but now it also now gets ""unexpected errors ""and hitting retry just gets the same. won't work til you clear the data out in settings.. but doing that all the time is annoying. please fix this issue!" 12 | awasome it is so cute 13 | i like it because i enjoy it the picture is very small 14 | ? guns by calford is better 15 | excellent app! but add more! this is much better than samsung's equation detection but the only thing missing in this app is the lack of variety. just like in the note 10.1 please add more equations and have the ability to solve complex algebra. i'll be glad to pay for such a feature. makes work so much easier! 16 | it is fun i love it 17 | yampee exellent nice and cool app 18 | great memory game great for the mind keep you on your toes. helps with memory. 19 | thanks :) this app is very useful and i keep checking it out every morning. 20 | thanks :-) xperia sp it's very inspiring! 21 | crashed system stopped working. no matter what i did this app kept crashing the system. my phone wouldn't work. the screen kept going black and i couldn't run anything. happened all of a sudden. uninstalled and went over to go launcher ex. same features smoother not crashing even with more widgets running. sorry adw. 22 | nice it is very useful 23 | yala is an app for dowloading and listening to arabic music and radio the idea and concept is amazing but it has so many bugs with lots of songs at times and it messes up my music when it connects to some random wifi. please stop and fix this. 24 | very easy and fun not many programs deserve a 5 star but this is one of them. i do miss some features (or maybe i just don't know how to use the app) like continue reading from my last page ad it sometimes reloads from start with me articles but overall this is a very good app. thanks guys! 25 | great app.. nice app .. keep it up.. it is very useful app for us... ***** 26 | everything i wanted takes less space than default clock of moto x and packs so much info. the default settings are pretty impressive. did not have to customize much. won't mind paying for it. 27 | freezes is an awesome game until the past few days it freezes after each mission. 28 | awesome to the developer i just gave another slot machine app very poor ratings your game is 100 the best on android and i've played all the slot machines android has to offer and by far plenty of coins and i'm having fun as hell if it drains all the money now i would not be disappointed i have never paid for a app but i think i will pay for this one stupendous job 29 | very nice. great keyboard app. many great tools and a ton of customization. 30 | game this game is kool!!!! 31 | amazing! awesome app for mathematics. worth thousand starts. thanks devs. though the ui is hard to understand. thanks for the small size 32 | invaluable! i use this app daily and can't imagine not having it. 33 | won't load on s3 like everyone else game freezes when trying to load a mission. so they updated the app and now the new version isn't compatible with my phone. bah puh! 34 | i really like this game!!!!!! this is a good game!!!! 35 | woooow it is gooooooood game:-) :-) :-) 36 | freezes after about 2 minutes of gameplay 37 | anoying bug i use both the powertoggles notification bar widget and home screen widget. really good concept as a whole. but... the home screen widget always freezes a couple hours after i configured it. the widgetbis used to launch my favorite apps not for toggling anything if you want to resolve it i can try to give you more info. ofc rating goes up if this is resolved!! :-) ** bug doesn't occur anymore! :d 38 | match cast lines up don't show i mean aside from that the app is pretty great 39 | ? cute but eats up alot of battery. 40 | best twitter client! if you are looking for the best twitter client search no more. this is it! it's similar to tweetbot on ios. you get all the great features from pull to refresh inline images column views white/black holo theme background tap to jump to top and so much more. oh and there are no ads in this version. it is simply amazing. 41 | help love this game but now it wont let me sell my fish. it freezes and crashes. please fix! 42 | ? so far so good. ? however freezes up way too much. hope that error gets fixed soon or i will have to give it the boot. 43 | network error! suddenly after downloading an update pack i cannot login and said that loading failed. check your connection network. but my connection is working finely. please fix this. thanks 44 | excellent good app for people abroad 45 | fun widget good selection of hilts and colors. 46 | shame its a good game dont get me wrong but since theupdate with the hunter and new maps it freezes unless you stik to old maps.... cumon update fix please 47 | awesome this app is legendary! 48 | one thing missing i really like this app. simple user interface and works smoothly. espn should add a feature to notify you when your current selection is finished. that is the only thing missing. 49 | excellent you can change your car s it's easy to park it self 50 | pls fix bugs nice apps! 51 | worked good at first... when i first got it it worked well now it freezes up and only displays blank white screen. 52 | simply the best i very rarely write reviews. even more rare is when i like an app enough to buy it. this app deserves both. i have seriously tried most music players for the android and this one has everything i need. audio engine is tops start and stop via headphone jack great album art download. lots of other programs have these things but either lack in these or other features. i especially like the simple issue of bass and treble control. price a little high that was the only thing holding me back from buying it sooner. was thinking of giving it 4 stars because of price but; what the heck- i gave it five. 53 | good app its a very good app very nice and easy an simple ui. 54 | good app i found the app is very good for forecast. i like it. 55 | the game constantly freezes. very frustrated since update. the game freezes and goes to green screen. i have to restart tablet every time to get game to respond. fix? 56 | ? motorola devour. becoming more and more sure this game freezes my phones? very fun game though? it is a challenge and addictive? 57 | slot game. it was a great game but it has stopped paying out almost completely. 58 | freezes every time only works when you restart your phone 59 | ads great app but the ads come up all the time and it ruins it 60 | very fun!! worth the memory. :-d 61 | horrible installed perfectly and though was great app. uploaded some pics from gallery with no problem. when wanted to take a pic as demo here shows above it was just a black scree. pressed camera button nothing. can upload from gallery but cant use the main feature of this app which is what is advertised. why would i use this to upload from my phone? my phone hasd bluetooth and a usb to hook up to my pc. i can send those pics via text email and facebook. the main purpose of this app does not work! 62 | great apps works well and looks great. lots of available providers. from the 5 or so apps i've tested deliveries seems to support the most. i submitted a bug report when statuses from one of the providers wasn't working as expected. the developer promptly got back to me and fixed the issue. only feature i would like is auto-detection of provider based on the tracking number. but as a fellow programmer i understand the challenges presented by such a feature so this doesn't affect my rating. 63 | need to add a feature you guys should add a custom car builder feature where you can just build your own car but every part you buy will just keep adding up your total overall great app its fun and it kills time 64 | great alarm but has new bug making it use lots of battery i like this alarm i definitely like being able to wake up to a random playlist. but since an update its now using massive amounts of battery using the gps. i'm guessing it's something to do with sunrise feature. i don't use the sunrise alarm and i certainly don't move between countries enough for it to constantly be updating. so either a setting to not allow access to gps or to only update once a day would be useful. otherwise i'll have to go back to the standard alarm until it's fixed. 65 | gs3 best weather and clock widget out there hands down 66 | good for me 112089 come to my friend in the game. 67 | doesn't work fix bugs and maybe i'll give it 4 stars. sorry! 68 | some cant download because when you download a swf an error appears pls fix ill rate 5 star for this 69 | nice but after i update i can't open the game? pls fix this game and i love it.i will give 5 star if this game fixed the bug. username: roenan12 server: s2 libra 70 | freezes when browser loads freezes when browser loads to play game 71 | awesome it's very useful to all people. 72 | can't install i can't install it on my samsung note iii please fix it. error. (941) 73 | note 2 it's very useful to use........love it 74 | good app it's customizable which i like. sometimes it's a bit difficult getting items to line up properly and the areas for each part of the display don't change size dynamically so the font sizes will get bigger or smaller to fit in the space alloted for them which i don't like. overall it's a good app though. 75 | "too slow. app is slower than mobile site and/or competing livejournal apps like""eljay"" (search the marketplace for it). what with the lack of features even in the main pc site it's proof that livejournal is no match for even the much maligned facebook...rip livejournal." 76 | very good but can you add download all attachment as zip option ? we miss this very nice feature which is available in web version. 77 | "widget locks up my galaxy s3 this a very beautiful looking app and i like it a lot but it regularly locks up the homescreen widget the clock freezes and the weather status locks up with the status ""locating"". my phone is a samsung galaxy s3 running on 4.1.1 based in the uk. hope this info helps solve the issue as this app is definitely worth a full 5 stars if it were not for it locking up. please fix this and i'll even purchase the add free pro version ;-)" 78 | a must have! great app and a perfect companion for many other apps. 79 | like minecraft pe. has good graphics 80 | ? what is the best the best thing to scan? 81 | bug when i leave the rest room i am always sent back to main menu please fix it and ill give it a better rate 82 | great app love the- my list feature. i can find the best deals and plan my black friday shopping trip 83 | its what i was looking for i love this app 84 | amazing! a must have app 85 | galaxy ace india is greatest 86 | awesome it's a good app 87 | stop button crash this app repatly fix that bug and u get 5 stars 88 | time time freezes and will not update since latest version. 89 | far from as cool as smartglass i went with a ps4 rather than the xbox one but i do miss smart glass. they need to make this app enable the voice features and things like the store should be built-in not a browser link. 90 | finally one great game for android. 91 | excellent app no complaints at all but still waiting for urdu localization 92 | good app great launcher. i really like it but it is killing my battery. 93 | regret buying this app fcs so randomly that it's almost impossible to use it for longer period of time. lack of updates have made it even more impossible to use. haven't seen even a single much requested feature by the community being implemented in this app despite being ok play store for more or less 4 months now. disappointed. 94 | offline browsing please allow us to open local html or swf files. there are few browsers out there that can not play local html but those lack the browsing/playing features of this app. 95 | amazing this is freaking great 96 | ? not for motorola defy 97 | thankyou waited for such an app for a long time. jazakallah! 98 | awesome but... its a really good player no doubt. but since last update some of my albums wont show and even songs wont show. not to mention it repeats some albums and its not my memory card cause i only have 1 album for each group of songs. please fix this. its the best player ive come across id hate to have to uninstall it :( 99 | cutiest emoticons cute emoticons to have fun.. 100 | game freezes! i play on the pc and now tried to play over and over on my samsung s3 and it freezes up after 2 min. in the game! 101 | great app! i love this app! it's great to have my book lists with me wherever i go and i can easily keep track of which books i've read to each of my kids. one thing missing is the browse feature like the apple version has; i miss that! add that and i will rate 5 stars! 102 | ? i love it but some times it freezes and i have to do somthing else 103 | a superb app i came to android about a year ago from an iphone and i tried every reddit client out there such as baconreader and reddit news. these were all good apps but none of them compared to my experience with alienblue. then i tried reddit sync and it has become almost exclusively the only way i read redditch anymore. the holo interface and black night mode are easy on my eyes and it has all of the features i need. 104 | improving rapidly. this app used to be crappy while the service was ok but recently the app has started receiving updates on a regular basis adding a lot of basic functionality that should have been there from the beginning. now it features landscape support resume from last position notification bar controls and save to sd card. however i still sorely miss a decent widget easily sortable queues or the extensions found in the desktop app. 105 | -------------------------------------------------------------------------------- /data/SentiStrength/jira-test-se.csv: -------------------------------------------------------------------------------- 1 | "It may cause conflicts, but if your intent is to break system security, you probably don't care.",-1 2 | "Thanks, Amar!",1 3 | "This was a very bad bug, introduced by me being an idiot.",-1 4 | Didn't got the time to try it yet.,-1 5 | Pull it back in if you think different.,-1 6 | Why the hell is this not a bug?,-1 7 | the recommendation in the wiki is bad.,-1 8 | I'm confused.,-1 9 | "I was in too much of a hurry, sorry hold on a sec.",-1 10 | Sorry for the noise.,-1 11 | It really sucks that there is now way currently to include transitive dependencies.,-1 12 | "Thanks,Mayank",1 13 | Thanks Uwe!,1 14 | Thanks!,1 15 | My bad.,-1 16 | "I don't care if everything is pretty or not, but we should at least support basic admin functionality in IE IMO (though I have not used it for years for just about anything).",-1 17 | Is that ok for this file (b/c I have no idea how to do the svn move now ... after I've made all the changes already) :),1 18 | "My bad, I screwed up the assertion -> RuntimeException transition.",-1 19 | Regex is your friend.,1 20 | Pull it back in if you think different.,-1 21 | "In fact, if you happen to know why ObserverHammerQuorumTest is failing with this latest patch, I'd love to hear.",1 22 | My bad.,-1 23 | "sorry old xml here is the used one:?¤?¶?¼ und ??????",-1 24 | You're results are awesome Paul. Great work :)Looking forward to see your new JSON parser in trunk whenever you think is ready.,1 25 | This is Awesome Stefan - thanks a million!,1 26 | My bad.,-1 27 | Thanks for fixing it so quickly!,1 28 | "Hi Sagara,Thank you very much for looking into this.",1 29 | "Sorry typo: ""our part"" in place of ""one part"" above.",-1 30 | "Sorry, I meant to have this patch in sooner but got quite busy, I expect to have it soon.",-1 31 | Yes exactly -- sorry to be so unclear.,-1 32 | Weird.,-1 33 | I think you made mistakes in ivy.xml.,-1 34 | Thanks for your patience.,1 35 | Thanks Sijie.,1 36 | "Thanks, Dhruba!",1 37 | "Shit, I missed a cast.",-1 38 | But please don't say that my reasoning is bad - because it is not.,-1 39 | This might be a bug indeed.,-1 40 | This is clearly bad webserver behaviour.,-1 41 | @Ashutosh: thanks a lot for the comments.,1 42 | 897392Thanks Sharan !,1 43 | "Looks good, thanks Laura!",1 44 | That's my bad.,-1 45 | Thanks Andrew - the patch was applied to SQL module at r525019.,1 46 | "Thank you,+ Harit Himanshu",1 47 | This sucks *so much*...,-1 48 | "I've tried something similar (I removed the handlers and kept the readers), but the performance was not visible.",-1 49 | Damn...,-1 50 | Applied patch with thanks to Scott.,1 51 | I think we all know it just sucks.,-1 52 | "Damn, Chuck is scary.",-1 53 | You would need to implement session resume; thats a whole new can of worms.,-1 54 | I don't have strong opinions about it either way.,-1 55 | This is weird.,-1 56 | Thank you for the patch Sergey. ,1 57 | "Yes it is a dup, thanks Mike for taking care of this (I planned to do this yesterday but didn't make it)",-1 58 | "Sorry Avdhesh, I forgot to add these two files to the patch, here is the new patch containing the missing files",-1 59 | "Thanks a lot, Kiran for the patch.",1 60 | This is bad.,-1 61 | "Sorry, I guess I'm against ""never computing this shit""... because you guys think returning NaN is ok.",-1 62 | sorry for delay,-1 63 | "And boy, hell broke loose ;)So... the biggest issue I'm facing is indeed with Random sharing across threads.",-1 64 | "Thanks,Arvind",1 65 | Please close as this is just me being stupid.,-1 66 | "Thanks a lot for the reviews, Todd.",1 67 | Thanks Karthik.,1 68 | "Sorry for my poor review, didn't notice try/catch :(",-1 69 | "Seems we didn't enforce an exec for sh, but we did for fs.",-1 70 | "durrrh, that sucks.",-1 71 | "This is pretty trivial, just adds three asserts to TestPath#testNormalize.",-1 72 | Automatic location selection is very cool.,1 73 | All I'm trying to say is that it's pretty easy to end up in propagation failure hell here or change something else that blows things up for use cases that are not foreseen.,-1 74 | I don't know what just happened.,-1 75 | "Thanks for the report, and sorry its taken so long to fix it.",-1 76 | I think the correct resolution is to ensure that the prefix stack mechanism gets reset each time the XMLReader is used.,-1 77 | yup sorry just fixed.,-1 78 | A stupid bug in a patch that is already applied.,-1 79 | i found the class - so there is no bug - sorry,-1 80 | > then we don't save IO by limiting the buffer size to 1 KBI'm confused by this.,-1 81 | Finally closing this bug from hell.,-1 82 | "Thanks Areek, patch looks good!",1 83 | Incidentally if we all nag Joe Walnes enough we might be able to persuade him to release a new qdox which can ignore annotations etc (though it will still struggle with generics I think),-1 84 | Bug in existing testDelegationTokenRestoredOnRMrestart().,-1 85 | "Igor is an idiot, and we *do* need gmake.",-1 86 | Thank you.,1 87 | Sorry about that.,-1 88 | My bad.,-1 89 | Sorry for the trouble Vikram!,-1 90 | "Why the hell do they deliver Duration, if they cannot instantiate it :-/",-1 91 | (sorry if that was confusing),-1 92 | "Sorry Chris,I missed it, done !",-1 93 | This is a great suggestion.,1 94 | "Thanks for reminding me; I agree, I'll do it.",1 95 | {quote}searcher.getAtomicReader().getSortedDocValues(uniqueKey);{quote}This is a performance killer.,-1 96 | Thanks for the reviews..,1 97 | I'm an idiot.,-1 98 | This looks safe also,1 99 | Completely missed issue 614..,-1 100 | "Owen, thanks for the slides.",1 101 | "Thanks,Mayank",1 102 | This looks good to me.,1 103 | Sounds weird to me...Could you package a (totally!),-1 104 | Seems to be failing for a different reason nowtestContainerLaunch(org.apache.hadoop.yarn.server.nodemanager.TestLinuxContainerExecutorWithMocks) Time elapsed: 0.523 sec <<< FAILURE!,-1 105 | "Sorry, I kind of forget about this one.",-1 106 | Sorry for the confusion :),-1 107 | An output connector should also have a say in what URLs it will accept.,-1 108 | "Well, that sucks.",-1 109 | Here is quite bad.,-1 110 | Sounds like a good idea.,1 111 | "Sorry, I was trying to get to the wiki but it's been a busy week.",-1 112 | "Damn, it seemed it didn't work.",-1 113 | It is good to have the test.,1 114 | Interesting that you decided not to detect the error based on finding two similarly-named operations.,1 115 | Sorry that I think I missed some discussion in the mailing list.,-1 116 | Thanks to Mathias Werlitz - sorry for the delay.,-1 117 | "Thanks to both of you, and to Deepesh for the initiative.",1 118 | My error.,-1 119 | Awesome stuff Stephen! ,1 120 | I want to integrate the sweet sweet logo Andrew crafted.,1 121 | "Thanks, Owen!",1 122 | "I am not really sure what does the receive payment do before the shipment, it doesn't sound as if it is doing what we expect it to do.",-1 123 | Sounds like a good idea. ,1 124 | "Thanks Oliver, that's fixed it. ",1 125 | Thanks Tom!,1 126 | "umm ... call me crazy, but why are we making this public?",-1 127 | "Actually I don't want to specify the encoding, because I don't care how the data is transported to me.",-1 128 | "Hi Guillaume,I did not have an answer right away, so I sent you question to Leonard Rosenthol.",-1 129 | Indeed that would be VERY bad design.,-1 130 | Maurice I don't have such option or maybe I don't know where it is.,-1 131 | Doing it at the hackathon you'd have a few fellas at your shoulder to give you pointers should you get stuck.,-1 132 | Thanks for sweet patch Erik.,1 133 | sorry.,-1 134 | Bad IE.,-1 135 | "I don't have to ensure that the classloader knows groovy classes, *you* must do that.",-1 136 | Cool - good information to have. Thanks Lance!,1 137 | "- there were a hell of a bigger problem, though : as we were blocked in the executor until the SearchRequest was totally processed, all the responses were enqueued.",-1 138 | " , my bad",-1 139 | Sorry for the noise.,-1 140 | "Thanks for spotting this, fixed at r487519",1 141 | I really like what I am seeing so far.,1 142 | The cause is that the call to SpecificResponder.writeError in Responder.respond ultimately calls GenericData.resolveUnion which in turn calls GenericData.getSchemaName before the line where the UnresolvedUnionException gets thrown.,-1 143 | "I suspect it has nothing to do with the file system connector or Infinispan connectors, and is simply a (stupid) mistake in the federated join processor.",-1 144 | PreCommit-HDFS-Build is stuck.,-1 145 | "The eclipse ui was completly stuck, which was similiar to the other experinces.",-1 146 | "Ie, both at are bad.",-1 147 | "Brilliant feedback, thanks! I'm glad you found the issue, and the solution!",1 148 | :)Sorry about that.,-1 149 | > - FOUserAgent.getStream() is cool and very easy to use (now that it's properly> documented).,1 150 | Resolving again....,-1 151 | "I didn't do that because that seems bad in hive, so I returned ""null"" from the operation.",-1 152 | "HADOOP-2949: - If tarball is specified, HOD no longer validates for the pkgs directory in gridservice-hdfs or mapred sections as these are not going to be used anyway.",-1 153 | The samples you gave are different.,-1 154 | Sorry for the delay.,-1 155 | Thanks Brock!,1 156 | My bad.,-1 157 | I like Richards update as well :) What I did was out of pure anger so it may not have been the sexiest.,1 158 | Thanks for the patch Mubarak.,1 159 | "Well it's me that didn't get the whole point, now i got more, sorry for the noise.",-1 160 | "Great! Awesome!thanks,Dims",1 161 | "Aaarrggh, how stupid of mine to have a System.out, again!",-1 162 | Imo all this just doesn't make sense from a pure performance aspect.,-1 163 | "I have stupidly deleted the original test dir, but judging from the suite's output files, no output was created after 3 1/2 hours.",-1 164 | "Qianshi is working on the SSL session reuse, but this buggy Bug system does not allow him assign this ticket, sigh",-1 165 | Sorry for the noise..,-1 166 | Thanks Alejandro .,1 167 | "Many thanks, Neeme.",1 168 | that's some freaky shit ...,-1 169 | "It sucks to lose the code readability, but it seems like a reasonable price to pay.",-1 170 | Sorry - the above comes across as terse.,-1 171 | Awesome work: this is a great first cut!,1 172 | "Hell, this is gonna take me a lot of work to raise.",-1 173 | "Thanks Atul,Your patch is in trunk at r933169I just added some ""mod for OFBiz layered lookups"" comments around changes as suggested Sascha",1 174 | "Some file were missing in the last patch, sorry.",-1 175 | Thanks Ashutosh and Gunther for your help!,1 176 | Doh!,-1 177 | I screwed up the encoding of the stopwords file (sorry).,-1 178 | "Ah, damn, I thought it was fixed :/Guillaume ?",-1 179 | "Sorry, yes I believe this has been resolved.",-1 180 | Pull it back in if you think different.,-1 181 | What a stupid name I chose for that object... )-:,-1 182 | sorry for misleading attachment name.,-1 183 | "The guy on our team that was going to do this was swamped, so I re-assigned this to you.",-1 184 | This looks simple and sweet to me.,1 185 | Sorry ;-),-1 186 | "As far as the query shit, i have no idea if solrdispatchfilter or whatever could/should do Thread.currentThread().setName(x) or whatever (and maybe restore after)",-1 187 | My bad.,-1 188 | "Secondly reading this code I can see why this bug is happening, after completing stage 2 above when adding the new item the code does ""ordered - cancelled = quantity"" which equates to ""1 - 1 = 0"".",-1 189 | Thanks Awdesh - Done at r821748.,1 190 | Sorry - the federated build is still working out kinks...,-1 191 | "Super, I'll commit shortly -- thanks Yonik!",1 192 | sorry for your time.,-1 193 | Damn !,-1 194 | This sucks badly.,-1 195 | Sorry but this is just stupid.,-1 196 | "Awesome, you rock, Drew!",1 197 | "If that would be the case, this would be bad design.",-1 198 | I've tested them out and everything is fine.,1 199 | This is only a problem with exceptions from java.,-1 200 | "Aaron, sorry about this.",-1 201 | The rest are *totally* unrelated.,-1 202 | "[~cmccabe]: oh, hell no!",-1 203 | "Turning off hints is basically intended as a ""oh shit, something is broken with hints, let's turn it off"" switch.",-1 204 | Damn it !,-1 205 | I think it's time to just close this issue.,-1 206 | Thanks Ashish!,1 207 | Damn...,-1 208 | "Aha,, thanks for the information. That makes sense. Glad to hear that it helped! :)",1 209 | "Ah, that was my bad.",-1 210 | ah - my bad.,-1 211 | "Thanks, Sanjay!",1 212 | fuck u,-1 213 | Cheers!,1 214 | Thanks for doing that Uma.,1 215 | Thanks Dianne!,1 216 | "My bad, this is already done.",-1 217 | huh ... i thought i did resolve this.,-1 218 | "Hell, UnaryFunction might even be faster than all of these calls in a row.",-1 219 | I'm an idiot.,-1 220 | Thanks senaka for the patch.,1 221 | "Oh, I didn't consider one flow like after the edit log conversion, immediately #store failed.",-1 222 | Forget the patch for the moment.,-1 223 | I like the elegant parser! ,1 224 | I would love to have it right now for storm too. If you want me to sign up as a use case I am happy to.,1 225 | "Brandon, sorry.",-1 226 | "Ugh, sorry :( Thanks!",-1 227 | bq. OK w/ the latest patch all tests pass for me! Great Awesome! :),1 228 | "It is a nasty bug that I've seen in real life, though.",-1 229 | I am new to Mina and the whole environment.,-1 230 | Awesome - great stuff Maria. Thanks!,1 231 | Ok. Stupid user Error here.,-1 232 | Sorry about that...,-1 233 | "To upgrade to a recent version of Jackrabbit, see http://wiki.apache.org/jackrabbit/BackupAndMigrationI'm afraid I can't say much about the risk.",-1 234 | Sorry GavDONECommitted @revision 2487.,-1 235 | "Mac, looks like the tests are failing (especially TestHarFileSystem).",-1 236 | I'm glad we were able to resolve this issue.,1 237 | My patch wouldn't compile.,-1 238 | Will still be stuck in the loop though if can't actually close regions.,-1 239 | Thanks henry and pat... we'll have to re submit all the PA's so trigger hudson.,1 240 | "Someone, but I'm not sure who, owes me a public apology here. /Larry ",-1 241 | "Thanks, Daryn!",1 242 | cool man it looks good. we need a changes entry but from my side this looks good. we can tackle the todos on trunk,1 243 | Oh man ... it's a fucking precendence problem.,-1 244 | "Yep, my bad.",-1 245 | Patch applied with thanks!,1 246 | "Hi Carlos, This looks awesome! Lots of cool stuff.",1 247 | Excuse me for stolen assignement.,-1 248 | "I opened [HADOOP-3607] to fix a wrong URL, but appart from that I don't there's still references to the old structure.",-1 249 | "Cool, looks good.",1 250 | I meet the same problem on Eclipse recently but haven't figured out how to get through.,-1 251 | "Thanks, Ashish!",1 252 | Thanks!,1 253 | {quote}You are messing down deep below hbase in dfs.,-1 254 | "Thanks Andrew, committed in rev.",1 255 | "@mahadev - I would love to help test a patch :) I'm currently using 3.3.1 + ZOOKEEPER-744 + ZOOKEEPER-790, applied in that order.",1 256 | "Thanks a lot for sharing that, Josh!",1 257 | Weird.,-1 258 | "Thanks, Mike!",1 259 | Sorry.,-1 260 | "Hi Sandy,Thanks so much to give me such comments, that's really helpful, I will update this later.",1 261 | Tried some more stuff and realized I was doing it wrong.,-1 262 | "Thanks for the very clear explanation of the needed change, Dag.",1 263 | This patch really helped.,1 264 | "Daryn, the current patch looks good.",1 265 | The biggest problem is that we've had too many committers over the years and we'd have to get all of their permission to change it.,-1 266 | thanks for the explanation.,1 267 | Weird.,-1 268 | "The patch should be relatively trivial, but like I said, I have no idea if there is other important stuff going on there or not.",-1 269 | Splitting an existing sub-shard gets stuck up.,-1 270 | "And debugging is hell, because the test environment needs to have the exact same loader setup.",-1 271 | "Holy complicated-as-shit-algorithm, Batman!The complexity of our implementation vs the complexity of what we're actually doing is starting to worry me here.",-1 272 | Sorry about that.,-1 273 | Patch looks good to me.,1 274 | Thanks for the patch Erik (and Jon),1 275 | Version 2.2.0RC3 is fine.,1 276 | "Because, we don't care for this in cases where we there is no node that is down.",-1 277 | > I hated that aspect of working for commercial companies.,-1 278 | Please contact me if you need any clarifications.,1 279 | -------------------------------------------------------------------------------- /data/SentiStrength/jira-test.txt: -------------------------------------------------------------------------------- 1 | sent 2 | "It may cause conflicts, but if your intent is to break system security, you probably don't care." 3 | "Thanks, Amar!" 4 | "This was a very bad bug, introduced by me being an idiot." 5 | Didn't got the time to try it yet. 6 | Pull it back in if you think different. 7 | Why the hell is this not a bug? 8 | the recommendation in the wiki is bad. 9 | I'm confused. 10 | "I was in too much of a hurry, sorry hold on a sec." 11 | Sorry for the noise. 12 | It really sucks that there is now way currently to include transitive dependencies. 13 | "Thanks,Mayank" 14 | Thanks Uwe! 15 | Thanks! 16 | My bad. 17 | "I don't care if everything is pretty or not, but we should at least support basic admin functionality in IE IMO (though I have not used it for years for just about anything)." 18 | Is that ok for this file (b/c I have no idea how to do the svn move now ... after I've made all the changes already) :) 19 | "My bad, I screwed up the assertion -> RuntimeException transition." 20 | Regex is your friend. 21 | Pull it back in if you think different. 22 | "In fact, if you happen to know why ObserverHammerQuorumTest is failing with this latest patch, I'd love to hear." 23 | My bad. 24 | "sorry old xml here is the used one:?¤?¶?¼ und ??????" 25 | You're results are awesome Paul. Great work :)Looking forward to see your new JSON parser in trunk whenever you think is ready. 26 | This is Awesome Stefan - thanks a million! 27 | My bad. 28 | Thanks for fixing it so quickly! 29 | "Hi Sagara,Thank you very much for looking into this." 30 | "Sorry typo: ""our part"" in place of ""one part"" above." 31 | "Sorry, I meant to have this patch in sooner but got quite busy, I expect to have it soon." 32 | Yes exactly -- sorry to be so unclear. 33 | Weird. 34 | I think you made mistakes in ivy.xml. 35 | Thanks for your patience. 36 | Thanks Sijie. 37 | "Thanks, Dhruba!" 38 | "Shit, I missed a cast." 39 | But please don't say that my reasoning is bad - because it is not. 40 | This might be a bug indeed. 41 | This is clearly bad webserver behaviour. 42 | @Ashutosh: thanks a lot for the comments. 43 | 897392Thanks Sharan ! 44 | "Looks good, thanks Laura!" 45 | That's my bad. 46 | Thanks Andrew - the patch was applied to SQL module at r525019. 47 | "Thank you,+ Harit Himanshu" 48 | This sucks *so much*... 49 | "I've tried something similar (I removed the handlers and kept the readers), but the performance was not visible." 50 | Damn... 51 | Applied patch with thanks to Scott. 52 | I think we all know it just sucks. 53 | "Damn, Chuck is scary." 54 | You would need to implement session resume; thats a whole new can of worms. 55 | I don't have strong opinions about it either way. 56 | This is weird. 57 | Thank you for the patch Sergey. 58 | "Yes it is a dup, thanks Mike for taking care of this (I planned to do this yesterday but didn't make it)" 59 | "Sorry Avdhesh, I forgot to add these two files to the patch, here is the new patch containing the missing files" 60 | "Thanks a lot, Kiran for the patch." 61 | This is bad. 62 | "Sorry, I guess I'm against ""never computing this shit""... because you guys think returning NaN is ok." 63 | sorry for delay 64 | "And boy, hell broke loose ;)So... the biggest issue I'm facing is indeed with Random sharing across threads." 65 | "Thanks,Arvind" 66 | Please close as this is just me being stupid. 67 | "Thanks a lot for the reviews, Todd." 68 | Thanks Karthik. 69 | "Sorry for my poor review, didn't notice try/catch :(" 70 | "Seems we didn't enforce an exec for sh, but we did for fs." 71 | "durrrh, that sucks." 72 | "This is pretty trivial, just adds three asserts to TestPath#testNormalize." 73 | Automatic location selection is very cool. 74 | All I'm trying to say is that it's pretty easy to end up in propagation failure hell here or change something else that blows things up for use cases that are not foreseen. 75 | I don't know what just happened. 76 | "Thanks for the report, and sorry its taken so long to fix it." 77 | I think the correct resolution is to ensure that the prefix stack mechanism gets reset each time the XMLReader is used. 78 | yup sorry just fixed. 79 | A stupid bug in a patch that is already applied. 80 | i found the class - so there is no bug - sorry 81 | > then we don't save IO by limiting the buffer size to 1 KBI'm confused by this. 82 | Finally closing this bug from hell. 83 | "Thanks Areek, patch looks good!" 84 | Incidentally if we all nag Joe Walnes enough we might be able to persuade him to release a new qdox which can ignore annotations etc (though it will still struggle with generics I think) 85 | Bug in existing testDelegationTokenRestoredOnRMrestart(). 86 | "Igor is an idiot, and we *do* need gmake." 87 | Thank you. 88 | Sorry about that. 89 | My bad. 90 | Sorry for the trouble Vikram! 91 | "Why the hell do they deliver Duration, if they cannot instantiate it :-/" 92 | (sorry if that was confusing) 93 | "Sorry Chris,I missed it, done !" 94 | This is a great suggestion. 95 | "Thanks for reminding me; I agree, I'll do it." 96 | {quote}searcher.getAtomicReader().getSortedDocValues(uniqueKey);{quote}This is a performance killer. 97 | Thanks for the reviews.. 98 | I'm an idiot. 99 | This looks safe also 100 | Completely missed issue 614.. 101 | "Owen, thanks for the slides." 102 | "Thanks,Mayank" 103 | This looks good to me. 104 | Sounds weird to me...Could you package a (totally!) 105 | Seems to be failing for a different reason nowtestContainerLaunch(org.apache.hadoop.yarn.server.nodemanager.TestLinuxContainerExecutorWithMocks) Time elapsed: 0.523 sec <<< FAILURE! 106 | "Sorry, I kind of forget about this one." 107 | Sorry for the confusion :) 108 | An output connector should also have a say in what URLs it will accept. 109 | "Well, that sucks." 110 | Here is quite bad. 111 | Sounds like a good idea. 112 | "Sorry, I was trying to get to the wiki but it's been a busy week." 113 | "Damn, it seemed it didn't work." 114 | It is good to have the test. 115 | Interesting that you decided not to detect the error based on finding two similarly-named operations. 116 | Sorry that I think I missed some discussion in the mailing list. 117 | Thanks to Mathias Werlitz - sorry for the delay. 118 | "Thanks to both of you, and to Deepesh for the initiative." 119 | My error. 120 | Awesome stuff Stephen! 121 | I want to integrate the sweet sweet logo Andrew crafted. 122 | "Thanks, Owen!" 123 | "I am not really sure what does the receive payment do before the shipment, it doesn't sound as if it is doing what we expect it to do." 124 | Sounds like a good idea. 125 | "Thanks Oliver, that's fixed it. " 126 | Thanks Tom! 127 | "umm ... call me crazy, but why are we making this public?" 128 | "Actually I don't want to specify the encoding, because I don't care how the data is transported to me." 129 | "Hi Guillaume,I did not have an answer right away, so I sent you question to Leonard Rosenthol." 130 | Indeed that would be VERY bad design. 131 | Maurice I don't have such option or maybe I don't know where it is. 132 | Doing it at the hackathon you'd have a few fellas at your shoulder to give you pointers should you get stuck. 133 | Thanks for sweet patch Erik. 134 | sorry. 135 | Bad IE. 136 | "I don't have to ensure that the classloader knows groovy classes, *you* must do that." 137 | Cool - good information to have. Thanks Lance! 138 | "- there were a hell of a bigger problem, though : as we were blocked in the executor until the SearchRequest was totally processed, all the responses were enqueued." 139 | " , my bad" 140 | Sorry for the noise. 141 | "Thanks for spotting this, fixed at r487519" 142 | I really like what I am seeing so far. 143 | The cause is that the call to SpecificResponder.writeError in Responder.respond ultimately calls GenericData.resolveUnion which in turn calls GenericData.getSchemaName before the line where the UnresolvedUnionException gets thrown. 144 | "I suspect it has nothing to do with the file system connector or Infinispan connectors, and is simply a (stupid) mistake in the federated join processor." 145 | PreCommit-HDFS-Build is stuck. 146 | "The eclipse ui was completly stuck, which was similiar to the other experinces." 147 | "Ie, both at are bad." 148 | "Brilliant feedback, thanks! I'm glad you found the issue, and the solution!" 149 | :)Sorry about that. 150 | > - FOUserAgent.getStream() is cool and very easy to use (now that it's properly> documented). 151 | Resolving again.... 152 | "I didn't do that because that seems bad in hive, so I returned ""null"" from the operation." 153 | "HADOOP-2949: - If tarball is specified, HOD no longer validates for the pkgs directory in gridservice-hdfs or mapred sections as these are not going to be used anyway." 154 | The samples you gave are different. 155 | Sorry for the delay. 156 | Thanks Brock! 157 | My bad. 158 | I like Richards update as well :) What I did was out of pure anger so it may not have been the sexiest. 159 | Thanks for the patch Mubarak. 160 | "Well it's me that didn't get the whole point, now i got more, sorry for the noise." 161 | "Great! Awesome!thanks,Dims" 162 | "Aaarrggh, how stupid of mine to have a System.out, again!" 163 | Imo all this just doesn't make sense from a pure performance aspect. 164 | "I have stupidly deleted the original test dir, but judging from the suite's output files, no output was created after 3 1/2 hours." 165 | "Qianshi is working on the SSL session reuse, but this buggy Bug system does not allow him assign this ticket, sigh" 166 | Sorry for the noise.. 167 | Thanks Alejandro . 168 | "Many thanks, Neeme." 169 | that's some freaky shit ... 170 | "It sucks to lose the code readability, but it seems like a reasonable price to pay." 171 | Sorry - the above comes across as terse. 172 | Awesome work: this is a great first cut! 173 | "Hell, this is gonna take me a lot of work to raise." 174 | "Thanks Atul,Your patch is in trunk at r933169I just added some ""mod for OFBiz layered lookups"" comments around changes as suggested Sascha" 175 | "Some file were missing in the last patch, sorry." 176 | Thanks Ashutosh and Gunther for your help! 177 | Doh! 178 | I screwed up the encoding of the stopwords file (sorry). 179 | "Ah, damn, I thought it was fixed :/Guillaume ?" 180 | "Sorry, yes I believe this has been resolved." 181 | Pull it back in if you think different. 182 | What a stupid name I chose for that object... )-: 183 | sorry for misleading attachment name. 184 | "The guy on our team that was going to do this was swamped, so I re-assigned this to you." 185 | This looks simple and sweet to me. 186 | Sorry ;-) 187 | "As far as the query shit, i have no idea if solrdispatchfilter or whatever could/should do Thread.currentThread().setName(x) or whatever (and maybe restore after)" 188 | My bad. 189 | "Secondly reading this code I can see why this bug is happening, after completing stage 2 above when adding the new item the code does ""ordered - cancelled = quantity"" which equates to ""1 - 1 = 0""." 190 | Thanks Awdesh - Done at r821748. 191 | Sorry - the federated build is still working out kinks... 192 | "Super, I'll commit shortly -- thanks Yonik!" 193 | sorry for your time. 194 | Damn ! 195 | This sucks badly. 196 | Sorry but this is just stupid. 197 | "Awesome, you rock, Drew!" 198 | "If that would be the case, this would be bad design." 199 | I've tested them out and everything is fine. 200 | This is only a problem with exceptions from java. 201 | "Aaron, sorry about this." 202 | The rest are *totally* unrelated. 203 | "[~cmccabe]: oh, hell no!" 204 | "Turning off hints is basically intended as a ""oh shit, something is broken with hints, let's turn it off"" switch." 205 | Damn it ! 206 | I think it's time to just close this issue. 207 | Thanks Ashish! 208 | Damn... 209 | "Aha,, thanks for the information. That makes sense. Glad to hear that it helped! :)" 210 | "Ah, that was my bad." 211 | ah - my bad. 212 | "Thanks, Sanjay!" 213 | fuck u 214 | Cheers! 215 | Thanks for doing that Uma. 216 | Thanks Dianne! 217 | "My bad, this is already done." 218 | huh ... i thought i did resolve this. 219 | "Hell, UnaryFunction might even be faster than all of these calls in a row." 220 | I'm an idiot. 221 | Thanks senaka for the patch. 222 | "Oh, I didn't consider one flow like after the edit log conversion, immediately #store failed." 223 | Forget the patch for the moment. 224 | I like the elegant parser! 225 | I would love to have it right now for storm too. If you want me to sign up as a use case I am happy to. 226 | "Brandon, sorry." 227 | "Ugh, sorry :( Thanks!" 228 | bq. OK w/ the latest patch all tests pass for me! Great Awesome! :) 229 | "It is a nasty bug that I've seen in real life, though." 230 | I am new to Mina and the whole environment. 231 | Awesome - great stuff Maria. Thanks! 232 | Ok. Stupid user Error here. 233 | Sorry about that... 234 | "To upgrade to a recent version of Jackrabbit, see http://wiki.apache.org/jackrabbit/BackupAndMigrationI'm afraid I can't say much about the risk." 235 | Sorry GavDONECommitted @revision 2487. 236 | "Mac, looks like the tests are failing (especially TestHarFileSystem)." 237 | I'm glad we were able to resolve this issue. 238 | My patch wouldn't compile. 239 | Will still be stuck in the loop though if can't actually close regions. 240 | Thanks henry and pat... we'll have to re submit all the PA's so trigger hudson. 241 | "Someone, but I'm not sure who, owes me a public apology here. /Larry " 242 | "Thanks, Daryn!" 243 | cool man it looks good. we need a changes entry but from my side this looks good. we can tackle the todos on trunk 244 | Oh man ... it's a fucking precendence problem. 245 | "Yep, my bad." 246 | Patch applied with thanks! 247 | "Hi Carlos, This looks awesome! Lots of cool stuff." 248 | Excuse me for stolen assignement. 249 | "I opened [HADOOP-3607] to fix a wrong URL, but appart from that I don't there's still references to the old structure." 250 | "Cool, looks good." 251 | I meet the same problem on Eclipse recently but haven't figured out how to get through. 252 | "Thanks, Ashish!" 253 | Thanks! 254 | {quote}You are messing down deep below hbase in dfs. 255 | "Thanks Andrew, committed in rev." 256 | "@mahadev - I would love to help test a patch :) I'm currently using 3.3.1 + ZOOKEEPER-744 + ZOOKEEPER-790, applied in that order." 257 | "Thanks a lot for sharing that, Josh!" 258 | Weird. 259 | "Thanks, Mike!" 260 | Sorry. 261 | "Hi Sandy,Thanks so much to give me such comments, that's really helpful, I will update this later." 262 | Tried some more stuff and realized I was doing it wrong. 263 | "Thanks for the very clear explanation of the needed change, Dag." 264 | This patch really helped. 265 | "Daryn, the current patch looks good." 266 | The biggest problem is that we've had too many committers over the years and we'd have to get all of their permission to change it. 267 | thanks for the explanation. 268 | Weird. 269 | "The patch should be relatively trivial, but like I said, I have no idea if there is other important stuff going on there or not." 270 | Splitting an existing sub-shard gets stuck up. 271 | "And debugging is hell, because the test environment needs to have the exact same loader setup." 272 | "Holy complicated-as-shit-algorithm, Batman!The complexity of our implementation vs the complexity of what we're actually doing is starting to worry me here." 273 | Sorry about that. 274 | Patch looks good to me. 275 | Thanks for the patch Erik (and Jon) 276 | Version 2.2.0RC3 is fine. 277 | "Because, we don't care for this in cases where we there is no node that is down." 278 | > I hated that aspect of working for commercial companies. 279 | Please contact me if you need any clarifications. 280 | -------------------------------------------------------------------------------- /scripts/PTM/api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from utils import * 3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification 4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification 5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification 6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification 7 | import argparse 8 | 9 | # Model | Tokenizer | Pretrained weights shortcut 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), 11 | (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), 12 | (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 13 | (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') 14 | ] 15 | 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] 17 | 18 | seed_torch(42) 19 | ## Read model name 20 | parser = argparse.ArgumentParser(description='Choose the models.') 21 | 22 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 23 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 24 | 25 | 26 | args = parser.parse_args() 27 | m_num=args.model_num 28 | 29 | cur_model=MODELS[m_num] 30 | m_name=MODEL_NAMES[m_num] 31 | 32 | train_df=pd.read_pickle(api_train) 33 | train_df['label']=train_df['label'].replace(-1, 2) 34 | 35 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) 36 | 37 | sentences=train_df.sentence.values 38 | labels=train_df.label.values 39 | 40 | # max_len = 0 41 | # for sent in sentences: 42 | # input_ids=tokenizer.encode(str(sent), add_special_tokens=True) 43 | # max_len=max(max_len, len(input_ids)) 44 | # print('Max sentence length: ', max_len) 45 | 46 | input_ids = [] 47 | attention_masks = [] 48 | 49 | for sent in sentences: 50 | 51 | encoded_dict = tokenizer.encode_plus( 52 | str(sent), 53 | add_special_tokens = True, 54 | max_length = MAX_LEN, 55 | pad_to_max_length = True, 56 | return_attention_mask = True, 57 | return_tensors = 'pt' 58 | ) 59 | 60 | input_ids.append(encoded_dict['input_ids']) 61 | attention_masks.append(encoded_dict['attention_mask']) 62 | 63 | 64 | train_inputs = torch.cat(input_ids, dim=0) 65 | train_masks = torch.cat(attention_masks, dim=0) 66 | train_labels = torch.tensor(labels) 67 | 68 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape)) 69 | 70 | train_data = TensorDataset(train_inputs, train_masks, train_labels) 71 | train_sampler = RandomSampler(train_data) 72 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) 73 | 74 | # Train Model 75 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 76 | model.cuda() 77 | 78 | param_optimizer = list(model.named_parameters()) 79 | no_decay = ['bias', 'gamma', 'beta'] 80 | optimizer_grouped_parameters = [ 81 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 82 | 'weight_decay_rate': 0.01}, 83 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 84 | 'weight_decay_rate': 0.0} 85 | ] 86 | 87 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE) 88 | 89 | begin=time.time() 90 | train_loss_set = [] 91 | 92 | for _ in trange(EPOCHS, desc="Epoch"): 93 | 94 | model.train() 95 | 96 | tr_loss = 0 97 | nb_tr_examples, nb_tr_steps = 0, 0 98 | 99 | for step, batch in enumerate(train_dataloader): 100 | 101 | batch = tuple(t.to(device) for t in batch) 102 | 103 | b_input_ids, b_input_mask, b_labels = batch 104 | optimizer.zero_grad() 105 | 106 | # Forward pass 107 | outputs = model(b_input_ids, token_type_ids=None, \ 108 | attention_mask=b_input_mask, labels=b_labels) 109 | loss = outputs[0] 110 | logits = outputs[1] 111 | train_loss_set.append(loss.item()) 112 | 113 | # Backward pass 114 | loss.backward() 115 | optimizer.step() 116 | 117 | tr_loss += loss.item() 118 | nb_tr_examples += b_input_ids.size(0) 119 | nb_tr_steps += 1 120 | 121 | print("Train loss: {}".format(tr_loss/nb_tr_steps)) 122 | 123 | end=time.time() 124 | print('Training used {:.2f} second'.format(end-begin)) 125 | 126 | begin=time.time() 127 | #test_df = pd.read_csv(api_test, usecols=['sentence','label']) 128 | test_df=pd.read_pickle(api_test) 129 | test_df['label']=test_df['label'].replace(-1, 2) 130 | 131 | sentences=test_df.sentence.values 132 | labels = test_df.label.values 133 | 134 | input_ids = [] 135 | attention_masks = [] 136 | 137 | for sent in sentences: 138 | encoded_dict = tokenizer.encode_plus( 139 | str(sent), 140 | add_special_tokens = True, 141 | max_length = MAX_LEN, 142 | pad_to_max_length = True, 143 | return_attention_mask = True, 144 | return_tensors = 'pt', 145 | ) 146 | 147 | input_ids.append(encoded_dict['input_ids']) 148 | attention_masks.append(encoded_dict['attention_mask']) 149 | 150 | prediction_inputs = torch.cat(input_ids,dim=0) 151 | prediction_masks = torch.cat(attention_masks,dim=0) 152 | prediction_labels = torch.tensor(labels) 153 | 154 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) 155 | prediction_sampler = SequentialSampler(prediction_data) 156 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE) 157 | 158 | model.eval() 159 | predictions,true_labels=[],[] 160 | 161 | for batch in prediction_dataloader: 162 | batch = tuple(t.to(device) for t in batch) 163 | b_input_ids, b_input_mask, b_labels = batch 164 | 165 | with torch.no_grad(): 166 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 167 | logits = outputs[0] 168 | 169 | logits = logits.detach().cpu().numpy() 170 | label_ids = b_labels.to('cpu').numpy() 171 | 172 | predictions.append(logits) 173 | true_labels.append(label_ids) 174 | 175 | end=time.time() 176 | print('Prediction used {:.2f} seconds'.format(end-begin)) 177 | 178 | flat_predictions = [item for sublist in predictions for item in sublist] 179 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten() 180 | flat_true_labels = [item for sublist in true_labels for item in sublist] 181 | 182 | print("Accuracy of {} on API Reviews is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions))) 183 | 184 | print(classification_report(flat_true_labels,flat_predictions)) -------------------------------------------------------------------------------- /scripts/PTM/app.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from utils import * 3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification 4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification 5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification 6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification 7 | import argparse 8 | 9 | # Model | Tokenizer | Pretrained weights shortcut 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), 11 | (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), 12 | (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 13 | (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') 14 | ] 15 | 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] 17 | 18 | seed_torch(20200209) 19 | ## Read model name 20 | parser = argparse.ArgumentParser(description='Choose the models.') 21 | 22 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 23 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 24 | 25 | 26 | args = parser.parse_args() 27 | m_num=args.model_num 28 | 29 | cur_model=MODELS[m_num] 30 | m_name=MODEL_NAMES[m_num] 31 | 32 | train_df=pd.read_pickle(app_train) 33 | train_df['label']=train_df['label'].replace(-1, 2) 34 | # negative: 2 35 | 36 | # tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=True) 37 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) 38 | 39 | sentences=train_df.sentence.values 40 | labels=train_df.label.values 41 | 42 | # max_len = 0 43 | # for sent in sentences: 44 | # input_ids=tokenizer.encode(sent, add_special_tokens=True) 45 | # max_len=max(max_len, len(input_ids)) 46 | # print('Max sentence length: ', max_len) 47 | 48 | input_ids = [] 49 | attention_masks = [] 50 | 51 | for sent in sentences: 52 | 53 | encoded_dict = tokenizer.encode_plus( 54 | str(sent), 55 | add_special_tokens = True, 56 | max_length = MAX_LEN, 57 | pad_to_max_length = True, 58 | return_attention_mask = True, 59 | return_tensors = 'pt' 60 | ) 61 | 62 | input_ids.append(encoded_dict['input_ids']) 63 | attention_masks.append(encoded_dict['attention_mask']) 64 | 65 | 66 | train_inputs = torch.cat(input_ids, dim=0) 67 | train_masks = torch.cat(attention_masks, dim=0) 68 | train_labels = torch.tensor(labels) 69 | 70 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape)) 71 | 72 | train_data = TensorDataset(train_inputs, train_masks, train_labels) 73 | train_sampler = RandomSampler(train_data) 74 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) 75 | 76 | # Train Model 77 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 78 | model.cuda() 79 | 80 | param_optimizer = list(model.named_parameters()) 81 | no_decay = ['bias', 'gamma', 'beta'] 82 | optimizer_grouped_parameters = [ 83 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 84 | 'weight_decay_rate': 0.01}, 85 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 86 | 'weight_decay_rate': 0.0} 87 | ] 88 | 89 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE) 90 | 91 | begin=time.time() 92 | train_loss_set = [] 93 | 94 | for _ in trange(EPOCHS, desc="Epoch"): 95 | 96 | model.train() 97 | 98 | tr_loss = 0 99 | nb_tr_examples, nb_tr_steps = 0, 0 100 | 101 | for step, batch in enumerate(train_dataloader): 102 | 103 | batch = tuple(t.to(device) for t in batch) 104 | 105 | b_input_ids, b_input_mask, b_labels = batch 106 | optimizer.zero_grad() 107 | 108 | # Forward pass 109 | outputs = model(b_input_ids, token_type_ids=None, \ 110 | attention_mask=b_input_mask, labels=b_labels) 111 | loss = outputs[0] 112 | logits = outputs[1] 113 | train_loss_set.append(loss.item()) 114 | 115 | # Backward pass 116 | loss.backward() 117 | optimizer.step() 118 | 119 | tr_loss += loss.item() 120 | nb_tr_examples += b_input_ids.size(0) 121 | nb_tr_steps += 1 122 | 123 | print("Train loss: {}".format(tr_loss/nb_tr_steps)) 124 | 125 | end=time.time() 126 | print('Training used {:.2f} second'.format(end-begin)) 127 | 128 | ### Test 129 | begin=time.time() 130 | test_df=pd.read_pickle(app_test) 131 | test_df['label']=test_df['label'].replace(-1,2) 132 | 133 | sentences=test_df.sentence.values 134 | labels = test_df.label.values 135 | 136 | input_ids = [] 137 | attention_masks = [] 138 | 139 | for sent in sentences: 140 | encoded_dict = tokenizer.encode_plus( 141 | str(sent), 142 | add_special_tokens = True, 143 | max_length = MAX_LEN, 144 | pad_to_max_length = True, 145 | return_attention_mask = True, 146 | return_tensors = 'pt', 147 | ) 148 | 149 | input_ids.append(encoded_dict['input_ids']) 150 | attention_masks.append(encoded_dict['attention_mask']) 151 | 152 | prediction_inputs = torch.cat(input_ids,dim=0) 153 | prediction_masks = torch.cat(attention_masks,dim=0) 154 | prediction_labels = torch.tensor(labels) 155 | 156 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) 157 | prediction_sampler = SequentialSampler(prediction_data) 158 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE) 159 | 160 | model.eval() 161 | predictions,true_labels=[],[] 162 | 163 | for batch in prediction_dataloader: 164 | batch = tuple(t.to(device) for t in batch) 165 | b_input_ids, b_input_mask, b_labels = batch 166 | 167 | with torch.no_grad(): 168 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 169 | logits = outputs[0] 170 | 171 | logits = logits.detach().cpu().numpy() 172 | label_ids = b_labels.to('cpu').numpy() 173 | 174 | predictions.append(logits) 175 | true_labels.append(label_ids) 176 | 177 | end=time.time() 178 | print('Prediction used {:.2f} seconds'.format(end-begin)) 179 | 180 | flat_predictions = [item for sublist in predictions for item in sublist] 181 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten() 182 | flat_true_labels = [item for sublist in true_labels for item in sublist] 183 | 184 | print("Accuracy of {} on APP Reviews is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions))) 185 | print(classification_report(flat_true_labels,flat_predictions)) -------------------------------------------------------------------------------- /scripts/PTM/cr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from utils import * 3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification 4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification 5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification 6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification 7 | import argparse 8 | 9 | # Model | Tokenizer | Pretrained weights shortcut 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), 11 | (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), 12 | (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 13 | (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') 14 | ] 15 | 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] 17 | 18 | seed_torch(42) 19 | 20 | ## Read model name 21 | parser = argparse.ArgumentParser(description='Choose the models.') 22 | 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 24 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 25 | 26 | 27 | args = parser.parse_args() 28 | m_num=args.model_num 29 | 30 | cur_model=MODELS[m_num] 31 | m_name=MODEL_NAMES[m_num] 32 | 33 | train_df=pd.read_pickle(cr_train) 34 | 35 | # 0: non-negative, 1: negative 36 | train_df['label']=train_df['label'].replace(-1, 1) 37 | 38 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) 39 | 40 | sentences=train_df.sentence.values 41 | labels=train_df.label.values 42 | 43 | # max_len = 0 44 | # for sent in sentences: 45 | # input_ids=tokenizer.encode(sent, add_special_tokens=True) 46 | # max_len=max(max_len, len(input_ids)) 47 | # print('Max sentence length: ', max_len) 48 | 49 | input_ids = [] 50 | attention_masks = [] 51 | 52 | for sent in sentences: 53 | 54 | encoded_dict = tokenizer.encode_plus( 55 | str(sent), 56 | add_special_tokens = True, 57 | max_length = MAX_LEN, 58 | pad_to_max_length = True, 59 | return_attention_mask = True, 60 | return_tensors = 'pt' 61 | ) 62 | 63 | input_ids.append(encoded_dict['input_ids']) 64 | attention_masks.append(encoded_dict['attention_mask']) 65 | 66 | 67 | train_inputs = torch.cat(input_ids, dim=0) 68 | train_masks = torch.cat(attention_masks, dim=0) 69 | train_labels = torch.tensor(labels) 70 | 71 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape)) 72 | 73 | train_data = TensorDataset(train_inputs, train_masks, train_labels) 74 | train_sampler = RandomSampler(train_data) 75 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) 76 | 77 | # Train Model 78 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 79 | model.cuda() 80 | 81 | param_optimizer = list(model.named_parameters()) 82 | no_decay = ['bias', 'gamma', 'beta'] 83 | optimizer_grouped_parameters = [ 84 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 85 | 'weight_decay_rate': 0.01}, 86 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 87 | 'weight_decay_rate': 0.0} 88 | ] 89 | 90 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE) 91 | 92 | begin=time.time() 93 | train_loss_set = [] 94 | 95 | for _ in trange(EPOCHS, desc="Epoch"): 96 | 97 | model.train() 98 | 99 | tr_loss = 0 100 | nb_tr_examples, nb_tr_steps = 0, 0 101 | 102 | for step, batch in enumerate(train_dataloader): 103 | 104 | batch = tuple(t.to(device) for t in batch) 105 | 106 | b_input_ids, b_input_mask, b_labels = batch 107 | optimizer.zero_grad() 108 | 109 | # Forward pass 110 | outputs = model(b_input_ids, token_type_ids=None, \ 111 | attention_mask=b_input_mask, labels=b_labels) 112 | loss = outputs[0] 113 | logits = outputs[1] 114 | train_loss_set.append(loss.item()) 115 | 116 | # Backward pass 117 | loss.backward() 118 | optimizer.step() 119 | 120 | tr_loss += loss.item() 121 | nb_tr_examples += b_input_ids.size(0) 122 | nb_tr_steps += 1 123 | 124 | print("Train loss: {}".format(tr_loss/nb_tr_steps)) 125 | 126 | end=time.time() 127 | print('Training used {} second'.format(end-begin)) 128 | 129 | begin=time.time() 130 | # 0: non-negative, 1: negative 131 | test_df=pd.read_pickle(cr_test) 132 | test_df['label']=test_df['label'].replace(-1, 1) 133 | 134 | sentences=test_df.sentence.values 135 | labels = test_df.label.values 136 | 137 | input_ids = [] 138 | attention_masks = [] 139 | 140 | for sent in sentences: 141 | encoded_dict = tokenizer.encode_plus( 142 | str(sent), 143 | add_special_tokens = True, 144 | max_length = MAX_LEN, 145 | pad_to_max_length = True, 146 | return_attention_mask = True, 147 | return_tensors = 'pt' 148 | ) 149 | 150 | input_ids.append(encoded_dict['input_ids']) 151 | attention_masks.append(encoded_dict['attention_mask']) 152 | 153 | prediction_inputs = torch.cat(input_ids,dim=0) 154 | prediction_masks = torch.cat(attention_masks,dim=0) 155 | prediction_labels = torch.tensor(labels) 156 | 157 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) 158 | prediction_sampler = SequentialSampler(prediction_data) 159 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE) 160 | 161 | model.eval() 162 | predictions,true_labels=[],[] 163 | 164 | for batch in prediction_dataloader: 165 | batch = tuple(t.to(device) for t in batch) 166 | b_input_ids, b_input_mask, b_labels = batch 167 | 168 | with torch.no_grad(): 169 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 170 | logits = outputs[0] 171 | 172 | logits = logits.detach().cpu().numpy() 173 | label_ids = b_labels.to('cpu').numpy() 174 | 175 | predictions.append(logits) 176 | true_labels.append(label_ids) 177 | 178 | end=time.time() 179 | print('Prediction used {:.2f} seconds'.format(end - begin)) 180 | 181 | flat_predictions = [item for sublist in predictions for item in sublist] 182 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten() 183 | flat_true_labels = [item for sublist in true_labels for item in sublist] 184 | 185 | print("Accuracy of {} on Code Reviews is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions))) 186 | print(classification_report(flat_true_labels,flat_predictions)) -------------------------------------------------------------------------------- /scripts/PTM/early-stopping/api.py: -------------------------------------------------------------------------------- 1 | # Created by happygirlzt 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts') 5 | 6 | from utils import * 7 | from sklearn.model_selection import train_test_split 8 | import argparse 9 | import pprint 10 | import math 11 | from transformers import AdamW 12 | from ignite.engine import Engine, Events 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping 15 | from ignite.contrib.handlers import ProgressBar 16 | 17 | import logging 18 | logging.basicConfig(level=logging.ERROR) 19 | 20 | ## Read model name and project name 21 | parser = argparse.ArgumentParser(description='Choose the models.') 22 | 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 24 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 25 | 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?', 27 | help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0') 28 | 29 | args = parser.parse_args() 30 | #print(args.model_num) 31 | #print(args.project_num) 32 | 33 | m_num=args.model_num 34 | rerun_flag=bool(args.re_run) 35 | 36 | # Generate training, validation and test set 37 | data_folder=Path('../data/') 38 | 39 | cur_model=MODELS[m_num] 40 | m_name=MODEL_NAMES[m_num] 41 | 42 | print('Running model {} in API reviews'.format(m_name)) 43 | 44 | #### Read data 45 | train_data=pd.read_pickle(data_folder/'api-train.pkl') 46 | train_data['label']=train_data['label'].replace(-1, 2) 47 | 48 | X_train=train_data['sentence'] 49 | y_train=train_data['label'] 50 | 51 | test_data=pd.read_pickle(data_folder/'api-test.pkl') 52 | test_data['label']=test_data['label'].replace(-1, 2) 53 | 54 | X_test=test_data['sentence'] 55 | y_test=test_data['label'] 56 | print('Read success!') 57 | 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False) 59 | 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False) 61 | 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset))) 63 | 64 | if rerun_flag: 65 | X_train, X_validation, y_train, y_validation = train_test_split(X_train, 66 | y_train, 67 | test_size=0.05, 68 | random_state=SEED, 69 | stratify=y_train) 70 | 71 | #train_dataloader=get_dataloader(X_train, y_train,cur_model,True) 72 | #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False) 73 | 74 | train_iterator=get_iterator(X_train, y_train, cur_model, True) 75 | valid_iterator=get_iterator(X_validation, y_validation, cur_model, False) 76 | 77 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 78 | model.cuda() 79 | 80 | optimizer = AdamW(model.parameters(), 81 | lr=LEARNING_RATE, 82 | eps=EPS, 83 | weight_decay=WEIGHT_DECAY) 84 | 85 | #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5 86 | 87 | def process_function(engine, batch): 88 | model.train() 89 | optimizer.zero_grad() 90 | 91 | b_input_ids = batch.INPUT_IDS 92 | b_input_mask = batch.ATTENTION_MASKS 93 | b_labels = batch.LABEL 94 | 95 | 96 | outputs = model(b_input_ids, 97 | token_type_ids=None, 98 | attention_mask=b_input_mask, 99 | labels=b_labels) 100 | 101 | loss = outputs[0] 102 | logits = outputs[1] 103 | 104 | loss.backward() 105 | optimizer.step() 106 | #scheduler.step() 107 | return loss.item() 108 | 109 | def eval_function(engine, batch): 110 | model.eval() 111 | with torch.no_grad(): 112 | b_input_ids = batch.INPUT_IDS 113 | b_input_mask = batch.ATTENTION_MASKS 114 | b_labels = batch.LABEL 115 | 116 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 117 | #logits = outputs[0] 118 | y_pred=outputs[0] 119 | 120 | return y_pred, b_labels 121 | 122 | trainer = Engine(process_function) 123 | train_evaluator = Engine(eval_function) 124 | validation_evaluator = Engine(eval_function) 125 | 126 | #print('success!') 127 | #### Metrics 128 | RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') 129 | 130 | def output_transform_fun(output): 131 | y_pred, y = output 132 | y_pred=y_pred.detach().cpu().numpy() 133 | y=y.to('cpu').numpy() 134 | y_pred=np.argmax(y_pred, axis=1).flatten() 135 | return torch.from_numpy(y_pred), torch.from_numpy(y) 136 | 137 | criterion = nn.CrossEntropyLoss() 138 | ### Training 139 | #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy') 140 | Loss(criterion).attach(train_evaluator, 'cross-entropy') 141 | 142 | #precision = Precision(output_transform=output_transform_fun, average=False) 143 | #.detach().cpu().numpy() 144 | #recall = Recall(output_transform=output_transform_fun, average=False) 145 | #.detach().cpu().numpy() 146 | #F1 = (precision * recall * 2) / (precision + recall) 147 | 148 | #precision.attach(train_evaluator, 'precision') 149 | #recall.attach(train_evaluator, 'recall') 150 | #F1.attach(train_evaluator, 'F1') 151 | 152 | ### Validation 153 | #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy') 154 | Loss(criterion).attach(validation_evaluator, 'cross-entropy') 155 | 156 | #precision.attach(validation_evaluator, 'precision') 157 | #recall.attach(validation_evaluator, 'recall') 158 | #F1.attach(validation_evaluator, 'F1') 159 | 160 | #### Progress Bar 161 | pbar = ProgressBar(persist=True, bar_format="") 162 | pbar.attach(trainer, ['loss']) 163 | 164 | def score_function_loss(engine): 165 | val_loss = engine.state.metrics['cross-entropy'] 166 | return -val_loss 167 | 168 | def score_function_f1(engine): 169 | val_f1 = engine.state.metrics['F1'] 170 | if math.isnan(val_f1): 171 | return -9999 172 | return val_f1 173 | 174 | handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer) 175 | 176 | validation_evaluator.add_event_handler(Events.COMPLETED, handler) 177 | 178 | def log_training_results(engine): 179 | train_evaluator.run(train_iterator) 180 | metrics = train_evaluator.state.metrics 181 | pbar.log_message( 182 | "Training Results - Epoch: {} \nMetrics\n{}" 183 | .format(engine.state.epoch, pprint.pformat(metrics))) 184 | 185 | def log_validation_results(engine): 186 | validation_evaluator.run(valid_iterator) 187 | metrics = validation_evaluator.state.metrics 188 | pbar.log_message( 189 | "Validation Results - Epoch: {} \nMetrics\n{}" 190 | .format(engine.state.epoch, pprint.pformat(metrics))) 191 | pbar.n = pbar.last_print_n = 0 192 | 193 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results) 194 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results) 195 | 196 | #### Checkpoint 197 | 198 | # to_save = {'{}_{}'.format(p_name, m_name): model, 199 | # 'optimizer': optimizer, 200 | # 'lr_scheduler': scheduler 201 | # } 202 | 203 | to_save={'api_{}'.format(m_name): model} 204 | 205 | cp_handler = Checkpoint(to_save, 206 | DiskSaver('../models/', 207 | create_dir=True, require_empty=False), 208 | filename_prefix='best', 209 | score_function=score_function_loss, 210 | score_name='val_loss') 211 | 212 | validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler) 213 | #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler) 214 | 215 | # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False) 216 | 217 | # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) 218 | trainer.run(train_iterator, max_epochs=4) 219 | else: 220 | print('Runing saved model...') 221 | #run_on_test(cur_model, p_name, m_name, pred_iterator) 222 | run_saved_model(prediction_dataloader, cur_model, 'api', m_name) -------------------------------------------------------------------------------- /scripts/PTM/early-stopping/app.py: -------------------------------------------------------------------------------- 1 | # Created by happygirlzt 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts') 5 | 6 | from utils import * 7 | from sklearn.model_selection import train_test_split 8 | import argparse 9 | import pprint 10 | import math 11 | from transformers import AdamW 12 | from ignite.engine import Engine, Events 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping 15 | from ignite.contrib.handlers import ProgressBar 16 | 17 | import logging 18 | logging.basicConfig(level=logging.ERROR) 19 | 20 | ## Read model name and project name 21 | parser = argparse.ArgumentParser(description='Choose the models.') 22 | 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 24 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 25 | 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?', 27 | help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0') 28 | 29 | args = parser.parse_args() 30 | #print(args.model_num) 31 | #print(args.project_num) 32 | 33 | m_num=args.model_num 34 | rerun_flag=bool(args.re_run) 35 | 36 | # Generate training, validation and test set 37 | data_folder=Path('../data/') 38 | 39 | cur_model=MODELS[m_num] 40 | m_name=MODEL_NAMES[m_num] 41 | 42 | print('Running model {} in App reviews'.format(m_name)) 43 | 44 | #### Read data 45 | train_data=pd.read_pickle(data_folder/'app-train.pkl') 46 | train_data['label']=train_data['label'].replace(-1, 2) 47 | 48 | X_train=train_data['sentence'] 49 | y_train=train_data['label'] 50 | 51 | test_data=pd.read_pickle(data_folder/'app-test.pkl') 52 | test_data['label']=test_data['label'].replace(-1, 2) 53 | 54 | X_test=test_data['sentence'] 55 | y_test=test_data['label'] 56 | print('Read success!') 57 | 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False) 59 | 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False) 61 | 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset))) 63 | 64 | if rerun_flag: 65 | X_train, X_validation, y_train, y_validation = train_test_split(X_train, 66 | y_train, 67 | test_size=0.05, 68 | random_state=SEED, 69 | stratify=y_train) 70 | 71 | #train_dataloader=get_dataloader(X_train, y_train,cur_model,True) 72 | #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False) 73 | 74 | train_iterator=get_iterator(X_train, y_train, cur_model, True) 75 | valid_iterator=get_iterator(X_validation, y_validation, cur_model, False) 76 | 77 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 78 | model.cuda() 79 | 80 | optimizer = AdamW(model.parameters(), 81 | lr=LEARNING_RATE, 82 | eps=EPS, 83 | weight_decay=WEIGHT_DECAY) 84 | 85 | #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5 86 | 87 | def process_function(engine, batch): 88 | model.train() 89 | optimizer.zero_grad() 90 | 91 | b_input_ids = batch.INPUT_IDS 92 | b_input_mask = batch.ATTENTION_MASKS 93 | b_labels = batch.LABEL 94 | 95 | 96 | outputs = model(b_input_ids, 97 | token_type_ids=None, 98 | attention_mask=b_input_mask, 99 | labels=b_labels) 100 | 101 | loss = outputs[0] 102 | logits = outputs[1] 103 | 104 | loss.backward() 105 | optimizer.step() 106 | #scheduler.step() 107 | return loss.item() 108 | 109 | def eval_function(engine, batch): 110 | model.eval() 111 | with torch.no_grad(): 112 | b_input_ids = batch.INPUT_IDS 113 | b_input_mask = batch.ATTENTION_MASKS 114 | b_labels = batch.LABEL 115 | 116 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 117 | #logits = outputs[0] 118 | y_pred=outputs[0] 119 | 120 | return y_pred, b_labels 121 | 122 | trainer = Engine(process_function) 123 | train_evaluator = Engine(eval_function) 124 | validation_evaluator = Engine(eval_function) 125 | 126 | #print('success!') 127 | #### Metrics 128 | RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') 129 | 130 | def output_transform_fun(output): 131 | y_pred, y = output 132 | y_pred=y_pred.detach().cpu().numpy() 133 | y=y.to('cpu').numpy() 134 | y_pred=np.argmax(y_pred, axis=1).flatten() 135 | return torch.from_numpy(y_pred), torch.from_numpy(y) 136 | 137 | criterion = nn.CrossEntropyLoss() 138 | ### Training 139 | #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy') 140 | Loss(criterion).attach(train_evaluator, 'cross-entropy') 141 | 142 | #precision = Precision(output_transform=output_transform_fun, average=False) 143 | #.detach().cpu().numpy() 144 | #recall = Recall(output_transform=output_transform_fun, average=False) 145 | #.detach().cpu().numpy() 146 | #F1 = (precision * recall * 2) / (precision + recall) 147 | 148 | #precision.attach(train_evaluator, 'precision') 149 | #recall.attach(train_evaluator, 'recall') 150 | #F1.attach(train_evaluator, 'F1') 151 | 152 | ### Validation 153 | #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy') 154 | Loss(criterion).attach(validation_evaluator, 'cross-entropy') 155 | 156 | #precision.attach(validation_evaluator, 'precision') 157 | #recall.attach(validation_evaluator, 'recall') 158 | #F1.attach(validation_evaluator, 'F1') 159 | 160 | #### Progress Bar 161 | pbar = ProgressBar(persist=True, bar_format="") 162 | pbar.attach(trainer, ['loss']) 163 | 164 | def score_function_loss(engine): 165 | val_loss = engine.state.metrics['cross-entropy'] 166 | return -val_loss 167 | 168 | def score_function_f1(engine): 169 | val_f1 = engine.state.metrics['F1'] 170 | if math.isnan(val_f1): 171 | return -9999 172 | return val_f1 173 | 174 | handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer) 175 | 176 | validation_evaluator.add_event_handler(Events.COMPLETED, handler) 177 | 178 | def log_training_results(engine): 179 | train_evaluator.run(train_iterator) 180 | metrics = train_evaluator.state.metrics 181 | pbar.log_message( 182 | "Training Results - Epoch: {} \nMetrics\n{}" 183 | .format(engine.state.epoch, pprint.pformat(metrics))) 184 | 185 | def log_validation_results(engine): 186 | validation_evaluator.run(valid_iterator) 187 | metrics = validation_evaluator.state.metrics 188 | pbar.log_message( 189 | "Validation Results - Epoch: {} \nMetrics\n{}" 190 | .format(engine.state.epoch, pprint.pformat(metrics))) 191 | pbar.n = pbar.last_print_n = 0 192 | 193 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results) 194 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results) 195 | 196 | #### Checkpoint 197 | 198 | # to_save = {'{}_{}'.format(p_name, m_name): model, 199 | # 'optimizer': optimizer, 200 | # 'lr_scheduler': scheduler 201 | # } 202 | 203 | to_save={'app_{}'.format(m_name): model} 204 | 205 | cp_handler = Checkpoint(to_save, 206 | DiskSaver('../models/', 207 | create_dir=True, require_empty=False), 208 | filename_prefix='best', 209 | score_function=score_function_loss, 210 | score_name='val_loss') 211 | 212 | validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler) 213 | #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler) 214 | 215 | # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False) 216 | 217 | # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) 218 | trainer.run(train_iterator, max_epochs=4) 219 | else: 220 | print('Runing saved model...') 221 | #run_on_test(cur_model, p_name, m_name, pred_iterator) 222 | run_saved_model(prediction_dataloader, cur_model, 'app', m_name) -------------------------------------------------------------------------------- /scripts/PTM/early-stopping/cr.py: -------------------------------------------------------------------------------- 1 | # Created by happygirlzt 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts') 5 | 6 | from utils import * 7 | from sklearn.model_selection import train_test_split 8 | import argparse 9 | import pprint 10 | import math 11 | from transformers import AdamW 12 | from ignite.engine import Engine, Events 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping 15 | from ignite.contrib.handlers import ProgressBar 16 | 17 | import logging 18 | logging.basicConfig(level=logging.ERROR) 19 | 20 | ## Read model name and project name 21 | parser = argparse.ArgumentParser(description='Choose the models.') 22 | 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 24 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 25 | 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?', 27 | help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0') 28 | 29 | args = parser.parse_args() 30 | #print(args.model_num) 31 | #print(args.project_num) 32 | 33 | m_num=args.model_num 34 | rerun_flag=bool(args.re_run) 35 | 36 | # Generate training, validation and test set 37 | data_folder=Path('../data/') 38 | 39 | cur_model=MODELS[m_num] 40 | m_name=MODEL_NAMES[m_num] 41 | 42 | print('Running model {} in code reviews'.format(m_name)) 43 | 44 | #### Read data 45 | train_data=pd.read_pickle(data_folder/'cr-train.pkl') 46 | train_data['label']=train_data['label'].replace(-1, 1) 47 | 48 | X_train=train_data['sentence'] 49 | y_train=train_data['label'] 50 | 51 | test_data=pd.read_pickle(data_folder/'cr-test.pkl') 52 | test_data['label']=test_data['label'].replace(-1, 1) 53 | 54 | X_test=test_data['sentence'] 55 | y_test=test_data['label'] 56 | print('Read success!') 57 | 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False) 59 | 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False) 61 | 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset))) 63 | 64 | if rerun_flag: 65 | X_train, X_validation, y_train, y_validation = train_test_split(X_train, 66 | y_train, 67 | test_size=0.05, 68 | random_state=SEED, 69 | stratify=y_train) 70 | 71 | #train_dataloader=get_dataloader(X_train, y_train,cur_model,True) 72 | #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False) 73 | 74 | train_iterator=get_iterator(X_train, y_train, cur_model, True) 75 | valid_iterator=get_iterator(X_validation, y_validation, cur_model, False) 76 | 77 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 78 | model.cuda() 79 | 80 | optimizer = AdamW(model.parameters(), 81 | lr=LEARNING_RATE, 82 | eps=EPS, 83 | weight_decay=WEIGHT_DECAY) 84 | 85 | #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5 86 | 87 | def process_function(engine, batch): 88 | model.train() 89 | optimizer.zero_grad() 90 | 91 | b_input_ids = batch.INPUT_IDS 92 | b_input_mask = batch.ATTENTION_MASKS 93 | b_labels = batch.LABEL 94 | 95 | 96 | outputs = model(b_input_ids, 97 | token_type_ids=None, 98 | attention_mask=b_input_mask, 99 | labels=b_labels) 100 | 101 | loss = outputs[0] 102 | logits = outputs[1] 103 | 104 | loss.backward() 105 | optimizer.step() 106 | #scheduler.step() 107 | return loss.item() 108 | 109 | def eval_function(engine, batch): 110 | model.eval() 111 | with torch.no_grad(): 112 | b_input_ids = batch.INPUT_IDS 113 | b_input_mask = batch.ATTENTION_MASKS 114 | b_labels = batch.LABEL 115 | 116 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 117 | #logits = outputs[0] 118 | y_pred=outputs[0] 119 | 120 | return y_pred, b_labels 121 | 122 | trainer = Engine(process_function) 123 | train_evaluator = Engine(eval_function) 124 | validation_evaluator = Engine(eval_function) 125 | 126 | #print('success!') 127 | #### Metrics 128 | RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') 129 | 130 | def output_transform_fun(output): 131 | y_pred, y = output 132 | y_pred=y_pred.detach().cpu().numpy() 133 | y=y.to('cpu').numpy() 134 | y_pred=np.argmax(y_pred, axis=1).flatten() 135 | return torch.from_numpy(y_pred), torch.from_numpy(y) 136 | 137 | criterion = nn.CrossEntropyLoss() 138 | ### Training 139 | #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy') 140 | Loss(criterion).attach(train_evaluator, 'cross-entropy') 141 | 142 | #precision = Precision(output_transform=output_transform_fun, average=False) 143 | #.detach().cpu().numpy() 144 | #recall = Recall(output_transform=output_transform_fun, average=False) 145 | #.detach().cpu().numpy() 146 | #F1 = (precision * recall * 2) / (precision + recall) 147 | 148 | #precision.attach(train_evaluator, 'precision') 149 | #recall.attach(train_evaluator, 'recall') 150 | #F1.attach(train_evaluator, 'F1') 151 | 152 | ### Validation 153 | #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy') 154 | Loss(criterion).attach(validation_evaluator, 'cross-entropy') 155 | 156 | #precision.attach(validation_evaluator, 'precision') 157 | #recall.attach(validation_evaluator, 'recall') 158 | #F1.attach(validation_evaluator, 'F1') 159 | 160 | #### Progress Bar 161 | pbar = ProgressBar(persist=True, bar_format="") 162 | pbar.attach(trainer, ['loss']) 163 | 164 | def score_function_loss(engine): 165 | val_loss = engine.state.metrics['cross-entropy'] 166 | return -val_loss 167 | 168 | def score_function_f1(engine): 169 | val_f1 = engine.state.metrics['F1'] 170 | if math.isnan(val_f1): 171 | return -9999 172 | return val_f1 173 | 174 | handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer) 175 | 176 | validation_evaluator.add_event_handler(Events.COMPLETED, handler) 177 | 178 | def log_training_results(engine): 179 | train_evaluator.run(train_iterator) 180 | metrics = train_evaluator.state.metrics 181 | pbar.log_message( 182 | "Training Results - Epoch: {} \nMetrics\n{}" 183 | .format(engine.state.epoch, pprint.pformat(metrics))) 184 | 185 | def log_validation_results(engine): 186 | validation_evaluator.run(valid_iterator) 187 | metrics = validation_evaluator.state.metrics 188 | pbar.log_message( 189 | "Validation Results - Epoch: {} \nMetrics\n{}" 190 | .format(engine.state.epoch, pprint.pformat(metrics))) 191 | pbar.n = pbar.last_print_n = 0 192 | 193 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results) 194 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results) 195 | 196 | #### Checkpoint 197 | 198 | # to_save = {'{}_{}'.format(p_name, m_name): model, 199 | # 'optimizer': optimizer, 200 | # 'lr_scheduler': scheduler 201 | # } 202 | 203 | to_save={'cr_{}'.format(m_name): model} 204 | 205 | cp_handler = Checkpoint(to_save, 206 | DiskSaver('../models/', 207 | create_dir=True, require_empty=False), 208 | filename_prefix='best', 209 | score_function=score_function_loss, 210 | score_name='val_loss') 211 | 212 | validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler) 213 | #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler) 214 | 215 | # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False) 216 | 217 | # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) 218 | trainer.run(train_iterator, max_epochs=4) 219 | else: 220 | print('Runing saved model...') 221 | #run_on_test(cur_model, p_name, m_name, pred_iterator) 222 | run_saved_model(prediction_dataloader, cur_model, 'cr', m_name) -------------------------------------------------------------------------------- /scripts/PTM/early-stopping/github.py: -------------------------------------------------------------------------------- 1 | # Created by happygirlzt 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts') 5 | 6 | from utils import * 7 | from sklearn.model_selection import train_test_split 8 | import argparse 9 | import pprint 10 | import math 11 | from transformers import AdamW 12 | from ignite.engine import Engine, Events 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping 15 | from ignite.contrib.handlers import ProgressBar 16 | 17 | import logging 18 | logging.basicConfig(level=logging.ERROR) 19 | 20 | ## Read model name and project name 21 | parser = argparse.ArgumentParser(description='Choose the models.') 22 | 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 24 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 25 | 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?', 27 | help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0') 28 | 29 | args = parser.parse_args() 30 | #print(args.model_num) 31 | #print(args.project_num) 32 | 33 | m_num=args.model_num 34 | rerun_flag=bool(args.re_run) 35 | 36 | # Generate training, validation and test set 37 | data_folder=Path('../data/') 38 | 39 | cur_model=MODELS[m_num] 40 | m_name=MODEL_NAMES[m_num] 41 | 42 | print('Running model {} in Github'.format(m_name)) 43 | 44 | #### Read data 45 | train_data=pd.read_pickle(data_folder/'gh-train.pkl') 46 | train_data['label']=train_data['label'].replace({'positive':1, 'negative':2, 'neutral':0}) 47 | 48 | X_train=train_data['sentence'] 49 | y_train=train_data['label'] 50 | 51 | test_data=pd.read_pickle(data_folder/'gh-test.pkl') 52 | test_data['label']=test_data['label'].replace({'positive':1, 'negative':2, 'neutral':0}) 53 | 54 | X_test=test_data['sentence'] 55 | y_test=test_data['label'] 56 | print('Read success!') 57 | 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False) 59 | 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False) 61 | 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset))) 63 | 64 | if rerun_flag: 65 | X_train, X_validation, y_train, y_validation = train_test_split(X_train, 66 | y_train, 67 | test_size=0.05, 68 | random_state=SEED, 69 | stratify=y_train) 70 | 71 | #train_dataloader=get_dataloader(X_train, y_train,cur_model,True) 72 | #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False) 73 | 74 | train_iterator=get_iterator(X_train, y_train, cur_model, True) 75 | valid_iterator=get_iterator(X_validation, y_validation, cur_model, False) 76 | 77 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 78 | model.cuda() 79 | 80 | optimizer = AdamW(model.parameters(), 81 | lr=LEARNING_RATE, 82 | eps=EPS, 83 | weight_decay=WEIGHT_DECAY) 84 | 85 | #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5 86 | 87 | def process_function(engine, batch): 88 | model.train() 89 | optimizer.zero_grad() 90 | 91 | b_input_ids = batch.INPUT_IDS 92 | b_input_mask = batch.ATTENTION_MASKS 93 | b_labels = batch.LABEL 94 | 95 | 96 | outputs = model(b_input_ids, 97 | token_type_ids=None, 98 | attention_mask=b_input_mask, 99 | labels=b_labels) 100 | 101 | loss = outputs[0] 102 | logits = outputs[1] 103 | 104 | loss.backward() 105 | optimizer.step() 106 | #scheduler.step() 107 | return loss.item() 108 | 109 | def eval_function(engine, batch): 110 | model.eval() 111 | with torch.no_grad(): 112 | b_input_ids = batch.INPUT_IDS 113 | b_input_mask = batch.ATTENTION_MASKS 114 | b_labels = batch.LABEL 115 | 116 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 117 | #logits = outputs[0] 118 | y_pred=outputs[0] 119 | 120 | return y_pred, b_labels 121 | 122 | trainer = Engine(process_function) 123 | train_evaluator = Engine(eval_function) 124 | validation_evaluator = Engine(eval_function) 125 | 126 | #print('success!') 127 | #### Metrics 128 | RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') 129 | 130 | def output_transform_fun(output): 131 | y_pred, y = output 132 | y_pred=y_pred.detach().cpu().numpy() 133 | y=y.to('cpu').numpy() 134 | y_pred=np.argmax(y_pred, axis=1).flatten() 135 | return torch.from_numpy(y_pred), torch.from_numpy(y) 136 | 137 | criterion = nn.CrossEntropyLoss() 138 | ### Training 139 | #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy') 140 | Loss(criterion).attach(train_evaluator, 'cross-entropy') 141 | 142 | #precision = Precision(output_transform=output_transform_fun, average=False) 143 | #.detach().cpu().numpy() 144 | #recall = Recall(output_transform=output_transform_fun, average=False) 145 | #.detach().cpu().numpy() 146 | #F1 = (precision * recall * 2) / (precision + recall) 147 | 148 | #precision.attach(train_evaluator, 'precision') 149 | #recall.attach(train_evaluator, 'recall') 150 | #F1.attach(train_evaluator, 'F1') 151 | 152 | ### Validation 153 | #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy') 154 | Loss(criterion).attach(validation_evaluator, 'cross-entropy') 155 | 156 | #precision.attach(validation_evaluator, 'precision') 157 | #recall.attach(validation_evaluator, 'recall') 158 | #F1.attach(validation_evaluator, 'F1') 159 | 160 | #### Progress Bar 161 | pbar = ProgressBar(persist=True, bar_format="") 162 | pbar.attach(trainer, ['loss']) 163 | 164 | def score_function_loss(engine): 165 | val_loss = engine.state.metrics['cross-entropy'] 166 | return -val_loss 167 | 168 | def score_function_f1(engine): 169 | val_f1 = engine.state.metrics['F1'] 170 | if math.isnan(val_f1): 171 | return -9999 172 | return val_f1 173 | 174 | handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer) 175 | 176 | validation_evaluator.add_event_handler(Events.COMPLETED, handler) 177 | 178 | def log_training_results(engine): 179 | train_evaluator.run(train_iterator) 180 | metrics = train_evaluator.state.metrics 181 | pbar.log_message( 182 | "Training Results - Epoch: {} \nMetrics\n{}" 183 | .format(engine.state.epoch, pprint.pformat(metrics))) 184 | 185 | def log_validation_results(engine): 186 | validation_evaluator.run(valid_iterator) 187 | metrics = validation_evaluator.state.metrics 188 | pbar.log_message( 189 | "Validation Results - Epoch: {} \nMetrics\n{}" 190 | .format(engine.state.epoch, pprint.pformat(metrics))) 191 | pbar.n = pbar.last_print_n = 0 192 | 193 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results) 194 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results) 195 | 196 | #### Checkpoint 197 | 198 | # to_save = {'{}_{}'.format(p_name, m_name): model, 199 | # 'optimizer': optimizer, 200 | # 'lr_scheduler': scheduler 201 | # } 202 | 203 | to_save={'gh_{}'.format(m_name): model} 204 | 205 | cp_handler = Checkpoint(to_save, 206 | DiskSaver('../models/', 207 | create_dir=True, require_empty=False), 208 | filename_prefix='best', 209 | score_function=score_function_loss, 210 | score_name='val_loss') 211 | 212 | validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler) 213 | #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler) 214 | 215 | # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False) 216 | 217 | # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) 218 | trainer.run(train_iterator, max_epochs=4) 219 | else: 220 | print('Runing saved model...') 221 | #run_on_test(cur_model, p_name, m_name, pred_iterator) 222 | run_saved_model(prediction_dataloader, cur_model, 'gh', m_name) -------------------------------------------------------------------------------- /scripts/PTM/early-stopping/jira.py: -------------------------------------------------------------------------------- 1 | # Created by happygirlzt 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts') 5 | 6 | from utils import * 7 | from sklearn.model_selection import train_test_split 8 | import argparse 9 | import pprint 10 | import math 11 | from transformers import AdamW 12 | from ignite.engine import Engine, Events 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping 15 | from ignite.contrib.handlers import ProgressBar 16 | 17 | import logging 18 | logging.basicConfig(level=logging.ERROR) 19 | 20 | ## Read model name and project name 21 | parser = argparse.ArgumentParser(description='Choose the models.') 22 | 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 24 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 25 | 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?', 27 | help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0') 28 | 29 | args = parser.parse_args() 30 | #print(args.model_num) 31 | #print(args.project_num) 32 | 33 | m_num=args.model_num 34 | rerun_flag=bool(args.re_run) 35 | 36 | # Generate training, validation and test set 37 | data_folder=Path('../data/') 38 | 39 | cur_model=MODELS[m_num] 40 | m_name=MODEL_NAMES[m_num] 41 | 42 | print('Running model {} in Jira'.format(m_name)) 43 | 44 | #### Read data 45 | train_data=pd.read_pickle(data_folder/'jira-train.pkl') 46 | train_data['label']=train_data['label'].replace(-1, 0) 47 | 48 | X_train=train_data['sentence'] 49 | y_train=train_data['label'] 50 | 51 | test_data=pd.read_pickle(data_folder/'jira-test.pkl') 52 | test_data['label']=test_data['label'].replace(-1, 0) 53 | 54 | X_test=test_data['sentence'] 55 | y_test=test_data['label'] 56 | print('Read success!') 57 | 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False) 59 | 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False) 61 | 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset))) 63 | 64 | if rerun_flag: 65 | X_train, X_validation, y_train, y_validation = train_test_split(X_train, 66 | y_train, 67 | test_size=0.05, 68 | random_state=SEED, 69 | stratify=y_train) 70 | 71 | #train_dataloader=get_dataloader(X_train, y_train,cur_model,True) 72 | #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False) 73 | 74 | train_iterator=get_iterator(X_train, y_train, cur_model, True) 75 | valid_iterator=get_iterator(X_validation, y_validation, cur_model, False) 76 | 77 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 78 | model.cuda() 79 | 80 | optimizer = AdamW(model.parameters(), 81 | lr=LEARNING_RATE, 82 | eps=EPS, 83 | weight_decay=WEIGHT_DECAY) 84 | 85 | #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5 86 | 87 | def process_function(engine, batch): 88 | model.train() 89 | optimizer.zero_grad() 90 | 91 | b_input_ids = batch.INPUT_IDS 92 | b_input_mask = batch.ATTENTION_MASKS 93 | b_labels = batch.LABEL 94 | 95 | 96 | outputs = model(b_input_ids, 97 | token_type_ids=None, 98 | attention_mask=b_input_mask, 99 | labels=b_labels) 100 | 101 | loss = outputs[0] 102 | logits = outputs[1] 103 | 104 | loss.backward() 105 | optimizer.step() 106 | #scheduler.step() 107 | return loss.item() 108 | 109 | def eval_function(engine, batch): 110 | model.eval() 111 | with torch.no_grad(): 112 | b_input_ids = batch.INPUT_IDS 113 | b_input_mask = batch.ATTENTION_MASKS 114 | b_labels = batch.LABEL 115 | 116 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 117 | #logits = outputs[0] 118 | y_pred=outputs[0] 119 | 120 | return y_pred, b_labels 121 | 122 | trainer = Engine(process_function) 123 | train_evaluator = Engine(eval_function) 124 | validation_evaluator = Engine(eval_function) 125 | 126 | #print('success!') 127 | #### Metrics 128 | RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') 129 | 130 | def output_transform_fun(output): 131 | y_pred, y = output 132 | y_pred=y_pred.detach().cpu().numpy() 133 | y=y.to('cpu').numpy() 134 | y_pred=np.argmax(y_pred, axis=1).flatten() 135 | return torch.from_numpy(y_pred), torch.from_numpy(y) 136 | 137 | criterion = nn.CrossEntropyLoss() 138 | ### Training 139 | #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy') 140 | Loss(criterion).attach(train_evaluator, 'cross-entropy') 141 | 142 | #precision = Precision(output_transform=output_transform_fun, average=False) 143 | #.detach().cpu().numpy() 144 | #recall = Recall(output_transform=output_transform_fun, average=False) 145 | #.detach().cpu().numpy() 146 | #F1 = (precision * recall * 2) / (precision + recall) 147 | 148 | #precision.attach(train_evaluator, 'precision') 149 | #recall.attach(train_evaluator, 'recall') 150 | #F1.attach(train_evaluator, 'F1') 151 | 152 | ### Validation 153 | #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy') 154 | Loss(criterion).attach(validation_evaluator, 'cross-entropy') 155 | 156 | #precision.attach(validation_evaluator, 'precision') 157 | #recall.attach(validation_evaluator, 'recall') 158 | #F1.attach(validation_evaluator, 'F1') 159 | 160 | #### Progress Bar 161 | pbar = ProgressBar(persist=True, bar_format="") 162 | pbar.attach(trainer, ['loss']) 163 | 164 | def score_function_loss(engine): 165 | val_loss = engine.state.metrics['cross-entropy'] 166 | return -val_loss 167 | 168 | def score_function_f1(engine): 169 | val_f1 = engine.state.metrics['F1'] 170 | if math.isnan(val_f1): 171 | return -9999 172 | return val_f1 173 | 174 | handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer) 175 | 176 | validation_evaluator.add_event_handler(Events.COMPLETED, handler) 177 | 178 | def log_training_results(engine): 179 | train_evaluator.run(train_iterator) 180 | metrics = train_evaluator.state.metrics 181 | pbar.log_message( 182 | "Training Results - Epoch: {} \nMetrics\n{}" 183 | .format(engine.state.epoch, pprint.pformat(metrics))) 184 | 185 | def log_validation_results(engine): 186 | validation_evaluator.run(valid_iterator) 187 | metrics = validation_evaluator.state.metrics 188 | pbar.log_message( 189 | "Validation Results - Epoch: {} \nMetrics\n{}" 190 | .format(engine.state.epoch, pprint.pformat(metrics))) 191 | pbar.n = pbar.last_print_n = 0 192 | 193 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results) 194 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results) 195 | 196 | #### Checkpoint 197 | 198 | # to_save = {'{}_{}'.format(p_name, m_name): model, 199 | # 'optimizer': optimizer, 200 | # 'lr_scheduler': scheduler 201 | # } 202 | 203 | to_save={'jira_{}'.format(m_name): model} 204 | 205 | cp_handler = Checkpoint(to_save, 206 | DiskSaver('../models/', 207 | create_dir=True, require_empty=False), 208 | filename_prefix='best', 209 | score_function=score_function_loss, 210 | score_name='val_loss') 211 | 212 | validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler) 213 | #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler) 214 | 215 | # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False) 216 | 217 | # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) 218 | trainer.run(train_iterator, max_epochs=4) 219 | else: 220 | print('Runing saved model...') 221 | #run_on_test(cur_model, p_name, m_name, pred_iterator) 222 | run_saved_model(prediction_dataloader, cur_model, 'jira', m_name) -------------------------------------------------------------------------------- /scripts/PTM/early-stopping/so.py: -------------------------------------------------------------------------------- 1 | # Created by happygirlzt 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts') 5 | 6 | from utils import * 7 | from sklearn.model_selection import train_test_split 8 | import argparse 9 | import pprint 10 | import math 11 | from transformers import AdamW 12 | from ignite.engine import Engine, Events 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping 15 | from ignite.contrib.handlers import ProgressBar 16 | 17 | import logging 18 | logging.basicConfig(level=logging.ERROR) 19 | 20 | ## Read model name and project name 21 | parser = argparse.ArgumentParser(description='Choose the models.') 22 | 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 24 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 25 | 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?', 27 | help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0') 28 | 29 | args = parser.parse_args() 30 | #print(args.model_num) 31 | #print(args.project_num) 32 | 33 | m_num=args.model_num 34 | rerun_flag=bool(args.re_run) 35 | 36 | # Generate training, validation and test set 37 | data_folder=Path('../data/') 38 | 39 | cur_model=MODELS[m_num] 40 | m_name=MODEL_NAMES[m_num] 41 | 42 | print('Running model {} in Stack Overflow'.format(m_name)) 43 | 44 | #### Read data 45 | train_data=pd.read_pickle(data_folder/'so-train.pkl') 46 | train_data['label']=train_data['label'].replace(-1, 2) 47 | 48 | X_train=train_data['sentence'] 49 | y_train=train_data['label'] 50 | 51 | test_data=pd.read_pickle(data_folder/'so-test.pkl') 52 | test_data['label']=test_data['label'].replace(-1, 2) 53 | 54 | X_test=test_data['sentence'] 55 | y_test=test_data['label'] 56 | print('Read success!') 57 | 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False) 59 | 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False) 61 | 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset))) 63 | 64 | if rerun_flag: 65 | X_train, X_validation, y_train, y_validation = train_test_split(X_train, 66 | y_train, 67 | test_size=0.05, 68 | random_state=SEED, 69 | stratify=y_train) 70 | 71 | #train_dataloader=get_dataloader(X_train, y_train,cur_model,True) 72 | #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False) 73 | 74 | train_iterator=get_iterator(X_train, y_train, cur_model, True) 75 | valid_iterator=get_iterator(X_validation, y_validation, cur_model, False) 76 | 77 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 78 | model.cuda() 79 | 80 | optimizer = AdamW(model.parameters(), 81 | lr=LEARNING_RATE, 82 | eps=EPS, 83 | weight_decay=WEIGHT_DECAY) 84 | 85 | #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5 86 | 87 | def process_function(engine, batch): 88 | model.train() 89 | optimizer.zero_grad() 90 | 91 | b_input_ids = batch.INPUT_IDS 92 | b_input_mask = batch.ATTENTION_MASKS 93 | b_labels = batch.LABEL 94 | 95 | 96 | outputs = model(b_input_ids, 97 | token_type_ids=None, 98 | attention_mask=b_input_mask, 99 | labels=b_labels) 100 | 101 | loss = outputs[0] 102 | logits = outputs[1] 103 | 104 | loss.backward() 105 | optimizer.step() 106 | #scheduler.step() 107 | return loss.item() 108 | 109 | def eval_function(engine, batch): 110 | model.eval() 111 | with torch.no_grad(): 112 | b_input_ids = batch.INPUT_IDS 113 | b_input_mask = batch.ATTENTION_MASKS 114 | b_labels = batch.LABEL 115 | 116 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 117 | #logits = outputs[0] 118 | y_pred=outputs[0] 119 | 120 | return y_pred, b_labels 121 | 122 | trainer = Engine(process_function) 123 | train_evaluator = Engine(eval_function) 124 | validation_evaluator = Engine(eval_function) 125 | 126 | #print('success!') 127 | #### Metrics 128 | RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss') 129 | 130 | def output_transform_fun(output): 131 | y_pred, y = output 132 | y_pred=y_pred.detach().cpu().numpy() 133 | y=y.to('cpu').numpy() 134 | y_pred=np.argmax(y_pred, axis=1).flatten() 135 | return torch.from_numpy(y_pred), torch.from_numpy(y) 136 | 137 | criterion = nn.CrossEntropyLoss() 138 | ### Training 139 | #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy') 140 | Loss(criterion).attach(train_evaluator, 'cross-entropy') 141 | 142 | #precision = Precision(output_transform=output_transform_fun, average=False) 143 | #.detach().cpu().numpy() 144 | #recall = Recall(output_transform=output_transform_fun, average=False) 145 | #.detach().cpu().numpy() 146 | #F1 = (precision * recall * 2) / (precision + recall) 147 | 148 | #precision.attach(train_evaluator, 'precision') 149 | #recall.attach(train_evaluator, 'recall') 150 | #F1.attach(train_evaluator, 'F1') 151 | 152 | ### Validation 153 | #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy') 154 | Loss(criterion).attach(validation_evaluator, 'cross-entropy') 155 | 156 | #precision.attach(validation_evaluator, 'precision') 157 | #recall.attach(validation_evaluator, 'recall') 158 | #F1.attach(validation_evaluator, 'F1') 159 | 160 | #### Progress Bar 161 | pbar = ProgressBar(persist=True, bar_format="") 162 | pbar.attach(trainer, ['loss']) 163 | 164 | def score_function_loss(engine): 165 | val_loss = engine.state.metrics['cross-entropy'] 166 | return -val_loss 167 | 168 | def score_function_f1(engine): 169 | val_f1 = engine.state.metrics['F1'] 170 | if math.isnan(val_f1): 171 | return -9999 172 | return val_f1 173 | 174 | handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer) 175 | 176 | validation_evaluator.add_event_handler(Events.COMPLETED, handler) 177 | 178 | def log_training_results(engine): 179 | train_evaluator.run(train_iterator) 180 | metrics = train_evaluator.state.metrics 181 | pbar.log_message( 182 | "Training Results - Epoch: {} \nMetrics\n{}" 183 | .format(engine.state.epoch, pprint.pformat(metrics))) 184 | 185 | def log_validation_results(engine): 186 | validation_evaluator.run(valid_iterator) 187 | metrics = validation_evaluator.state.metrics 188 | pbar.log_message( 189 | "Validation Results - Epoch: {} \nMetrics\n{}" 190 | .format(engine.state.epoch, pprint.pformat(metrics))) 191 | pbar.n = pbar.last_print_n = 0 192 | 193 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results) 194 | trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results) 195 | 196 | #### Checkpoint 197 | 198 | # to_save = {'{}_{}'.format(p_name, m_name): model, 199 | # 'optimizer': optimizer, 200 | # 'lr_scheduler': scheduler 201 | # } 202 | 203 | to_save={'so_{}'.format(m_name): model} 204 | 205 | cp_handler = Checkpoint(to_save, 206 | DiskSaver('../models/', 207 | create_dir=True, require_empty=False), 208 | filename_prefix='best', 209 | score_function=score_function_loss, 210 | score_name='val_loss') 211 | 212 | validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler) 213 | #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler) 214 | 215 | # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False) 216 | 217 | # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) 218 | trainer.run(train_iterator, max_epochs=4) 219 | else: 220 | print('Runing saved model...') 221 | #run_on_test(cur_model, p_name, m_name, pred_iterator) 222 | run_saved_model(prediction_dataloader, cur_model, 'so', m_name) -------------------------------------------------------------------------------- /scripts/PTM/early-stopping/utils.py: -------------------------------------------------------------------------------- 1 | # Created by happygirlzt 2 | import torch 3 | import torch.nn as nn 4 | from transformers import BertTokenizer, BertForSequenceClassification 5 | from transformers import DistilBertTokenizer, DistilBertForSequenceClassification 6 | from transformers import XLNetTokenizer, XLNetForSequenceClassification 7 | from transformers import RobertaTokenizer, RobertaForSequenceClassification 8 | from transformers import AlbertTokenizer, AlbertForSequenceClassification 9 | 10 | from sklearn.model_selection import train_test_split 11 | from torch.utils.data import TensorDataset, DataLoader 12 | import random 13 | import numpy as np 14 | import pandas as pd 15 | from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score 16 | from pathlib import Path 17 | import re 18 | import torchtext 19 | import glob 20 | from torchtext import data 21 | from torchtext.data import Field 22 | 23 | 24 | if torch.cuda.is_available(): 25 | device = torch.device("cuda") 26 | #print(f'There are {torch.cuda.device_count()} GPU(s) available.') 27 | #print('Device name:', torch.cuda.get_device_name(0)) 28 | 29 | else: 30 | print('No GPU available, using the CPU instead.') 31 | device = torch.device("cpu") 32 | 33 | data_folder=Path('../data/') 34 | model_folder=Path('../models/') 35 | result_folder=Path('../result/') 36 | 37 | # Model | Tokenizer | Pretrained weights shortcut 38 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), 39 | (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), 40 | (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 41 | (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') 42 | ] 43 | 44 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] 45 | 46 | ## Parameters setting 47 | BATCH_SIZE=16 48 | LEARNING_RATE=2e-5 49 | MAX_SEQ_LENGTH=256 50 | SEED=42 51 | EPOCHS=4 52 | EPS=1e-8 53 | WEIGHT_DECAY=1e-5 54 | 55 | def seed_torch(seed): 56 | random.seed(seed) 57 | np.random.seed(seed) 58 | torch.manual_seed(seed) 59 | torch.cuda.manual_seed(seed) 60 | torch.backends.cudnn.deterministic=True 61 | 62 | seed_torch(SEED) 63 | 64 | 65 | def get_dataloader(X_cur, y_cur, cur_model, is_train): 66 | input_ids, attention_masks = preprocessing_for_classifier_tensor(X_cur.values, cur_model) 67 | 68 | labels = torch.from_numpy(np.array(y_cur, dtype='int64')) 69 | 70 | cur_dataset = TensorDataset(input_ids, attention_masks, labels) 71 | 72 | cur_dataloader = DataLoader( 73 | cur_dataset, 74 | batch_size = BATCH_SIZE, 75 | shuffle=is_train) 76 | 77 | return cur_dataloader 78 | 79 | def get_iterator(X_cur, y_cur, cur_model, is_train): 80 | input_ids, attention_masks = preprocessing_for_classifier_list(X_cur.values, cur_model) 81 | #print(f'type of input_ids: {type(input_ids)}') 82 | #print(f'type of input_ids[0]: {type(input_ids[0])}') 83 | #print(f'type of attention_masks: {type(attention_masks)}') 84 | #print(f'type of attention_masks[0]: {type(attention_masks[0])}') 85 | labels = torch.from_numpy(np.array(y_cur, dtype='int64')) 86 | 87 | INPUT_IDS=Field(sequential=False, use_vocab=False, batch_first=True) 88 | ATTENTION_MASKS=Field(sequential=False, use_vocab=False, batch_first=True) 89 | LABEL=Field(sequential=False, use_vocab=False, batch_first=True) 90 | 91 | fields=[ 92 | ('INPUT_IDS', INPUT_IDS), 93 | ('ATTENTION_MASKS', ATTENTION_MASKS), 94 | ('LABEL', LABEL) 95 | ] 96 | examples=[] 97 | for i in range(len(labels)): 98 | examples.append(data.Example.fromlist([input_ids[i], 99 | attention_masks[i], 100 | labels[i]], 101 | fields)) 102 | 103 | 104 | cur_dataset = torchtext.data.Dataset(examples, fields) 105 | cur_iterator = data.BucketIterator(cur_dataset, batch_size=BATCH_SIZE, device='cuda', shuffle=is_train) 106 | return cur_iterator 107 | 108 | def preprocessing_for_classifier_tensor(sentences, cur_model): 109 | tokenizer=cur_model[1].from_pretrained(cur_model[2]) 110 | input_ids=[] 111 | attention_masks=[] 112 | 113 | for sent in sentences: 114 | encoded_sent = tokenizer.encode_plus( 115 | str(sent), 116 | add_special_tokens=True, 117 | max_length=MAX_SEQ_LENGTH, 118 | pad_to_max_length=True, 119 | return_tensors='pt', # Return PyTorch tensor 120 | return_attention_mask=True 121 | ) 122 | 123 | input_ids.append(encoded_sent.get('input_ids')) 124 | attention_masks.append(encoded_sent.get('attention_mask')) 125 | 126 | input_ids = torch.cat(input_ids, dim=0) 127 | attention_masks = torch.cat(attention_masks, dim=0) 128 | 129 | return input_ids, attention_masks 130 | 131 | def preprocessing_for_classifier_list(sentences, cur_model): 132 | tokenizer=cur_model[1].from_pretrained(cur_model[2]) 133 | input_ids = [] 134 | attention_masks = [] 135 | 136 | for sent in sentences: 137 | encoded_sent = tokenizer.encode_plus( 138 | str(sent), 139 | add_special_tokens=True, 140 | max_length=MAX_SEQ_LENGTH, 141 | pad_to_max_length=True, 142 | return_attention_mask=True 143 | ) 144 | 145 | input_ids.append(encoded_sent.get('input_ids')) 146 | attention_masks.append(encoded_sent.get('attention_mask')) 147 | 148 | return input_ids, attention_masks 149 | 150 | 151 | def run_saved_model(prediction_dataloader, cur_model, p_name, m_name): 152 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 153 | model.cuda() 154 | # satd_classifier.load_state_dict(torch.load(data_folder/'{}-{}.bin'.format(p_name, m_name))) 155 | # print('{}-{}.bin loaded'.format(p_name, m_name)) 156 | 157 | name_pattern='/sa4se/models/best_{}_{}_*'.format(p_name, m_name) 158 | # print(type(glob.glob(name_pattern))) 159 | candidates=glob.glob(name_pattern) 160 | candidates.sort(reverse=True) 161 | file_name=candidates[0] 162 | 163 | model.load_state_dict(torch.load(file_name)) 164 | print('{} loaded'.format(file_name)) 165 | 166 | model.eval() 167 | predictions, true_labels = [], [] 168 | 169 | for batch in prediction_dataloader: 170 | batch = tuple(t.to(device) for t in batch) 171 | 172 | b_input_ids, b_input_mask, b_labels = batch 173 | 174 | with torch.no_grad(): 175 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 176 | 177 | logits = outputs[0] 178 | 179 | # will create a synchronization point 180 | logits = logits.detach().cpu().numpy() 181 | label_ids = b_labels.to('cpu').numpy() 182 | 183 | predictions.append(logits) 184 | true_labels.append(label_ids) 185 | 186 | print(' DONE.') 187 | 188 | flat_predictions = [item for sublist in predictions for item in sublist] 189 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten() 190 | flat_true_labels = [item for sublist in true_labels for item in sublist] 191 | 192 | #print('Precision is {:.3f}'.format(precision_score(flat_true_labels, flat_predictions))) 193 | #print('Recall is {:.3f}'.format(recall_score(flat_true_labels, flat_predictions))) 194 | #print('F1-score is {:.3f}'.format(f1_score(flat_true_labels, flat_predictions))) 195 | print(classification_report(flat_true_labels, flat_predictions)) -------------------------------------------------------------------------------- /scripts/PTM/github.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from utils import * 3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification 4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification 5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification 6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification 7 | import argparse 8 | 9 | # Model | Tokenizer | Pretrained weights shortcut 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), 11 | (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), 12 | (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 13 | (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') 14 | ] 15 | 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] 17 | 18 | seed_torch(42) 19 | ## Read model name 20 | parser = argparse.ArgumentParser(description='Choose the models.') 21 | 22 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 23 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 24 | 25 | 26 | args = parser.parse_args() 27 | m_num=args.model_num 28 | 29 | cur_model=MODELS[m_num] 30 | m_name=MODEL_NAMES[m_num] 31 | 32 | train_df=pd.read_pickle(gh_train) 33 | train_df['label']=train_df['label'].replace({'positive':1, 'negative':2, 'neutral':0}) 34 | 35 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) 36 | 37 | sentences=train_df.sentence.values 38 | labels=train_df.label.values 39 | 40 | # max_len = 0 41 | # for sent in sentences: 42 | # input_ids=tokenizer.encode(sent, add_special_tokens=True) 43 | # max_len=max(max_len, len(input_ids)) 44 | # print('Max sentence length: ', max_len) 45 | 46 | input_ids = [] 47 | attention_masks = [] 48 | 49 | for sent in sentences: 50 | 51 | encoded_dict = tokenizer.encode_plus( 52 | str(sent), 53 | add_special_tokens = True, 54 | max_length = MAX_LEN, 55 | pad_to_max_length = True, 56 | return_attention_mask = True, 57 | return_tensors = 'pt' 58 | ) 59 | 60 | input_ids.append(encoded_dict['input_ids']) 61 | attention_masks.append(encoded_dict['attention_mask']) 62 | 63 | 64 | train_inputs = torch.cat(input_ids, dim=0) 65 | train_masks = torch.cat(attention_masks, dim=0) 66 | train_labels = torch.tensor(labels) 67 | 68 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape)) 69 | 70 | train_data = TensorDataset(train_inputs, train_masks, train_labels) 71 | train_sampler = RandomSampler(train_data) 72 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) 73 | 74 | # Train Model 75 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 76 | model.cuda() 77 | 78 | param_optimizer = list(model.named_parameters()) 79 | no_decay = ['bias', 'gamma', 'beta'] 80 | optimizer_grouped_parameters = [ 81 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 82 | 'weight_decay_rate': 0.01}, 83 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 84 | 'weight_decay_rate': 0.0} 85 | ] 86 | 87 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE) 88 | 89 | begin=time.time() 90 | train_loss_set = [] 91 | 92 | for _ in trange(EPOCHS, desc="Epoch"): 93 | 94 | model.train() 95 | 96 | tr_loss = 0 97 | nb_tr_examples, nb_tr_steps = 0, 0 98 | 99 | for step, batch in enumerate(train_dataloader): 100 | 101 | batch = tuple(t.to(device) for t in batch) 102 | 103 | b_input_ids, b_input_mask, b_labels = batch 104 | optimizer.zero_grad() 105 | 106 | # Forward pass 107 | outputs = model(b_input_ids, token_type_ids=None, \ 108 | attention_mask=b_input_mask, labels=b_labels) 109 | loss = outputs[0] 110 | logits = outputs[1] 111 | train_loss_set.append(loss.item()) 112 | 113 | # Backward pass 114 | loss.backward() 115 | optimizer.step() 116 | 117 | tr_loss += loss.item() 118 | nb_tr_examples += b_input_ids.size(0) 119 | nb_tr_steps += 1 120 | 121 | print("Train loss: {}".format(tr_loss/nb_tr_steps)) 122 | 123 | end=time.time() 124 | print('Used {} second'.format(end-begin)) 125 | 126 | ### Test 127 | begin=time.time() 128 | test_df=pd.read_pickle(gh_test) 129 | 130 | test_df['label']=test_df['label'].replace({ 131 | 'positive':1, 132 | 'negative':2, 133 | 'neutral':0}) 134 | 135 | sentences=test_df.sentence.values 136 | labels = test_df.label.values 137 | 138 | input_ids = [] 139 | attention_masks = [] 140 | 141 | for sent in sentences: 142 | encoded_dict = tokenizer.encode_plus( 143 | str(sent), 144 | add_special_tokens = True, 145 | max_length = MAX_LEN, 146 | pad_to_max_length = True, 147 | return_attention_mask = True, 148 | return_tensors = 'pt', 149 | ) 150 | 151 | input_ids.append(encoded_dict['input_ids']) 152 | attention_masks.append(encoded_dict['attention_mask']) 153 | 154 | prediction_inputs = torch.cat(input_ids,dim=0) 155 | prediction_masks = torch.cat(attention_masks,dim=0) 156 | prediction_labels = torch.tensor(labels) 157 | 158 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) 159 | prediction_sampler = SequentialSampler(prediction_data) 160 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE) 161 | 162 | model.eval() 163 | predictions,true_labels=[],[] 164 | 165 | for batch in prediction_dataloader: 166 | batch = tuple(t.to(device) for t in batch) 167 | b_input_ids, b_input_mask, b_labels = batch 168 | 169 | with torch.no_grad(): 170 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 171 | logits = outputs[0] 172 | 173 | logits = logits.detach().cpu().numpy() 174 | label_ids = b_labels.to('cpu').numpy() 175 | 176 | predictions.append(logits) 177 | true_labels.append(label_ids) 178 | 179 | end=time.time() 180 | print('Prediction used {:.2f} seconds'.format(end - begin)) 181 | 182 | flat_predictions = [item for sublist in predictions for item in sublist] 183 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten() 184 | flat_true_labels = [item for sublist in true_labels for item in sublist] 185 | 186 | ### Get predictions on XLNet 187 | # new_df=pd.DataFrame(columns=['Text', 'True_label', 'XLNet_predicted']) 188 | 189 | # new_df['Text'] = pd.Series(sentences) 190 | # new_df['True_label'] = pd.Series(flat_true_labels) 191 | # new_df['True_label']=new_df['True_label'].replace({0: 'neutral', 1: 'positive', 2:'negative'}) 192 | # new_df['RoBERTa_predicted'] = pd.Series(flat_predictions) 193 | # new_df['RoBERTa_predicted']=new_df['XLNet_predicted'].replace( 194 | # {0: 'neutral', 1: 'positive', 2:'negative'}) 195 | # new_df.to_csv(data_folder/'XLNet_github_predictions.csv', header=True) 196 | 197 | # Evaluation of BERT in GitHub Dataset 198 | print("Accuracy of {} on GitHub is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions))) 199 | 200 | print(classification_report(flat_true_labels,flat_predictions)) -------------------------------------------------------------------------------- /scripts/PTM/jira.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from utils import * 3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification 4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification 5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification 6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification 7 | import argparse 8 | 9 | # Model | Tokenizer | Pretrained weights shortcut 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), 11 | (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), 12 | (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 13 | (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') 14 | ] 15 | 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] 17 | 18 | seed_torch(42) 19 | 20 | ## Read model name 21 | parser = argparse.ArgumentParser(description='Choose the models.') 22 | 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 24 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 25 | 26 | 27 | args = parser.parse_args() 28 | m_num=args.model_num 29 | 30 | cur_model=MODELS[m_num] 31 | m_name=MODEL_NAMES[m_num] 32 | 33 | train_df = pd.read_pickle(jira_train) 34 | train_df['label']=train_df['label'].replace(-1, 0) 35 | # Negative: 0, Positive: 1 36 | 37 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) 38 | 39 | sentences=train_df.sentence.values 40 | labels=train_df.label.values 41 | 42 | # max_len = 0 43 | # for sent in sentences: 44 | # input_ids=tokenizer.encode(sent, add_special_tokens=True) 45 | # max_len=max(max_len, len(input_ids)) 46 | # print('Max sentence length: ', max_len) 47 | 48 | input_ids = [] 49 | attention_masks = [] 50 | 51 | for sent in sentences: 52 | 53 | encoded_dict = tokenizer.encode_plus( 54 | str(sent), 55 | add_special_tokens = True, 56 | max_length = MAX_LEN, 57 | pad_to_max_length = True, 58 | return_attention_mask = True, 59 | return_tensors = 'pt' 60 | ) 61 | 62 | input_ids.append(encoded_dict['input_ids']) 63 | attention_masks.append(encoded_dict['attention_mask']) 64 | 65 | 66 | train_inputs = torch.cat(input_ids, dim=0) 67 | train_masks = torch.cat(attention_masks, dim=0) 68 | train_labels = torch.tensor(labels) 69 | 70 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape)) 71 | 72 | train_data = TensorDataset(train_inputs, train_masks, train_labels) 73 | train_sampler = RandomSampler(train_data) 74 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) 75 | 76 | # Train Model 77 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 78 | model.cuda() 79 | 80 | param_optimizer = list(model.named_parameters()) 81 | no_decay = ['bias', 'gamma', 'beta'] 82 | optimizer_grouped_parameters = [ 83 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 84 | 'weight_decay_rate': 0.01}, 85 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 86 | 'weight_decay_rate': 0.0} 87 | ] 88 | 89 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE) 90 | 91 | begin=time.time() 92 | train_loss_set = [] 93 | 94 | for _ in trange(EPOCHS, desc="Epoch"): 95 | 96 | model.train() 97 | 98 | tr_loss = 0 99 | nb_tr_examples, nb_tr_steps = 0, 0 100 | 101 | for step, batch in enumerate(train_dataloader): 102 | 103 | batch = tuple(t.to(device) for t in batch) 104 | 105 | b_input_ids, b_input_mask, b_labels = batch 106 | optimizer.zero_grad() 107 | 108 | # Forward pass 109 | outputs = model(b_input_ids, token_type_ids=None, \ 110 | attention_mask=b_input_mask, labels=b_labels) 111 | loss = outputs[0] 112 | logits = outputs[1] 113 | train_loss_set.append(loss.item()) 114 | 115 | # Backward pass 116 | loss.backward() 117 | optimizer.step() 118 | 119 | tr_loss += loss.item() 120 | nb_tr_examples += b_input_ids.size(0) 121 | nb_tr_steps += 1 122 | 123 | print("Train loss: {}".format(tr_loss/nb_tr_steps)) 124 | 125 | end=time.time() 126 | print('Training used {} second'.format(end-begin)) 127 | 128 | begin=time.time() 129 | test_df = pd.read_pickle(jira_test) 130 | test_df['label']=test_df['label'].replace(-1, 0) 131 | 132 | sentences=test_df.sentence.values 133 | labels = test_df.label.values 134 | 135 | input_ids = [] 136 | attention_masks = [] 137 | 138 | for sent in sentences: 139 | encoded_dict = tokenizer.encode_plus( 140 | str(sent), 141 | add_special_tokens = True, 142 | max_length = MAX_LEN, 143 | pad_to_max_length = True, 144 | return_attention_mask = True, 145 | return_tensors = 'pt' 146 | ) 147 | 148 | input_ids.append(encoded_dict['input_ids']) 149 | attention_masks.append(encoded_dict['attention_mask']) 150 | 151 | prediction_inputs = torch.cat(input_ids,dim=0) 152 | prediction_masks = torch.cat(attention_masks,dim=0) 153 | prediction_labels = torch.tensor(labels) 154 | 155 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) 156 | prediction_sampler = SequentialSampler(prediction_data) 157 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE) 158 | 159 | model.eval() 160 | predictions,true_labels=[],[] 161 | 162 | for batch in prediction_dataloader: 163 | batch = tuple(t.to(device) for t in batch) 164 | b_input_ids, b_input_mask, b_labels = batch 165 | 166 | with torch.no_grad(): 167 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 168 | logits = outputs[0] 169 | 170 | logits = logits.detach().cpu().numpy() 171 | label_ids = b_labels.to('cpu').numpy() 172 | 173 | predictions.append(logits) 174 | true_labels.append(label_ids) 175 | 176 | end=time.time() 177 | print('Prediction used {:.2f} seconds'.format(end-begin)) 178 | 179 | flat_predictions = [item for sublist in predictions for item in sublist] 180 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten() 181 | flat_true_labels = [item for sublist in true_labels for item in sublist] 182 | 183 | print("Accuracy of {} on Jira is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions))) 184 | print(classification_report(flat_true_labels, flat_predictions)) -------------------------------------------------------------------------------- /scripts/PTM/run_all.sh: -------------------------------------------------------------------------------- 1 | for i in 0 1 2 3 2 | do 3 | python github.py -m $i 4 | python api.py -m $i 5 | python app.py -m $i 6 | python so.py -m $i 7 | python jira.py -m $i 8 | python cr.py -m $i 9 | done -------------------------------------------------------------------------------- /scripts/PTM/so.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from utils import * 3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification 4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification 5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification 6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification 7 | import argparse 8 | 9 | # Model | Tokenizer | Pretrained weights shortcut 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), 11 | (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), 12 | (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 13 | (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') 14 | ] 15 | 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] 17 | 18 | seed_torch(42) 19 | 20 | ## Read model name 21 | parser = argparse.ArgumentParser(description='Choose the models.') 22 | 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?', 24 | help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0') 25 | 26 | 27 | args = parser.parse_args() 28 | m_num=args.model_num 29 | 30 | cur_model=MODELS[m_num] 31 | m_name=MODEL_NAMES[m_num] 32 | 33 | train_df=pd.read_pickle(so_train) 34 | 35 | train_df['label']=train_df['label'].replace(-1, 2) 36 | 37 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) 38 | 39 | sentences=train_df.sentence.values 40 | labels=train_df.label.values 41 | 42 | # Find the max length of the sentence 43 | # max_len = 0 44 | # for sent in sentences: 45 | # input_ids=tokenizer.encode(sent, add_special_tokens=True) 46 | # max_len=max(max_len, len(input_ids)) 47 | # print('Max sentence length: ', max_len) 48 | 49 | input_ids = [] 50 | attention_masks = [] 51 | 52 | for sent in sentences: 53 | 54 | encoded_dict = tokenizer.encode_plus( 55 | str(sent), 56 | add_special_tokens = True, 57 | max_length = MAX_LEN, 58 | pad_to_max_length = True, 59 | return_attention_mask = True, 60 | return_tensors = 'pt' 61 | ) 62 | 63 | input_ids.append(encoded_dict['input_ids']) 64 | attention_masks.append(encoded_dict['attention_mask']) 65 | 66 | 67 | train_inputs = torch.cat(input_ids, dim=0) 68 | train_masks = torch.cat(attention_masks, dim=0) 69 | train_labels = torch.tensor(labels) 70 | 71 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape)) 72 | 73 | train_data = TensorDataset(train_inputs, train_masks, train_labels) 74 | train_sampler = RandomSampler(train_data) 75 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) 76 | 77 | # Train Model 78 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) 79 | model.cuda() 80 | 81 | param_optimizer = list(model.named_parameters()) 82 | no_decay = ['bias', 'gamma', 'beta'] 83 | optimizer_grouped_parameters = [ 84 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 85 | 'weight_decay_rate': 0.01}, 86 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 87 | 'weight_decay_rate': 0.0} 88 | ] 89 | 90 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE) 91 | 92 | begin=time.time() 93 | train_loss_set = [] 94 | 95 | for _ in trange(EPOCHS, desc="Epoch"): 96 | 97 | model.train() 98 | 99 | tr_loss = 0 100 | nb_tr_examples, nb_tr_steps = 0, 0 101 | 102 | for step, batch in enumerate(train_dataloader): 103 | 104 | batch = tuple(t.to(device) for t in batch) 105 | 106 | b_input_ids, b_input_mask, b_labels = batch 107 | optimizer.zero_grad() 108 | 109 | # Forward pass 110 | outputs = model(b_input_ids, token_type_ids=None, \ 111 | attention_mask=b_input_mask, labels=b_labels) 112 | loss = outputs[0] 113 | logits = outputs[1] 114 | train_loss_set.append(loss.item()) 115 | 116 | # Backward pass 117 | loss.backward() 118 | optimizer.step() 119 | 120 | tr_loss += loss.item() 121 | nb_tr_examples += b_input_ids.size(0) 122 | nb_tr_steps += 1 123 | 124 | print("Train loss: {}".format(tr_loss/nb_tr_steps)) 125 | 126 | end=time.time() 127 | print('Training used {:.2f} second'.format(end-begin)) 128 | 129 | test_begin=time.time() 130 | test_df=pd.read_pickle(so_test) 131 | test_df['label']=test_df['label'].replace(-1, 2) 132 | 133 | sentences=test_df.sentence.values 134 | labels = test_df.label.values 135 | 136 | input_ids = [] 137 | attention_masks = [] 138 | 139 | for sent in sentences: 140 | encoded_dict = tokenizer.encode_plus( 141 | str(sent), 142 | add_special_tokens = True, 143 | max_length = MAX_LEN, 144 | pad_to_max_length = True, 145 | return_attention_mask = True, 146 | return_tensors = 'pt', 147 | ) 148 | 149 | input_ids.append(encoded_dict['input_ids']) 150 | attention_masks.append(encoded_dict['attention_mask']) 151 | 152 | prediction_inputs = torch.cat(input_ids,dim=0) 153 | prediction_masks = torch.cat(attention_masks,dim=0) 154 | prediction_labels = torch.tensor(labels) 155 | 156 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) 157 | prediction_sampler = SequentialSampler(prediction_data) 158 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE) 159 | 160 | model.eval() 161 | predictions,true_labels=[],[] 162 | 163 | for batch in prediction_dataloader: 164 | batch = tuple(t.to(device) for t in batch) 165 | b_input_ids, b_input_mask, b_labels = batch 166 | 167 | with torch.no_grad(): 168 | outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 169 | logits = outputs[0] 170 | 171 | logits = logits.detach().cpu().numpy() 172 | label_ids = b_labels.to('cpu').numpy() 173 | 174 | predictions.append(logits) 175 | true_labels.append(label_ids) 176 | 177 | test_end=time.time() 178 | print('Prediction used {:.2f} second'.format(test_end-test_begin)) 179 | 180 | flat_predictions = [item for sublist in predictions for item in sublist] 181 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten() 182 | flat_true_labels = [item for sublist in true_labels for item in sublist] 183 | 184 | print("Accuracy of {} on Stack Overflow is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions))) 185 | 186 | print(classification_report(flat_true_labels,flat_predictions)) -------------------------------------------------------------------------------- /scripts/PTM/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler 3 | 4 | from transformers import AdamW 5 | 6 | from tqdm import tqdm, trange 7 | import pandas as pd 8 | import numpy as np 9 | import random 10 | import time 11 | 12 | from sklearn.metrics import accuracy_score, classification_report 13 | 14 | def seed_torch(seed): 15 | random.seed(seed) 16 | np.random.seed(seed) 17 | torch.manual_seed(seed) 18 | torch.cuda.manual_seed(seed) 19 | torch.backends.cudnn.deterministic=True 20 | 21 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 22 | n_gpu = torch.cuda.device_count() 23 | torch.cuda.get_device_name(0) 24 | 25 | # Datasets 26 | from pathlib import Path 27 | data_folder=Path('/sa4se/data/') 28 | 29 | api_train=data_folder/'api-train.pkl' 30 | api_test=data_folder/'api-test.pkl' 31 | 32 | gh_train=data_folder/'gh-train.pkl' 33 | gh_test=data_folder/'gh-test.pkl' 34 | 35 | jira_train=data_folder/'jira-train.pkl' 36 | jira_test=data_folder/'jira-test.pkl' 37 | 38 | so_train=data_folder/'so-train.pkl' 39 | so_test=data_folder/'so-test.pkl' 40 | 41 | app_train=data_folder/'app-train.pkl' 42 | app_test=data_folder/'app-test.pkl' 43 | 44 | cr_train=data_folder/'cr-train.pkl' 45 | cr_test=data_folder/'cr-test.pkl' 46 | 47 | # Hyperparameters 48 | MAX_LEN=256 49 | BATCH_SIZE=16 50 | EPOCHS=4 51 | LEARNING_RATE=2e-5 -------------------------------------------------------------------------------- /scripts/SentiCR/SenticrTest.py: -------------------------------------------------------------------------------- 1 | # Created by happygirlzt 2 | 3 | from SentiCR import SentiCR 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from sklearn.metrics import classification_report 8 | import time 9 | 10 | sentiment_analyzer=SentiCR() 11 | 12 | from pathlib import Path 13 | data_folder=Path('/sa4se/data') # your data folder 14 | 15 | api_train=data_folder/'api-train.pkl' 16 | api_test=data_folder/'api-test.pkl' 17 | 18 | gh_train=data_folder/'gh-train.pkl' 19 | gh_test=data_folder/'gh-test.pkl' 20 | 21 | jira_train=data_folder/'jira-train.pkl' 22 | jira_test=data_folder/'jira-test.pkl' 23 | 24 | so_train=data_folder/'so-train.pkl' 25 | so_test=data_folder/'so-test.pkl' 26 | 27 | app_train=data_folder/'app-train.pkl' 28 | app_test=data_folder/'app-test.pkl' 29 | 30 | cr_train=data_folder/'cr-train.pkl' 31 | cr_test=data_folder/'cr-test.pkl' 32 | 33 | def predict_jira(): 34 | begin=time.time() 35 | df=pd.read_pickle(jira_test) 36 | 37 | df['label']=df['label'].replace(-1, 0) 38 | 39 | sentences=df['sentence'] 40 | y_test=df['label'] 41 | 42 | pred=[] 43 | for sent in sentences: 44 | score=sentiment_analyzer.get_sentiment_polarity(sent) 45 | pred.append(score) 46 | end=time.time() 47 | print('Prediction used {:.2f} seconds'.format(end-begin)) 48 | 49 | y_pred=pd.DataFrame(pred, columns=['pred_label']) 50 | print(classification_report(y_test, y_pred)) 51 | # report = classification_report(y_test, y_pred, output_dict=True) 52 | # df = pd.DataFrame(report).transpose() 53 | # df.to_csv('./SentiCR_jira.csv') 54 | 55 | def predict_so(): 56 | begin=time.time() 57 | df=pd.read_pickle(so_test) 58 | 59 | df['label']=df['label'].replace(-1, 2) 60 | 61 | sentences=df['sentence'] 62 | y_test=df['label'] 63 | 64 | pred=[] 65 | for sent in sentences: 66 | score=sentiment_analyzer.get_sentiment_polarity(sent) 67 | pred.append(score) 68 | 69 | end=time.time() 70 | print('Prediction used {:.2f} seconds'.format(end-begin)) 71 | y_pred=pd.DataFrame(pred, columns=['pred_label']) 72 | 73 | print(classification_report(y_test, y_pred)) 74 | #results = confusion_matrix(y_test, y_pred, labels=[1,0,2]) 75 | #print(results) 76 | #report = classification_report(y_test, y_pred, output_dict=True) 77 | #df = pd.DataFrame(report).transpose() 78 | #df.to_csv('./SentiCR_so.csv') 79 | 80 | def predict_gh(): 81 | begin=time.time() 82 | df=pd.read_pickle(gh_test) 83 | 84 | sentences=df['sentence'] 85 | y_test=df['label'] 86 | 87 | pred=[] 88 | for sent in sentences: 89 | score=sentiment_analyzer.get_sentiment_polarity(sent) 90 | pred.append(score) 91 | 92 | end=time.time() 93 | print('Prediction used {:.2f} seconds'.format(end-begin)) 94 | y_pred=pd.DataFrame(pred, columns=['pred_label']) 95 | 96 | # new_df=pd.DataFrame(columns=['Text', 'SentiCR_predicted']) 97 | # new_df['Text'] = sentences.copy 98 | # new_df['SentiCR_predicted'] = y_pred.copy 99 | 100 | # new_df.to_csv('./senticr_preditied.csv', header=True) 101 | 102 | print(classification_report(y_test, y_pred)) 103 | # report = classification_report(y_test, y_pred, output_dict=True) 104 | # df = pd.DataFrame(report).transpose() 105 | # df.to_csv('./SentiCR_gh.csv') 106 | 107 | def predict_app(): 108 | begin=time.time() 109 | df=pd.read_pickle(app_test) 110 | 111 | df['label']=df['label'].replace(-1,2) 112 | 113 | sentences=df['sentence'] 114 | y_test=df['label'] 115 | 116 | print(sentences.shape[0]==y_test.shape[0]) 117 | pred=[] 118 | for sent in sentences: 119 | score=sentiment_analyzer.get_sentiment_polarity(sent) 120 | pred.append(score) 121 | 122 | end=time.time() 123 | print('Prediction used {:.2f} seconds'.format(end-begin)) 124 | y_pred=pd.DataFrame(pred, columns=['pred_label']) 125 | print(classification_report(y_test, y_pred)) 126 | # report = classification_report(y_test, y_pred, output_dict=True) 127 | # df = pd.DataFrame(report).transpose() 128 | # df.to_csv('./SentiCR_app.csv') 129 | 130 | def predict_cr(): 131 | begin=time.time() 132 | df=pd.read_pickle(cr_test) 133 | df['label']=df['label'].replace(-1,1) 134 | 135 | sentences=df['sentence'] 136 | y_test=df['label'] 137 | 138 | pred=[] 139 | for sent in sentences: 140 | score=sentiment_analyzer.get_sentiment_polarity(sent) 141 | pred.append(score) 142 | 143 | end=time.time() 144 | print('Prediction used {:.2f} seconds'.format(end-begin)) 145 | y_pred=pd.DataFrame(pred, columns=['pred_label']) 146 | print(classification_report(y_test, y_pred)) 147 | # report = classification_report(y_test, y_pred, output_dict=True) 148 | # df = pd.DataFrame(report).transpose() 149 | # df.to_csv('./SentiCR_cr1.csv') 150 | 151 | def predict_api(): 152 | begin=time.time() 153 | df=pd.read_pickle(api_test) 154 | df['label']=df['label'].replace(-1,2) 155 | 156 | sentences=df['sentence'] 157 | y_test=df['label'] 158 | 159 | pred=[] 160 | for sent in sentences: 161 | score=sentiment_analyzer.get_sentiment_polarity(sent) 162 | pred.append(score) 163 | 164 | end=time.time() 165 | print('Prediction used {:.2f} seconds'.format(end-begin)) 166 | y_pred=pd.DataFrame(pred, columns=['pred_label']) 167 | print(classification_report(y_test, y_pred)) 168 | #report = classification_report(y_test, y_pred, output_dict=True) 169 | #df = pd.DataFrame(report).transpose() 170 | #df.to_csv('./SentiCR_api.csv') 171 | 172 | #predict_jira() 173 | #predict_api() 174 | #predict_gh() 175 | predict_so() 176 | #predict_cr() 177 | #predict_app() -------------------------------------------------------------------------------- /scripts/StanfordCoreNLP.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Test data on Stanford CoreNLP 3 | # Author: happygirlzt 4 | # coding: utf-8 5 | from sklearn.metrics import classification_report 6 | from pycorenlp import StanfordCoreNLP 7 | import pandas as pd 8 | import time 9 | 10 | from pathlib import Path 11 | data_folder=Path('/sa4se/data') # your data folder 12 | 13 | api_train=data_folder/'api-train.pkl' 14 | api_test=data_folder/'api-test.pkl' 15 | 16 | gh_train=data_folder/'gh-train.pkl' 17 | gh_test=data_folder/'gh-test.pkl' 18 | 19 | jira_train=data_folder/'jira-train.pkl' 20 | jira_test=data_folder/'jira-test.pkl' 21 | 22 | so_train=data_folder/'so-train.pkl' 23 | so_test=data_folder/'so-test.pkl' 24 | 25 | app_train=data_folder/'app-train.pkl' 26 | app_test=data_folder/'app-test.pkl' 27 | 28 | cr_train=data_folder/'cr-train.pkl' 29 | cr_test=data_folder/'cr-test.pkl' 30 | 31 | nlp = StanfordCoreNLP('http://localhost:9000') 32 | 33 | def get_predictions(test_df): 34 | print('total length is {}'.format(test_df.shape[0])) 35 | predictions=[] 36 | 37 | for index, row in test_df.iterrows(): 38 | sent=row['sentence'] 39 | #print(sent) 40 | try: 41 | res = nlp.annotate(sent, 42 | properties={ 43 | 'annotators': 'sentiment', 44 | 'outputFormat': 'json', 45 | 'timeout': 5000000000000, 46 | }) 47 | except: 48 | # print(sent) 49 | predictions.append('Neutral') 50 | continue 51 | 52 | #print(type(res['sentences'])) 53 | #return predictions 54 | 55 | try: 56 | # one row has many sentences 57 | if len(res['sentences']) > 1: 58 | total=0 59 | num=len(res['sentences']) 60 | 61 | for s in res['sentences']: 62 | # print(s['sentiment']) 63 | # predictions.append(s['sentiment']) 64 | total+=int(s['sentimentValue']) 65 | 66 | if total / num == 2: 67 | predictions.append('Neutral') 68 | elif total / num < 2: 69 | predictions.append('Negative') 70 | else: 71 | predictions.append('Positive') 72 | else: 73 | # one row only has one sentence 74 | for s in res['sentences']: 75 | #print(s['sentiment']) 76 | predictions.append(s['sentiment']) 77 | except: 78 | predictions.append('Neutral') 79 | continue 80 | return predictions 81 | 82 | def get_pred_df(cur_pred): 83 | pred_df=pd.DataFrame(cur_pred, columns=['Polarity']) 84 | pred_df['Polarity']=pred_df['Polarity'].replace({ 85 | 'Neutral':0, 86 | 'Negative':-1, 87 | 'Positive':1, 88 | 'Verynegative':-1, 89 | 'Verypositive':1}) 90 | 91 | pred_df['Polarity']=pred_df['Polarity'].astype(int) 92 | return pred_df 93 | 94 | def test_api(): 95 | begin=time.time() 96 | # API reviews 97 | test_df=pd.read_pickle(api_test) 98 | cur_pred=get_predictions(test_df) 99 | 100 | end=time.time() 101 | print('Predict API used {:.2f} seconds'.format(end-begin)) 102 | 103 | pred_df=get_pred_df(cur_pred) 104 | print(classification_report(test_df['label'], pred_df['Polarity'])) 105 | 106 | def test_gh(): 107 | begin=time.time() 108 | # GitHub 109 | test_df=pd.read_pickle(gh_test) 110 | cur_pred=get_predictions(test_df) 111 | end=time.time() 112 | print('Predict GitHub used {:.2f} seconds'.format(end-begin)) 113 | #len(predictions) 114 | pred_df=get_pred_df(cur_pred) 115 | 116 | test_df['label']=test_df['label'].replace({ 117 | 'neutral':0, 118 | 'positive':1, 119 | 'negative':-1}) 120 | 121 | print(classification_report(test_df['label'], pred_df['Polarity'])) 122 | 123 | # APP reviews 124 | def test_app(): 125 | begin=time.time() 126 | test_df=pd.read_pickle(app_test) 127 | 128 | cur_pred=get_predictions(test_df) 129 | 130 | end=time.time() 131 | print('Predict APP used {:.2f} seconds'.format(end-begin)) 132 | pred_df=get_pred_df(cur_pred) 133 | 134 | print(classification_report(test_df['label'], pred_df['Polarity'])) 135 | 136 | # SO 137 | def test_so(): 138 | begin=time.time() 139 | test_df=pd.read_pickle(so_test) 140 | 141 | cur_pred=get_predictions(test_df) 142 | end=time.time() 143 | print('Predict StackOverflow used {:.2f} seconds'.format(end-begin)) 144 | 145 | pred_df=get_pred_df(cur_pred) 146 | print(classification_report(test_df['label'], pred_df['Polarity'])) 147 | 148 | 149 | # Jira 150 | def test_jira(): 151 | begin=time.time() 152 | test_df=pd.read_pickle(jira_test) 153 | cur_pred=get_predictions(test_df) 154 | 155 | end=time.time() 156 | print('Predict Jira used {:.2f} seconds'.format(end-begin)) 157 | 158 | pred_df=get_pred_df(cur_pred) 159 | 160 | print(classification_report(test_df['label'], pred_df['Polarity'])) 161 | 162 | 163 | # CR 164 | def test_cr(): 165 | begin=time.time() 166 | test_df=pd.read_pickle(cr_test) 167 | cur_pred=get_predictions(test_df) 168 | 169 | end=time.time() 170 | print('Predict Code Reviews used {:.2f} seconds'.format(end-begin)) 171 | 172 | pred_df=get_pred_df(cur_pred) 173 | print(classification_report(test_df['label'], pred_df['Polarity'])) 174 | 175 | #test_gh() 176 | #test_api() 177 | #test_app() 178 | #test_so() 179 | #test_jira() 180 | #test_cr() -------------------------------------------------------------------------------- /scripts/analyze-results/Senti4SD.py: -------------------------------------------------------------------------------- 1 | # This file is to used to predict the performance of Senti4SD 2 | # Author: happygirlzt 3 | import pandas as pd 4 | import numpy as np 5 | 6 | import re 7 | from sklearn.metrics import classification_report,confusion_matrix 8 | 9 | def get_confusion_matrix(): 10 | pred=pd.read_csv('./predictions/so-predictions.csv',usecols=['PREDICTED']) 11 | 12 | #print(pred.shape) 13 | pred.columns=['res'] 14 | res_pd=pred['res'] 15 | 16 | test_df=pd.read_csv('so-test-sd.csv',usecols=['Text','Polarity']) 17 | 18 | true_df=pd.Series(test_df['Polarity'],dtype='int32') 19 | results=confusion_matrix(true_df,res_pd, labels=['positive','neutral','negative']) 20 | print(results) 21 | 22 | #get_confusion_matrix() 23 | 24 | def analyze_cr(): 25 | # Replace './predictions/cr-predictions.csv' with your predicted file name 26 | pred=pd.read_csv('./predictions/cr-predictions.csv',usecols=['PREDICTED']) 27 | pred['PREDICTED']=pred['PREDICTED'].replace({'positive':'neutral'}) 28 | 29 | print(pred.shape) 30 | pred.columns=['res'] 31 | res_pd=pred['res'] 32 | 33 | # read in true lables 34 | test_df=pd.read_csv('cr-test-sd.csv', usecols=['Text','Polarity']) 35 | 36 | true_df=pd.Series(test_df['Polarity'], dtype='int32') 37 | print(classification_report(true_df, pred)) 38 | #analyze_cr() 39 | 40 | def analyze_app(): 41 | pred=pd.read_csv('./predictions/app-predictions.csv',usecols=['PREDICTED']) 42 | 43 | print(pred.shape) 44 | pred.columns=['res'] 45 | res_pd=pred['res'] 46 | 47 | # read in true lables 48 | test_df=pd.read_csv('app-test-sd.csv',usecols=['Text','Polarity']) 49 | 50 | true_df=pd.Series(test_df['Polarity'],dtype='int32') 51 | print(classification_report(true_df, pred)) 52 | #analyze_app() 53 | 54 | def analyze_gh(): 55 | pred=pd.read_csv('./predictions/gh-predictions.csv',usecols=['PREDICTED']) 56 | 57 | print(pred.shape) 58 | pred.columns=['res'] 59 | res_pd=pred['res'] 60 | 61 | # read in true lables 62 | test_df=pd.read_csv('gh-test-sd.csv',usecols=['Text','Polarity']) 63 | 64 | true_df=pd.Series(test_df['Polarity'],dtype='int32') 65 | print(classification_report(true_df, pred)) 66 | #analyze_gh() 67 | 68 | def analyze_jira(): 69 | pred=pd.read_csv('./predictions/jira-predictions.csv',usecols=['PREDICTED']) 70 | 71 | print(pred.shape) 72 | pred.columns=['res'] 73 | res_pd=pred['res'] 74 | 75 | # read in true lables 76 | test_df=pd.read_csv('jira-test-sd.csv',usecols=['Text','Polarity']) 77 | 78 | true_df=pd.Series(test_df['Polarity'],dtype='int32') 79 | print(classification_report(true_df, pred)) 80 | #analyze_jira() 81 | 82 | def analyze_api(): 83 | pred=pd.read_csv('./predictions/api-predictions.csv',usecols=['PREDICTED']) 84 | 85 | print(pred.shape) 86 | pred.columns=['res'] 87 | res_pd=pred['res'] 88 | 89 | test_df=pd.read_csv('api-test-sd.csv',usecols=['Text','Polarity']) 90 | 91 | true_df=pd.Series(test_df['Polarity'],dtype='int32') 92 | print(classification_report(true_df, pred)) 93 | #analyze_api() 94 | 95 | def analyze_so(): 96 | pred=pd.read_csv('./predictions/so-predictions.csv',usecols=['PREDICTED']) 97 | 98 | print(pred.shape) 99 | pred.columns=['res'] 100 | res_pd=pred['res'] 101 | 102 | test_df=pd.read_csv('so-test-sd.csv',usecols=['Text','Polarity']) 103 | 104 | true_df=pd.Series(test_df['Polarity'],dtype='int32') 105 | print(classification_report(true_df, pred)) 106 | #analyze_so() -------------------------------------------------------------------------------- /scripts/analyze-results/SentiStrength-SE.py: -------------------------------------------------------------------------------- 1 | # The file is used to analyze the prediction performance of SentiStrength-SE 2 | # Author: happygirlzt 3 | 4 | import pandas as pd 5 | import numpy as np 6 | 7 | import re 8 | from sklearn.metrics import classification_report, confusion_matrix 9 | 10 | def get_confusion_matrix(): 11 | pred=pd.read_csv('so-ss.csv', header=None) 12 | #print(pred.shape) 13 | pred.columns=['res'] 14 | res_pd=pred['res'] 15 | 16 | pos_list=[] 17 | neg_list=[] 18 | 19 | for sent in res_pd: 20 | cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:] 21 | 22 | new_list=cur_list[0].split() 23 | 24 | pos_list.append(int(new_list[0])) 25 | neg_list.append(int(new_list[1])) 26 | 27 | total = [p + n for p, n in zip(pos_list, neg_list)] 28 | label=[] 29 | for score in total: 30 | if score>0: 31 | label.append(1) 32 | elif score==0: 33 | label.append(0) 34 | else: 35 | label.append(-1) 36 | 37 | pred_df=pd.Series(label,dtype='int32') 38 | 39 | test_df=pd.read_csv('so-test-se.csv',header=None) 40 | test_df.columns=['sentence','label'] 41 | 42 | true_df=pd.Series(test_df['label'],dtype='int32') 43 | 44 | results = confusion_matrix(true_df,pred_df,labels=[1,0,-1]) 45 | print(results) 46 | 47 | #get_confusion_matrix() 48 | 49 | def analyze_cr(): 50 | # replace 'cr-ss.csv' with your prediction file name 51 | pred=pd.read_csv('cr-ss.csv',header=None) 52 | print(pred.shape) 53 | pred.columns=['res'] 54 | res_pd=pred['res'] 55 | 56 | pos_list=[] 57 | neg_list=[] 58 | 59 | for sent in res_pd: 60 | cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:] 61 | #print(cur_list) 62 | 63 | new_list=cur_list[0].split() 64 | 65 | pos_list.append(int(new_list[0])) 66 | neg_list.append(int(new_list[1])) 67 | 68 | total = [p + n for p, n in zip(pos_list, neg_list)] 69 | label=[] 70 | for score in total: 71 | if score<0: 72 | label.append(-1) 73 | else: 74 | label.append(0) 75 | 76 | pred_df=pd.Series(label,dtype='int32') 77 | 78 | #print(pred_df) 79 | 80 | # read in true lables 81 | test_df=pd.read_csv('cr-test-se.csv',header=None) 82 | test_df.columns=['sentence','label'] 83 | 84 | true_df=pd.Series(test_df['label'],dtype='int32') 85 | print(classification_report(true_df, pred_df)) 86 | 87 | report=classification_report(true_df, pred_df, output_dict=True) 88 | df = pd.DataFrame(report).transpose() 89 | df.to_csv('./SentiStrength-SE-cr1.csv') 90 | 91 | analyze_cr() 92 | 93 | def analyze_app(): 94 | pred=pd.read_csv('app-ss.csv',header=None) 95 | print(pred.shape) 96 | pred.columns=['res'] 97 | res_pd=pred['res'] 98 | 99 | pos_list=[] 100 | neg_list=[] 101 | 102 | for sent in res_pd: 103 | cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:] 104 | #print(cur_list) 105 | new_list=cur_list[0].split() 106 | 107 | pos_list.append(int(new_list[0])) 108 | neg_list.append(int(new_list[1])) 109 | 110 | total = [p + n for p, n in zip(pos_list, neg_list)] 111 | label=[] 112 | for score in total: 113 | if score>0: 114 | label.append(1) 115 | elif score==0: 116 | label.append(0) 117 | else: 118 | label.append(-1) 119 | 120 | pred_df=pd.Series(label,dtype='int32') 121 | 122 | #print(pred_df) 123 | 124 | # read in true lables 125 | test_df=pd.read_csv('app-test-se.csv',header=None) 126 | test_df.columns=['sentence','label'] 127 | 128 | true_df=pd.Series(test_df['label'],dtype='int32') 129 | print(classification_report(true_df, pred_df)) 130 | #analyze_app() 131 | 132 | def analyze_gh(): 133 | pred=pd.read_csv('gh-ss.csv',header=None) 134 | print(pred.shape) 135 | pred.columns=['res'] 136 | res_pd=pred['res'] 137 | 138 | pos_list=[] 139 | neg_list=[] 140 | 141 | for sent in res_pd: 142 | cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:] 143 | #print(cur_list) 144 | 145 | new_list=cur_list[0].split() 146 | 147 | pos_list.append(int(new_list[0])) 148 | neg_list.append(int(new_list[1])) 149 | 150 | total = [p + n for p, n in zip(pos_list, neg_list)] 151 | label=[] 152 | for score in total: 153 | if score>0: 154 | label.append(1) 155 | elif score==0: 156 | label.append(0) 157 | else: 158 | label.append(-1) 159 | 160 | pred_df=pd.Series(label,dtype='int32') 161 | 162 | #print(pred_df) 163 | 164 | # read in true lables 165 | test_df=pd.read_csv('gh-test-se.csv',header=None) 166 | test_df.columns=['sentence','label'] 167 | 168 | test_df['label']=test_df['label'].replace({'positive':1, 'negative':-1, 'neutral':0}) 169 | 170 | true_df=pd.Series(test_df['label'],dtype='int32') 171 | print(classification_report(true_df, pred_df)) 172 | 173 | #analyze_gh() 174 | 175 | def analyze_jira(): 176 | pred=pd.read_csv('jira-ss.csv',header=None) 177 | print(pred.shape) 178 | pred.columns=['res'] 179 | res_pd=pred['res'] 180 | 181 | pos_list=[] 182 | neg_list=[] 183 | 184 | for sent in res_pd: 185 | cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:] 186 | #print(cur_list) 187 | 188 | new_list=cur_list[0].split() 189 | 190 | pos_list.append(int(new_list[0])) 191 | neg_list.append(int(new_list[1])) 192 | 193 | total = [p + n for p, n in zip(pos_list, neg_list)] 194 | label=[] 195 | for score in total: 196 | if score>0: 197 | label.append(1) 198 | elif score==0: 199 | label.append(0) 200 | else: 201 | label.append(-1) 202 | 203 | pred_df=pd.Series(label,dtype='int32') 204 | 205 | #print(pred_df) 206 | 207 | # read in true lables 208 | test_df=pd.read_csv('jira-test-se.csv',header=None) 209 | test_df.columns=['sentence','label'] 210 | 211 | true_df=pd.Series(test_df['label'],dtype='int32') 212 | print(classification_report(true_df, pred_df)) 213 | 214 | #analyze_jira() 215 | def analyze_api(): 216 | pred=pd.read_csv('api-ss.csv',header=None) 217 | print(pred.shape) 218 | pred.columns=['res'] 219 | res_pd=pred['res'] 220 | 221 | pos_list=[] 222 | neg_list=[] 223 | 224 | for sent in res_pd: 225 | cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:] 226 | #print(cur_list) 227 | 228 | new_list=cur_list[0].split() 229 | 230 | pos_list.append(int(new_list[0])) 231 | neg_list.append(int(new_list[1])) 232 | 233 | total = [p + n for p, n in zip(pos_list, neg_list)] 234 | label=[] 235 | for score in total: 236 | if score>0: 237 | label.append(1) 238 | elif score==0: 239 | label.append(0) 240 | else: 241 | label.append(-1) 242 | 243 | pred_df=pd.Series(label,dtype='int32') 244 | 245 | #print(pred_df) 246 | 247 | # read in true lables 248 | test_df=pd.read_csv('api-test-se.csv',header=None) 249 | test_df.columns=['sentence','label'] 250 | 251 | true_df=pd.Series(test_df['label'],dtype='int32') 252 | print(classification_report(true_df, pred_df)) 253 | 254 | #analyze_api() 255 | 256 | def analyze_so(): 257 | pred=pd.read_csv('so-ss.csv',header=None) 258 | print(pred.shape) 259 | pred.columns=['res'] 260 | res_pd=pred['res'] 261 | 262 | pos_list=[] 263 | neg_list=[] 264 | 265 | for sent in res_pd: 266 | cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:] 267 | #print(cur_list) 268 | 269 | new_list=cur_list[0].split() 270 | 271 | pos_list.append(int(new_list[0])) 272 | neg_list.append(int(new_list[1])) 273 | 274 | total = [p + n for p, n in zip(pos_list, neg_list)] 275 | label=[] 276 | for score in total: 277 | if score>0: 278 | label.append(1) 279 | elif score==0: 280 | label.append(0) 281 | else: 282 | label.append(-1) 283 | 284 | pred_df=pd.Series(label,dtype='int32') 285 | 286 | #print(pred_df) 287 | 288 | # read in true lables 289 | test_df=pd.read_csv('so-test-se.csv',header=None) 290 | test_df.columns=['sentence','label'] 291 | 292 | true_df=pd.Series(test_df['label'],dtype='int32') 293 | print(classification_report(true_df, pred_df)) 294 | #analyze_so() -------------------------------------------------------------------------------- /scripts/analyze-results/SentiStrength.py: -------------------------------------------------------------------------------- 1 | # The file is used to analyze the performance of SentiStrength 2 | # Author: happygirlzt 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn.metrics import classification_report 6 | from sklearn.metrics import confusion_matrix 7 | 8 | #lol = list(csv.reader(open('text.txt', 'rb'), delimiter='\t')) 9 | 10 | def get_confusion_matrix(): 11 | df=pd.read_csv('so-test+results.txt', sep='\t', index_col=False, header=None) 12 | #print(df.head()) 13 | df.columns=['sent','pos','neg'] 14 | #print(df.shape) 15 | 16 | result=[] 17 | total_lines=df.shape[0] 18 | for i in range(total_lines): 19 | cur_sum=int(df.iloc[i].pos)+int(df.iloc[i].neg) 20 | if cur_sum > 0: 21 | result.append(1) 22 | elif cur_sum == 0: 23 | result.append(0) 24 | else: 25 | result.append(-1) 26 | 27 | y_pred=pd.DataFrame(result) 28 | 29 | y_true=pd.read_csv('so-test-se.csv', header=None,usecols=[1]) 30 | 31 | results = confusion_matrix(y_true,y_pred,labels=[1,0,-1]) 32 | print(results) 33 | 34 | #get_confusion_matrix() 35 | 36 | def analyze(file_name): 37 | #replace '{}-test+results.txt' with your prediction file name 38 | df=pd.read_csv('{}-test+results.txt'.format(file_name), sep='\t', index_col=False, header=None) 39 | print(df.head()) 40 | df.columns=['sent','pos','neg'] 41 | print(df.shape) 42 | 43 | result=[] 44 | total_lines=df.shape[0] 45 | for i in range(total_lines): 46 | cur_sum=int(df.iloc[i].pos)+int(df.iloc[i].neg) 47 | if cur_sum > 0: 48 | result.append(1) 49 | elif cur_sum == 0: 50 | result.append(0) 51 | else: 52 | result.append(-1) 53 | 54 | y_pred=pd.DataFrame(result) 55 | 56 | y_true=pd.read_csv('{}-test-se.csv'.format(file_name), header=None,usecols=[1]) 57 | 58 | print(classification_report(y_true, y_pred)) 59 | 60 | def analyze_gh(): 61 | df=pd.read_csv('gh-test+results.txt', sep='\t', index_col=False, header=None) 62 | print(df.head()) 63 | df.columns=['sent','pos','neg'] 64 | print(df.shape) 65 | 66 | result=[] 67 | total_lines=df.shape[0] 68 | for i in range(total_lines): 69 | cur_sum=int(df.iloc[i].pos)+int(df.iloc[i].neg) 70 | if cur_sum > 0: 71 | result.append(1) 72 | elif cur_sum == 0: 73 | result.append(0) 74 | else: 75 | result.append(-1) 76 | 77 | y_pred=pd.DataFrame(result) 78 | 79 | y_true=pd.read_csv('gh-test-se.csv', header=None, usecols=[1]) 80 | y_true=y_true.replace({'positive':1, 'negative':-1, 'neutral':0}) 81 | 82 | print(classification_report(y_true,y_pred)) 83 | report = classification_report(y_true, y_pred, output_dict=True) 84 | df = pd.DataFrame(report).transpose() 85 | df.to_csv('./SentiStrength-gh.csv') 86 | #analyze('api') 87 | #analyze('cr') 88 | #analyze('app') 89 | #analyze('gh') 90 | #analyze('jira') 91 | #analyze_gh() 92 | #analyze('so') -------------------------------------------------------------------------------- /scripts/analyze-results/gh-xlnet-senticr.py: -------------------------------------------------------------------------------- 1 | # This file is used to compare the predictions from XLNet and SentiCR 2 | # on GitHub dataset 3 | # Created by happygirlzt 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from sklearn.metrics import classification_report, confusion_matrix 8 | from pathlib import Path 9 | 10 | data_folder=Path('your_github_predictions_folder') 11 | 12 | ### Concatenate predictions from XLNet and SentiCR into one dataframe 13 | #xlnet = pd.read_csv(data_folder/'XLNet_github_predictions.csv') 14 | #print(xlnet.shape) 15 | 16 | #cr = pd.read_csv(data_folder/'senticr_github_predictions.csv') 17 | #print(cr.head()) 18 | #print(cr.shape) 19 | 20 | 21 | # final_df=pd.DataFrame(columns=['Text', 'True_label', 'XLNet_predicted', 'SentiCR_predicted']) 22 | # final_df['Text'] = pd.Series(xlnet['Text']) 23 | # final_df['True_label']=pd.Series(xlnet['True_label']) 24 | # final_df['xlnet_predicted']=pd.Series(xlnet['xlnet_predicted']) 25 | # final_df['SentiCR_predicted']=pd.Series(cr['SentiCR_predicted']) 26 | #print(final_df.head()) 27 | 28 | #final_df.to_csv(data_folder/'xlnet-senticr.csv', header=True) 29 | 30 | ### Read the df 31 | final_df=pd.read_csv(data_folder/'xlnet-senticr.csv') 32 | # xlnet true predictions 33 | xlnet_true = final_df.loc[final_df['True_label'] == final_df['XLNet_predicted']] 34 | print(xlnet_true.head()) 35 | 36 | # xlnet true, while cr false 37 | # xlnet true predictions 38 | xlnet_true_cr_false = final_df.loc[ 39 | (final_df['True_label'] == final_df['XLNet_predicted']) & 40 | (final_df['True_label'] != final_df['SentiCR_predicted']) 41 | ] 42 | 43 | #print(xlnet_true_cr_false.head()) 44 | print(xlnet_true_cr_false.shape) 45 | print(xlnet_true.shape) 46 | 47 | # cr true predictions 48 | cr_true = final_df.loc[ 49 | final_df['True_label'] == final_df['SentiCR_predicted'] 50 | ] 51 | print(cr_true.head()) 52 | 53 | # xlnet false, while cr true 54 | cr_true_xlnet_false=final_df.loc[ 55 | (final_df['True_label'] != final_df['XLNet_predicted']) & 56 | (final_df['True_label'] == final_df['SentiCR_predicted']) 57 | ] 58 | 59 | print(cr_true_xlnet_false.shape[0]) 60 | print(cr_true.shape[0]) 61 | 62 | # both true 63 | both_true=final_df.loc[ 64 | (final_df['True_label'] == final_df['SentiCR_predicted']) & 65 | (final_df['True_label'] == final_df['XLNet_predicted']) 66 | ] 67 | #both_true.head() 68 | print('both true {}'.format(both_true.shape[0])) -------------------------------------------------------------------------------- /scripts/prepare-data/convert_senti4sd.py: -------------------------------------------------------------------------------- 1 | # This file is for converting data to the format of Senti4SD 2 | # Created by happygirlzt 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import re 7 | 8 | def convert_cr(): 9 | df=pd.read_csv('../data/cr-test-se.csv',header=None,encoding='utf_8') 10 | df.columns=['Text','Polarity'] 11 | df['Polarity']=df['Polarity'].replace({-1: 'negative', 1: 'positive', 0: 'neutral'}) 12 | df.to_csv('../data/cr-test-sd.csv', index=False,encoding='utf_8') 13 | convert_cr() 14 | 15 | def convert_jira(): 16 | df=pd.read_csv('../data/jira-test-se.csv',header=None) 17 | df.columns=['Text','Polarity'] 18 | df['Polarity']=df['Polarity'].replace({-1: 'negative', 1: 'positive'}) 19 | df.to_csv('../data/jira-test-sd.csv', index=False) 20 | #convert_jira() 21 | 22 | def convert_so(): 23 | for file_name in ['train','test']: 24 | df=pd.read_csv('../data/so-{}.csv'.format(file_name),usecols=['text','oracle']) 25 | df.columns=['Text','Polarity'] 26 | df['Polarity']=df['Polarity'].replace({-1: 'negative', 1: 'positive', 0: 'neutral'}) 27 | 28 | df.to_csv('../data/so-{}-sd.csv'.format(file_name),index=False) 29 | #convert_so() 30 | 31 | def convert_api(): 32 | for file_name in ['train','test']: 33 | df=pd.read_csv('../data/api-{}.csv'.format(file_name), usecols=['sentence','label']) 34 | df.columns=['Text','Polarity'] 35 | df['Polarity']=df['Polarity'].replace({-1: 'negative', 1: 'positive', 0: 'neutral'}) 36 | df.to_csv('../data/api-{}-sd.csv'.format(file_name),index=False) 37 | #convert_api() 38 | 39 | def convert_app(): 40 | for file_name in ['train','test']: 41 | df=pd.read_csv('../data/app-{}.csv'.format(file_name), usecols=['sentence','oracle']) 42 | df.columns=['Text','Polarity'] 43 | df['Polarity']=df['Polarity'].replace({-1: 'negative', 1: 'positive', 0: 'neutral'}) 44 | 45 | df.to_csv('../data/app-{}-sd.csv'.format(file_name), index=False) 46 | #convert_app() 47 | 48 | def convert_gh(): 49 | df=pd.read_csv('../data/gh-test.csv', usecols=['Text','Polarity']) 50 | df.to_csv('../data/gh-test-sd.csv', index=False) 51 | #convert_gh() -------------------------------------------------------------------------------- /scripts/prepare-data/convert_sentistrength.py: -------------------------------------------------------------------------------- 1 | # This file is for convert data format to SentiStrength-SE format 2 | # Created by happygirlzt 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import re 7 | from pathlib import Path 8 | data_folder=Path('YOUR_DATA_FOLDER') 9 | 10 | api_train=data_folder/'api-train.pkl' 11 | api_test=data_folder/'api-test.pkl' 12 | 13 | gh_train=data_folder/'gh-train.pkl' 14 | gh_test=data_folder/'gh-test.pkl' 15 | 16 | jira_train=data_folder/'jira-train.pkl' 17 | jira_test=data_folder/'jira-test.pkl' 18 | 19 | so_train=data_folder/'so-train.pkl' 20 | so_test=data_folder/'so-test.pkl' 21 | 22 | app_train=data_folder/'app-train.pkl' 23 | app_test=data_folder/'app-test.pkl' 24 | 25 | cr_train=data_folder/'cr-train.pkl' 26 | cr_test=data_folder/'cr-test.pkl' 27 | 28 | def convert_jira_test(): 29 | df=pd.read_pickle(jira_test) 30 | 31 | sents=[] 32 | 33 | for index, row in df.iterrows(): 34 | text=row['sentence'] 35 | text=''.join(text.split('\n')) 36 | sents.append(text) 37 | 38 | #print(len(sents)) 39 | new_df=pd.DataFrame(sents,columns=['sentence']) 40 | 41 | df.update(new_df) 42 | df.to_csv('../data/jira-test-se.csv',header=None,index=False) 43 | 44 | def convert_so_test(): 45 | df=pd.read_pickle(so_test) 46 | sents=[] 47 | 48 | for index, row in df.iterrows(): 49 | text=row['sentence'] 50 | text=''.join(text.split('\n')) 51 | sents.append(text) 52 | 53 | #print(len(sents)) 54 | new_df=pd.DataFrame(sents,columns=['sentence']) 55 | 56 | df.update(new_df) 57 | df.to_csv('../data/so-test-se.csv',header=None,index=False) 58 | 59 | def convert_api_test(): 60 | df=pd.read_pickle(api_test) 61 | sents=[] 62 | 63 | for index, row in df.iterrows(): 64 | text=row['sentence'] 65 | text=''.join(str(text).split('\n')) 66 | sents.append(text) 67 | 68 | #print(len(sents)) 69 | new_df=pd.DataFrame(sents,columns=['sentence']) 70 | 71 | df.update(new_df) 72 | df.to_csv('../data/api-test-se.csv',header=None,index=False) 73 | 74 | def convert_app_test(): 75 | df=pd.read_pickle(app_test) 76 | sents=[] 77 | 78 | for index, row in df.iterrows(): 79 | text=row['sentence'] 80 | text=''.join(str(text).split('\n')) 81 | sents.append(text) 82 | 83 | #print(len(sents)) 84 | new_df=pd.DataFrame(sents,columns=['sentence']) 85 | 86 | df.update(new_df) 87 | df.to_csv('../data/app-test-se.csv',header=None,index=False) 88 | #convert_api_test() 89 | 90 | def convert_cr_test(): 91 | df=pd.read_pickle(cr_test) 92 | 93 | sents=[] 94 | labels=[] 95 | for index, row in df.iterrows(): 96 | text=row['sentence'] 97 | text=''.join(text.split('\n')) 98 | sents.append(text) 99 | labels.append(row['label']) 100 | 101 | #print(len(sents)) 102 | new_df=pd.DataFrame({'sentence': sents,'label': labels}) 103 | 104 | sents=[] 105 | labels=[] 106 | for index, row in new_df.iterrows(): 107 | text=row['sentence'] 108 | text=''.join(text.split('\n')) 109 | sents.append(text) 110 | labels.append(row['label']) 111 | 112 | pd.DataFrame({'sentence': sents,'label': labels}).to_csv('../data/cr-test-se.csv',header=None,index=False) 113 | 114 | #convert_so_test() 115 | convert_cr_test() 116 | #convert_app_test() 117 | 118 | def convert_gh_test(): 119 | df=pd.read_pickle(gh_test) 120 | sents=[] 121 | 122 | for index, row in df.iterrows(): 123 | text=row['sentence'] 124 | text=''.join(text.split('\n')) 125 | sents.append(text) 126 | 127 | #print(len(sents)) 128 | new_df=pd.DataFrame(sents,columns=['sentence']) 129 | 130 | df.update(new_df) 131 | df.to_csv('../data/gh-test-se.csv',header=None,index=False) 132 | #convert_gh_test() --------------------------------------------------------------------------------