├── .gitignore
├── README.md
├── data
    ├── PTM
    │   ├── api-test.pkl
    │   ├── api-train.pkl
    │   ├── app-test.pkl
    │   ├── app-train.pkl
    │   ├── cr-test.pkl
    │   ├── cr-train.pkl
    │   ├── gh-test.pkl
    │   ├── gh-train.pkl
    │   ├── jira-test.pkl
    │   ├── jira-train.pkl
    │   ├── so-test.pkl
    │   └── so-train.pkl
    ├── Senti4SD
    │   ├── api-test-sd.csv
    │   ├── app-test-sd.csv
    │   ├── cr-test-sd.csv
    │   ├── gh-test-sd.csv
    │   ├── jira-test-sd.csv
    │   └── so-test-sd.csv
    ├── SentiStrength
    │   ├── api-test-se.csv
    │   ├── api-test.txt
    │   ├── app-test-se.csv
    │   ├── app-test.txt
    │   ├── cr-test-se.csv
    │   ├── cr-test.txt
    │   ├── gh-test-se.csv
    │   ├── gh-test.txt
    │   ├── jira-test-se.csv
    │   ├── jira-test.txt
    │   ├── so-test-se.csv
    │   └── so-test.txt
    └── github-predictions
    │   └── xlnet-senticr.csv
└── scripts
    ├── PTM
        ├── api.py
        ├── app.py
        ├── cr.py
        ├── early-stopping
        │   ├── api.py
        │   ├── app.py
        │   ├── cr.py
        │   ├── github.py
        │   ├── jira.py
        │   ├── so.py
        │   └── utils.py
        ├── github.py
        ├── jira.py
        ├── run_all.sh
        ├── so.py
        └── utils.py
    ├── SentiCR
        ├── SentiCR.py
        └── SenticrTest.py
    ├── StanfordCoreNLP.py
    ├── analyze-results
        ├── Senti4SD.py
        ├── SentiStrength-SE.py
        ├── SentiStrength.py
        └── gh-xlnet-senticr.py
    └── prepare-data
        ├── convert_senti4sd.py
        └── convert_sentistrength.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | # static files generated from Django application using `collectstatic`
142 | media
143 | static


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | In total, we run (5 + 4) * 6 = 54 experiments. Among them, we directly predict the labels on *Stanford CoreNLP*, *SentiStrength*, *SentiStrength-SE*, *Senti4SD* without re-training. While in *SentiCR* and pre-trained Transformer-based language models, we do supervised learning on each specific dataset.
  3 | 
  4 | Do remember to change your file name or location of the data into the scripts.
  5 | # Datasets
  6 | Six datasets have been used. The sources of these datasets are noted in the paper. Credit to the original authors. You can download the original datasets in the following sources.
  7 | - API Reviews (Downloaded from https://github.com/giasuddin/OpinionValueTSE/blob/master/ConsolidatedMentionResolutionBenchmark.xls)
  8 | - APP Reviews (Downloaded from https://sentiment-se.github.io/replication.zip)
  9 | - Code Reviews (Download from https://github.com/senticr/SentiCR/blob/master/SentiCR/oracle.xlsx)
 10 | - GitHub Comments (Downloaded from https://doi.org/10.6084/m9.figshare.11604597)
 11 | - JIRA Issues (https://sentiment-se.github.io/replication.zip)
 12 | - StackOverflow (https://sentiment-se.github.io/replication.zip)
 13 | 
 14 | # Approaches
 15 | ## SA4SE tools
 16 | ### Stanford CoreNLP
 17 | Usage: https://github.com/smilli/py-corenlp
 18 | ### SentiStrength
 19 | Download from: https://www.softpedia.com/get/Others/Home-Education/SentiStrength.shtml
 20 | You should download both the exe file and the SentiStrength Data zip file. After you extract the data file, you will see it contains many useful word lists.
 21 | ### SentiStrength-SE
 22 | Download from: https://laser.cs.uno.edu/Projects/Projects.html
 23 | ### SentiCR
 24 | Source code:https://github.com/senticr/SentiCR
 25 | ### Senti4SD
 26 | Source code: https://github.com/collab-uniba/pySenti4SD or https://github.com/collab-uniba/Senti4SD
 27 | 
 28 | ## Pre-trained Transformer-based Language Models
 29 | We used pre-trained BERT, XNLet, RoBERTa, and ALBERT models. We use huggingface library: https://huggingface.co/transformers/
 30 | 
 31 | # Scripts
 32 | ## Pre-trained Transformer-based language models
 33 | We used six python scripts, i.e., [api.py](./scripts/PTM/api.py), [app.py](./scripts/PTM/app.py), [cr.py](./scripts/PTM/cr.py), [github.py](./scripts/PTM/github.py), [jira.py](./scripts/PTM/jira.py) and [so.py](./scripts/PTM/so.py). For each script, the argument is the model. For example, to run BERT on api data, you can run the [api.py](./scripts/PTM/api.py) like follows: `$ python api.py -m 0` (Instead of tuning the hyper-parameters, we used fixed hyper-parameters as stated in our paper.)
 34 | 
 35 | 
 36 | You can also apply the early-stopping technique in the scripts, and the code in [this folder](./scripts/PTM/early-stopping). If you would like to train/fine-tune the BERT on dataset API, please run `$ python api.py -m 0 -r 1`; If you would like to evaluate the fine-tuned BERT on dataset API, please run `$ python api.py -m 0 -r 0`. The argument '-r' indicates whether you would like to re-train.
 37 | 
 38 | The data used by this group as located in [PTM folder](./data/PTM/)
 39 | 
 40 | ## SentiCR
 41 | After cloning this [repo](https://github.com/senticr/SentiCR), you have to modify the training oracle and its corresponding test part. We also put the modified script in [SentiCR.py](./scripts/SentiCR/SentiCR.py) and [SenticrTest.py](./scripts/SentiCR/SenticrTest.py). You can replace our scripts in your cloned SentiCR repo to run the test. You should notice that you have to use the same training and test dataset. For example, use training oracle (GitHub dataset) and test file (GitHub dataset).
 42 | 
 43 | ## Senti4SD
 44 | After your clone the [Senti4SD](https://github.com/collab-uniba/pySenti4SD) or [pySenti4SD](https://github.com/collab-uniba/Senti4SD), just run the following command without re-training:
 45 | 
 46 | ```bash
 47 | sh classification.sh -i test_dataset.csv -o predictions.csv or
 48 | sh classification.sh test_dataset.csv predictions.csv
 49 | ```
 50 | 
 51 | After getting the predictions, please run [Senti4SD.py](./scripts/analyze-results/Senti4SD.py) to analyze the prediction performance.
 52 | 
 53 | ## Stanford CoreNLP
 54 | After you download and start the Stanford CoreNLP server, you can import the library by referring the example in this [repo](https://github.com/smilli/py-corenlp). Our script is in [StanfordCoreNLP.py](./scripts/StanfordCoreNLP.py).
 55 | 
 56 | ## SentiStrength
 57 | ### Prepare data
 58 | As all the input should only be one line, we should convert our data into this format in case some sentences have multiple lines. Our used test data can be found in [SentiStrength folder](./data/SentiStrength/). If you want to run your own, you can directly use our [script](./scripts/prepare-data/convert_sentistrength.py)
 59 | ### Prediction
 60 | Run SentiStrength2.3Free.exe. The process is as follows:
 61 | 1. Select reporting options
 62 | Click on 'Reporting Options' -> Unchoose 'Report Classification Rationale' and 'Report Translation (From Abbreviations etc.)'. In other words, we only select 'Report Sentiment Strength CLassifications [don't uncheck this normally ever].
 63 | 2.  Select input file
 64 | Click on 'Sentiment Strength Analysis' -> 'Analyse ALL Texts in File [each line separately]' -> Select the test file -> 'Echo header line to the results?', select 'Yes' -> 'Which column contains the text? Enter 1 for ...', enter 1 -> Choose your folder to save the output file
 65 | 
 66 | 
 67 | 
 68 | The input file is a txt file, which in each line is a test text. It will output a file, in each line, it has two values, which represent the negative and positive values, respectively. Our strategy is to calculate the sum of these two values.
 69 | ### Evaluation
 70 | After getting the predictions, please run [SentiStrength.py](./scripts/analyze-results/SentiStrength.py) to analyze the prediction performance.
 71 | 
 72 | ## SentiStrength-SE
 73 | Almost the same workflow as SentiStrength.
 74 | ### Prepare data
 75 | The same as SentiStrength.
 76 | ### Prediction
 77 | This application is almost the same as SentiStrength. It will output two integer values, and we assign a sentiment value based on the sum.
 78 | ```bash
 79 | java -jar SentiStrength-SE_V1.5.jar
 80 | ```
 81 | Input-> Select the test file
 82 | 
 83 | 
 84 | Detect Sentiments
 85 | ### Evaluation
 86 | After getting the predictions, please run [SentiStrength-SE.py](./scripts/analyze-results/SentiStrength-SE.py) to analyze the predictions.
 87 | 
 88 | ## Discussion part
 89 | We compared the predictions made by XLNet and SentiCR in Discussion part in our paper. The script used is [gh-xlnet-senticr.py](./scripts/analyze-results/gh-xlnet-senticr.py).
 90 | 
 91 | # Contact
 92 | If you have any problems, feel free to contact Ting Zhang (tingzhang.2019@phdcs.smu.edu.sg)
 93 | 
 94 | # Cite
 95 | If you find this repo useful, please consider to cite our work.
 96 | ```
 97 | @inproceedings{zhang2020sentiment,
 98 |   title={Sentiment Analysis for Software Engineering: How Far Can Pre-trained Transformer Models Go?},
 99 |   author={Zhang, Ting and Xu, Bowen and Thung, Ferdian and Haryono, Stefanus Agus and Lo, David and Jiang, Lingxiao},
100 |   booktitle={2020 IEEE International Conference on Software Maintenance and Evolution (ICSME)},
101 |   pages={70--80},
102 |   year={2020},
103 |   organization={IEEE}
104 | }
105 | ```
106 | 


--------------------------------------------------------------------------------
/data/PTM/api-test.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/api-test.pkl


--------------------------------------------------------------------------------
/data/PTM/api-train.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/api-train.pkl


--------------------------------------------------------------------------------
/data/PTM/app-test.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/app-test.pkl


--------------------------------------------------------------------------------
/data/PTM/app-train.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/app-train.pkl


--------------------------------------------------------------------------------
/data/PTM/cr-test.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/cr-test.pkl


--------------------------------------------------------------------------------
/data/PTM/cr-train.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/cr-train.pkl


--------------------------------------------------------------------------------
/data/PTM/gh-test.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/gh-test.pkl


--------------------------------------------------------------------------------
/data/PTM/gh-train.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/gh-train.pkl


--------------------------------------------------------------------------------
/data/PTM/jira-test.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/jira-test.pkl


--------------------------------------------------------------------------------
/data/PTM/jira-train.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/jira-train.pkl


--------------------------------------------------------------------------------
/data/PTM/so-test.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/so-test.pkl


--------------------------------------------------------------------------------
/data/PTM/so-train.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soarsmu/SA4SE/31d7dc5bcbda072bb79f682aaad39f2406813dce/data/PTM/so-train.pkl


--------------------------------------------------------------------------------
/data/Senti4SD/app-test-sd.csv:
--------------------------------------------------------------------------------
  1 | Text,Polarity
  2 | ? horrible on moment. my battery dies faster !,negative
  3 | ? good features  slow deliveries  and hit and miss notifications.,negative
  4 | fun and picture the game has beautiful pictures.,positive
  5 | crap ~ refund requested the game freezes about 6mins into it everytime ~ crap shack game ~ refunds hard now.,negative
  6 | just love it! been using for 9 months now and it is perfect app! ?,positive
  7 | 5 stars this is exactly what i am looking for. thank you so much.,positive
  8 | great it's awesome,positive
  9 | i hope there's penguinz this is a really fun game.,positive
 10 | slow  no point. the viewfinder display is really laggy and slow.  doesn't seem to offer any creative black and white features so how does it improve over just desaturating color photos?,negative
 11 | "well.. it was awesome .. i do like the new look   but now it also now gets ""unexpected errors ""and hitting retry just gets the same.  won't work til you clear the data out in settings.. but doing that all the time is annoying. please fix this issue!",negative
 12 | awasome it is so cute,positive
 13 | i like it because i enjoy it the picture is very small,positive
 14 | ? guns by calford is better,positive
 15 | excellent app! but add more! this is much better than samsung's equation detection but the only thing missing in this app is the lack of variety. just like in the note 10.1  please add more equations and have the ability to solve complex algebra. i'll be glad to pay for such a feature. makes work so much easier!,positive
 16 | it is fun i love it,positive
 17 | yampee exellent nice and cool app,positive
 18 | great memory game great for the mind keep you on your toes. helps with memory.,positive
 19 | thanks :) this app is very useful and i keep checking it out every morning.,positive
 20 | thanks :-) xperia sp  it's very inspiring!,positive
 21 | crashed system stopped working. no matter what i did this app kept crashing the system. my phone wouldn't work. the screen kept going black and i couldn't run anything. happened all of a sudden. uninstalled and went over to go launcher ex. same features  smoother  not crashing even with more widgets running. sorry adw.,negative
 22 | nice it is very useful,positive
 23 | yala is an app for dowloading  and listening to arabic music and radio the idea and concept is amazing but it has so many bugs with lots of songs at times and it messes up my music when it connects to some random wifi. please stop and fix this.,negative
 24 | very easy and fun not many programs deserve a 5 star but this is one of them. i do miss some features (or maybe i just don't know how to use the app) like continue reading from my last page ad it sometimes reloads from start with me articles but overall this is a very good app. thanks guys!,positive
 25 | great app.. nice app .. keep it up..  it is very useful app for us... *****,positive
 26 | everything i wanted takes less space than default clock of moto x  and packs so much info. the default settings are pretty impressive. did not have to customize much. won't mind paying for it.,positive
 27 | freezes is an awesome game until the past few days  it freezes after each mission.,negative
 28 | awesome to the developer i just gave another slot machine app very poor ratings your game is 100 the best on android and i've played all the slot machines android has to offer and by far plenty of coins and i'm having fun as hell if it drains all the money now i would not be disappointed i have never paid for a app but i think i will pay for this one stupendous job,positive
 29 | very nice. great keyboard app. many great tools and a ton of customization.,positive
 30 | game this game is kool!!!!,positive
 31 | amazing! awesome app for mathematics. worth thousand starts. thanks devs. though the ui is hard to understand. thanks for the small size,positive
 32 | invaluable! i use this app daily and can't imagine not having it.,positive
 33 | won't load on s3 like everyone else game freezes when trying to load a mission. so they updated the app and now the new version isn't compatible with my phone.  bah puh!,negative
 34 | i really like this game!!!!!! this is a good game!!!!,positive
 35 | woooow it is gooooooood game:-) :-) :-),positive
 36 | freezes after about 2 minutes of gameplay,negative
 37 | anoying bug i use both the powertoggles notification bar widget and home screen widget.  really good concept as a whole. but... the home screen widget always freezes a couple hours after i configured it.  the widgetbis used to launch my favorite apps  not for toggling anything if you want to resolve it  i can try to give you more info. ofc rating goes up if this is resolved!! :-) ** bug doesn't occur anymore! :d,negative
 38 | match cast lines up don't show i mean aside from that the app is pretty great,positive
 39 | ? cute but eats up alot of battery.,negative
 40 | best twitter client! if you are looking for the best twitter client  search no more. this is it! it's similar to tweetbot on ios. you get all the great features from pull to refresh  inline images  column views  white/black holo theme background  tap to jump to top  and so much more. oh and there are no ads in this version. it is simply amazing.,positive
 41 | help love this game but now it wont let me sell my fish. it freezes and crashes. please fix!,negative
 42 | ? so far so good. ? however  freezes up way too much. hope that error gets fixed soon or i will have to give it the boot.,negative
 43 | network error! suddenly after downloading an update pack i cannot login and said that loading failed. check your connection network. but my connection is working finely. please fix this. thanks,negative
 44 | excellent good app for people abroad,positive
 45 | fun widget good selection of hilts and colors.,positive
 46 | shame its a good game dont get me wrong but since theupdate with the hunter and new maps it freezes   unless you stik to old maps.... cumon update fix please,negative
 47 | awesome this app is legendary!,positive
 48 | one thing missing i really like this app. simple user interface and works smoothly. espn should add a feature to notify you when your current selection is finished. that is the only thing missing.,positive
 49 | excellent you can change your car s  it's easy to park it self,positive
 50 | pls fix bugs nice apps!,negative
 51 | worked good at first... when i first got it  it worked well  now it freezes up and only displays blank white screen.,negative
 52 | simply the best i very rarely write reviews. even more rare is when i like an app enough to buy it. this app deserves both. i have  seriously  tried most music players for the android and this one has everything i need. audio engine is tops  start and stop via headphone jack  great album art download. lots of other programs have these things but either lack in these or other features. i especially like the simple issue of bass and treble control. price a little high  that was the only thing holding me back from buying it sooner. was thinking of giving it 4 stars because of price but; what the heck- i gave it five.,positive
 53 | good app its a very good app  very nice and easy an simple ui.,positive
 54 | good app i found the app is very good for forecast. i like it.,positive
 55 | the game constantly freezes. very frustrated since update. the game freezes and goes to green screen. i have to restart tablet every time to get game to respond. fix?,negative
 56 | ? motorola devour. becoming more and more sure this game freezes my phones? very fun game though? it is a challenge  and addictive?,neutral
 57 | slot game. it was a great game but it has stopped paying out almost completely.,negative
 58 | freezes every time only works when you restart your phone,negative
 59 | ads great app but the ads come up all the time and it ruins it,negative
 60 | very fun!! worth the memory.   :-d,positive
 61 | horrible installed perfectly and though was great app. uploaded some pics from gallery with no problem. when wanted to take a pic as demo here shows above  it was just a black scree. pressed camera button  nothing. can upload from gallery  but cant use the main feature of this app which is what is advertised. why would i use this to upload from my phone? my phone hasd bluetooth  and a usb to hook up to my pc. i can send those pics via text  email  and facebook. the main purpose of this app does not work!,negative
 62 | great apps works well and looks great. lots of available providers. from the 5 or so apps i've tested  deliveries seems to support the most.  i submitted a bug report when statuses from one of the providers wasn't working as expected. the developer promptly got back to me and fixed the issue.   only feature i would like is auto-detection of provider based on the tracking number. but as a fellow programmer i understand the challenges presented by such a feature  so this doesn't affect my rating.,positive
 63 | need to add a feature you guys should add a custom car builder feature where you can just build your own car but every part you buy will just keep adding up your total overall great app its fun and it kills time,positive
 64 | great alarm  but has new bug making it use lots of battery i like this alarm  i definitely like being able to wake up to a random playlist. but since an update its now using massive amounts of battery using the gps. i'm guessing it's something to do with sunrise feature. i don't use the sunrise alarm and i certainly don't move between countries enough for it to constantly be updating. so either a setting to not allow access to gps or to only update once a day would be useful. otherwise i'll have to go back to the standard alarm until it's fixed.,neutral
 65 | gs3 best weather and clock widget out there hands down,positive
 66 | good for me 112089 come to my friend in the game.,positive
 67 | doesn't work fix bugs and maybe i'll give it 4 stars. sorry!,negative
 68 | some cant download because when you download a swf an error appears pls fix ill rate 5 star for this,negative
 69 | nice but after i update i can't open the game? pls fix this game and i love it.i will give 5 star if this game fixed the bug.  username: roenan12 server: s2 libra,negative
 70 | freezes when browser loads freezes when browser loads to play game,negative
 71 | awesome it's very useful to all people.,positive
 72 | can't install i can't install it on my samsung note iii please fix it. error. (941),negative
 73 | note 2 it's very useful to use........love it,positive
 74 | good app it's customizable which i like. sometimes it's a bit difficult getting items to line up properly and the areas for each part of the display don't change size dynamically so the font sizes will get bigger or smaller to fit in the space alloted for them which i don't like. overall it's a good app though.,positive
 75 | "too slow. app is slower than mobile site and/or competing livejournal apps  like""eljay"" (search the marketplace for it). what with the lack of features even in the main pc site  it's proof that livejournal is no match for even the much maligned facebook...rip livejournal.",negative
 76 | very good but can you add download all attachment as zip option ?  we miss this very nice feature which is available in web version.,positive
 77 | "widget locks up my galaxy s3 this a very beautiful looking app and i like it a lot but it regularly locks up the homescreen widget  the clock freezes and the weather status locks up with the status ""locating"". my phone is a samsung galaxy s3 running on 4.1.1 based in the uk. hope this info helps solve the issue as this app is definitely worth a full 5 stars if it were not for it locking up. please fix this and i'll even purchase the add free pro version ;-)",negative
 78 | a must have! great app and a perfect companion for many other apps.,positive
 79 | like minecraft pe. has good graphics,positive
 80 | ? what is the best the best thing to scan?,neutral
 81 | bug when i leave the rest room i am always sent back to main menu please fix it and ill give it a better rate,negative
 82 | great app love the- my list feature.  i can find the best deals and plan my black friday shopping trip,positive
 83 | its what i was looking for i love this app,positive
 84 | amazing! a must have app,positive
 85 | galaxy ace india is greatest,positive
 86 | awesome it's a good app,positive
 87 | stop button crash this app repatly fix that bug and u get 5 stars,negative
 88 | time time freezes and will not update since latest version.,negative
 89 | far from as cool as smartglass i went with a ps4 rather than the xbox one  but i do miss smart glass. they need to make this app enable the voice features  and things like the store should be built-in  not a browser link.,negative
 90 | finally  one great game for android.,positive
 91 | excellent app no complaints at all but still waiting for urdu localization,positive
 92 | good app great launcher.  i really like it but it is killing my battery.,neutral
 93 | regret buying this app fcs so randomly that it's almost impossible to use it for longer period of time.  lack of updates have made it even more impossible to use.  haven't seen even a single much requested feature by the community being implemented in this app despite being ok play store for more or less 4 months now.  disappointed.,negative
 94 | offline browsing please allow us to open local html or swf files.  there are few browsers out there that can not play local html but those lack the browsing/playing features of this app.,neutral
 95 | amazing this is freaking great,positive
 96 | ? not for motorola defy,neutral
 97 | thankyou waited for such an app for a long time. jazakallah!,positive
 98 | awesome but... its a really good player no doubt. but since last update some of my albums wont show  and even songs wont show. not to mention it repeats some albums  and its not my memory card cause i only have 1 album for each group of songs. please fix this. its the best player ive come across id hate to have to uninstall it :(,neutral
 99 | cutiest emoticons cute emoticons to have fun..,positive
100 | game freezes! i play on the pc and now tried to play over and over on my samsung s3 and it freezes up after 2 min. in the game!,negative
101 | great app! i love this app! it's great to have my book lists with me wherever i go and i can easily keep track of which books i've read to each of my kids. one thing missing is the browse feature like the apple version has; i miss that! add that and i will rate 5 stars!,positive
102 | ? i love it  but some times it freezes and i have to do somthing else,negative
103 | a superb app i came to android about a year ago from an iphone  and i tried every reddit client out there such as baconreader and reddit news. these were all good apps  but none of them compared to my experience with alienblue. then  i tried reddit sync and it has become almost exclusively the only way i read redditch anymore. the holo interface and black night mode are easy on my eyes and it has all of the features i need.,positive
104 | improving rapidly. this app used to be crappy while the service was ok  but recently the app has started receiving updates on a regular basis adding a lot of basic functionality that should have been there from the beginning. now it features landscape support  resume from last position  notification bar controls and save to sd card. however  i still sorely miss a decent widget  easily sortable queues or the extensions found in the desktop app.,positive
105 | 


--------------------------------------------------------------------------------
/data/SentiStrength/app-test-se.csv:
--------------------------------------------------------------------------------
  1 | ? horrible on moment. my battery dies faster !,-1
  2 | ? good features  slow deliveries  and hit and miss notifications.,-1
  3 | fun and picture the game has beautiful pictures.,1
  4 | crap ~ refund requested the game freezes about 6mins into it everytime ~ crap shack game ~ refunds hard now.,-1
  5 | just love it! been using for 9 months now and it is perfect app! ?,1
  6 | 5 stars this is exactly what i am looking for. thank you so much.,1
  7 | great it's awesome,1
  8 | i hope there's penguinz this is a really fun game.,1
  9 | slow  no point. the viewfinder display is really laggy and slow.  doesn't seem to offer any creative black and white features so how does it improve over just desaturating color photos?,-1
 10 | "well.. it was awesome .. i do like the new look   but now it also now gets ""unexpected errors ""and hitting retry just gets the same.  won't work til you clear the data out in settings.. but doing that all the time is annoying. please fix this issue!",-1
 11 | awasome it is so cute,1
 12 | i like it because i enjoy it the picture is very small,1
 13 | ? guns by calford is better,1
 14 | excellent app! but add more! this is much better than samsung's equation detection but the only thing missing in this app is the lack of variety. just like in the note 10.1  please add more equations and have the ability to solve complex algebra. i'll be glad to pay for such a feature. makes work so much easier!,1
 15 | it is fun i love it,1
 16 | yampee exellent nice and cool app,1
 17 | great memory game great for the mind keep you on your toes. helps with memory.,1
 18 | thanks :) this app is very useful and i keep checking it out every morning.,1
 19 | thanks :-) xperia sp  it's very inspiring!,1
 20 | crashed system stopped working. no matter what i did this app kept crashing the system. my phone wouldn't work. the screen kept going black and i couldn't run anything. happened all of a sudden. uninstalled and went over to go launcher ex. same features  smoother  not crashing even with more widgets running. sorry adw.,-1
 21 | nice it is very useful,1
 22 | yala is an app for dowloading  and listening to arabic music and radio the idea and concept is amazing but it has so many bugs with lots of songs at times and it messes up my music when it connects to some random wifi. please stop and fix this.,-1
 23 | very easy and fun not many programs deserve a 5 star but this is one of them. i do miss some features (or maybe i just don't know how to use the app) like continue reading from my last page ad it sometimes reloads from start with me articles but overall this is a very good app. thanks guys!,1
 24 | great app.. nice app .. keep it up..  it is very useful app for us... *****,1
 25 | everything i wanted takes less space than default clock of moto x  and packs so much info. the default settings are pretty impressive. did not have to customize much. won't mind paying for it.,1
 26 | freezes is an awesome game until the past few days  it freezes after each mission.,-1
 27 | awesome to the developer i just gave another slot machine app very poor ratings your game is 100 the best on android and i've played all the slot machines android has to offer and by far plenty of coins and i'm having fun as hell if it drains all the money now i would not be disappointed i have never paid for a app but i think i will pay for this one stupendous job,1
 28 | very nice. great keyboard app. many great tools and a ton of customization.,1
 29 | game this game is kool!!!!,1
 30 | amazing! awesome app for mathematics. worth thousand starts. thanks devs. though the ui is hard to understand. thanks for the small size,1
 31 | invaluable! i use this app daily and can't imagine not having it.,1
 32 | won't load on s3 like everyone else game freezes when trying to load a mission. so they updated the app and now the new version isn't compatible with my phone.  bah puh!,-1
 33 | i really like this game!!!!!! this is a good game!!!!,1
 34 | woooow it is gooooooood game:-) :-) :-),1
 35 | freezes after about 2 minutes of gameplay,-1
 36 | anoying bug i use both the powertoggles notification bar widget and home screen widget.  really good concept as a whole. but... the home screen widget always freezes a couple hours after i configured it.  the widgetbis used to launch my favorite apps  not for toggling anything if you want to resolve it  i can try to give you more info. ofc rating goes up if this is resolved!! :-) ** bug doesn't occur anymore! :d,-1
 37 | match cast lines up don't show i mean aside from that the app is pretty great,1
 38 | ? cute but eats up alot of battery.,-1
 39 | best twitter client! if you are looking for the best twitter client  search no more. this is it! it's similar to tweetbot on ios. you get all the great features from pull to refresh  inline images  column views  white/black holo theme background  tap to jump to top  and so much more. oh and there are no ads in this version. it is simply amazing.,1
 40 | help love this game but now it wont let me sell my fish. it freezes and crashes. please fix!,-1
 41 | ? so far so good. ? however  freezes up way too much. hope that error gets fixed soon or i will have to give it the boot.,-1
 42 | network error! suddenly after downloading an update pack i cannot login and said that loading failed. check your connection network. but my connection is working finely. please fix this. thanks,-1
 43 | excellent good app for people abroad,1
 44 | fun widget good selection of hilts and colors.,1
 45 | shame its a good game dont get me wrong but since theupdate with the hunter and new maps it freezes   unless you stik to old maps.... cumon update fix please,-1
 46 | awesome this app is legendary!,1
 47 | one thing missing i really like this app. simple user interface and works smoothly. espn should add a feature to notify you when your current selection is finished. that is the only thing missing.,1
 48 | excellent you can change your car s  it's easy to park it self,1
 49 | pls fix bugs nice apps!,-1
 50 | worked good at first... when i first got it  it worked well  now it freezes up and only displays blank white screen.,-1
 51 | simply the best i very rarely write reviews. even more rare is when i like an app enough to buy it. this app deserves both. i have  seriously  tried most music players for the android and this one has everything i need. audio engine is tops  start and stop via headphone jack  great album art download. lots of other programs have these things but either lack in these or other features. i especially like the simple issue of bass and treble control. price a little high  that was the only thing holding me back from buying it sooner. was thinking of giving it 4 stars because of price but; what the heck- i gave it five.,1
 52 | good app its a very good app  very nice and easy an simple ui.,1
 53 | good app i found the app is very good for forecast. i like it.,1
 54 | the game constantly freezes. very frustrated since update. the game freezes and goes to green screen. i have to restart tablet every time to get game to respond. fix?,-1
 55 | ? motorola devour. becoming more and more sure this game freezes my phones? very fun game though? it is a challenge  and addictive?,0
 56 | slot game. it was a great game but it has stopped paying out almost completely.,-1
 57 | freezes every time only works when you restart your phone,-1
 58 | ads great app but the ads come up all the time and it ruins it,-1
 59 | very fun!! worth the memory.   :-d,1
 60 | horrible installed perfectly and though was great app. uploaded some pics from gallery with no problem. when wanted to take a pic as demo here shows above  it was just a black scree. pressed camera button  nothing. can upload from gallery  but cant use the main feature of this app which is what is advertised. why would i use this to upload from my phone? my phone hasd bluetooth  and a usb to hook up to my pc. i can send those pics via text  email  and facebook. the main purpose of this app does not work!,-1
 61 | great apps works well and looks great. lots of available providers. from the 5 or so apps i've tested  deliveries seems to support the most.  i submitted a bug report when statuses from one of the providers wasn't working as expected. the developer promptly got back to me and fixed the issue.   only feature i would like is auto-detection of provider based on the tracking number. but as a fellow programmer i understand the challenges presented by such a feature  so this doesn't affect my rating.,1
 62 | need to add a feature you guys should add a custom car builder feature where you can just build your own car but every part you buy will just keep adding up your total overall great app its fun and it kills time,1
 63 | great alarm  but has new bug making it use lots of battery i like this alarm  i definitely like being able to wake up to a random playlist. but since an update its now using massive amounts of battery using the gps. i'm guessing it's something to do with sunrise feature. i don't use the sunrise alarm and i certainly don't move between countries enough for it to constantly be updating. so either a setting to not allow access to gps or to only update once a day would be useful. otherwise i'll have to go back to the standard alarm until it's fixed.,0
 64 | gs3 best weather and clock widget out there hands down,1
 65 | good for me 112089 come to my friend in the game.,1
 66 | doesn't work fix bugs and maybe i'll give it 4 stars. sorry!,-1
 67 | some cant download because when you download a swf an error appears pls fix ill rate 5 star for this,-1
 68 | nice but after i update i can't open the game? pls fix this game and i love it.i will give 5 star if this game fixed the bug.  username: roenan12 server: s2 libra,-1
 69 | freezes when browser loads freezes when browser loads to play game,-1
 70 | awesome it's very useful to all people.,1
 71 | can't install i can't install it on my samsung note iii please fix it. error. (941),-1
 72 | note 2 it's very useful to use........love it,1
 73 | good app it's customizable which i like. sometimes it's a bit difficult getting items to line up properly and the areas for each part of the display don't change size dynamically so the font sizes will get bigger or smaller to fit in the space alloted for them which i don't like. overall it's a good app though.,1
 74 | "too slow. app is slower than mobile site and/or competing livejournal apps  like""eljay"" (search the marketplace for it). what with the lack of features even in the main pc site  it's proof that livejournal is no match for even the much maligned facebook...rip livejournal.",-1
 75 | very good but can you add download all attachment as zip option ?  we miss this very nice feature which is available in web version.,1
 76 | "widget locks up my galaxy s3 this a very beautiful looking app and i like it a lot but it regularly locks up the homescreen widget  the clock freezes and the weather status locks up with the status ""locating"". my phone is a samsung galaxy s3 running on 4.1.1 based in the uk. hope this info helps solve the issue as this app is definitely worth a full 5 stars if it were not for it locking up. please fix this and i'll even purchase the add free pro version ;-)",-1
 77 | a must have! great app and a perfect companion for many other apps.,1
 78 | like minecraft pe. has good graphics,1
 79 | ? what is the best the best thing to scan?,0
 80 | bug when i leave the rest room i am always sent back to main menu please fix it and ill give it a better rate,-1
 81 | great app love the- my list feature.  i can find the best deals and plan my black friday shopping trip,1
 82 | its what i was looking for i love this app,1
 83 | amazing! a must have app,1
 84 | galaxy ace india is greatest,1
 85 | awesome it's a good app,1
 86 | stop button crash this app repatly fix that bug and u get 5 stars,-1
 87 | time time freezes and will not update since latest version.,-1
 88 | far from as cool as smartglass i went with a ps4 rather than the xbox one  but i do miss smart glass. they need to make this app enable the voice features  and things like the store should be built-in  not a browser link.,-1
 89 | finally  one great game for android.,1
 90 | excellent app no complaints at all but still waiting for urdu localization,1
 91 | good app great launcher.  i really like it but it is killing my battery.,0
 92 | regret buying this app fcs so randomly that it's almost impossible to use it for longer period of time.  lack of updates have made it even more impossible to use.  haven't seen even a single much requested feature by the community being implemented in this app despite being ok play store for more or less 4 months now.  disappointed.,-1
 93 | offline browsing please allow us to open local html or swf files.  there are few browsers out there that can not play local html but those lack the browsing/playing features of this app.,0
 94 | amazing this is freaking great,1
 95 | ? not for motorola defy,0
 96 | thankyou waited for such an app for a long time. jazakallah!,1
 97 | awesome but... its a really good player no doubt. but since last update some of my albums wont show  and even songs wont show. not to mention it repeats some albums  and its not my memory card cause i only have 1 album for each group of songs. please fix this. its the best player ive come across id hate to have to uninstall it :(,0
 98 | cutiest emoticons cute emoticons to have fun..,1
 99 | game freezes! i play on the pc and now tried to play over and over on my samsung s3 and it freezes up after 2 min. in the game!,-1
100 | great app! i love this app! it's great to have my book lists with me wherever i go and i can easily keep track of which books i've read to each of my kids. one thing missing is the browse feature like the apple version has; i miss that! add that and i will rate 5 stars!,1
101 | ? i love it  but some times it freezes and i have to do somthing else,-1
102 | a superb app i came to android about a year ago from an iphone  and i tried every reddit client out there such as baconreader and reddit news. these were all good apps  but none of them compared to my experience with alienblue. then  i tried reddit sync and it has become almost exclusively the only way i read redditch anymore. the holo interface and black night mode are easy on my eyes and it has all of the features i need.,1
103 | improving rapidly. this app used to be crappy while the service was ok  but recently the app has started receiving updates on a regular basis adding a lot of basic functionality that should have been there from the beginning. now it features landscape support  resume from last position  notification bar controls and save to sd card. however  i still sorely miss a decent widget  easily sortable queues or the extensions found in the desktop app.,1
104 | 


--------------------------------------------------------------------------------
/data/SentiStrength/app-test.txt:
--------------------------------------------------------------------------------
  1 | sent
  2 | ? horrible on moment. my battery dies faster !
  3 | ? good features  slow deliveries  and hit and miss notifications.
  4 | fun and picture the game has beautiful pictures.
  5 | crap ~ refund requested the game freezes about 6mins into it everytime ~ crap shack game ~ refunds hard now.
  6 | just love it! been using for 9 months now and it is perfect app! ?
  7 | 5 stars this is exactly what i am looking for. thank you so much.
  8 | great it's awesome
  9 | i hope there's penguinz this is a really fun game.
 10 | slow  no point. the viewfinder display is really laggy and slow.  doesn't seem to offer any creative black and white features so how does it improve over just desaturating color photos?
 11 | "well.. it was awesome .. i do like the new look   but now it also now gets ""unexpected errors ""and hitting retry just gets the same.  won't work til you clear the data out in settings.. but doing that all the time is annoying. please fix this issue!"
 12 | awasome it is so cute
 13 | i like it because i enjoy it the picture is very small
 14 | ? guns by calford is better
 15 | excellent app! but add more! this is much better than samsung's equation detection but the only thing missing in this app is the lack of variety. just like in the note 10.1  please add more equations and have the ability to solve complex algebra. i'll be glad to pay for such a feature. makes work so much easier!
 16 | it is fun i love it
 17 | yampee exellent nice and cool app
 18 | great memory game great for the mind keep you on your toes. helps with memory.
 19 | thanks :) this app is very useful and i keep checking it out every morning.
 20 | thanks :-) xperia sp  it's very inspiring!
 21 | crashed system stopped working. no matter what i did this app kept crashing the system. my phone wouldn't work. the screen kept going black and i couldn't run anything. happened all of a sudden. uninstalled and went over to go launcher ex. same features  smoother  not crashing even with more widgets running. sorry adw.
 22 | nice it is very useful
 23 | yala is an app for dowloading  and listening to arabic music and radio the idea and concept is amazing but it has so many bugs with lots of songs at times and it messes up my music when it connects to some random wifi. please stop and fix this.
 24 | very easy and fun not many programs deserve a 5 star but this is one of them. i do miss some features (or maybe i just don't know how to use the app) like continue reading from my last page ad it sometimes reloads from start with me articles but overall this is a very good app. thanks guys!
 25 | great app.. nice app .. keep it up..  it is very useful app for us... *****
 26 | everything i wanted takes less space than default clock of moto x  and packs so much info. the default settings are pretty impressive. did not have to customize much. won't mind paying for it.
 27 | freezes is an awesome game until the past few days  it freezes after each mission.
 28 | awesome to the developer i just gave another slot machine app very poor ratings your game is 100 the best on android and i've played all the slot machines android has to offer and by far plenty of coins and i'm having fun as hell if it drains all the money now i would not be disappointed i have never paid for a app but i think i will pay for this one stupendous job
 29 | very nice. great keyboard app. many great tools and a ton of customization.
 30 | game this game is kool!!!!
 31 | amazing! awesome app for mathematics. worth thousand starts. thanks devs. though the ui is hard to understand. thanks for the small size
 32 | invaluable! i use this app daily and can't imagine not having it.
 33 | won't load on s3 like everyone else game freezes when trying to load a mission. so they updated the app and now the new version isn't compatible with my phone.  bah puh!
 34 | i really like this game!!!!!! this is a good game!!!!
 35 | woooow it is gooooooood game:-) :-) :-)
 36 | freezes after about 2 minutes of gameplay
 37 | anoying bug i use both the powertoggles notification bar widget and home screen widget.  really good concept as a whole. but... the home screen widget always freezes a couple hours after i configured it.  the widgetbis used to launch my favorite apps  not for toggling anything if you want to resolve it  i can try to give you more info. ofc rating goes up if this is resolved!! :-) ** bug doesn't occur anymore! :d
 38 | match cast lines up don't show i mean aside from that the app is pretty great
 39 | ? cute but eats up alot of battery.
 40 | best twitter client! if you are looking for the best twitter client  search no more. this is it! it's similar to tweetbot on ios. you get all the great features from pull to refresh  inline images  column views  white/black holo theme background  tap to jump to top  and so much more. oh and there are no ads in this version. it is simply amazing.
 41 | help love this game but now it wont let me sell my fish. it freezes and crashes. please fix!
 42 | ? so far so good. ? however  freezes up way too much. hope that error gets fixed soon or i will have to give it the boot.
 43 | network error! suddenly after downloading an update pack i cannot login and said that loading failed. check your connection network. but my connection is working finely. please fix this. thanks
 44 | excellent good app for people abroad
 45 | fun widget good selection of hilts and colors.
 46 | shame its a good game dont get me wrong but since theupdate with the hunter and new maps it freezes   unless you stik to old maps.... cumon update fix please
 47 | awesome this app is legendary!
 48 | one thing missing i really like this app. simple user interface and works smoothly. espn should add a feature to notify you when your current selection is finished. that is the only thing missing.
 49 | excellent you can change your car s  it's easy to park it self
 50 | pls fix bugs nice apps!
 51 | worked good at first... when i first got it  it worked well  now it freezes up and only displays blank white screen.
 52 | simply the best i very rarely write reviews. even more rare is when i like an app enough to buy it. this app deserves both. i have  seriously  tried most music players for the android and this one has everything i need. audio engine is tops  start and stop via headphone jack  great album art download. lots of other programs have these things but either lack in these or other features. i especially like the simple issue of bass and treble control. price a little high  that was the only thing holding me back from buying it sooner. was thinking of giving it 4 stars because of price but; what the heck- i gave it five.
 53 | good app its a very good app  very nice and easy an simple ui.
 54 | good app i found the app is very good for forecast. i like it.
 55 | the game constantly freezes. very frustrated since update. the game freezes and goes to green screen. i have to restart tablet every time to get game to respond. fix?
 56 | ? motorola devour. becoming more and more sure this game freezes my phones? very fun game though? it is a challenge  and addictive?
 57 | slot game. it was a great game but it has stopped paying out almost completely.
 58 | freezes every time only works when you restart your phone
 59 | ads great app but the ads come up all the time and it ruins it
 60 | very fun!! worth the memory.   :-d
 61 | horrible installed perfectly and though was great app. uploaded some pics from gallery with no problem. when wanted to take a pic as demo here shows above  it was just a black scree. pressed camera button  nothing. can upload from gallery  but cant use the main feature of this app which is what is advertised. why would i use this to upload from my phone? my phone hasd bluetooth  and a usb to hook up to my pc. i can send those pics via text  email  and facebook. the main purpose of this app does not work!
 62 | great apps works well and looks great. lots of available providers. from the 5 or so apps i've tested  deliveries seems to support the most.  i submitted a bug report when statuses from one of the providers wasn't working as expected. the developer promptly got back to me and fixed the issue.   only feature i would like is auto-detection of provider based on the tracking number. but as a fellow programmer i understand the challenges presented by such a feature  so this doesn't affect my rating.
 63 | need to add a feature you guys should add a custom car builder feature where you can just build your own car but every part you buy will just keep adding up your total overall great app its fun and it kills time
 64 | great alarm  but has new bug making it use lots of battery i like this alarm  i definitely like being able to wake up to a random playlist. but since an update its now using massive amounts of battery using the gps. i'm guessing it's something to do with sunrise feature. i don't use the sunrise alarm and i certainly don't move between countries enough for it to constantly be updating. so either a setting to not allow access to gps or to only update once a day would be useful. otherwise i'll have to go back to the standard alarm until it's fixed.
 65 | gs3 best weather and clock widget out there hands down
 66 | good for me 112089 come to my friend in the game.
 67 | doesn't work fix bugs and maybe i'll give it 4 stars. sorry!
 68 | some cant download because when you download a swf an error appears pls fix ill rate 5 star for this
 69 | nice but after i update i can't open the game? pls fix this game and i love it.i will give 5 star if this game fixed the bug.  username: roenan12 server: s2 libra
 70 | freezes when browser loads freezes when browser loads to play game
 71 | awesome it's very useful to all people.
 72 | can't install i can't install it on my samsung note iii please fix it. error. (941)
 73 | note 2 it's very useful to use........love it
 74 | good app it's customizable which i like. sometimes it's a bit difficult getting items to line up properly and the areas for each part of the display don't change size dynamically so the font sizes will get bigger or smaller to fit in the space alloted for them which i don't like. overall it's a good app though.
 75 | "too slow. app is slower than mobile site and/or competing livejournal apps  like""eljay"" (search the marketplace for it). what with the lack of features even in the main pc site  it's proof that livejournal is no match for even the much maligned facebook...rip livejournal."
 76 | very good but can you add download all attachment as zip option ?  we miss this very nice feature which is available in web version.
 77 | "widget locks up my galaxy s3 this a very beautiful looking app and i like it a lot but it regularly locks up the homescreen widget  the clock freezes and the weather status locks up with the status ""locating"". my phone is a samsung galaxy s3 running on 4.1.1 based in the uk. hope this info helps solve the issue as this app is definitely worth a full 5 stars if it were not for it locking up. please fix this and i'll even purchase the add free pro version ;-)"
 78 | a must have! great app and a perfect companion for many other apps.
 79 | like minecraft pe. has good graphics
 80 | ? what is the best the best thing to scan?
 81 | bug when i leave the rest room i am always sent back to main menu please fix it and ill give it a better rate
 82 | great app love the- my list feature.  i can find the best deals and plan my black friday shopping trip
 83 | its what i was looking for i love this app
 84 | amazing! a must have app
 85 | galaxy ace india is greatest
 86 | awesome it's a good app
 87 | stop button crash this app repatly fix that bug and u get 5 stars
 88 | time time freezes and will not update since latest version.
 89 | far from as cool as smartglass i went with a ps4 rather than the xbox one  but i do miss smart glass. they need to make this app enable the voice features  and things like the store should be built-in  not a browser link.
 90 | finally  one great game for android.
 91 | excellent app no complaints at all but still waiting for urdu localization
 92 | good app great launcher.  i really like it but it is killing my battery.
 93 | regret buying this app fcs so randomly that it's almost impossible to use it for longer period of time.  lack of updates have made it even more impossible to use.  haven't seen even a single much requested feature by the community being implemented in this app despite being ok play store for more or less 4 months now.  disappointed.
 94 | offline browsing please allow us to open local html or swf files.  there are few browsers out there that can not play local html but those lack the browsing/playing features of this app.
 95 | amazing this is freaking great
 96 | ? not for motorola defy
 97 | thankyou waited for such an app for a long time. jazakallah!
 98 | awesome but... its a really good player no doubt. but since last update some of my albums wont show  and even songs wont show. not to mention it repeats some albums  and its not my memory card cause i only have 1 album for each group of songs. please fix this. its the best player ive come across id hate to have to uninstall it :(
 99 | cutiest emoticons cute emoticons to have fun..
100 | game freezes! i play on the pc and now tried to play over and over on my samsung s3 and it freezes up after 2 min. in the game!
101 | great app! i love this app! it's great to have my book lists with me wherever i go and i can easily keep track of which books i've read to each of my kids. one thing missing is the browse feature like the apple version has; i miss that! add that and i will rate 5 stars!
102 | ? i love it  but some times it freezes and i have to do somthing else
103 | a superb app i came to android about a year ago from an iphone  and i tried every reddit client out there such as baconreader and reddit news. these were all good apps  but none of them compared to my experience with alienblue. then  i tried reddit sync and it has become almost exclusively the only way i read redditch anymore. the holo interface and black night mode are easy on my eyes and it has all of the features i need.
104 | improving rapidly. this app used to be crappy while the service was ok  but recently the app has started receiving updates on a regular basis adding a lot of basic functionality that should have been there from the beginning. now it features landscape support  resume from last position  notification bar controls and save to sd card. however  i still sorely miss a decent widget  easily sortable queues or the extensions found in the desktop app.
105 | 


--------------------------------------------------------------------------------
/data/SentiStrength/jira-test-se.csv:
--------------------------------------------------------------------------------
  1 | "It may cause conflicts, but if your intent is to break system security, you probably don't care.",-1
  2 | "Thanks, Amar!",1
  3 | "This was a very bad bug, introduced by me being an idiot.",-1
  4 | Didn't got the time to try it yet.,-1
  5 | Pull it back in if you think different.,-1
  6 | Why the hell is this not a bug?,-1
  7 | the recommendation in the wiki is bad.,-1
  8 | I'm confused.,-1
  9 | "I was in too much of a hurry, sorry hold on a sec.",-1
 10 | Sorry for the noise.,-1
 11 | It really sucks that there is now way currently to include transitive dependencies.,-1
 12 | "Thanks,Mayank",1
 13 | Thanks Uwe!,1
 14 | Thanks!,1
 15 | My bad.,-1
 16 | "I don't care if everything is pretty or not, but we should at least support basic admin functionality in IE IMO (though I have not used it for years for just about anything).",-1
 17 | Is that ok for this file (b/c I have no idea how to do the svn move now ... after I've made all the changes already) :),1
 18 | "My bad, I screwed up the assertion -> RuntimeException transition.",-1
 19 | Regex is your friend.,1
 20 | Pull it back in if you think different.,-1
 21 | "In fact, if you happen to know why ObserverHammerQuorumTest is failing with this latest patch, I'd love to hear.",1
 22 | My bad.,-1
 23 | "sorry old xml here is the used one:<?xml version=""1.0"" encoding=""UTF-8""?><root><value>?¤?¶?¼ und ??????",-1
 24 | You're results are awesome Paul. Great work :)Looking forward to see your new JSON parser in trunk whenever you think is ready.,1
 25 | This is Awesome Stefan - thanks a million!,1
 26 | My bad.,-1
 27 | Thanks for fixing it so quickly!,1
 28 | "Hi Sagara,Thank you very much for looking into this.",1
 29 | "Sorry typo:  ""our part"" in place of  ""one part"" above.",-1
 30 | "Sorry, I meant to have this patch in sooner but got quite busy, I expect to have it soon.",-1
 31 | Yes exactly -- sorry to be so unclear.,-1
 32 | Weird.,-1
 33 | I think you made mistakes in ivy.xml.,-1
 34 | Thanks for your patience.,1
 35 | Thanks Sijie.,1
 36 | "Thanks, Dhruba!",1
 37 | "Shit, I missed a cast.",-1
 38 | But please don't say that my reasoning is bad - because it is not.,-1
 39 | This might be a bug indeed.,-1
 40 | This is clearly bad webserver behaviour.,-1
 41 | @Ashutosh: thanks a lot for the comments.,1
 42 | 897392Thanks Sharan !,1
 43 | "Looks good, thanks Laura!",1
 44 | That's my bad.,-1
 45 | Thanks Andrew - the patch was applied to SQL module at r525019.,1
 46 | "Thank you,+ Harit Himanshu",1
 47 | This sucks *so much*...,-1
 48 | "I've tried something similar (I removed the handlers and kept the readers), but the performance was not visible.",-1
 49 | Damn...,-1
 50 | Applied patch with thanks to Scott.,1
 51 | I think we all know it just sucks.,-1
 52 | "Damn, Chuck is scary.",-1
 53 | You would need to implement session resume; thats a whole new can of worms.,-1
 54 | I don't have strong opinions about it either way.,-1
 55 | This is weird.,-1
 56 | Thank you for the patch Sergey. ,1
 57 | "Yes it is a dup, thanks Mike for taking care of this (I planned to do this yesterday but didn't make it)",-1
 58 | "Sorry Avdhesh, I forgot to add these two files to the patch, here is the new patch containing the missing files",-1
 59 | "Thanks a lot, Kiran for the patch.",1
 60 | This is bad.,-1
 61 | "Sorry, I guess I'm against ""never computing this shit""... because you guys think returning NaN is ok.",-1
 62 | sorry for delay,-1
 63 | "And boy, hell broke loose ;)So... the biggest issue I'm facing is indeed with Random sharing across threads.",-1
 64 | "Thanks,Arvind",1
 65 | Please close as this is just me being stupid.,-1
 66 | "Thanks a lot for the reviews, Todd.",1
 67 | Thanks Karthik.,1
 68 | "Sorry for my poor review, didn't notice try/catch :(",-1
 69 | "Seems we didn't enforce an exec for sh, but we did for fs.",-1
 70 | "durrrh, that sucks.",-1
 71 | "This is pretty trivial, just adds three asserts to TestPath#testNormalize.",-1
 72 |  Automatic location selection is very cool.,1
 73 | All I'm trying to say is that it's pretty easy to end up in propagation failure hell here or change something else that blows things up for use cases that are not foreseen.,-1
 74 | I don't know what just happened.,-1
 75 | "Thanks for the report, and sorry its taken so long to fix it.",-1
 76 | I think the correct resolution is to ensure that the prefix stack mechanism gets reset each time the XMLReader is used.,-1
 77 | yup sorry just fixed.,-1
 78 | A stupid bug in a patch that is already applied.,-1
 79 | i found the class - so there is no bug - sorry,-1
 80 | > then we don't save IO by limiting the buffer size to 1 KBI'm confused by this.,-1
 81 | Finally closing this bug from hell.,-1
 82 | "Thanks Areek, patch looks good!",1
 83 | Incidentally if we all nag Joe Walnes enough we might be able to persuade him to release a new qdox which can ignore annotations etc (though it will still struggle with generics I think),-1
 84 | Bug in existing testDelegationTokenRestoredOnRMrestart().,-1
 85 | "Igor is an idiot, and we *do* need gmake.",-1
 86 | Thank you.,1
 87 | Sorry about that.,-1
 88 | My bad.,-1
 89 | Sorry for the trouble Vikram!,-1
 90 | "Why the hell do they deliver Duration, if they cannot instantiate it :-/",-1
 91 | (sorry if that was confusing),-1
 92 | "Sorry Chris,I missed it, done !",-1
 93 | This is a great suggestion.,1
 94 | "Thanks for reminding me; I agree, I'll do it.",1
 95 | {quote}searcher.getAtomicReader().getSortedDocValues(uniqueKey);{quote}This is a performance killer.,-1
 96 | Thanks for the reviews..,1
 97 | I'm an idiot.,-1
 98 | This looks safe also,1
 99 | Completely missed issue 614..,-1
100 | "Owen, thanks for the slides.",1
101 | "Thanks,Mayank",1
102 | This looks good to me.,1
103 | Sounds weird to me...Could you package a (totally!),-1
104 | Seems to be failing for a different reason nowtestContainerLaunch(org.apache.hadoop.yarn.server.nodemanager.TestLinuxContainerExecutorWithMocks)  Time elapsed: 0.523 sec  <<< FAILURE!,-1
105 | "Sorry, I kind of forget about this one.",-1
106 | Sorry for the confusion :),-1
107 | An output connector should also have a say in what URLs it will accept.,-1
108 | "Well, that sucks.",-1
109 | Here is quite bad.,-1
110 | Sounds like a good idea.,1
111 | "Sorry, I was trying to get to the wiki but it's been a busy week.",-1
112 | "Damn, it seemed it didn't work.",-1
113 | It is good to have the test.,1
114 | Interesting that you decided not to detect the error based on finding two similarly-named operations.,1
115 | Sorry that I think I missed some discussion in the mailing list.,-1
116 | Thanks to Mathias Werlitz - sorry for the delay.,-1
117 | "Thanks to both of you, and to Deepesh for the initiative.",1
118 | My error.,-1
119 | Awesome stuff Stephen! ,1
120 | I want to integrate the sweet sweet logo Andrew crafted.,1
121 | "Thanks, Owen!",1
122 | "I am not really sure what does the receive payment do before the shipment, it doesn't sound as if it is doing what we expect it to do.",-1
123 | Sounds like a good idea. ,1
124 | "Thanks Oliver, that's fixed it. ",1
125 | Thanks Tom!,1
126 | "umm ... call me crazy, but why are we making this public?",-1
127 | "Actually I don't want to specify the encoding, because I don't care how the data is transported to me.",-1
128 | "Hi Guillaume,I did not have an answer right away, so I sent you question to Leonard Rosenthol.",-1
129 | Indeed that would be VERY bad design.,-1
130 | Maurice I don't have such option or maybe I don't know where it is.,-1
131 | Doing it at the hackathon you'd have a few fellas at your shoulder to give you pointers should you get stuck.,-1
132 | Thanks for sweet patch Erik.,1
133 | sorry.,-1
134 | Bad IE.,-1
135 | "I don't have to ensure that the classloader knows groovy classes, *you* must do that.",-1
136 | Cool - good information to have.  Thanks Lance!,1
137 | "- there were a hell of a bigger problem, though : as we were blocked in the executor until the SearchRequest was totally processed, all the responses were enqueued.",-1
138 | "<gmcdonald> , my bad",-1
139 | Sorry for the noise.,-1
140 | "Thanks for spotting this, fixed at r487519",1
141 | I really like what I am seeing so far.,1
142 | The cause is that the call to SpecificResponder.writeError in Responder.respond ultimately calls GenericData.resolveUnion which in turn calls GenericData.getSchemaName before the line where the UnresolvedUnionException gets thrown.,-1
143 | "I suspect it has nothing to do with the file system connector or Infinispan connectors, and is simply a (stupid) mistake in the federated join processor.",-1
144 | PreCommit-HDFS-Build is stuck.,-1
145 | "The eclipse ui was completly stuck, which was similiar to the other experinces.",-1
146 | "Ie, both at are bad.",-1
147 | "Brilliant feedback, thanks! I'm glad you found the issue, and the solution!",1
148 | :)Sorry about that.,-1
149 | > - FOUserAgent.getStream() is cool and very easy to use (now that it's properly> documented).,1
150 | Resolving again....,-1
151 | "I didn't do that because that seems bad in hive, so I returned ""null"" from the operation.",-1
152 | "HADOOP-2949: - If tarball is specified, HOD no longer validates for the pkgs directory in gridservice-hdfs or mapred sections as these are not going to be used anyway.",-1
153 | The samples you gave are different.,-1
154 | Sorry for the delay.,-1
155 | Thanks Brock!,1
156 | My bad.,-1
157 | I like Richards update as well :) What I did was out of pure anger so it may not have been the sexiest.,1
158 | Thanks for the patch Mubarak.,1
159 | "Well it's me that didn't get the whole point, now i got more, sorry for the noise.",-1
160 | "Great! Awesome!thanks,Dims",1
161 | "Aaarrggh, how stupid of mine  to have a System.out, again!",-1
162 | Imo all this just doesn't make sense from a pure performance aspect.,-1
163 | "I have stupidly deleted the original test dir, but judging from the suite's output files, no output was created after 3 1/2 hours.",-1
164 | "Qianshi is working on the SSL session reuse, but this buggy Bug system does not allow him assign this ticket, sigh",-1
165 | Sorry for the noise..,-1
166 | Thanks Alejandro .,1
167 | "Many thanks, Neeme.",1
168 | that's some freaky shit ...,-1
169 | "It sucks to lose the code readability, but it seems like a reasonable price to pay.",-1
170 | Sorry - the above comes across as terse.,-1
171 | Awesome work: this is a great first cut!,1
172 | "Hell, this is gonna take me a lot of work to raise.",-1
173 | "Thanks Atul,Your patch is in trunk at r933169I just added some ""mod for OFBiz layered lookups"" comments around changes as suggested Sascha",1
174 | "Some file were missing in the last patch, sorry.",-1
175 | Thanks Ashutosh and Gunther for your help!,1
176 | Doh!,-1
177 | I screwed up the encoding of the stopwords file (sorry).,-1
178 | "Ah, damn, I thought it was fixed :/Guillaume ?",-1
179 | "Sorry, yes I believe this has been resolved.",-1
180 | Pull it back in if you think different.,-1
181 | What a stupid name I chose for that object... )-:,-1
182 | sorry for misleading attachment name.,-1
183 | "The guy on our team that was going to do this was swamped, so I re-assigned this to you.",-1
184 | This looks simple and sweet to me.,1
185 | Sorry ;-),-1
186 | "As far as the query shit, i have no idea if solrdispatchfilter or whatever could/should do Thread.currentThread().setName(x) or whatever (and maybe restore after)",-1
187 | My bad.,-1
188 | "Secondly reading this code I can see why this bug is happening, after completing stage 2 above when adding the new item the code does ""ordered - cancelled = quantity"" which equates to ""1 - 1 = 0"".",-1
189 | Thanks Awdesh - Done at r821748.,1
190 | Sorry - the federated build is still working out kinks...,-1
191 | "Super, I'll commit shortly -- thanks Yonik!",1
192 | sorry for your time.,-1
193 | Damn !,-1
194 | This sucks badly.,-1
195 | Sorry but this is just stupid.,-1
196 | "Awesome, you rock, Drew!",1
197 | "If that would be the case, this would be bad design.",-1
198 | I've tested them out and everything is fine.,1
199 | This is only a problem with exceptions from java.,-1
200 | "Aaron, sorry about this.",-1
201 | The rest are *totally* unrelated.,-1
202 | "[~cmccabe]: oh, hell no!",-1
203 | "Turning off hints is basically intended as a ""oh shit, something is broken with hints, let's turn it off"" switch.",-1
204 | Damn it !,-1
205 | I think it's time to just close this issue.,-1
206 | Thanks Ashish!,1
207 | Damn...,-1
208 | "Aha,, thanks for the information. That makes sense. Glad to hear that it helped! :)",1
209 | "Ah, that was my bad.",-1
210 | ah - my bad.,-1
211 | "Thanks, Sanjay!",1
212 | fuck u,-1
213 | Cheers!,1
214 | Thanks for doing that Uma.,1
215 | Thanks Dianne!,1
216 | "My bad, this is already done.",-1
217 | huh ... i thought i did resolve this.,-1
218 | "Hell, UnaryFunction might even be faster than all of these calls in a row.",-1
219 | I'm an idiot.,-1
220 | Thanks senaka for the patch.,1
221 | "Oh, I didn't consider one flow like after the edit log conversion, immediately #store failed.",-1
222 | Forget the patch for the moment.,-1
223 |  I like the elegant parser!  ,1
224 | I would love to have it right now for storm too. If you want me to sign up as a use case I am happy to.,1
225 | "Brandon, sorry.",-1
226 | "Ugh, sorry :(  Thanks!",-1
227 | bq. OK w/ the latest patch all tests pass for me! Great Awesome! :),1
228 | "It is a nasty bug that I've seen in real life, though.",-1
229 | I am new to Mina and the whole environment.,-1
230 | Awesome - great stuff Maria.  Thanks!,1
231 | Ok. Stupid user Error here.,-1
232 | Sorry about that...,-1
233 | "To upgrade to a recent version of Jackrabbit, see http://wiki.apache.org/jackrabbit/BackupAndMigrationI'm afraid I can't say much about the risk.",-1
234 | Sorry GavDONECommitted @revision 2487.,-1
235 | "Mac, looks like the tests are failing (especially TestHarFileSystem).",-1
236 | I'm glad we were able to resolve this issue.,1
237 | My patch wouldn't compile.,-1
238 | Will still be stuck in the loop though if can't actually close regions.,-1
239 | Thanks henry and pat... we'll have to re submit all the PA's so trigger hudson.,1
240 | "Someone, but I'm not sure who, owes me a public apology here.  /Larry  ",-1
241 | "Thanks, Daryn!",1
242 | cool man it looks good. we need a changes entry but from my side this looks good. we can tackle the todos on trunk,1
243 | Oh man ... it's a fucking precendence problem.,-1
244 | "Yep, my bad.",-1
245 | Patch applied with thanks!,1
246 | "Hi Carlos, This looks awesome! Lots of cool stuff.",1
247 | Excuse me for stolen assignement.,-1
248 | "I opened [HADOOP-3607] to fix a wrong URL, but appart from that I don't there's still references to the old structure.",-1
249 | "Cool, looks good.",1
250 | I meet the same problem on Eclipse recently but haven't figured out how to get through.,-1
251 | "Thanks, Ashish!",1
252 | Thanks!,1
253 | {quote}You are messing down deep below hbase in dfs.,-1
254 | "Thanks Andrew, committed in rev.",1
255 | "@mahadev - I would love to help test a patch :) I'm currently using 3.3.1 + ZOOKEEPER-744 + ZOOKEEPER-790, applied in that order.",1
256 | "Thanks a lot for sharing that, Josh!",1
257 | Weird.,-1
258 | "Thanks, Mike!",1
259 | Sorry.,-1
260 | "Hi Sandy,Thanks so much to give me such comments, that's really helpful, I will update this later.",1
261 | Tried some more stuff and realized I was doing it wrong.,-1
262 | "Thanks for the very clear explanation of the needed change, Dag.",1
263 | This patch really helped.,1
264 | "Daryn, the current patch looks good.",1
265 | The biggest problem is that we've had too many committers over the years and we'd have to get all of their permission to change it.,-1
266 | thanks for the explanation.,1
267 | Weird.,-1
268 | "The patch should be relatively trivial, but like I said, I have no idea if there is other important stuff going on there or not.",-1
269 | Splitting an existing sub-shard gets stuck up.,-1
270 | "And debugging is hell, because the test environment needs to have the exact same loader setup.",-1
271 | "Holy complicated-as-shit-algorithm, Batman!The complexity of our implementation vs the complexity of what we're actually doing is starting to worry me here.",-1
272 | Sorry about that.,-1
273 | Patch looks good to me.,1
274 | Thanks for the patch Erik (and Jon),1
275 | Version 2.2.0RC3 is fine.,1
276 | "Because, we don't care for this in cases where we there is no node that is down.",-1
277 | > I hated that aspect of working for commercial companies.,-1
278 | Please contact me if you need any clarifications.,1
279 | 


--------------------------------------------------------------------------------
/data/SentiStrength/jira-test.txt:
--------------------------------------------------------------------------------
  1 | sent
  2 | "It may cause conflicts, but if your intent is to break system security, you probably don't care."
  3 | "Thanks, Amar!"
  4 | "This was a very bad bug, introduced by me being an idiot."
  5 | Didn't got the time to try it yet.
  6 | Pull it back in if you think different.
  7 | Why the hell is this not a bug?
  8 | the recommendation in the wiki is bad.
  9 | I'm confused.
 10 | "I was in too much of a hurry, sorry hold on a sec."
 11 | Sorry for the noise.
 12 | It really sucks that there is now way currently to include transitive dependencies.
 13 | "Thanks,Mayank"
 14 | Thanks Uwe!
 15 | Thanks!
 16 | My bad.
 17 | "I don't care if everything is pretty or not, but we should at least support basic admin functionality in IE IMO (though I have not used it for years for just about anything)."
 18 | Is that ok for this file (b/c I have no idea how to do the svn move now ... after I've made all the changes already) :)
 19 | "My bad, I screwed up the assertion -> RuntimeException transition."
 20 | Regex is your friend.
 21 | Pull it back in if you think different.
 22 | "In fact, if you happen to know why ObserverHammerQuorumTest is failing with this latest patch, I'd love to hear."
 23 | My bad.
 24 | "sorry old xml here is the used one:<?xml version=""1.0"" encoding=""UTF-8""?><root><value>?¤?¶?¼ und ??????"
 25 | You're results are awesome Paul. Great work :)Looking forward to see your new JSON parser in trunk whenever you think is ready.
 26 | This is Awesome Stefan - thanks a million!
 27 | My bad.
 28 | Thanks for fixing it so quickly!
 29 | "Hi Sagara,Thank you very much for looking into this."
 30 | "Sorry typo:  ""our part"" in place of  ""one part"" above."
 31 | "Sorry, I meant to have this patch in sooner but got quite busy, I expect to have it soon."
 32 | Yes exactly -- sorry to be so unclear.
 33 | Weird.
 34 | I think you made mistakes in ivy.xml.
 35 | Thanks for your patience.
 36 | Thanks Sijie.
 37 | "Thanks, Dhruba!"
 38 | "Shit, I missed a cast."
 39 | But please don't say that my reasoning is bad - because it is not.
 40 | This might be a bug indeed.
 41 | This is clearly bad webserver behaviour.
 42 | @Ashutosh: thanks a lot for the comments.
 43 | 897392Thanks Sharan !
 44 | "Looks good, thanks Laura!"
 45 | That's my bad.
 46 | Thanks Andrew - the patch was applied to SQL module at r525019.
 47 | "Thank you,+ Harit Himanshu"
 48 | This sucks *so much*...
 49 | "I've tried something similar (I removed the handlers and kept the readers), but the performance was not visible."
 50 | Damn...
 51 | Applied patch with thanks to Scott.
 52 | I think we all know it just sucks.
 53 | "Damn, Chuck is scary."
 54 | You would need to implement session resume; thats a whole new can of worms.
 55 | I don't have strong opinions about it either way.
 56 | This is weird.
 57 | Thank you for the patch Sergey. 
 58 | "Yes it is a dup, thanks Mike for taking care of this (I planned to do this yesterday but didn't make it)"
 59 | "Sorry Avdhesh, I forgot to add these two files to the patch, here is the new patch containing the missing files"
 60 | "Thanks a lot, Kiran for the patch."
 61 | This is bad.
 62 | "Sorry, I guess I'm against ""never computing this shit""... because you guys think returning NaN is ok."
 63 | sorry for delay
 64 | "And boy, hell broke loose ;)So... the biggest issue I'm facing is indeed with Random sharing across threads."
 65 | "Thanks,Arvind"
 66 | Please close as this is just me being stupid.
 67 | "Thanks a lot for the reviews, Todd."
 68 | Thanks Karthik.
 69 | "Sorry for my poor review, didn't notice try/catch :("
 70 | "Seems we didn't enforce an exec for sh, but we did for fs."
 71 | "durrrh, that sucks."
 72 | "This is pretty trivial, just adds three asserts to TestPath#testNormalize."
 73 |  Automatic location selection is very cool.
 74 | All I'm trying to say is that it's pretty easy to end up in propagation failure hell here or change something else that blows things up for use cases that are not foreseen.
 75 | I don't know what just happened.
 76 | "Thanks for the report, and sorry its taken so long to fix it."
 77 | I think the correct resolution is to ensure that the prefix stack mechanism gets reset each time the XMLReader is used.
 78 | yup sorry just fixed.
 79 | A stupid bug in a patch that is already applied.
 80 | i found the class - so there is no bug - sorry
 81 | > then we don't save IO by limiting the buffer size to 1 KBI'm confused by this.
 82 | Finally closing this bug from hell.
 83 | "Thanks Areek, patch looks good!"
 84 | Incidentally if we all nag Joe Walnes enough we might be able to persuade him to release a new qdox which can ignore annotations etc (though it will still struggle with generics I think)
 85 | Bug in existing testDelegationTokenRestoredOnRMrestart().
 86 | "Igor is an idiot, and we *do* need gmake."
 87 | Thank you.
 88 | Sorry about that.
 89 | My bad.
 90 | Sorry for the trouble Vikram!
 91 | "Why the hell do they deliver Duration, if they cannot instantiate it :-/"
 92 | (sorry if that was confusing)
 93 | "Sorry Chris,I missed it, done !"
 94 | This is a great suggestion.
 95 | "Thanks for reminding me; I agree, I'll do it."
 96 | {quote}searcher.getAtomicReader().getSortedDocValues(uniqueKey);{quote}This is a performance killer.
 97 | Thanks for the reviews..
 98 | I'm an idiot.
 99 | This looks safe also
100 | Completely missed issue 614..
101 | "Owen, thanks for the slides."
102 | "Thanks,Mayank"
103 | This looks good to me.
104 | Sounds weird to me...Could you package a (totally!)
105 | Seems to be failing for a different reason nowtestContainerLaunch(org.apache.hadoop.yarn.server.nodemanager.TestLinuxContainerExecutorWithMocks)  Time elapsed: 0.523 sec  <<< FAILURE!
106 | "Sorry, I kind of forget about this one."
107 | Sorry for the confusion :)
108 | An output connector should also have a say in what URLs it will accept.
109 | "Well, that sucks."
110 | Here is quite bad.
111 | Sounds like a good idea.
112 | "Sorry, I was trying to get to the wiki but it's been a busy week."
113 | "Damn, it seemed it didn't work."
114 | It is good to have the test.
115 | Interesting that you decided not to detect the error based on finding two similarly-named operations.
116 | Sorry that I think I missed some discussion in the mailing list.
117 | Thanks to Mathias Werlitz - sorry for the delay.
118 | "Thanks to both of you, and to Deepesh for the initiative."
119 | My error.
120 | Awesome stuff Stephen! 
121 | I want to integrate the sweet sweet logo Andrew crafted.
122 | "Thanks, Owen!"
123 | "I am not really sure what does the receive payment do before the shipment, it doesn't sound as if it is doing what we expect it to do."
124 | Sounds like a good idea. 
125 | "Thanks Oliver, that's fixed it. "
126 | Thanks Tom!
127 | "umm ... call me crazy, but why are we making this public?"
128 | "Actually I don't want to specify the encoding, because I don't care how the data is transported to me."
129 | "Hi Guillaume,I did not have an answer right away, so I sent you question to Leonard Rosenthol."
130 | Indeed that would be VERY bad design.
131 | Maurice I don't have such option or maybe I don't know where it is.
132 | Doing it at the hackathon you'd have a few fellas at your shoulder to give you pointers should you get stuck.
133 | Thanks for sweet patch Erik.
134 | sorry.
135 | Bad IE.
136 | "I don't have to ensure that the classloader knows groovy classes, *you* must do that."
137 | Cool - good information to have.  Thanks Lance!
138 | "- there were a hell of a bigger problem, though : as we were blocked in the executor until the SearchRequest was totally processed, all the responses were enqueued."
139 | "<gmcdonald> , my bad"
140 | Sorry for the noise.
141 | "Thanks for spotting this, fixed at r487519"
142 | I really like what I am seeing so far.
143 | The cause is that the call to SpecificResponder.writeError in Responder.respond ultimately calls GenericData.resolveUnion which in turn calls GenericData.getSchemaName before the line where the UnresolvedUnionException gets thrown.
144 | "I suspect it has nothing to do with the file system connector or Infinispan connectors, and is simply a (stupid) mistake in the federated join processor."
145 | PreCommit-HDFS-Build is stuck.
146 | "The eclipse ui was completly stuck, which was similiar to the other experinces."
147 | "Ie, both at are bad."
148 | "Brilliant feedback, thanks! I'm glad you found the issue, and the solution!"
149 | :)Sorry about that.
150 | > - FOUserAgent.getStream() is cool and very easy to use (now that it's properly> documented).
151 | Resolving again....
152 | "I didn't do that because that seems bad in hive, so I returned ""null"" from the operation."
153 | "HADOOP-2949: - If tarball is specified, HOD no longer validates for the pkgs directory in gridservice-hdfs or mapred sections as these are not going to be used anyway."
154 | The samples you gave are different.
155 | Sorry for the delay.
156 | Thanks Brock!
157 | My bad.
158 | I like Richards update as well :) What I did was out of pure anger so it may not have been the sexiest.
159 | Thanks for the patch Mubarak.
160 | "Well it's me that didn't get the whole point, now i got more, sorry for the noise."
161 | "Great! Awesome!thanks,Dims"
162 | "Aaarrggh, how stupid of mine  to have a System.out, again!"
163 | Imo all this just doesn't make sense from a pure performance aspect.
164 | "I have stupidly deleted the original test dir, but judging from the suite's output files, no output was created after 3 1/2 hours."
165 | "Qianshi is working on the SSL session reuse, but this buggy Bug system does not allow him assign this ticket, sigh"
166 | Sorry for the noise..
167 | Thanks Alejandro .
168 | "Many thanks, Neeme."
169 | that's some freaky shit ...
170 | "It sucks to lose the code readability, but it seems like a reasonable price to pay."
171 | Sorry - the above comes across as terse.
172 | Awesome work: this is a great first cut!
173 | "Hell, this is gonna take me a lot of work to raise."
174 | "Thanks Atul,Your patch is in trunk at r933169I just added some ""mod for OFBiz layered lookups"" comments around changes as suggested Sascha"
175 | "Some file were missing in the last patch, sorry."
176 | Thanks Ashutosh and Gunther for your help!
177 | Doh!
178 | I screwed up the encoding of the stopwords file (sorry).
179 | "Ah, damn, I thought it was fixed :/Guillaume ?"
180 | "Sorry, yes I believe this has been resolved."
181 | Pull it back in if you think different.
182 | What a stupid name I chose for that object... )-:
183 | sorry for misleading attachment name.
184 | "The guy on our team that was going to do this was swamped, so I re-assigned this to you."
185 | This looks simple and sweet to me.
186 | Sorry ;-)
187 | "As far as the query shit, i have no idea if solrdispatchfilter or whatever could/should do Thread.currentThread().setName(x) or whatever (and maybe restore after)"
188 | My bad.
189 | "Secondly reading this code I can see why this bug is happening, after completing stage 2 above when adding the new item the code does ""ordered - cancelled = quantity"" which equates to ""1 - 1 = 0""."
190 | Thanks Awdesh - Done at r821748.
191 | Sorry - the federated build is still working out kinks...
192 | "Super, I'll commit shortly -- thanks Yonik!"
193 | sorry for your time.
194 | Damn !
195 | This sucks badly.
196 | Sorry but this is just stupid.
197 | "Awesome, you rock, Drew!"
198 | "If that would be the case, this would be bad design."
199 | I've tested them out and everything is fine.
200 | This is only a problem with exceptions from java.
201 | "Aaron, sorry about this."
202 | The rest are *totally* unrelated.
203 | "[~cmccabe]: oh, hell no!"
204 | "Turning off hints is basically intended as a ""oh shit, something is broken with hints, let's turn it off"" switch."
205 | Damn it !
206 | I think it's time to just close this issue.
207 | Thanks Ashish!
208 | Damn...
209 | "Aha,, thanks for the information. That makes sense. Glad to hear that it helped! :)"
210 | "Ah, that was my bad."
211 | ah - my bad.
212 | "Thanks, Sanjay!"
213 | fuck u
214 | Cheers!
215 | Thanks for doing that Uma.
216 | Thanks Dianne!
217 | "My bad, this is already done."
218 | huh ... i thought i did resolve this.
219 | "Hell, UnaryFunction might even be faster than all of these calls in a row."
220 | I'm an idiot.
221 | Thanks senaka for the patch.
222 | "Oh, I didn't consider one flow like after the edit log conversion, immediately #store failed."
223 | Forget the patch for the moment.
224 |  I like the elegant parser!  
225 | I would love to have it right now for storm too. If you want me to sign up as a use case I am happy to.
226 | "Brandon, sorry."
227 | "Ugh, sorry :(  Thanks!"
228 | bq. OK w/ the latest patch all tests pass for me! Great Awesome! :)
229 | "It is a nasty bug that I've seen in real life, though."
230 | I am new to Mina and the whole environment.
231 | Awesome - great stuff Maria.  Thanks!
232 | Ok. Stupid user Error here.
233 | Sorry about that...
234 | "To upgrade to a recent version of Jackrabbit, see http://wiki.apache.org/jackrabbit/BackupAndMigrationI'm afraid I can't say much about the risk."
235 | Sorry GavDONECommitted @revision 2487.
236 | "Mac, looks like the tests are failing (especially TestHarFileSystem)."
237 | I'm glad we were able to resolve this issue.
238 | My patch wouldn't compile.
239 | Will still be stuck in the loop though if can't actually close regions.
240 | Thanks henry and pat... we'll have to re submit all the PA's so trigger hudson.
241 | "Someone, but I'm not sure who, owes me a public apology here.  /Larry  "
242 | "Thanks, Daryn!"
243 | cool man it looks good. we need a changes entry but from my side this looks good. we can tackle the todos on trunk
244 | Oh man ... it's a fucking precendence problem.
245 | "Yep, my bad."
246 | Patch applied with thanks!
247 | "Hi Carlos, This looks awesome! Lots of cool stuff."
248 | Excuse me for stolen assignement.
249 | "I opened [HADOOP-3607] to fix a wrong URL, but appart from that I don't there's still references to the old structure."
250 | "Cool, looks good."
251 | I meet the same problem on Eclipse recently but haven't figured out how to get through.
252 | "Thanks, Ashish!"
253 | Thanks!
254 | {quote}You are messing down deep below hbase in dfs.
255 | "Thanks Andrew, committed in rev."
256 | "@mahadev - I would love to help test a patch :) I'm currently using 3.3.1 + ZOOKEEPER-744 + ZOOKEEPER-790, applied in that order."
257 | "Thanks a lot for sharing that, Josh!"
258 | Weird.
259 | "Thanks, Mike!"
260 | Sorry.
261 | "Hi Sandy,Thanks so much to give me such comments, that's really helpful, I will update this later."
262 | Tried some more stuff and realized I was doing it wrong.
263 | "Thanks for the very clear explanation of the needed change, Dag."
264 | This patch really helped.
265 | "Daryn, the current patch looks good."
266 | The biggest problem is that we've had too many committers over the years and we'd have to get all of their permission to change it.
267 | thanks for the explanation.
268 | Weird.
269 | "The patch should be relatively trivial, but like I said, I have no idea if there is other important stuff going on there or not."
270 | Splitting an existing sub-shard gets stuck up.
271 | "And debugging is hell, because the test environment needs to have the exact same loader setup."
272 | "Holy complicated-as-shit-algorithm, Batman!The complexity of our implementation vs the complexity of what we're actually doing is starting to worry me here."
273 | Sorry about that.
274 | Patch looks good to me.
275 | Thanks for the patch Erik (and Jon)
276 | Version 2.2.0RC3 is fine.
277 | "Because, we don't care for this in cases where we there is no node that is down."
278 | > I hated that aspect of working for commercial companies.
279 | Please contact me if you need any clarifications.
280 | 


--------------------------------------------------------------------------------
/scripts/PTM/api.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from utils import *
  3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification
  4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification
  5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification
  6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification
  7 | import argparse
  8 | 
  9 | #          Model          | Tokenizer          | Pretrained weights shortcut
 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
 11 |           (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
 12 |           (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 
 13 |           (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
 14 |          ]
 15 | 
 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
 17 | 
 18 | seed_torch(42)
 19 | ## Read model name
 20 | parser = argparse.ArgumentParser(description='Choose the models.')
 21 | 
 22 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 23 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 24 | 
 25 | 
 26 | args = parser.parse_args()
 27 | m_num=args.model_num
 28 | 
 29 | cur_model=MODELS[m_num]
 30 | m_name=MODEL_NAMES[m_num]
 31 | 
 32 | train_df=pd.read_pickle(api_train)
 33 | train_df['label']=train_df['label'].replace(-1, 2)
 34 | 
 35 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)
 36 | 
 37 | sentences=train_df.sentence.values
 38 | labels=train_df.label.values
 39 | 
 40 | # max_len = 0
 41 | # for sent in sentences:
 42 | #     input_ids=tokenizer.encode(str(sent), add_special_tokens=True)
 43 | #     max_len=max(max_len, len(input_ids))
 44 | # print('Max sentence length: ', max_len)
 45 | 
 46 | input_ids = []
 47 | attention_masks = []
 48 | 
 49 | for sent in sentences:
 50 | 
 51 |     encoded_dict = tokenizer.encode_plus(
 52 |                         str(sent), 
 53 |                         add_special_tokens = True, 
 54 |                         max_length = MAX_LEN,
 55 |                         pad_to_max_length = True,
 56 |                         return_attention_mask = True, 
 57 |                         return_tensors = 'pt'
 58 |                    )
 59 |      
 60 |     input_ids.append(encoded_dict['input_ids'])
 61 |     attention_masks.append(encoded_dict['attention_mask'])
 62 | 
 63 | 
 64 | train_inputs = torch.cat(input_ids, dim=0)
 65 | train_masks = torch.cat(attention_masks, dim=0)
 66 | train_labels = torch.tensor(labels)
 67 | 
 68 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape))
 69 | 
 70 | train_data = TensorDataset(train_inputs, train_masks, train_labels)
 71 | train_sampler = RandomSampler(train_data)
 72 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
 73 | 
 74 | # Train Model
 75 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 76 | model.cuda()
 77 | 
 78 | param_optimizer = list(model.named_parameters())
 79 | no_decay = ['bias', 'gamma', 'beta']
 80 | optimizer_grouped_parameters = [
 81 |     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
 82 |      'weight_decay_rate': 0.01},
 83 |     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
 84 |      'weight_decay_rate': 0.0}
 85 | ]
 86 | 
 87 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
 88 | 
 89 | begin=time.time()
 90 | train_loss_set = []
 91 | 
 92 | for _ in trange(EPOCHS, desc="Epoch"): 
 93 | 
 94 |     model.train()
 95 | 
 96 |     tr_loss = 0
 97 |     nb_tr_examples, nb_tr_steps = 0, 0
 98 |     
 99 |     for step, batch in enumerate(train_dataloader):
100 |     
101 |         batch = tuple(t.to(device) for t in batch)
102 |       
103 |         b_input_ids, b_input_mask, b_labels = batch
104 |         optimizer.zero_grad()
105 |         
106 |         # Forward pass
107 |         outputs = model(b_input_ids, token_type_ids=None, \
108 |                         attention_mask=b_input_mask, labels=b_labels)
109 |         loss = outputs[0]
110 |         logits = outputs[1]
111 |         train_loss_set.append(loss.item())    
112 |         
113 |         # Backward pass
114 |         loss.backward()
115 |         optimizer.step()
116 |         
117 |         tr_loss += loss.item()
118 |         nb_tr_examples += b_input_ids.size(0)
119 |         nb_tr_steps += 1
120 | 
121 |     print("Train loss: {}".format(tr_loss/nb_tr_steps))
122 | 
123 | end=time.time()
124 | print('Training used {:.2f} second'.format(end-begin))
125 | 
126 | begin=time.time()
127 | #test_df = pd.read_csv(api_test, usecols=['sentence','label'])
128 | test_df=pd.read_pickle(api_test)
129 | test_df['label']=test_df['label'].replace(-1, 2)
130 | 
131 | sentences=test_df.sentence.values
132 | labels = test_df.label.values
133 | 
134 | input_ids = []
135 | attention_masks = []
136 | 
137 | for sent in sentences:
138 |     encoded_dict = tokenizer.encode_plus(
139 |                         str(sent), 
140 |                         add_special_tokens = True, 
141 |                         max_length = MAX_LEN,
142 |                         pad_to_max_length = True,
143 |                         return_attention_mask = True, 
144 |                         return_tensors = 'pt',
145 |                    )
146 |      
147 |     input_ids.append(encoded_dict['input_ids'])
148 |     attention_masks.append(encoded_dict['attention_mask'])
149 | 
150 | prediction_inputs = torch.cat(input_ids,dim=0)
151 | prediction_masks = torch.cat(attention_masks,dim=0)
152 | prediction_labels = torch.tensor(labels)
153 | 
154 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
155 | prediction_sampler = SequentialSampler(prediction_data)
156 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)
157 | 
158 | model.eval()
159 | predictions,true_labels=[],[]
160 | 
161 | for batch in prediction_dataloader:
162 |     batch = tuple(t.to(device) for t in batch)
163 |     b_input_ids, b_input_mask, b_labels = batch
164 | 
165 |     with torch.no_grad():
166 |         outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
167 |         logits = outputs[0]
168 | 
169 |     logits = logits.detach().cpu().numpy()
170 |     label_ids = b_labels.to('cpu').numpy()
171 |     
172 |     predictions.append(logits)
173 |     true_labels.append(label_ids)
174 | 
175 | end=time.time()
176 | print('Prediction used {:.2f} seconds'.format(end-begin))
177 | 
178 | flat_predictions = [item for sublist in predictions for item in sublist]
179 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
180 | flat_true_labels = [item for sublist in true_labels for item in sublist]
181 | 
182 | print("Accuracy of {} on API Reviews is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))
183 | 
184 | print(classification_report(flat_true_labels,flat_predictions))


--------------------------------------------------------------------------------
/scripts/PTM/app.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from utils import *
  3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification
  4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification
  5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification
  6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification
  7 | import argparse
  8 | 
  9 | #          Model          | Tokenizer          | Pretrained weights shortcut
 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
 11 |           (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
 12 |           (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 
 13 |           (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
 14 |          ]
 15 | 
 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
 17 | 
 18 | seed_torch(20200209)
 19 | ## Read model name
 20 | parser = argparse.ArgumentParser(description='Choose the models.')
 21 | 
 22 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 23 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 24 | 
 25 | 
 26 | args = parser.parse_args()
 27 | m_num=args.model_num
 28 | 
 29 | cur_model=MODELS[m_num]
 30 | m_name=MODEL_NAMES[m_num]
 31 | 
 32 | train_df=pd.read_pickle(app_train)
 33 | train_df['label']=train_df['label'].replace(-1, 2)
 34 | # negative: 2
 35 | 
 36 | # tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=True)
 37 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)
 38 | 
 39 | sentences=train_df.sentence.values
 40 | labels=train_df.label.values
 41 | 
 42 | # max_len = 0
 43 | # for sent in sentences:
 44 | #     input_ids=tokenizer.encode(sent, add_special_tokens=True)
 45 | #     max_len=max(max_len, len(input_ids))
 46 | # print('Max sentence length: ', max_len)
 47 | 
 48 | input_ids = []
 49 | attention_masks = []
 50 | 
 51 | for sent in sentences:
 52 | 
 53 |     encoded_dict = tokenizer.encode_plus(
 54 |                         str(sent), 
 55 |                         add_special_tokens = True, 
 56 |                         max_length = MAX_LEN,
 57 |                         pad_to_max_length = True,
 58 |                         return_attention_mask = True, 
 59 |                         return_tensors = 'pt'
 60 |                    )
 61 |      
 62 |     input_ids.append(encoded_dict['input_ids'])
 63 |     attention_masks.append(encoded_dict['attention_mask'])
 64 | 
 65 | 
 66 | train_inputs = torch.cat(input_ids, dim=0)
 67 | train_masks = torch.cat(attention_masks, dim=0)
 68 | train_labels = torch.tensor(labels)
 69 | 
 70 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape))
 71 | 
 72 | train_data = TensorDataset(train_inputs, train_masks, train_labels)
 73 | train_sampler = RandomSampler(train_data)
 74 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
 75 | 
 76 | # Train Model
 77 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 78 | model.cuda()
 79 | 
 80 | param_optimizer = list(model.named_parameters())
 81 | no_decay = ['bias', 'gamma', 'beta']
 82 | optimizer_grouped_parameters = [
 83 |     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
 84 |      'weight_decay_rate': 0.01},
 85 |     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
 86 |      'weight_decay_rate': 0.0}
 87 | ]
 88 | 
 89 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
 90 | 
 91 | begin=time.time()
 92 | train_loss_set = []
 93 | 
 94 | for _ in trange(EPOCHS, desc="Epoch"): 
 95 | 
 96 |     model.train()
 97 | 
 98 |     tr_loss = 0
 99 |     nb_tr_examples, nb_tr_steps = 0, 0
100 |     
101 |     for step, batch in enumerate(train_dataloader):
102 |     
103 |         batch = tuple(t.to(device) for t in batch)
104 |       
105 |         b_input_ids, b_input_mask, b_labels = batch
106 |         optimizer.zero_grad()
107 |         
108 |         # Forward pass
109 |         outputs = model(b_input_ids, token_type_ids=None, \
110 |                         attention_mask=b_input_mask, labels=b_labels)
111 |         loss = outputs[0]
112 |         logits = outputs[1]
113 |         train_loss_set.append(loss.item())    
114 |         
115 |         # Backward pass
116 |         loss.backward()
117 |         optimizer.step()
118 |         
119 |         tr_loss += loss.item()
120 |         nb_tr_examples += b_input_ids.size(0)
121 |         nb_tr_steps += 1
122 | 
123 |     print("Train loss: {}".format(tr_loss/nb_tr_steps))
124 | 
125 | end=time.time()
126 | print('Training used {:.2f} second'.format(end-begin))
127 | 
128 | ### Test
129 | begin=time.time()
130 | test_df=pd.read_pickle(app_test)
131 | test_df['label']=test_df['label'].replace(-1,2)
132 | 
133 | sentences=test_df.sentence.values
134 | labels = test_df.label.values
135 | 
136 | input_ids = []
137 | attention_masks = []
138 | 
139 | for sent in sentences:
140 |     encoded_dict = tokenizer.encode_plus(
141 |                         str(sent), 
142 |                         add_special_tokens = True, 
143 |                         max_length = MAX_LEN,
144 |                         pad_to_max_length = True,
145 |                         return_attention_mask = True, 
146 |                         return_tensors = 'pt',
147 |                    )
148 |      
149 |     input_ids.append(encoded_dict['input_ids'])
150 |     attention_masks.append(encoded_dict['attention_mask'])
151 | 
152 | prediction_inputs = torch.cat(input_ids,dim=0)
153 | prediction_masks = torch.cat(attention_masks,dim=0)
154 | prediction_labels = torch.tensor(labels)
155 | 
156 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
157 | prediction_sampler = SequentialSampler(prediction_data)
158 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)
159 | 
160 | model.eval()
161 | predictions,true_labels=[],[]
162 | 
163 | for batch in prediction_dataloader:
164 |     batch = tuple(t.to(device) for t in batch)
165 |     b_input_ids, b_input_mask, b_labels = batch
166 | 
167 |     with torch.no_grad():
168 |         outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
169 |         logits = outputs[0]
170 | 
171 |     logits = logits.detach().cpu().numpy()
172 |     label_ids = b_labels.to('cpu').numpy()
173 |     
174 |     predictions.append(logits)
175 |     true_labels.append(label_ids)
176 | 
177 | end=time.time()
178 | print('Prediction used {:.2f} seconds'.format(end-begin))
179 | 
180 | flat_predictions = [item for sublist in predictions for item in sublist]
181 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
182 | flat_true_labels = [item for sublist in true_labels for item in sublist]
183 | 
184 | print("Accuracy of {} on APP Reviews is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))
185 | print(classification_report(flat_true_labels,flat_predictions))


--------------------------------------------------------------------------------
/scripts/PTM/cr.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from utils import *
  3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification
  4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification
  5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification
  6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification
  7 | import argparse
  8 | 
  9 | #          Model          | Tokenizer          | Pretrained weights shortcut
 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
 11 |           (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
 12 |           (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 
 13 |           (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
 14 |          ]
 15 | 
 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
 17 | 
 18 | seed_torch(42)
 19 | 
 20 | ## Read model name
 21 | parser = argparse.ArgumentParser(description='Choose the models.')
 22 | 
 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 24 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 25 | 
 26 | 
 27 | args = parser.parse_args()
 28 | m_num=args.model_num
 29 | 
 30 | cur_model=MODELS[m_num]
 31 | m_name=MODEL_NAMES[m_num]
 32 | 
 33 | train_df=pd.read_pickle(cr_train)
 34 | 
 35 | # 0: non-negative, 1: negative
 36 | train_df['label']=train_df['label'].replace(-1, 1)
 37 | 
 38 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)
 39 | 
 40 | sentences=train_df.sentence.values
 41 | labels=train_df.label.values
 42 | 
 43 | # max_len = 0
 44 | # for sent in sentences:
 45 | #     input_ids=tokenizer.encode(sent, add_special_tokens=True)
 46 | #     max_len=max(max_len, len(input_ids))
 47 | # print('Max sentence length: ', max_len)
 48 | 
 49 | input_ids = []
 50 | attention_masks = []
 51 | 
 52 | for sent in sentences:
 53 | 
 54 |     encoded_dict = tokenizer.encode_plus(
 55 |                         str(sent), 
 56 |                         add_special_tokens = True, 
 57 |                         max_length = MAX_LEN,
 58 |                         pad_to_max_length = True,
 59 |                         return_attention_mask = True, 
 60 |                         return_tensors = 'pt'
 61 |                    )
 62 |      
 63 |     input_ids.append(encoded_dict['input_ids'])
 64 |     attention_masks.append(encoded_dict['attention_mask'])
 65 | 
 66 | 
 67 | train_inputs = torch.cat(input_ids, dim=0)
 68 | train_masks = torch.cat(attention_masks, dim=0)
 69 | train_labels = torch.tensor(labels)
 70 | 
 71 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape))
 72 | 
 73 | train_data = TensorDataset(train_inputs, train_masks, train_labels)
 74 | train_sampler = RandomSampler(train_data)
 75 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
 76 | 
 77 | # Train Model
 78 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 79 | model.cuda()
 80 | 
 81 | param_optimizer = list(model.named_parameters())
 82 | no_decay = ['bias', 'gamma', 'beta']
 83 | optimizer_grouped_parameters = [
 84 |     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
 85 |      'weight_decay_rate': 0.01},
 86 |     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
 87 |      'weight_decay_rate': 0.0}
 88 | ]
 89 | 
 90 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
 91 | 
 92 | begin=time.time()
 93 | train_loss_set = []
 94 | 
 95 | for _ in trange(EPOCHS, desc="Epoch"): 
 96 | 
 97 |     model.train()
 98 | 
 99 |     tr_loss = 0
100 |     nb_tr_examples, nb_tr_steps = 0, 0
101 |     
102 |     for step, batch in enumerate(train_dataloader):
103 |     
104 |         batch = tuple(t.to(device) for t in batch)
105 |       
106 |         b_input_ids, b_input_mask, b_labels = batch
107 |         optimizer.zero_grad()
108 |         
109 |         # Forward pass
110 |         outputs = model(b_input_ids, token_type_ids=None, \
111 |                         attention_mask=b_input_mask, labels=b_labels)
112 |         loss = outputs[0]
113 |         logits = outputs[1]
114 |         train_loss_set.append(loss.item())    
115 |         
116 |         # Backward pass
117 |         loss.backward()
118 |         optimizer.step()
119 |         
120 |         tr_loss += loss.item()
121 |         nb_tr_examples += b_input_ids.size(0)
122 |         nb_tr_steps += 1
123 | 
124 |     print("Train loss: {}".format(tr_loss/nb_tr_steps))
125 | 
126 | end=time.time()
127 | print('Training used {} second'.format(end-begin))
128 | 
129 | begin=time.time()
130 | # 0: non-negative, 1: negative
131 | test_df=pd.read_pickle(cr_test)
132 | test_df['label']=test_df['label'].replace(-1, 1)
133 | 
134 | sentences=test_df.sentence.values
135 | labels = test_df.label.values
136 | 
137 | input_ids = []
138 | attention_masks = []
139 | 
140 | for sent in sentences:
141 |     encoded_dict = tokenizer.encode_plus(
142 |                         str(sent), 
143 |                         add_special_tokens = True, 
144 |                         max_length = MAX_LEN,
145 |                         pad_to_max_length = True,
146 |                         return_attention_mask = True, 
147 |                         return_tensors = 'pt'
148 |                    )
149 |      
150 |     input_ids.append(encoded_dict['input_ids'])
151 |     attention_masks.append(encoded_dict['attention_mask'])
152 | 
153 | prediction_inputs = torch.cat(input_ids,dim=0)
154 | prediction_masks = torch.cat(attention_masks,dim=0)
155 | prediction_labels = torch.tensor(labels)
156 | 
157 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
158 | prediction_sampler = SequentialSampler(prediction_data)
159 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)
160 | 
161 | model.eval()
162 | predictions,true_labels=[],[]
163 | 
164 | for batch in prediction_dataloader:
165 |     batch = tuple(t.to(device) for t in batch)
166 |     b_input_ids, b_input_mask, b_labels = batch
167 | 
168 |     with torch.no_grad():
169 |         outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
170 |         logits = outputs[0]
171 | 
172 |     logits = logits.detach().cpu().numpy()
173 |     label_ids = b_labels.to('cpu').numpy()
174 |     
175 |     predictions.append(logits)
176 |     true_labels.append(label_ids)
177 | 
178 | end=time.time()
179 | print('Prediction used {:.2f} seconds'.format(end - begin))
180 | 
181 | flat_predictions = [item for sublist in predictions for item in sublist]
182 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
183 | flat_true_labels = [item for sublist in true_labels for item in sublist]
184 | 
185 | print("Accuracy of {} on Code Reviews is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))
186 | print(classification_report(flat_true_labels,flat_predictions))


--------------------------------------------------------------------------------
/scripts/PTM/early-stopping/api.py:
--------------------------------------------------------------------------------
  1 | # Created by happygirlzt
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts')
  5 | 
  6 | from utils import *
  7 | from sklearn.model_selection import train_test_split
  8 | import argparse
  9 | import pprint
 10 | import math
 11 | from transformers import AdamW
 12 | from ignite.engine import Engine, Events
 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall
 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping
 15 | from ignite.contrib.handlers import ProgressBar
 16 | 
 17 | import logging
 18 | logging.basicConfig(level=logging.ERROR)
 19 | 
 20 | ## Read model name and project name
 21 | parser = argparse.ArgumentParser(description='Choose the models.')
 22 | 
 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 24 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 25 | 
 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?',
 27 |                     help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0')
 28 | 
 29 | args = parser.parse_args()
 30 | #print(args.model_num)
 31 | #print(args.project_num)
 32 | 
 33 | m_num=args.model_num
 34 | rerun_flag=bool(args.re_run)
 35 |     
 36 | # Generate training, validation and test set
 37 | data_folder=Path('../data/')
 38 | 
 39 | cur_model=MODELS[m_num]
 40 | m_name=MODEL_NAMES[m_num]
 41 | 
 42 | print('Running model {} in API reviews'.format(m_name))
 43 | 
 44 | #### Read data
 45 | train_data=pd.read_pickle(data_folder/'api-train.pkl')
 46 | train_data['label']=train_data['label'].replace(-1, 2)
 47 | 
 48 | X_train=train_data['sentence']
 49 | y_train=train_data['label']
 50 | 
 51 | test_data=pd.read_pickle(data_folder/'api-test.pkl')
 52 | test_data['label']=test_data['label'].replace(-1, 2)
 53 | 
 54 | X_test=test_data['sentence']
 55 | y_test=test_data['label']
 56 | print('Read success!')
 57 | 
 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False)
 59 | 
 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False)
 61 | 
 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset)))
 63 | 
 64 | if rerun_flag:
 65 |     X_train, X_validation, y_train, y_validation = train_test_split(X_train, 
 66 |                                                             y_train, 
 67 |                                                             test_size=0.05, 
 68 |                                                             random_state=SEED,
 69 |                                                             stratify=y_train)
 70 | 
 71 |     #train_dataloader=get_dataloader(X_train, y_train,cur_model,True)
 72 |     #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False)
 73 |     
 74 |     train_iterator=get_iterator(X_train, y_train, cur_model, True)
 75 |     valid_iterator=get_iterator(X_validation, y_validation, cur_model, False)
 76 |     
 77 |     model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 78 |     model.cuda()
 79 |     
 80 |     optimizer = AdamW(model.parameters(),
 81 |                       lr=LEARNING_RATE,
 82 |                       eps=EPS,
 83 |                       weight_decay=WEIGHT_DECAY)
 84 |     
 85 |     #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5
 86 |     
 87 |     def process_function(engine, batch):
 88 |         model.train()
 89 |         optimizer.zero_grad()
 90 |         
 91 |         b_input_ids = batch.INPUT_IDS
 92 |         b_input_mask = batch.ATTENTION_MASKS
 93 |         b_labels = batch.LABEL
 94 |             
 95 | 
 96 |         outputs = model(b_input_ids,
 97 |                         token_type_ids=None,
 98 |                         attention_mask=b_input_mask,
 99 |                         labels=b_labels)
100 | 
101 |         loss = outputs[0]
102 |         logits = outputs[1]
103 | 
104 |         loss.backward()
105 |         optimizer.step()
106 |         #scheduler.step()
107 |         return loss.item()
108 | 
109 |     def eval_function(engine, batch):
110 |         model.eval()
111 |         with torch.no_grad():
112 |             b_input_ids = batch.INPUT_IDS
113 |             b_input_mask = batch.ATTENTION_MASKS
114 |             b_labels = batch.LABEL
115 |             
116 |             outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
117 |             #logits = outputs[0]
118 |             y_pred=outputs[0]
119 |             
120 |             return y_pred, b_labels
121 |     
122 |     trainer = Engine(process_function)
123 |     train_evaluator = Engine(eval_function)
124 |     validation_evaluator = Engine(eval_function)
125 |     
126 |     #print('success!')
127 |     #### Metrics
128 |     RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
129 |     
130 |     def output_transform_fun(output):
131 |         y_pred, y = output
132 |         y_pred=y_pred.detach().cpu().numpy()
133 |         y=y.to('cpu').numpy()
134 |         y_pred=np.argmax(y_pred, axis=1).flatten()
135 |         return torch.from_numpy(y_pred), torch.from_numpy(y)
136 |     
137 |     criterion = nn.CrossEntropyLoss()
138 |     ### Training
139 |     #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy')
140 |     Loss(criterion).attach(train_evaluator, 'cross-entropy')
141 |     
142 |     #precision = Precision(output_transform=output_transform_fun, average=False)
143 |     #.detach().cpu().numpy()
144 |     #recall = Recall(output_transform=output_transform_fun, average=False)
145 |     #.detach().cpu().numpy()
146 |     #F1 = (precision * recall * 2) / (precision + recall)
147 | 
148 |     #precision.attach(train_evaluator, 'precision')
149 |     #recall.attach(train_evaluator, 'recall')
150 |     #F1.attach(train_evaluator, 'F1')
151 |     
152 |     ### Validation    
153 |     #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy')
154 |     Loss(criterion).attach(validation_evaluator, 'cross-entropy')
155 | 
156 |     #precision.attach(validation_evaluator, 'precision')
157 |     #recall.attach(validation_evaluator, 'recall')
158 |     #F1.attach(validation_evaluator, 'F1')
159 |     
160 |     #### Progress Bar
161 |     pbar = ProgressBar(persist=True, bar_format="")
162 |     pbar.attach(trainer, ['loss'])
163 |     
164 |     def score_function_loss(engine):
165 |         val_loss = engine.state.metrics['cross-entropy']
166 |         return -val_loss
167 |     
168 |     def score_function_f1(engine):
169 |         val_f1 = engine.state.metrics['F1']
170 |         if math.isnan(val_f1):
171 |             return -9999
172 |         return val_f1
173 | 
174 |     handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer)
175 |     
176 |     validation_evaluator.add_event_handler(Events.COMPLETED, handler)
177 |     
178 |     def log_training_results(engine):
179 |         train_evaluator.run(train_iterator)
180 |         metrics = train_evaluator.state.metrics
181 |         pbar.log_message(
182 |         "Training Results - Epoch: {} \nMetrics\n{}"
183 |         .format(engine.state.epoch, pprint.pformat(metrics)))
184 |     
185 |     def log_validation_results(engine):
186 |         validation_evaluator.run(valid_iterator)
187 |         metrics = validation_evaluator.state.metrics
188 |         pbar.log_message(
189 |         "Validation Results - Epoch: {} \nMetrics\n{}"
190 |         .format(engine.state.epoch, pprint.pformat(metrics)))
191 |         pbar.n = pbar.last_print_n = 0
192 |         
193 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results)
194 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)
195 | 
196 |     #### Checkpoint
197 |     
198 |     # to_save = {'{}_{}'.format(p_name, m_name): model,
199 |     #           'optimizer': optimizer,
200 |     #           'lr_scheduler': scheduler
201 |     #           }
202 |     
203 |     to_save={'api_{}'.format(m_name): model}
204 |     
205 |     cp_handler = Checkpoint(to_save,
206 |                         DiskSaver('../models/',
207 |                         create_dir=True, require_empty=False),
208 |                         filename_prefix='best',
209 |                         score_function=score_function_loss,
210 |                         score_name='val_loss')
211 | 
212 |     validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler)
213 |     #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler)
214 | 
215 |     # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False)
216 |     
217 |     # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())
218 |     trainer.run(train_iterator, max_epochs=4)
219 | else:
220 |     print('Runing saved model...')
221 |     #run_on_test(cur_model, p_name, m_name, pred_iterator)
222 |     run_saved_model(prediction_dataloader, cur_model, 'api', m_name)


--------------------------------------------------------------------------------
/scripts/PTM/early-stopping/app.py:
--------------------------------------------------------------------------------
  1 | # Created by happygirlzt
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts')
  5 | 
  6 | from utils import *
  7 | from sklearn.model_selection import train_test_split
  8 | import argparse
  9 | import pprint
 10 | import math
 11 | from transformers import AdamW
 12 | from ignite.engine import Engine, Events
 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall
 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping
 15 | from ignite.contrib.handlers import ProgressBar
 16 | 
 17 | import logging
 18 | logging.basicConfig(level=logging.ERROR)
 19 | 
 20 | ## Read model name and project name
 21 | parser = argparse.ArgumentParser(description='Choose the models.')
 22 | 
 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 24 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 25 | 
 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?',
 27 |                     help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0')
 28 | 
 29 | args = parser.parse_args()
 30 | #print(args.model_num)
 31 | #print(args.project_num)
 32 | 
 33 | m_num=args.model_num
 34 | rerun_flag=bool(args.re_run)
 35 |     
 36 | # Generate training, validation and test set
 37 | data_folder=Path('../data/')
 38 | 
 39 | cur_model=MODELS[m_num]
 40 | m_name=MODEL_NAMES[m_num]
 41 | 
 42 | print('Running model {} in App reviews'.format(m_name))
 43 | 
 44 | #### Read data
 45 | train_data=pd.read_pickle(data_folder/'app-train.pkl')
 46 | train_data['label']=train_data['label'].replace(-1, 2)
 47 | 
 48 | X_train=train_data['sentence']
 49 | y_train=train_data['label']
 50 | 
 51 | test_data=pd.read_pickle(data_folder/'app-test.pkl')
 52 | test_data['label']=test_data['label'].replace(-1, 2)
 53 | 
 54 | X_test=test_data['sentence']
 55 | y_test=test_data['label']
 56 | print('Read success!')
 57 | 
 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False)
 59 | 
 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False)
 61 | 
 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset)))
 63 | 
 64 | if rerun_flag:
 65 |     X_train, X_validation, y_train, y_validation = train_test_split(X_train, 
 66 |                                                             y_train, 
 67 |                                                             test_size=0.05, 
 68 |                                                             random_state=SEED,
 69 |                                                             stratify=y_train)
 70 | 
 71 |     #train_dataloader=get_dataloader(X_train, y_train,cur_model,True)
 72 |     #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False)
 73 |     
 74 |     train_iterator=get_iterator(X_train, y_train, cur_model, True)
 75 |     valid_iterator=get_iterator(X_validation, y_validation, cur_model, False)
 76 |     
 77 |     model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 78 |     model.cuda()
 79 |     
 80 |     optimizer = AdamW(model.parameters(),
 81 |                       lr=LEARNING_RATE,
 82 |                       eps=EPS,
 83 |                       weight_decay=WEIGHT_DECAY)
 84 |     
 85 |     #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5
 86 |     
 87 |     def process_function(engine, batch):
 88 |         model.train()
 89 |         optimizer.zero_grad()
 90 |         
 91 |         b_input_ids = batch.INPUT_IDS
 92 |         b_input_mask = batch.ATTENTION_MASKS
 93 |         b_labels = batch.LABEL
 94 |             
 95 | 
 96 |         outputs = model(b_input_ids,
 97 |                         token_type_ids=None,
 98 |                         attention_mask=b_input_mask,
 99 |                         labels=b_labels)
100 | 
101 |         loss = outputs[0]
102 |         logits = outputs[1]
103 | 
104 |         loss.backward()
105 |         optimizer.step()
106 |         #scheduler.step()
107 |         return loss.item()
108 | 
109 |     def eval_function(engine, batch):
110 |         model.eval()
111 |         with torch.no_grad():
112 |             b_input_ids = batch.INPUT_IDS
113 |             b_input_mask = batch.ATTENTION_MASKS
114 |             b_labels = batch.LABEL
115 |             
116 |             outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
117 |             #logits = outputs[0]
118 |             y_pred=outputs[0]
119 |             
120 |             return y_pred, b_labels
121 |     
122 |     trainer = Engine(process_function)
123 |     train_evaluator = Engine(eval_function)
124 |     validation_evaluator = Engine(eval_function)
125 |     
126 |     #print('success!')
127 |     #### Metrics
128 |     RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
129 |     
130 |     def output_transform_fun(output):
131 |         y_pred, y = output
132 |         y_pred=y_pred.detach().cpu().numpy()
133 |         y=y.to('cpu').numpy()
134 |         y_pred=np.argmax(y_pred, axis=1).flatten()
135 |         return torch.from_numpy(y_pred), torch.from_numpy(y)
136 |     
137 |     criterion = nn.CrossEntropyLoss()
138 |     ### Training
139 |     #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy')
140 |     Loss(criterion).attach(train_evaluator, 'cross-entropy')
141 |     
142 |     #precision = Precision(output_transform=output_transform_fun, average=False)
143 |     #.detach().cpu().numpy()
144 |     #recall = Recall(output_transform=output_transform_fun, average=False)
145 |     #.detach().cpu().numpy()
146 |     #F1 = (precision * recall * 2) / (precision + recall)
147 | 
148 |     #precision.attach(train_evaluator, 'precision')
149 |     #recall.attach(train_evaluator, 'recall')
150 |     #F1.attach(train_evaluator, 'F1')
151 |     
152 |     ### Validation    
153 |     #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy')
154 |     Loss(criterion).attach(validation_evaluator, 'cross-entropy')
155 | 
156 |     #precision.attach(validation_evaluator, 'precision')
157 |     #recall.attach(validation_evaluator, 'recall')
158 |     #F1.attach(validation_evaluator, 'F1')
159 |     
160 |     #### Progress Bar
161 |     pbar = ProgressBar(persist=True, bar_format="")
162 |     pbar.attach(trainer, ['loss'])
163 |     
164 |     def score_function_loss(engine):
165 |         val_loss = engine.state.metrics['cross-entropy']
166 |         return -val_loss
167 |     
168 |     def score_function_f1(engine):
169 |         val_f1 = engine.state.metrics['F1']
170 |         if math.isnan(val_f1):
171 |             return -9999
172 |         return val_f1
173 | 
174 |     handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer)
175 |     
176 |     validation_evaluator.add_event_handler(Events.COMPLETED, handler)
177 |     
178 |     def log_training_results(engine):
179 |         train_evaluator.run(train_iterator)
180 |         metrics = train_evaluator.state.metrics
181 |         pbar.log_message(
182 |         "Training Results - Epoch: {} \nMetrics\n{}"
183 |         .format(engine.state.epoch, pprint.pformat(metrics)))
184 |     
185 |     def log_validation_results(engine):
186 |         validation_evaluator.run(valid_iterator)
187 |         metrics = validation_evaluator.state.metrics
188 |         pbar.log_message(
189 |         "Validation Results - Epoch: {} \nMetrics\n{}"
190 |         .format(engine.state.epoch, pprint.pformat(metrics)))
191 |         pbar.n = pbar.last_print_n = 0
192 |         
193 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results)
194 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)
195 | 
196 |     #### Checkpoint
197 |     
198 |     # to_save = {'{}_{}'.format(p_name, m_name): model,
199 |     #           'optimizer': optimizer,
200 |     #           'lr_scheduler': scheduler
201 |     #           }
202 |     
203 |     to_save={'app_{}'.format(m_name): model}
204 |     
205 |     cp_handler = Checkpoint(to_save,
206 |                         DiskSaver('../models/',
207 |                         create_dir=True, require_empty=False),
208 |                         filename_prefix='best',
209 |                         score_function=score_function_loss,
210 |                         score_name='val_loss')
211 | 
212 |     validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler)
213 |     #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler)
214 | 
215 |     # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False)
216 |     
217 |     # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())
218 |     trainer.run(train_iterator, max_epochs=4)
219 | else:
220 |     print('Runing saved model...')
221 |     #run_on_test(cur_model, p_name, m_name, pred_iterator)
222 |     run_saved_model(prediction_dataloader, cur_model, 'app', m_name)


--------------------------------------------------------------------------------
/scripts/PTM/early-stopping/cr.py:
--------------------------------------------------------------------------------
  1 | # Created by happygirlzt
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts')
  5 | 
  6 | from utils import *
  7 | from sklearn.model_selection import train_test_split
  8 | import argparse
  9 | import pprint
 10 | import math
 11 | from transformers import AdamW
 12 | from ignite.engine import Engine, Events
 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall
 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping
 15 | from ignite.contrib.handlers import ProgressBar
 16 | 
 17 | import logging
 18 | logging.basicConfig(level=logging.ERROR)
 19 | 
 20 | ## Read model name and project name
 21 | parser = argparse.ArgumentParser(description='Choose the models.')
 22 | 
 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 24 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 25 | 
 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?',
 27 |                     help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0')
 28 | 
 29 | args = parser.parse_args()
 30 | #print(args.model_num)
 31 | #print(args.project_num)
 32 | 
 33 | m_num=args.model_num
 34 | rerun_flag=bool(args.re_run)
 35 |     
 36 | # Generate training, validation and test set
 37 | data_folder=Path('../data/')
 38 | 
 39 | cur_model=MODELS[m_num]
 40 | m_name=MODEL_NAMES[m_num]
 41 | 
 42 | print('Running model {} in code reviews'.format(m_name))
 43 | 
 44 | #### Read data
 45 | train_data=pd.read_pickle(data_folder/'cr-train.pkl')
 46 | train_data['label']=train_data['label'].replace(-1, 1)
 47 | 
 48 | X_train=train_data['sentence']
 49 | y_train=train_data['label']
 50 | 
 51 | test_data=pd.read_pickle(data_folder/'cr-test.pkl')
 52 | test_data['label']=test_data['label'].replace(-1, 1)
 53 | 
 54 | X_test=test_data['sentence']
 55 | y_test=test_data['label']
 56 | print('Read success!')
 57 | 
 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False)
 59 | 
 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False)
 61 | 
 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset)))
 63 | 
 64 | if rerun_flag:
 65 |     X_train, X_validation, y_train, y_validation = train_test_split(X_train, 
 66 |                                                             y_train, 
 67 |                                                             test_size=0.05, 
 68 |                                                             random_state=SEED,
 69 |                                                             stratify=y_train)
 70 | 
 71 |     #train_dataloader=get_dataloader(X_train, y_train,cur_model,True)
 72 |     #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False)
 73 |     
 74 |     train_iterator=get_iterator(X_train, y_train, cur_model, True)
 75 |     valid_iterator=get_iterator(X_validation, y_validation, cur_model, False)
 76 |     
 77 |     model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 78 |     model.cuda()
 79 |     
 80 |     optimizer = AdamW(model.parameters(),
 81 |                       lr=LEARNING_RATE,
 82 |                       eps=EPS,
 83 |                       weight_decay=WEIGHT_DECAY)
 84 |     
 85 |     #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5
 86 |     
 87 |     def process_function(engine, batch):
 88 |         model.train()
 89 |         optimizer.zero_grad()
 90 |         
 91 |         b_input_ids = batch.INPUT_IDS
 92 |         b_input_mask = batch.ATTENTION_MASKS
 93 |         b_labels = batch.LABEL
 94 |             
 95 | 
 96 |         outputs = model(b_input_ids,
 97 |                         token_type_ids=None,
 98 |                         attention_mask=b_input_mask,
 99 |                         labels=b_labels)
100 | 
101 |         loss = outputs[0]
102 |         logits = outputs[1]
103 | 
104 |         loss.backward()
105 |         optimizer.step()
106 |         #scheduler.step()
107 |         return loss.item()
108 | 
109 |     def eval_function(engine, batch):
110 |         model.eval()
111 |         with torch.no_grad():
112 |             b_input_ids = batch.INPUT_IDS
113 |             b_input_mask = batch.ATTENTION_MASKS
114 |             b_labels = batch.LABEL
115 |             
116 |             outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
117 |             #logits = outputs[0]
118 |             y_pred=outputs[0]
119 |             
120 |             return y_pred, b_labels
121 |     
122 |     trainer = Engine(process_function)
123 |     train_evaluator = Engine(eval_function)
124 |     validation_evaluator = Engine(eval_function)
125 |     
126 |     #print('success!')
127 |     #### Metrics
128 |     RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
129 |     
130 |     def output_transform_fun(output):
131 |         y_pred, y = output
132 |         y_pred=y_pred.detach().cpu().numpy()
133 |         y=y.to('cpu').numpy()
134 |         y_pred=np.argmax(y_pred, axis=1).flatten()
135 |         return torch.from_numpy(y_pred), torch.from_numpy(y)
136 |     
137 |     criterion = nn.CrossEntropyLoss()
138 |     ### Training
139 |     #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy')
140 |     Loss(criterion).attach(train_evaluator, 'cross-entropy')
141 |     
142 |     #precision = Precision(output_transform=output_transform_fun, average=False)
143 |     #.detach().cpu().numpy()
144 |     #recall = Recall(output_transform=output_transform_fun, average=False)
145 |     #.detach().cpu().numpy()
146 |     #F1 = (precision * recall * 2) / (precision + recall)
147 | 
148 |     #precision.attach(train_evaluator, 'precision')
149 |     #recall.attach(train_evaluator, 'recall')
150 |     #F1.attach(train_evaluator, 'F1')
151 |     
152 |     ### Validation    
153 |     #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy')
154 |     Loss(criterion).attach(validation_evaluator, 'cross-entropy')
155 | 
156 |     #precision.attach(validation_evaluator, 'precision')
157 |     #recall.attach(validation_evaluator, 'recall')
158 |     #F1.attach(validation_evaluator, 'F1')
159 |     
160 |     #### Progress Bar
161 |     pbar = ProgressBar(persist=True, bar_format="")
162 |     pbar.attach(trainer, ['loss'])
163 |     
164 |     def score_function_loss(engine):
165 |         val_loss = engine.state.metrics['cross-entropy']
166 |         return -val_loss
167 |     
168 |     def score_function_f1(engine):
169 |         val_f1 = engine.state.metrics['F1']
170 |         if math.isnan(val_f1):
171 |             return -9999
172 |         return val_f1
173 | 
174 |     handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer)
175 |     
176 |     validation_evaluator.add_event_handler(Events.COMPLETED, handler)
177 |     
178 |     def log_training_results(engine):
179 |         train_evaluator.run(train_iterator)
180 |         metrics = train_evaluator.state.metrics
181 |         pbar.log_message(
182 |         "Training Results - Epoch: {} \nMetrics\n{}"
183 |         .format(engine.state.epoch, pprint.pformat(metrics)))
184 |     
185 |     def log_validation_results(engine):
186 |         validation_evaluator.run(valid_iterator)
187 |         metrics = validation_evaluator.state.metrics
188 |         pbar.log_message(
189 |         "Validation Results - Epoch: {} \nMetrics\n{}"
190 |         .format(engine.state.epoch, pprint.pformat(metrics)))
191 |         pbar.n = pbar.last_print_n = 0
192 |         
193 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results)
194 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)
195 | 
196 |     #### Checkpoint
197 |     
198 |     # to_save = {'{}_{}'.format(p_name, m_name): model,
199 |     #           'optimizer': optimizer,
200 |     #           'lr_scheduler': scheduler
201 |     #           }
202 |     
203 |     to_save={'cr_{}'.format(m_name): model}
204 |     
205 |     cp_handler = Checkpoint(to_save,
206 |                         DiskSaver('../models/',
207 |                         create_dir=True, require_empty=False),
208 |                         filename_prefix='best',
209 |                         score_function=score_function_loss,
210 |                         score_name='val_loss')
211 | 
212 |     validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler)
213 |     #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler)
214 | 
215 |     # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False)
216 |     
217 |     # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())
218 |     trainer.run(train_iterator, max_epochs=4)
219 | else:
220 |     print('Runing saved model...')
221 |     #run_on_test(cur_model, p_name, m_name, pred_iterator)
222 |     run_saved_model(prediction_dataloader, cur_model, 'cr', m_name)


--------------------------------------------------------------------------------
/scripts/PTM/early-stopping/github.py:
--------------------------------------------------------------------------------
  1 | # Created by happygirlzt
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts')
  5 | 
  6 | from utils import *
  7 | from sklearn.model_selection import train_test_split
  8 | import argparse
  9 | import pprint
 10 | import math
 11 | from transformers import AdamW
 12 | from ignite.engine import Engine, Events
 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall
 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping
 15 | from ignite.contrib.handlers import ProgressBar
 16 | 
 17 | import logging
 18 | logging.basicConfig(level=logging.ERROR)
 19 | 
 20 | ## Read model name and project name
 21 | parser = argparse.ArgumentParser(description='Choose the models.')
 22 | 
 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 24 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 25 | 
 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?',
 27 |                     help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0')
 28 | 
 29 | args = parser.parse_args()
 30 | #print(args.model_num)
 31 | #print(args.project_num)
 32 | 
 33 | m_num=args.model_num
 34 | rerun_flag=bool(args.re_run)
 35 |     
 36 | # Generate training, validation and test set
 37 | data_folder=Path('../data/')
 38 | 
 39 | cur_model=MODELS[m_num]
 40 | m_name=MODEL_NAMES[m_num]
 41 | 
 42 | print('Running model {} in Github'.format(m_name))
 43 | 
 44 | #### Read data
 45 | train_data=pd.read_pickle(data_folder/'gh-train.pkl')
 46 | train_data['label']=train_data['label'].replace({'positive':1, 'negative':2, 'neutral':0})
 47 | 
 48 | X_train=train_data['sentence']
 49 | y_train=train_data['label']
 50 | 
 51 | test_data=pd.read_pickle(data_folder/'gh-test.pkl')
 52 | test_data['label']=test_data['label'].replace({'positive':1, 'negative':2, 'neutral':0})
 53 | 
 54 | X_test=test_data['sentence']
 55 | y_test=test_data['label']
 56 | print('Read success!')
 57 | 
 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False)
 59 | 
 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False)
 61 | 
 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset)))
 63 | 
 64 | if rerun_flag:
 65 |     X_train, X_validation, y_train, y_validation = train_test_split(X_train, 
 66 |                                                             y_train, 
 67 |                                                             test_size=0.05, 
 68 |                                                             random_state=SEED,
 69 |                                                             stratify=y_train)
 70 | 
 71 |     #train_dataloader=get_dataloader(X_train, y_train,cur_model,True)
 72 |     #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False)
 73 |     
 74 |     train_iterator=get_iterator(X_train, y_train, cur_model, True)
 75 |     valid_iterator=get_iterator(X_validation, y_validation, cur_model, False)
 76 |     
 77 |     model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 78 |     model.cuda()
 79 |     
 80 |     optimizer = AdamW(model.parameters(),
 81 |                       lr=LEARNING_RATE,
 82 |                       eps=EPS,
 83 |                       weight_decay=WEIGHT_DECAY)
 84 |     
 85 |     #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5
 86 |     
 87 |     def process_function(engine, batch):
 88 |         model.train()
 89 |         optimizer.zero_grad()
 90 |         
 91 |         b_input_ids = batch.INPUT_IDS
 92 |         b_input_mask = batch.ATTENTION_MASKS
 93 |         b_labels = batch.LABEL
 94 |             
 95 | 
 96 |         outputs = model(b_input_ids,
 97 |                         token_type_ids=None,
 98 |                         attention_mask=b_input_mask,
 99 |                         labels=b_labels)
100 | 
101 |         loss = outputs[0]
102 |         logits = outputs[1]
103 | 
104 |         loss.backward()
105 |         optimizer.step()
106 |         #scheduler.step()
107 |         return loss.item()
108 | 
109 |     def eval_function(engine, batch):
110 |         model.eval()
111 |         with torch.no_grad():
112 |             b_input_ids = batch.INPUT_IDS
113 |             b_input_mask = batch.ATTENTION_MASKS
114 |             b_labels = batch.LABEL
115 |             
116 |             outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
117 |             #logits = outputs[0]
118 |             y_pred=outputs[0]
119 |             
120 |             return y_pred, b_labels
121 |     
122 |     trainer = Engine(process_function)
123 |     train_evaluator = Engine(eval_function)
124 |     validation_evaluator = Engine(eval_function)
125 |     
126 |     #print('success!')
127 |     #### Metrics
128 |     RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
129 |     
130 |     def output_transform_fun(output):
131 |         y_pred, y = output
132 |         y_pred=y_pred.detach().cpu().numpy()
133 |         y=y.to('cpu').numpy()
134 |         y_pred=np.argmax(y_pred, axis=1).flatten()
135 |         return torch.from_numpy(y_pred), torch.from_numpy(y)
136 |     
137 |     criterion = nn.CrossEntropyLoss()
138 |     ### Training
139 |     #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy')
140 |     Loss(criterion).attach(train_evaluator, 'cross-entropy')
141 |     
142 |     #precision = Precision(output_transform=output_transform_fun, average=False)
143 |     #.detach().cpu().numpy()
144 |     #recall = Recall(output_transform=output_transform_fun, average=False)
145 |     #.detach().cpu().numpy()
146 |     #F1 = (precision * recall * 2) / (precision + recall)
147 | 
148 |     #precision.attach(train_evaluator, 'precision')
149 |     #recall.attach(train_evaluator, 'recall')
150 |     #F1.attach(train_evaluator, 'F1')
151 |     
152 |     ### Validation    
153 |     #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy')
154 |     Loss(criterion).attach(validation_evaluator, 'cross-entropy')
155 | 
156 |     #precision.attach(validation_evaluator, 'precision')
157 |     #recall.attach(validation_evaluator, 'recall')
158 |     #F1.attach(validation_evaluator, 'F1')
159 |     
160 |     #### Progress Bar
161 |     pbar = ProgressBar(persist=True, bar_format="")
162 |     pbar.attach(trainer, ['loss'])
163 |     
164 |     def score_function_loss(engine):
165 |         val_loss = engine.state.metrics['cross-entropy']
166 |         return -val_loss
167 |     
168 |     def score_function_f1(engine):
169 |         val_f1 = engine.state.metrics['F1']
170 |         if math.isnan(val_f1):
171 |             return -9999
172 |         return val_f1
173 | 
174 |     handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer)
175 |     
176 |     validation_evaluator.add_event_handler(Events.COMPLETED, handler)
177 |     
178 |     def log_training_results(engine):
179 |         train_evaluator.run(train_iterator)
180 |         metrics = train_evaluator.state.metrics
181 |         pbar.log_message(
182 |         "Training Results - Epoch: {} \nMetrics\n{}"
183 |         .format(engine.state.epoch, pprint.pformat(metrics)))
184 |     
185 |     def log_validation_results(engine):
186 |         validation_evaluator.run(valid_iterator)
187 |         metrics = validation_evaluator.state.metrics
188 |         pbar.log_message(
189 |         "Validation Results - Epoch: {} \nMetrics\n{}"
190 |         .format(engine.state.epoch, pprint.pformat(metrics)))
191 |         pbar.n = pbar.last_print_n = 0
192 |         
193 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results)
194 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)
195 | 
196 |     #### Checkpoint
197 |     
198 |     # to_save = {'{}_{}'.format(p_name, m_name): model,
199 |     #           'optimizer': optimizer,
200 |     #           'lr_scheduler': scheduler
201 |     #           }
202 |     
203 |     to_save={'gh_{}'.format(m_name): model}
204 |     
205 |     cp_handler = Checkpoint(to_save,
206 |                         DiskSaver('../models/',
207 |                         create_dir=True, require_empty=False),
208 |                         filename_prefix='best',
209 |                         score_function=score_function_loss,
210 |                         score_name='val_loss')
211 | 
212 |     validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler)
213 |     #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler)
214 | 
215 |     # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False)
216 |     
217 |     # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())
218 |     trainer.run(train_iterator, max_epochs=4)
219 | else:
220 |     print('Runing saved model...')
221 |     #run_on_test(cur_model, p_name, m_name, pred_iterator)
222 |     run_saved_model(prediction_dataloader, cur_model, 'gh', m_name)


--------------------------------------------------------------------------------
/scripts/PTM/early-stopping/jira.py:
--------------------------------------------------------------------------------
  1 | # Created by happygirlzt
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts')
  5 | 
  6 | from utils import *
  7 | from sklearn.model_selection import train_test_split
  8 | import argparse
  9 | import pprint
 10 | import math
 11 | from transformers import AdamW
 12 | from ignite.engine import Engine, Events
 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall
 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping
 15 | from ignite.contrib.handlers import ProgressBar
 16 | 
 17 | import logging
 18 | logging.basicConfig(level=logging.ERROR)
 19 | 
 20 | ## Read model name and project name
 21 | parser = argparse.ArgumentParser(description='Choose the models.')
 22 | 
 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 24 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 25 | 
 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?',
 27 |                     help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0')
 28 | 
 29 | args = parser.parse_args()
 30 | #print(args.model_num)
 31 | #print(args.project_num)
 32 | 
 33 | m_num=args.model_num
 34 | rerun_flag=bool(args.re_run)
 35 |     
 36 | # Generate training, validation and test set
 37 | data_folder=Path('../data/')
 38 | 
 39 | cur_model=MODELS[m_num]
 40 | m_name=MODEL_NAMES[m_num]
 41 | 
 42 | print('Running model {} in Jira'.format(m_name))
 43 | 
 44 | #### Read data
 45 | train_data=pd.read_pickle(data_folder/'jira-train.pkl')
 46 | train_data['label']=train_data['label'].replace(-1, 0)
 47 | 
 48 | X_train=train_data['sentence']
 49 | y_train=train_data['label']
 50 | 
 51 | test_data=pd.read_pickle(data_folder/'jira-test.pkl')
 52 | test_data['label']=test_data['label'].replace(-1, 0)
 53 | 
 54 | X_test=test_data['sentence']
 55 | y_test=test_data['label']
 56 | print('Read success!')
 57 | 
 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False)
 59 | 
 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False)
 61 | 
 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset)))
 63 | 
 64 | if rerun_flag:
 65 |     X_train, X_validation, y_train, y_validation = train_test_split(X_train, 
 66 |                                                             y_train, 
 67 |                                                             test_size=0.05, 
 68 |                                                             random_state=SEED,
 69 |                                                             stratify=y_train)
 70 | 
 71 |     #train_dataloader=get_dataloader(X_train, y_train,cur_model,True)
 72 |     #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False)
 73 |     
 74 |     train_iterator=get_iterator(X_train, y_train, cur_model, True)
 75 |     valid_iterator=get_iterator(X_validation, y_validation, cur_model, False)
 76 |     
 77 |     model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 78 |     model.cuda()
 79 |     
 80 |     optimizer = AdamW(model.parameters(),
 81 |                       lr=LEARNING_RATE,
 82 |                       eps=EPS,
 83 |                       weight_decay=WEIGHT_DECAY)
 84 |     
 85 |     #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5
 86 |     
 87 |     def process_function(engine, batch):
 88 |         model.train()
 89 |         optimizer.zero_grad()
 90 |         
 91 |         b_input_ids = batch.INPUT_IDS
 92 |         b_input_mask = batch.ATTENTION_MASKS
 93 |         b_labels = batch.LABEL
 94 |             
 95 | 
 96 |         outputs = model(b_input_ids,
 97 |                         token_type_ids=None,
 98 |                         attention_mask=b_input_mask,
 99 |                         labels=b_labels)
100 | 
101 |         loss = outputs[0]
102 |         logits = outputs[1]
103 | 
104 |         loss.backward()
105 |         optimizer.step()
106 |         #scheduler.step()
107 |         return loss.item()
108 | 
109 |     def eval_function(engine, batch):
110 |         model.eval()
111 |         with torch.no_grad():
112 |             b_input_ids = batch.INPUT_IDS
113 |             b_input_mask = batch.ATTENTION_MASKS
114 |             b_labels = batch.LABEL
115 |             
116 |             outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
117 |             #logits = outputs[0]
118 |             y_pred=outputs[0]
119 |             
120 |             return y_pred, b_labels
121 |     
122 |     trainer = Engine(process_function)
123 |     train_evaluator = Engine(eval_function)
124 |     validation_evaluator = Engine(eval_function)
125 |     
126 |     #print('success!')
127 |     #### Metrics
128 |     RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
129 |     
130 |     def output_transform_fun(output):
131 |         y_pred, y = output
132 |         y_pred=y_pred.detach().cpu().numpy()
133 |         y=y.to('cpu').numpy()
134 |         y_pred=np.argmax(y_pred, axis=1).flatten()
135 |         return torch.from_numpy(y_pred), torch.from_numpy(y)
136 |     
137 |     criterion = nn.CrossEntropyLoss()
138 |     ### Training
139 |     #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy')
140 |     Loss(criterion).attach(train_evaluator, 'cross-entropy')
141 |     
142 |     #precision = Precision(output_transform=output_transform_fun, average=False)
143 |     #.detach().cpu().numpy()
144 |     #recall = Recall(output_transform=output_transform_fun, average=False)
145 |     #.detach().cpu().numpy()
146 |     #F1 = (precision * recall * 2) / (precision + recall)
147 | 
148 |     #precision.attach(train_evaluator, 'precision')
149 |     #recall.attach(train_evaluator, 'recall')
150 |     #F1.attach(train_evaluator, 'F1')
151 |     
152 |     ### Validation    
153 |     #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy')
154 |     Loss(criterion).attach(validation_evaluator, 'cross-entropy')
155 | 
156 |     #precision.attach(validation_evaluator, 'precision')
157 |     #recall.attach(validation_evaluator, 'recall')
158 |     #F1.attach(validation_evaluator, 'F1')
159 |     
160 |     #### Progress Bar
161 |     pbar = ProgressBar(persist=True, bar_format="")
162 |     pbar.attach(trainer, ['loss'])
163 |     
164 |     def score_function_loss(engine):
165 |         val_loss = engine.state.metrics['cross-entropy']
166 |         return -val_loss
167 |     
168 |     def score_function_f1(engine):
169 |         val_f1 = engine.state.metrics['F1']
170 |         if math.isnan(val_f1):
171 |             return -9999
172 |         return val_f1
173 | 
174 |     handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer)
175 |     
176 |     validation_evaluator.add_event_handler(Events.COMPLETED, handler)
177 |     
178 |     def log_training_results(engine):
179 |         train_evaluator.run(train_iterator)
180 |         metrics = train_evaluator.state.metrics
181 |         pbar.log_message(
182 |         "Training Results - Epoch: {} \nMetrics\n{}"
183 |         .format(engine.state.epoch, pprint.pformat(metrics)))
184 |     
185 |     def log_validation_results(engine):
186 |         validation_evaluator.run(valid_iterator)
187 |         metrics = validation_evaluator.state.metrics
188 |         pbar.log_message(
189 |         "Validation Results - Epoch: {} \nMetrics\n{}"
190 |         .format(engine.state.epoch, pprint.pformat(metrics)))
191 |         pbar.n = pbar.last_print_n = 0
192 |         
193 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results)
194 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)
195 | 
196 |     #### Checkpoint
197 |     
198 |     # to_save = {'{}_{}'.format(p_name, m_name): model,
199 |     #           'optimizer': optimizer,
200 |     #           'lr_scheduler': scheduler
201 |     #           }
202 |     
203 |     to_save={'jira_{}'.format(m_name): model}
204 |     
205 |     cp_handler = Checkpoint(to_save,
206 |                         DiskSaver('../models/',
207 |                         create_dir=True, require_empty=False),
208 |                         filename_prefix='best',
209 |                         score_function=score_function_loss,
210 |                         score_name='val_loss')
211 | 
212 |     validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler)
213 |     #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler)
214 | 
215 |     # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False)
216 |     
217 |     # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())
218 |     trainer.run(train_iterator, max_epochs=4)
219 | else:
220 |     print('Runing saved model...')
221 |     #run_on_test(cur_model, p_name, m_name, pred_iterator)
222 |     run_saved_model(prediction_dataloader, cur_model, 'jira', m_name)


--------------------------------------------------------------------------------
/scripts/PTM/early-stopping/so.py:
--------------------------------------------------------------------------------
  1 | # Created by happygirlzt
  2 | # -*- coding: utf-8 -*-
  3 | import sys
  4 | sys.path.append('/media/DATA/tingzhang-data/sa4se/scripts')
  5 | 
  6 | from utils import *
  7 | from sklearn.model_selection import train_test_split
  8 | import argparse
  9 | import pprint
 10 | import math
 11 | from transformers import AdamW
 12 | from ignite.engine import Engine, Events
 13 | from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall
 14 | from ignite.handlers import Checkpoint, DiskSaver, EarlyStopping
 15 | from ignite.contrib.handlers import ProgressBar
 16 | 
 17 | import logging
 18 | logging.basicConfig(level=logging.ERROR)
 19 | 
 20 | ## Read model name and project name
 21 | parser = argparse.ArgumentParser(description='Choose the models.')
 22 | 
 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 24 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 25 | 
 26 | parser.add_argument('-r', '--re_run', default=0, type=int, nargs='?',
 27 |                     help='Enter an integer... 0-re-run the saved model, 1-run new model; default: 0')
 28 | 
 29 | args = parser.parse_args()
 30 | #print(args.model_num)
 31 | #print(args.project_num)
 32 | 
 33 | m_num=args.model_num
 34 | rerun_flag=bool(args.re_run)
 35 |     
 36 | # Generate training, validation and test set
 37 | data_folder=Path('../data/')
 38 | 
 39 | cur_model=MODELS[m_num]
 40 | m_name=MODEL_NAMES[m_num]
 41 | 
 42 | print('Running model {} in Stack Overflow'.format(m_name))
 43 | 
 44 | #### Read data
 45 | train_data=pd.read_pickle(data_folder/'so-train.pkl')
 46 | train_data['label']=train_data['label'].replace(-1, 2)
 47 | 
 48 | X_train=train_data['sentence']
 49 | y_train=train_data['label']
 50 | 
 51 | test_data=pd.read_pickle(data_folder/'so-test.pkl')
 52 | test_data['label']=test_data['label'].replace(-1, 2)
 53 | 
 54 | X_test=test_data['sentence']
 55 | y_test=test_data['label']
 56 | print('Read success!')
 57 | 
 58 | # pred_iterator=get_iterator(X_test, y_test, cur_model, False)
 59 | 
 60 | prediction_dataloader=get_dataloader(X_test, y_test, cur_model, False)
 61 | 
 62 | # print('Training set is {}\nValidation set is {}\nTest set is {}'.format(len(train_dataloader.dataset), len(validation_dataloader.dataset), len(prediction_dataloader.dataset)))
 63 | 
 64 | if rerun_flag:
 65 |     X_train, X_validation, y_train, y_validation = train_test_split(X_train, 
 66 |                                                             y_train, 
 67 |                                                             test_size=0.05, 
 68 |                                                             random_state=SEED,
 69 |                                                             stratify=y_train)
 70 | 
 71 |     #train_dataloader=get_dataloader(X_train, y_train,cur_model,True)
 72 |     #validation_dataloader=get_dataloader(X_validation, y_validation,cur_model,False)
 73 |     
 74 |     train_iterator=get_iterator(X_train, y_train, cur_model, True)
 75 |     valid_iterator=get_iterator(X_validation, y_validation, cur_model, False)
 76 |     
 77 |     model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 78 |     model.cuda()
 79 |     
 80 |     optimizer = AdamW(model.parameters(),
 81 |                       lr=LEARNING_RATE,
 82 |                       eps=EPS,
 83 |                       weight_decay=WEIGHT_DECAY)
 84 |     
 85 |     #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8) # 5e-5 * 0.8 = 4e-5
 86 |     
 87 |     def process_function(engine, batch):
 88 |         model.train()
 89 |         optimizer.zero_grad()
 90 |         
 91 |         b_input_ids = batch.INPUT_IDS
 92 |         b_input_mask = batch.ATTENTION_MASKS
 93 |         b_labels = batch.LABEL
 94 |             
 95 | 
 96 |         outputs = model(b_input_ids,
 97 |                         token_type_ids=None,
 98 |                         attention_mask=b_input_mask,
 99 |                         labels=b_labels)
100 | 
101 |         loss = outputs[0]
102 |         logits = outputs[1]
103 | 
104 |         loss.backward()
105 |         optimizer.step()
106 |         #scheduler.step()
107 |         return loss.item()
108 | 
109 |     def eval_function(engine, batch):
110 |         model.eval()
111 |         with torch.no_grad():
112 |             b_input_ids = batch.INPUT_IDS
113 |             b_input_mask = batch.ATTENTION_MASKS
114 |             b_labels = batch.LABEL
115 |             
116 |             outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
117 |             #logits = outputs[0]
118 |             y_pred=outputs[0]
119 |             
120 |             return y_pred, b_labels
121 |     
122 |     trainer = Engine(process_function)
123 |     train_evaluator = Engine(eval_function)
124 |     validation_evaluator = Engine(eval_function)
125 |     
126 |     #print('success!')
127 |     #### Metrics
128 |     RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
129 |     
130 |     def output_transform_fun(output):
131 |         y_pred, y = output
132 |         y_pred=y_pred.detach().cpu().numpy()
133 |         y=y.to('cpu').numpy()
134 |         y_pred=np.argmax(y_pred, axis=1).flatten()
135 |         return torch.from_numpy(y_pred), torch.from_numpy(y)
136 |     
137 |     criterion = nn.CrossEntropyLoss()
138 |     ### Training
139 |     #Accuracy(output_transform=output_transform_fun).attach(train_evaluator, 'accuracy')
140 |     Loss(criterion).attach(train_evaluator, 'cross-entropy')
141 |     
142 |     #precision = Precision(output_transform=output_transform_fun, average=False)
143 |     #.detach().cpu().numpy()
144 |     #recall = Recall(output_transform=output_transform_fun, average=False)
145 |     #.detach().cpu().numpy()
146 |     #F1 = (precision * recall * 2) / (precision + recall)
147 | 
148 |     #precision.attach(train_evaluator, 'precision')
149 |     #recall.attach(train_evaluator, 'recall')
150 |     #F1.attach(train_evaluator, 'F1')
151 |     
152 |     ### Validation    
153 |     #Accuracy(output_transform=output_transform_fun).attach(validation_evaluator, 'accuracy')
154 |     Loss(criterion).attach(validation_evaluator, 'cross-entropy')
155 | 
156 |     #precision.attach(validation_evaluator, 'precision')
157 |     #recall.attach(validation_evaluator, 'recall')
158 |     #F1.attach(validation_evaluator, 'F1')
159 |     
160 |     #### Progress Bar
161 |     pbar = ProgressBar(persist=True, bar_format="")
162 |     pbar.attach(trainer, ['loss'])
163 |     
164 |     def score_function_loss(engine):
165 |         val_loss = engine.state.metrics['cross-entropy']
166 |         return -val_loss
167 |     
168 |     def score_function_f1(engine):
169 |         val_f1 = engine.state.metrics['F1']
170 |         if math.isnan(val_f1):
171 |             return -9999
172 |         return val_f1
173 | 
174 |     handler = EarlyStopping(patience=2, score_function=score_function_loss, trainer=trainer)
175 |     
176 |     validation_evaluator.add_event_handler(Events.COMPLETED, handler)
177 |     
178 |     def log_training_results(engine):
179 |         train_evaluator.run(train_iterator)
180 |         metrics = train_evaluator.state.metrics
181 |         pbar.log_message(
182 |         "Training Results - Epoch: {} \nMetrics\n{}"
183 |         .format(engine.state.epoch, pprint.pformat(metrics)))
184 |     
185 |     def log_validation_results(engine):
186 |         validation_evaluator.run(valid_iterator)
187 |         metrics = validation_evaluator.state.metrics
188 |         pbar.log_message(
189 |         "Validation Results - Epoch: {} \nMetrics\n{}"
190 |         .format(engine.state.epoch, pprint.pformat(metrics)))
191 |         pbar.n = pbar.last_print_n = 0
192 |         
193 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results)
194 |     trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)
195 | 
196 |     #### Checkpoint
197 |     
198 |     # to_save = {'{}_{}'.format(p_name, m_name): model,
199 |     #           'optimizer': optimizer,
200 |     #           'lr_scheduler': scheduler
201 |     #           }
202 |     
203 |     to_save={'so_{}'.format(m_name): model}
204 |     
205 |     cp_handler = Checkpoint(to_save,
206 |                         DiskSaver('../models/',
207 |                         create_dir=True, require_empty=False),
208 |                         filename_prefix='best',
209 |                         score_function=score_function_loss,
210 |                         score_name='val_loss')
211 | 
212 |     validation_evaluator.add_event_handler(Events.COMPLETED, cp_handler)
213 |     #trainer.add_event_handler(Events.ITERATION_COMPLETED(every=1000), cp_handler)
214 | 
215 |     # checkpointer = ModelCheckpoint('../models/', '{}'.format(p_name), create_dir=True, save_as_state_dict=True, require_empty=False)
216 |     
217 |     # trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())
218 |     trainer.run(train_iterator, max_epochs=4)
219 | else:
220 |     print('Runing saved model...')
221 |     #run_on_test(cur_model, p_name, m_name, pred_iterator)
222 |     run_saved_model(prediction_dataloader, cur_model, 'so', m_name)


--------------------------------------------------------------------------------
/scripts/PTM/early-stopping/utils.py:
--------------------------------------------------------------------------------
  1 | # Created by happygirlzt
  2 | import torch
  3 | import torch.nn as nn
  4 | from transformers import BertTokenizer, BertForSequenceClassification
  5 | from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
  6 | from transformers import XLNetTokenizer, XLNetForSequenceClassification
  7 | from transformers import RobertaTokenizer, RobertaForSequenceClassification
  8 | from transformers import AlbertTokenizer, AlbertForSequenceClassification
  9 | 
 10 | from sklearn.model_selection import train_test_split
 11 | from torch.utils.data import TensorDataset, DataLoader
 12 | import random
 13 | import numpy as np
 14 | import pandas as pd
 15 | from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
 16 | from pathlib import Path
 17 | import re
 18 | import torchtext
 19 | import glob
 20 | from torchtext import data
 21 | from torchtext.data import Field
 22 | 
 23 | 
 24 | if torch.cuda.is_available():       
 25 |     device = torch.device("cuda")
 26 |     #print(f'There are {torch.cuda.device_count()} GPU(s) available.')
 27 |     #print('Device name:', torch.cuda.get_device_name(0))
 28 | 
 29 | else:
 30 |     print('No GPU available, using the CPU instead.')
 31 |     device = torch.device("cpu")
 32 | 
 33 | data_folder=Path('../data/')
 34 | model_folder=Path('../models/')
 35 | result_folder=Path('../result/')
 36 | 
 37 | #          Model          | Tokenizer          | Pretrained weights shortcut
 38 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
 39 |           (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
 40 |           (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'),
 41 |           (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
 42 |          ]
 43 | 
 44 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
 45 | 
 46 | ## Parameters setting
 47 | BATCH_SIZE=16
 48 | LEARNING_RATE=2e-5
 49 | MAX_SEQ_LENGTH=256
 50 | SEED=42
 51 | EPOCHS=4
 52 | EPS=1e-8
 53 | WEIGHT_DECAY=1e-5
 54 | 
 55 | def seed_torch(seed):
 56 |     random.seed(seed)
 57 |     np.random.seed(seed)
 58 |     torch.manual_seed(seed)
 59 |     torch.cuda.manual_seed(seed)
 60 |     torch.backends.cudnn.deterministic=True
 61 | 
 62 | seed_torch(SEED)
 63 | 
 64 | 
 65 | def get_dataloader(X_cur, y_cur, cur_model, is_train):
 66 |     input_ids, attention_masks = preprocessing_for_classifier_tensor(X_cur.values, cur_model)
 67 |     
 68 |     labels = torch.from_numpy(np.array(y_cur, dtype='int64'))
 69 | 
 70 |     cur_dataset = TensorDataset(input_ids, attention_masks, labels)
 71 |     
 72 |     cur_dataloader = DataLoader(
 73 |                 cur_dataset,
 74 |                 batch_size = BATCH_SIZE,
 75 |                 shuffle=is_train)
 76 |     
 77 |     return cur_dataloader
 78 | 
 79 | def get_iterator(X_cur, y_cur, cur_model, is_train):
 80 |     input_ids, attention_masks = preprocessing_for_classifier_list(X_cur.values, cur_model)
 81 |     #print(f'type of input_ids: {type(input_ids)}')
 82 |     #print(f'type of input_ids[0]: {type(input_ids[0])}')
 83 |     #print(f'type of attention_masks: {type(attention_masks)}')
 84 |     #print(f'type of attention_masks[0]: {type(attention_masks[0])}')
 85 |     labels = torch.from_numpy(np.array(y_cur, dtype='int64'))
 86 |     
 87 |     INPUT_IDS=Field(sequential=False, use_vocab=False, batch_first=True)
 88 |     ATTENTION_MASKS=Field(sequential=False, use_vocab=False, batch_first=True)
 89 |     LABEL=Field(sequential=False, use_vocab=False, batch_first=True)
 90 |     
 91 |     fields=[
 92 |         ('INPUT_IDS', INPUT_IDS),
 93 |         ('ATTENTION_MASKS', ATTENTION_MASKS),
 94 |         ('LABEL', LABEL)
 95 |     ]
 96 |     examples=[]
 97 |     for i in range(len(labels)):
 98 |         examples.append(data.Example.fromlist([input_ids[i],
 99 |                                                attention_masks[i],
100 |                                                labels[i]],
101 |                                                fields))
102 |     
103 |     
104 |     cur_dataset = torchtext.data.Dataset(examples, fields)
105 |     cur_iterator = data.BucketIterator(cur_dataset, batch_size=BATCH_SIZE, device='cuda', shuffle=is_train)
106 |     return cur_iterator
107 | 
108 | def preprocessing_for_classifier_tensor(sentences, cur_model):
109 |     tokenizer=cur_model[1].from_pretrained(cur_model[2])
110 |     input_ids=[]
111 |     attention_masks=[]
112 | 
113 |     for sent in sentences:
114 |         encoded_sent = tokenizer.encode_plus(
115 |             str(sent),
116 |             add_special_tokens=True, 
117 |             max_length=MAX_SEQ_LENGTH,
118 |             pad_to_max_length=True,
119 |             return_tensors='pt',  # Return PyTorch tensor
120 |             return_attention_mask=True
121 |             )
122 |         
123 |         input_ids.append(encoded_sent.get('input_ids'))
124 |         attention_masks.append(encoded_sent.get('attention_mask'))
125 | 
126 |     input_ids = torch.cat(input_ids, dim=0)
127 |     attention_masks = torch.cat(attention_masks, dim=0)
128 | 
129 |     return input_ids, attention_masks
130 | 
131 | def preprocessing_for_classifier_list(sentences, cur_model):
132 |     tokenizer=cur_model[1].from_pretrained(cur_model[2])
133 |     input_ids = []
134 |     attention_masks = []
135 | 
136 |     for sent in sentences:
137 |         encoded_sent = tokenizer.encode_plus(
138 |             str(sent),
139 |             add_special_tokens=True, 
140 |             max_length=MAX_SEQ_LENGTH,
141 |             pad_to_max_length=True,
142 |             return_attention_mask=True
143 |             )
144 |         
145 |         input_ids.append(encoded_sent.get('input_ids'))
146 |         attention_masks.append(encoded_sent.get('attention_mask'))
147 | 
148 |     return input_ids, attention_masks
149 | 
150 |  
151 | def run_saved_model(prediction_dataloader, cur_model, p_name, m_name):
152 |     model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
153 |     model.cuda()
154 |     # satd_classifier.load_state_dict(torch.load(data_folder/'{}-{}.bin'.format(p_name, m_name)))    
155 |     # print('{}-{}.bin loaded'.format(p_name, m_name))
156 |     
157 |     name_pattern='/sa4se/models/best_{}_{}_*'.format(p_name, m_name)
158 |     # print(type(glob.glob(name_pattern)))
159 |     candidates=glob.glob(name_pattern)
160 |     candidates.sort(reverse=True)
161 |     file_name=candidates[0]
162 |     
163 |     model.load_state_dict(torch.load(file_name))
164 |     print('{} loaded'.format(file_name))
165 |     
166 |     model.eval()
167 |     predictions, true_labels = [], []
168 |     
169 |     for batch in prediction_dataloader:
170 |         batch = tuple(t.to(device) for t in batch)
171 |         
172 |         b_input_ids, b_input_mask, b_labels = batch
173 |         
174 |         with torch.no_grad():
175 |              outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
176 | 
177 |         logits = outputs[0]
178 |         
179 |         # will create a synchronization point
180 |         logits = logits.detach().cpu().numpy()
181 |         label_ids = b_labels.to('cpu').numpy()
182 |         
183 |         predictions.append(logits)
184 |         true_labels.append(label_ids)
185 | 
186 |     print('    DONE.')
187 | 
188 |     flat_predictions = [item for sublist in predictions for item in sublist]
189 |     flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
190 |     flat_true_labels = [item for sublist in true_labels for item in sublist]
191 |     
192 |     #print('Precision is {:.3f}'.format(precision_score(flat_true_labels, flat_predictions)))
193 |     #print('Recall is {:.3f}'.format(recall_score(flat_true_labels, flat_predictions)))
194 |     #print('F1-score is {:.3f}'.format(f1_score(flat_true_labels, flat_predictions)))    
195 |     print(classification_report(flat_true_labels, flat_predictions))


--------------------------------------------------------------------------------
/scripts/PTM/github.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from utils import *
  3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification
  4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification
  5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification
  6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification
  7 | import argparse
  8 | 
  9 | #          Model          | Tokenizer          | Pretrained weights shortcut
 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
 11 |           (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
 12 |           (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 
 13 |           (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
 14 |          ]
 15 | 
 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
 17 | 
 18 | seed_torch(42)
 19 | ## Read model name
 20 | parser = argparse.ArgumentParser(description='Choose the models.')
 21 | 
 22 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 23 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 24 | 
 25 | 
 26 | args = parser.parse_args()
 27 | m_num=args.model_num
 28 | 
 29 | cur_model=MODELS[m_num]
 30 | m_name=MODEL_NAMES[m_num]
 31 | 
 32 | train_df=pd.read_pickle(gh_train)
 33 | train_df['label']=train_df['label'].replace({'positive':1, 'negative':2, 'neutral':0})
 34 | 
 35 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)
 36 | 
 37 | sentences=train_df.sentence.values
 38 | labels=train_df.label.values
 39 | 
 40 | # max_len = 0
 41 | # for sent in sentences:
 42 | #     input_ids=tokenizer.encode(sent, add_special_tokens=True)
 43 | #     max_len=max(max_len, len(input_ids))
 44 | # print('Max sentence length: ', max_len)
 45 | 
 46 | input_ids = []
 47 | attention_masks = []
 48 | 
 49 | for sent in sentences:
 50 | 
 51 |     encoded_dict = tokenizer.encode_plus(
 52 |                         str(sent), 
 53 |                         add_special_tokens = True, 
 54 |                         max_length = MAX_LEN,
 55 |                         pad_to_max_length = True,
 56 |                         return_attention_mask = True, 
 57 |                         return_tensors = 'pt'
 58 |                    )
 59 |      
 60 |     input_ids.append(encoded_dict['input_ids'])
 61 |     attention_masks.append(encoded_dict['attention_mask'])
 62 | 
 63 | 
 64 | train_inputs = torch.cat(input_ids, dim=0)
 65 | train_masks = torch.cat(attention_masks, dim=0)
 66 | train_labels = torch.tensor(labels)
 67 | 
 68 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape))
 69 | 
 70 | train_data = TensorDataset(train_inputs, train_masks, train_labels)
 71 | train_sampler = RandomSampler(train_data)
 72 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
 73 | 
 74 | # Train Model
 75 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 76 | model.cuda()
 77 | 
 78 | param_optimizer = list(model.named_parameters())
 79 | no_decay = ['bias', 'gamma', 'beta']
 80 | optimizer_grouped_parameters = [
 81 |     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
 82 |      'weight_decay_rate': 0.01},
 83 |     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
 84 |      'weight_decay_rate': 0.0}
 85 | ]
 86 | 
 87 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
 88 | 
 89 | begin=time.time()
 90 | train_loss_set = []
 91 | 
 92 | for _ in trange(EPOCHS, desc="Epoch"): 
 93 | 
 94 |     model.train()
 95 | 
 96 |     tr_loss = 0
 97 |     nb_tr_examples, nb_tr_steps = 0, 0
 98 |     
 99 |     for step, batch in enumerate(train_dataloader):
100 |     
101 |         batch = tuple(t.to(device) for t in batch)
102 |       
103 |         b_input_ids, b_input_mask, b_labels = batch
104 |         optimizer.zero_grad()
105 |         
106 |         # Forward pass
107 |         outputs = model(b_input_ids, token_type_ids=None, \
108 |                         attention_mask=b_input_mask, labels=b_labels)
109 |         loss = outputs[0]
110 |         logits = outputs[1]
111 |         train_loss_set.append(loss.item())    
112 |         
113 |         # Backward pass
114 |         loss.backward()
115 |         optimizer.step()
116 |         
117 |         tr_loss += loss.item()
118 |         nb_tr_examples += b_input_ids.size(0)
119 |         nb_tr_steps += 1
120 | 
121 |     print("Train loss: {}".format(tr_loss/nb_tr_steps))
122 | 
123 | end=time.time()
124 | print('Used {} second'.format(end-begin))
125 | 
126 | ### Test
127 | begin=time.time()
128 | test_df=pd.read_pickle(gh_test)
129 | 
130 | test_df['label']=test_df['label'].replace({
131 |     'positive':1,
132 |     'negative':2,
133 |     'neutral':0})
134 | 
135 | sentences=test_df.sentence.values
136 | labels = test_df.label.values
137 | 
138 | input_ids = []
139 | attention_masks = []
140 | 
141 | for sent in sentences:
142 |     encoded_dict = tokenizer.encode_plus(
143 |                         str(sent), 
144 |                         add_special_tokens = True, 
145 |                         max_length = MAX_LEN,
146 |                         pad_to_max_length = True,
147 |                         return_attention_mask = True, 
148 |                         return_tensors = 'pt',
149 |                    )
150 |      
151 |     input_ids.append(encoded_dict['input_ids'])
152 |     attention_masks.append(encoded_dict['attention_mask'])
153 | 
154 | prediction_inputs = torch.cat(input_ids,dim=0)
155 | prediction_masks = torch.cat(attention_masks,dim=0)
156 | prediction_labels = torch.tensor(labels)
157 | 
158 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
159 | prediction_sampler = SequentialSampler(prediction_data)
160 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)
161 | 
162 | model.eval()
163 | predictions,true_labels=[],[]
164 | 
165 | for batch in prediction_dataloader:
166 |     batch = tuple(t.to(device) for t in batch)
167 |     b_input_ids, b_input_mask, b_labels = batch
168 | 
169 |     with torch.no_grad():
170 |         outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
171 |         logits = outputs[0]
172 | 
173 |     logits = logits.detach().cpu().numpy()
174 |     label_ids = b_labels.to('cpu').numpy()
175 |     
176 |     predictions.append(logits)
177 |     true_labels.append(label_ids)
178 | 
179 | end=time.time()
180 | print('Prediction used {:.2f} seconds'.format(end - begin))
181 | 
182 | flat_predictions = [item for sublist in predictions for item in sublist]
183 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
184 | flat_true_labels = [item for sublist in true_labels for item in sublist]
185 | 
186 | ### Get predictions on XLNet
187 | # new_df=pd.DataFrame(columns=['Text', 'True_label', 'XLNet_predicted'])
188 | 
189 | # new_df['Text'] = pd.Series(sentences)
190 | # new_df['True_label'] = pd.Series(flat_true_labels)
191 | # new_df['True_label']=new_df['True_label'].replace({0: 'neutral', 1: 'positive', 2:'negative'})
192 | # new_df['RoBERTa_predicted'] = pd.Series(flat_predictions)
193 | # new_df['RoBERTa_predicted']=new_df['XLNet_predicted'].replace(
194 | #     {0: 'neutral', 1: 'positive', 2:'negative'})
195 | # new_df.to_csv(data_folder/'XLNet_github_predictions.csv', header=True)
196 | 
197 | # Evaluation of BERT in GitHub Dataset
198 | print("Accuracy of {} on GitHub is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))
199 | 
200 | print(classification_report(flat_true_labels,flat_predictions))


--------------------------------------------------------------------------------
/scripts/PTM/jira.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from utils import *
  3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification
  4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification
  5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification
  6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification
  7 | import argparse
  8 | 
  9 | #          Model          | Tokenizer          | Pretrained weights shortcut
 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
 11 |           (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
 12 |           (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 
 13 |           (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
 14 |          ]
 15 | 
 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
 17 | 
 18 | seed_torch(42)
 19 | 
 20 | ## Read model name
 21 | parser = argparse.ArgumentParser(description='Choose the models.')
 22 | 
 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 24 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 25 | 
 26 | 
 27 | args = parser.parse_args()
 28 | m_num=args.model_num
 29 | 
 30 | cur_model=MODELS[m_num]
 31 | m_name=MODEL_NAMES[m_num]
 32 | 
 33 | train_df = pd.read_pickle(jira_train)
 34 | train_df['label']=train_df['label'].replace(-1, 0)
 35 | # Negative: 0, Positive: 1
 36 | 
 37 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)
 38 | 
 39 | sentences=train_df.sentence.values
 40 | labels=train_df.label.values
 41 | 
 42 | # max_len = 0
 43 | # for sent in sentences:
 44 | #     input_ids=tokenizer.encode(sent, add_special_tokens=True)
 45 | #     max_len=max(max_len, len(input_ids))
 46 | # print('Max sentence length: ', max_len)
 47 | 
 48 | input_ids = []
 49 | attention_masks = []
 50 | 
 51 | for sent in sentences:
 52 | 
 53 |     encoded_dict = tokenizer.encode_plus(
 54 |                         str(sent), 
 55 |                         add_special_tokens = True, 
 56 |                         max_length = MAX_LEN,
 57 |                         pad_to_max_length = True,
 58 |                         return_attention_mask = True, 
 59 |                         return_tensors = 'pt'
 60 |                    )
 61 |      
 62 |     input_ids.append(encoded_dict['input_ids'])
 63 |     attention_masks.append(encoded_dict['attention_mask'])
 64 | 
 65 | 
 66 | train_inputs = torch.cat(input_ids, dim=0)
 67 | train_masks = torch.cat(attention_masks, dim=0)
 68 | train_labels = torch.tensor(labels)
 69 | 
 70 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape))
 71 | 
 72 | train_data = TensorDataset(train_inputs, train_masks, train_labels)
 73 | train_sampler = RandomSampler(train_data)
 74 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
 75 | 
 76 | # Train Model
 77 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 78 | model.cuda()
 79 | 
 80 | param_optimizer = list(model.named_parameters())
 81 | no_decay = ['bias', 'gamma', 'beta']
 82 | optimizer_grouped_parameters = [
 83 |     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
 84 |      'weight_decay_rate': 0.01},
 85 |     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
 86 |      'weight_decay_rate': 0.0}
 87 | ]
 88 | 
 89 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
 90 | 
 91 | begin=time.time()
 92 | train_loss_set = []
 93 | 
 94 | for _ in trange(EPOCHS, desc="Epoch"): 
 95 | 
 96 |     model.train()
 97 | 
 98 |     tr_loss = 0
 99 |     nb_tr_examples, nb_tr_steps = 0, 0
100 |     
101 |     for step, batch in enumerate(train_dataloader):
102 |     
103 |         batch = tuple(t.to(device) for t in batch)
104 |       
105 |         b_input_ids, b_input_mask, b_labels = batch
106 |         optimizer.zero_grad()
107 |         
108 |         # Forward pass
109 |         outputs = model(b_input_ids, token_type_ids=None, \
110 |                         attention_mask=b_input_mask, labels=b_labels)
111 |         loss = outputs[0]
112 |         logits = outputs[1]
113 |         train_loss_set.append(loss.item())    
114 |         
115 |         # Backward pass
116 |         loss.backward()
117 |         optimizer.step()
118 |         
119 |         tr_loss += loss.item()
120 |         nb_tr_examples += b_input_ids.size(0)
121 |         nb_tr_steps += 1
122 | 
123 |     print("Train loss: {}".format(tr_loss/nb_tr_steps))
124 | 
125 | end=time.time()
126 | print('Training used {} second'.format(end-begin))
127 | 
128 | begin=time.time()
129 | test_df = pd.read_pickle(jira_test)
130 | test_df['label']=test_df['label'].replace(-1, 0)
131 | 
132 | sentences=test_df.sentence.values
133 | labels = test_df.label.values
134 | 
135 | input_ids = []
136 | attention_masks = []
137 | 
138 | for sent in sentences:
139 |     encoded_dict = tokenizer.encode_plus(
140 |                     str(sent), 
141 |                     add_special_tokens = True, 
142 |                     max_length = MAX_LEN,
143 |                     pad_to_max_length = True,
144 |                     return_attention_mask = True, 
145 |                     return_tensors = 'pt'
146 |                    )
147 |      
148 |     input_ids.append(encoded_dict['input_ids'])
149 |     attention_masks.append(encoded_dict['attention_mask'])
150 | 
151 | prediction_inputs = torch.cat(input_ids,dim=0)
152 | prediction_masks = torch.cat(attention_masks,dim=0)
153 | prediction_labels = torch.tensor(labels)
154 | 
155 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
156 | prediction_sampler = SequentialSampler(prediction_data)
157 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)
158 | 
159 | model.eval()
160 | predictions,true_labels=[],[]
161 | 
162 | for batch in prediction_dataloader:
163 |     batch = tuple(t.to(device) for t in batch)
164 |     b_input_ids, b_input_mask, b_labels = batch
165 | 
166 |     with torch.no_grad():
167 |         outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
168 |         logits = outputs[0]
169 | 
170 |     logits = logits.detach().cpu().numpy()
171 |     label_ids = b_labels.to('cpu').numpy()
172 |     
173 |     predictions.append(logits)
174 |     true_labels.append(label_ids)
175 |     
176 | end=time.time()
177 | print('Prediction used {:.2f} seconds'.format(end-begin))
178 | 
179 | flat_predictions = [item for sublist in predictions for item in sublist]
180 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
181 | flat_true_labels = [item for sublist in true_labels for item in sublist]
182 | 
183 | print("Accuracy of {} on Jira is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))
184 | print(classification_report(flat_true_labels, flat_predictions))


--------------------------------------------------------------------------------
/scripts/PTM/run_all.sh:
--------------------------------------------------------------------------------
1 | for i in 0 1 2 3
2 | do
3 |     python github.py -m $i
4 |     python api.py -m $i
5 |     python app.py -m $i
6 |     python so.py -m $i
7 |     python jira.py -m $i
8 |     python cr.py -m $i
9 | done


--------------------------------------------------------------------------------
/scripts/PTM/so.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from utils import *
  3 | from transformers import BertTokenizer, BertModel, BertForSequenceClassification
  4 | from transformers import XLNetTokenizer, XLNetForSequenceClassification
  5 | from transformers import RobertaTokenizer, RobertaForSequenceClassification
  6 | from transformers import AlbertTokenizer, AlbertForSequenceClassification
  7 | import argparse
  8 | 
  9 | #          Model          | Tokenizer          | Pretrained weights shortcut
 10 | MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
 11 |           (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
 12 |           (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), 
 13 |           (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
 14 |          ]
 15 | 
 16 | MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
 17 | 
 18 | seed_torch(42)
 19 | 
 20 | ## Read model name
 21 | parser = argparse.ArgumentParser(description='Choose the models.')
 22 | 
 23 | parser.add_argument('-m', '--model_num', default=0, type=int, nargs='?',
 24 |                     help='Enter an integer... 0-BERT, 1-XLNet, 2-RoBERTa, 3-ALBERT; default: 0')
 25 | 
 26 | 
 27 | args = parser.parse_args()
 28 | m_num=args.model_num
 29 | 
 30 | cur_model=MODELS[m_num]
 31 | m_name=MODEL_NAMES[m_num]
 32 | 
 33 | train_df=pd.read_pickle(so_train)
 34 | 
 35 | train_df['label']=train_df['label'].replace(-1, 2)
 36 | 
 37 | tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)
 38 | 
 39 | sentences=train_df.sentence.values
 40 | labels=train_df.label.values
 41 | 
 42 | # Find the max length of the sentence
 43 | # max_len = 0
 44 | # for sent in sentences:
 45 | #     input_ids=tokenizer.encode(sent, add_special_tokens=True)
 46 | #     max_len=max(max_len, len(input_ids))
 47 | # print('Max sentence length: ', max_len)
 48 | 
 49 | input_ids = []
 50 | attention_masks = []
 51 | 
 52 | for sent in sentences:
 53 | 
 54 |     encoded_dict = tokenizer.encode_plus(
 55 |                         str(sent), 
 56 |                         add_special_tokens = True, 
 57 |                         max_length = MAX_LEN,
 58 |                         pad_to_max_length = True,
 59 |                         return_attention_mask = True, 
 60 |                         return_tensors = 'pt'
 61 |                    )
 62 |      
 63 |     input_ids.append(encoded_dict['input_ids'])
 64 |     attention_masks.append(encoded_dict['attention_mask'])
 65 | 
 66 | 
 67 | train_inputs = torch.cat(input_ids, dim=0)
 68 | train_masks = torch.cat(attention_masks, dim=0)
 69 | train_labels = torch.tensor(labels)
 70 | 
 71 | print('Training data {} {} {}'.format(train_inputs.shape, train_masks.shape, train_labels.shape))
 72 | 
 73 | train_data = TensorDataset(train_inputs, train_masks, train_labels)
 74 | train_sampler = RandomSampler(train_data)
 75 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
 76 | 
 77 | # Train Model
 78 | model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
 79 | model.cuda()
 80 | 
 81 | param_optimizer = list(model.named_parameters())
 82 | no_decay = ['bias', 'gamma', 'beta']
 83 | optimizer_grouped_parameters = [
 84 |     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
 85 |      'weight_decay_rate': 0.01},
 86 |     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
 87 |      'weight_decay_rate': 0.0}
 88 | ]
 89 | 
 90 | optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
 91 | 
 92 | begin=time.time()
 93 | train_loss_set = []
 94 | 
 95 | for _ in trange(EPOCHS, desc="Epoch"): 
 96 | 
 97 |     model.train()
 98 | 
 99 |     tr_loss = 0
100 |     nb_tr_examples, nb_tr_steps = 0, 0
101 |     
102 |     for step, batch in enumerate(train_dataloader):
103 |     
104 |         batch = tuple(t.to(device) for t in batch)
105 |       
106 |         b_input_ids, b_input_mask, b_labels = batch
107 |         optimizer.zero_grad()
108 |         
109 |         # Forward pass
110 |         outputs = model(b_input_ids, token_type_ids=None, \
111 |                         attention_mask=b_input_mask, labels=b_labels)
112 |         loss = outputs[0]
113 |         logits = outputs[1]
114 |         train_loss_set.append(loss.item())    
115 |         
116 |         # Backward pass
117 |         loss.backward()
118 |         optimizer.step()
119 |         
120 |         tr_loss += loss.item()
121 |         nb_tr_examples += b_input_ids.size(0)
122 |         nb_tr_steps += 1
123 | 
124 |     print("Train loss: {}".format(tr_loss/nb_tr_steps))
125 | 
126 | end=time.time()
127 | print('Training used {:.2f} second'.format(end-begin))
128 | 
129 | test_begin=time.time()
130 | test_df=pd.read_pickle(so_test)
131 | test_df['label']=test_df['label'].replace(-1, 2)
132 | 
133 | sentences=test_df.sentence.values
134 | labels = test_df.label.values
135 | 
136 | input_ids = []
137 | attention_masks = []
138 | 
139 | for sent in sentences:
140 |     encoded_dict = tokenizer.encode_plus(
141 |                         str(sent), 
142 |                         add_special_tokens = True, 
143 |                         max_length = MAX_LEN,
144 |                         pad_to_max_length = True,
145 |                         return_attention_mask = True, 
146 |                         return_tensors = 'pt',
147 |                    )
148 |      
149 |     input_ids.append(encoded_dict['input_ids'])
150 |     attention_masks.append(encoded_dict['attention_mask'])
151 | 
152 | prediction_inputs = torch.cat(input_ids,dim=0)
153 | prediction_masks = torch.cat(attention_masks,dim=0)
154 | prediction_labels = torch.tensor(labels)
155 | 
156 | prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
157 | prediction_sampler = SequentialSampler(prediction_data)
158 | prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)
159 | 
160 | model.eval()
161 | predictions,true_labels=[],[]
162 | 
163 | for batch in prediction_dataloader:
164 |     batch = tuple(t.to(device) for t in batch)
165 |     b_input_ids, b_input_mask, b_labels = batch
166 | 
167 |     with torch.no_grad():
168 |         outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
169 |         logits = outputs[0]
170 | 
171 |     logits = logits.detach().cpu().numpy()
172 |     label_ids = b_labels.to('cpu').numpy()
173 |     
174 |     predictions.append(logits)
175 |     true_labels.append(label_ids)
176 | 
177 | test_end=time.time()
178 | print('Prediction used {:.2f} second'.format(test_end-test_begin))
179 | 
180 | flat_predictions = [item for sublist in predictions for item in sublist]
181 | flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
182 | flat_true_labels = [item for sublist in true_labels for item in sublist]
183 | 
184 | print("Accuracy of {} on Stack Overflow is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))
185 | 
186 | print(classification_report(flat_true_labels,flat_predictions))


--------------------------------------------------------------------------------
/scripts/PTM/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 3 | 
 4 | from transformers import AdamW
 5 | 
 6 | from tqdm import tqdm, trange
 7 | import pandas as pd
 8 | import numpy as np
 9 | import random
10 | import time
11 | 
12 | from sklearn.metrics import accuracy_score, classification_report
13 | 
14 | def seed_torch(seed):
15 |     random.seed(seed)
16 |     np.random.seed(seed)
17 |     torch.manual_seed(seed)
18 |     torch.cuda.manual_seed(seed)
19 |     torch.backends.cudnn.deterministic=True
20 | 
21 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22 | n_gpu = torch.cuda.device_count()
23 | torch.cuda.get_device_name(0)
24 | 
25 | # Datasets
26 | from pathlib import Path
27 | data_folder=Path('/sa4se/data/')
28 | 
29 | api_train=data_folder/'api-train.pkl'
30 | api_test=data_folder/'api-test.pkl'
31 | 
32 | gh_train=data_folder/'gh-train.pkl'
33 | gh_test=data_folder/'gh-test.pkl'
34 | 
35 | jira_train=data_folder/'jira-train.pkl'
36 | jira_test=data_folder/'jira-test.pkl'
37 | 
38 | so_train=data_folder/'so-train.pkl'
39 | so_test=data_folder/'so-test.pkl'
40 | 
41 | app_train=data_folder/'app-train.pkl'
42 | app_test=data_folder/'app-test.pkl'
43 | 
44 | cr_train=data_folder/'cr-train.pkl'
45 | cr_test=data_folder/'cr-test.pkl'
46 | 
47 | # Hyperparameters
48 | MAX_LEN=256
49 | BATCH_SIZE=16
50 | EPOCHS=4
51 | LEARNING_RATE=2e-5


--------------------------------------------------------------------------------
/scripts/SentiCR/SenticrTest.py:
--------------------------------------------------------------------------------
  1 | # Created by happygirlzt
  2 | 
  3 | from SentiCR import SentiCR
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | from sklearn.metrics import classification_report
  8 | import time
  9 | 
 10 | sentiment_analyzer=SentiCR()
 11 | 
 12 | from pathlib import Path
 13 | data_folder=Path('/sa4se/data') # your data folder
 14 | 
 15 | api_train=data_folder/'api-train.pkl'
 16 | api_test=data_folder/'api-test.pkl'
 17 | 
 18 | gh_train=data_folder/'gh-train.pkl'
 19 | gh_test=data_folder/'gh-test.pkl'
 20 | 
 21 | jira_train=data_folder/'jira-train.pkl'
 22 | jira_test=data_folder/'jira-test.pkl'
 23 | 
 24 | so_train=data_folder/'so-train.pkl'
 25 | so_test=data_folder/'so-test.pkl'
 26 | 
 27 | app_train=data_folder/'app-train.pkl'
 28 | app_test=data_folder/'app-test.pkl'
 29 | 
 30 | cr_train=data_folder/'cr-train.pkl'
 31 | cr_test=data_folder/'cr-test.pkl'
 32 | 
 33 | def predict_jira():
 34 |     begin=time.time()
 35 |     df=pd.read_pickle(jira_test)
 36 | 
 37 |     df['label']=df['label'].replace(-1, 0)
 38 | 
 39 |     sentences=df['sentence']
 40 |     y_test=df['label']
 41 |     
 42 |     pred=[]
 43 |     for sent in sentences:
 44 |         score=sentiment_analyzer.get_sentiment_polarity(sent)
 45 |         pred.append(score)
 46 |     end=time.time()
 47 |     print('Prediction used {:.2f} seconds'.format(end-begin))
 48 |     
 49 |     y_pred=pd.DataFrame(pred, columns=['pred_label'])
 50 |     print(classification_report(y_test, y_pred))
 51 |     # report = classification_report(y_test, y_pred, output_dict=True)
 52 |     # df = pd.DataFrame(report).transpose()
 53 |     # df.to_csv('./SentiCR_jira.csv')
 54 | 
 55 | def predict_so():
 56 |     begin=time.time()
 57 |     df=pd.read_pickle(so_test)
 58 | 
 59 |     df['label']=df['label'].replace(-1, 2)
 60 | 
 61 |     sentences=df['sentence']
 62 |     y_test=df['label']
 63 |     
 64 |     pred=[]
 65 |     for sent in sentences:
 66 |         score=sentiment_analyzer.get_sentiment_polarity(sent)
 67 |         pred.append(score)
 68 |     
 69 |     end=time.time()
 70 |     print('Prediction used {:.2f} seconds'.format(end-begin))
 71 |     y_pred=pd.DataFrame(pred, columns=['pred_label'])
 72 |     
 73 |     print(classification_report(y_test, y_pred))
 74 |     #results = confusion_matrix(y_test, y_pred, labels=[1,0,2])
 75 |     #print(results)
 76 |     #report = classification_report(y_test, y_pred, output_dict=True)
 77 |     #df = pd.DataFrame(report).transpose()
 78 |     #df.to_csv('./SentiCR_so.csv')
 79 | 
 80 | def predict_gh():
 81 |     begin=time.time()
 82 |     df=pd.read_pickle(gh_test)
 83 | 
 84 |     sentences=df['sentence']
 85 |     y_test=df['label']
 86 |     
 87 |     pred=[]
 88 |     for sent in sentences:
 89 |         score=sentiment_analyzer.get_sentiment_polarity(sent)
 90 |         pred.append(score)
 91 |     
 92 |     end=time.time()
 93 |     print('Prediction used {:.2f} seconds'.format(end-begin))
 94 |     y_pred=pd.DataFrame(pred, columns=['pred_label'])
 95 |     
 96 |     # new_df=pd.DataFrame(columns=['Text', 'SentiCR_predicted'])
 97 |     # new_df['Text'] = sentences.copy
 98 |     # new_df['SentiCR_predicted'] = y_pred.copy
 99 |     
100 |     # new_df.to_csv('./senticr_preditied.csv', header=True)
101 |     
102 |     print(classification_report(y_test, y_pred))
103 |     # report = classification_report(y_test, y_pred, output_dict=True)
104 |     # df = pd.DataFrame(report).transpose()
105 |     # df.to_csv('./SentiCR_gh.csv')
106 | 
107 | def predict_app():
108 |     begin=time.time()
109 |     df=pd.read_pickle(app_test)
110 | 
111 |     df['label']=df['label'].replace(-1,2)
112 | 
113 |     sentences=df['sentence']
114 |     y_test=df['label']
115 |     
116 |     print(sentences.shape[0]==y_test.shape[0])
117 |     pred=[]
118 |     for sent in sentences:
119 |         score=sentiment_analyzer.get_sentiment_polarity(sent)
120 |         pred.append(score)
121 |     
122 |     end=time.time()
123 |     print('Prediction used {:.2f} seconds'.format(end-begin))
124 |     y_pred=pd.DataFrame(pred, columns=['pred_label'])
125 |     print(classification_report(y_test, y_pred))
126 |     # report = classification_report(y_test, y_pred, output_dict=True)
127 |     # df = pd.DataFrame(report).transpose()
128 |     # df.to_csv('./SentiCR_app.csv')
129 |     
130 | def predict_cr():
131 |     begin=time.time()
132 |     df=pd.read_pickle(cr_test)
133 |     df['label']=df['label'].replace(-1,1)
134 | 
135 |     sentences=df['sentence']
136 |     y_test=df['label']
137 |     
138 |     pred=[]
139 |     for sent in sentences:
140 |         score=sentiment_analyzer.get_sentiment_polarity(sent)
141 |         pred.append(score)
142 |         
143 |     end=time.time()
144 |     print('Prediction used {:.2f} seconds'.format(end-begin))
145 |     y_pred=pd.DataFrame(pred, columns=['pred_label'])
146 |     print(classification_report(y_test, y_pred))
147 |     # report = classification_report(y_test, y_pred, output_dict=True)
148 |     # df = pd.DataFrame(report).transpose()
149 |     # df.to_csv('./SentiCR_cr1.csv')
150 | 
151 | def predict_api():
152 |     begin=time.time()
153 |     df=pd.read_pickle(api_test)
154 |     df['label']=df['label'].replace(-1,2)
155 | 
156 |     sentences=df['sentence']
157 |     y_test=df['label']
158 |     
159 |     pred=[]
160 |     for sent in sentences:
161 |         score=sentiment_analyzer.get_sentiment_polarity(sent)
162 |         pred.append(score)
163 |     
164 |     end=time.time()
165 |     print('Prediction used {:.2f} seconds'.format(end-begin))
166 |     y_pred=pd.DataFrame(pred, columns=['pred_label'])
167 |     print(classification_report(y_test, y_pred))
168 |     #report = classification_report(y_test, y_pred, output_dict=True)
169 |     #df = pd.DataFrame(report).transpose()
170 |     #df.to_csv('./SentiCR_api.csv')
171 | 
172 | #predict_jira()
173 | #predict_api()
174 | #predict_gh()
175 | predict_so()
176 | #predict_cr()
177 | #predict_app()


--------------------------------------------------------------------------------
/scripts/StanfordCoreNLP.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Test data on Stanford CoreNLP
  3 | # Author: happygirlzt
  4 | # coding: utf-8
  5 | from sklearn.metrics import classification_report
  6 | from pycorenlp import StanfordCoreNLP
  7 | import pandas as pd
  8 | import time
  9 | 
 10 | from pathlib import Path
 11 | data_folder=Path('/sa4se/data') # your data folder
 12 | 
 13 | api_train=data_folder/'api-train.pkl'
 14 | api_test=data_folder/'api-test.pkl'
 15 | 
 16 | gh_train=data_folder/'gh-train.pkl'
 17 | gh_test=data_folder/'gh-test.pkl'
 18 | 
 19 | jira_train=data_folder/'jira-train.pkl'
 20 | jira_test=data_folder/'jira-test.pkl'
 21 | 
 22 | so_train=data_folder/'so-train.pkl'
 23 | so_test=data_folder/'so-test.pkl'
 24 | 
 25 | app_train=data_folder/'app-train.pkl'
 26 | app_test=data_folder/'app-test.pkl'
 27 | 
 28 | cr_train=data_folder/'cr-train.pkl'
 29 | cr_test=data_folder/'cr-test.pkl'
 30 | 
 31 | nlp = StanfordCoreNLP('http://localhost:9000')
 32 | 
 33 | def get_predictions(test_df):
 34 |     print('total length is {}'.format(test_df.shape[0]))
 35 |     predictions=[]
 36 | 
 37 |     for index, row in test_df.iterrows():
 38 |         sent=row['sentence']
 39 |         #print(sent)
 40 |         try:
 41 |             res = nlp.annotate(sent,
 42 |                             properties={
 43 |                                 'annotators': 'sentiment',
 44 |                                 'outputFormat': 'json',
 45 |                                 'timeout': 5000000000000,
 46 |                             })
 47 |         except:
 48 |             # print(sent)
 49 |             predictions.append('Neutral')
 50 |             continue
 51 |         
 52 |         #print(type(res['sentences']))
 53 |         #return predictions
 54 |         
 55 |         try:
 56 |             # one row has many sentences
 57 |             if len(res['sentences']) > 1:
 58 |                 total=0
 59 |                 num=len(res['sentences'])
 60 |                 
 61 |                 for s in res['sentences']: 
 62 |                     # print(s['sentiment'])
 63 |                     # predictions.append(s['sentiment'])
 64 |                     total+=int(s['sentimentValue'])
 65 |                     
 66 |                 if total / num == 2:
 67 |                     predictions.append('Neutral')
 68 |                 elif total / num < 2:
 69 |                     predictions.append('Negative')
 70 |                 else:
 71 |                     predictions.append('Positive')
 72 |             else:
 73 |                 # one row only has one sentence
 74 |                 for s in res['sentences']: 
 75 |                     #print(s['sentiment'])
 76 |                     predictions.append(s['sentiment'])
 77 |         except:
 78 |             predictions.append('Neutral')
 79 |             continue
 80 |     return predictions
 81 | 
 82 | def get_pred_df(cur_pred):
 83 |     pred_df=pd.DataFrame(cur_pred, columns=['Polarity'])
 84 |     pred_df['Polarity']=pred_df['Polarity'].replace({
 85 |         'Neutral':0,
 86 |         'Negative':-1,
 87 |         'Positive':1,
 88 |         'Verynegative':-1,
 89 |         'Verypositive':1})
 90 | 
 91 |     pred_df['Polarity']=pred_df['Polarity'].astype(int)
 92 |     return pred_df
 93 | 
 94 | def test_api():
 95 |     begin=time.time()
 96 |     # API reviews
 97 |     test_df=pd.read_pickle(api_test)
 98 |     cur_pred=get_predictions(test_df)
 99 |     
100 |     end=time.time()
101 |     print('Predict API used {:.2f} seconds'.format(end-begin))
102 |     
103 |     pred_df=get_pred_df(cur_pred)
104 |     print(classification_report(test_df['label'], pred_df['Polarity']))
105 | 
106 | def test_gh():
107 |     begin=time.time()
108 |     # GitHub
109 |     test_df=pd.read_pickle(gh_test)
110 |     cur_pred=get_predictions(test_df)
111 |     end=time.time()
112 |     print('Predict GitHub used {:.2f} seconds'.format(end-begin))
113 |     #len(predictions)
114 |     pred_df=get_pred_df(cur_pred)
115 | 
116 |     test_df['label']=test_df['label'].replace({
117 |         'neutral':0,
118 |         'positive':1,
119 |         'negative':-1})
120 | 
121 |     print(classification_report(test_df['label'], pred_df['Polarity']))
122 | 
123 | # APP reviews
124 | def test_app():
125 |     begin=time.time()
126 |     test_df=pd.read_pickle(app_test)
127 |     
128 |     cur_pred=get_predictions(test_df)
129 |     
130 |     end=time.time()
131 |     print('Predict APP used {:.2f} seconds'.format(end-begin))
132 |     pred_df=get_pred_df(cur_pred)
133 | 
134 |     print(classification_report(test_df['label'], pred_df['Polarity']))
135 | 
136 | # SO
137 | def test_so():
138 |     begin=time.time()
139 |     test_df=pd.read_pickle(so_test)
140 |     
141 |     cur_pred=get_predictions(test_df)
142 |     end=time.time()
143 |     print('Predict StackOverflow used {:.2f} seconds'.format(end-begin))
144 |     
145 |     pred_df=get_pred_df(cur_pred)
146 |     print(classification_report(test_df['label'], pred_df['Polarity']))
147 | 
148 | 
149 | # Jira
150 | def test_jira():
151 |     begin=time.time()
152 |     test_df=pd.read_pickle(jira_test)
153 |     cur_pred=get_predictions(test_df)
154 |     
155 |     end=time.time()
156 |     print('Predict Jira used {:.2f} seconds'.format(end-begin))
157 | 
158 |     pred_df=get_pred_df(cur_pred)
159 | 
160 |     print(classification_report(test_df['label'], pred_df['Polarity']))
161 | 
162 | 
163 | # CR
164 | def test_cr():
165 |     begin=time.time()
166 |     test_df=pd.read_pickle(cr_test)
167 |     cur_pred=get_predictions(test_df)
168 |     
169 |     end=time.time()
170 |     print('Predict Code Reviews used {:.2f} seconds'.format(end-begin))
171 |     
172 |     pred_df=get_pred_df(cur_pred)
173 |     print(classification_report(test_df['label'], pred_df['Polarity']))
174 |     
175 | #test_gh()
176 | #test_api()
177 | #test_app()
178 | #test_so()
179 | #test_jira()
180 | #test_cr()


--------------------------------------------------------------------------------
/scripts/analyze-results/Senti4SD.py:
--------------------------------------------------------------------------------
  1 | # This file is to used to predict the performance of Senti4SD
  2 | # Author: happygirlzt
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | import re
  7 | from sklearn.metrics import classification_report,confusion_matrix
  8 | 
  9 | def get_confusion_matrix():
 10 |     pred=pd.read_csv('./predictions/so-predictions.csv',usecols=['PREDICTED'])
 11 |     
 12 |     #print(pred.shape)
 13 |     pred.columns=['res']
 14 |     res_pd=pred['res']
 15 | 
 16 |     test_df=pd.read_csv('so-test-sd.csv',usecols=['Text','Polarity'])
 17 |     
 18 |     true_df=pd.Series(test_df['Polarity'],dtype='int32')
 19 |     results=confusion_matrix(true_df,res_pd, labels=['positive','neutral','negative'])
 20 |     print(results)
 21 |     
 22 | #get_confusion_matrix()
 23 | 
 24 | def analyze_cr():
 25 |     # Replace './predictions/cr-predictions.csv' with your predicted file name
 26 |     pred=pd.read_csv('./predictions/cr-predictions.csv',usecols=['PREDICTED'])
 27 |     pred['PREDICTED']=pred['PREDICTED'].replace({'positive':'neutral'})
 28 |     
 29 |     print(pred.shape)
 30 |     pred.columns=['res']
 31 |     res_pd=pred['res']
 32 | 
 33 |     # read in true lables
 34 |     test_df=pd.read_csv('cr-test-sd.csv', usecols=['Text','Polarity'])
 35 |     
 36 |     true_df=pd.Series(test_df['Polarity'], dtype='int32')
 37 |     print(classification_report(true_df, pred))
 38 | #analyze_cr()
 39 | 
 40 | def analyze_app():    
 41 |     pred=pd.read_csv('./predictions/app-predictions.csv',usecols=['PREDICTED'])
 42 |     
 43 |     print(pred.shape)
 44 |     pred.columns=['res']
 45 |     res_pd=pred['res']
 46 | 
 47 |     # read in true lables
 48 |     test_df=pd.read_csv('app-test-sd.csv',usecols=['Text','Polarity'])
 49 |     
 50 |     true_df=pd.Series(test_df['Polarity'],dtype='int32')
 51 |     print(classification_report(true_df, pred))
 52 | #analyze_app()
 53 | 
 54 | def analyze_gh():    
 55 |     pred=pd.read_csv('./predictions/gh-predictions.csv',usecols=['PREDICTED'])
 56 |     
 57 |     print(pred.shape)
 58 |     pred.columns=['res']
 59 |     res_pd=pred['res']
 60 | 
 61 |     # read in true lables
 62 |     test_df=pd.read_csv('gh-test-sd.csv',usecols=['Text','Polarity'])
 63 |     
 64 |     true_df=pd.Series(test_df['Polarity'],dtype='int32')
 65 |     print(classification_report(true_df, pred))
 66 | #analyze_gh()
 67 | 
 68 | def analyze_jira():    
 69 |     pred=pd.read_csv('./predictions/jira-predictions.csv',usecols=['PREDICTED'])
 70 |     
 71 |     print(pred.shape)
 72 |     pred.columns=['res']
 73 |     res_pd=pred['res']
 74 | 
 75 |     # read in true lables
 76 |     test_df=pd.read_csv('jira-test-sd.csv',usecols=['Text','Polarity'])
 77 |     
 78 |     true_df=pd.Series(test_df['Polarity'],dtype='int32')
 79 |     print(classification_report(true_df, pred))
 80 | #analyze_jira()
 81 | 
 82 | def analyze_api():    
 83 |     pred=pd.read_csv('./predictions/api-predictions.csv',usecols=['PREDICTED'])
 84 |     
 85 |     print(pred.shape)
 86 |     pred.columns=['res']
 87 |     res_pd=pred['res']
 88 | 
 89 |     test_df=pd.read_csv('api-test-sd.csv',usecols=['Text','Polarity'])
 90 |     
 91 |     true_df=pd.Series(test_df['Polarity'],dtype='int32')
 92 |     print(classification_report(true_df, pred))    
 93 | #analyze_api()
 94 | 
 95 | def analyze_so():    
 96 |     pred=pd.read_csv('./predictions/so-predictions.csv',usecols=['PREDICTED'])
 97 |     
 98 |     print(pred.shape)
 99 |     pred.columns=['res']
100 |     res_pd=pred['res']
101 | 
102 |     test_df=pd.read_csv('so-test-sd.csv',usecols=['Text','Polarity'])
103 |     
104 |     true_df=pd.Series(test_df['Polarity'],dtype='int32')
105 |     print(classification_report(true_df, pred))
106 | #analyze_so()


--------------------------------------------------------------------------------
/scripts/analyze-results/SentiStrength-SE.py:
--------------------------------------------------------------------------------
  1 | # The file is used to analyze the prediction performance of SentiStrength-SE
  2 | # Author: happygirlzt
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | import re
  8 | from sklearn.metrics import classification_report, confusion_matrix
  9 | 
 10 | def get_confusion_matrix():
 11 |     pred=pd.read_csv('so-ss.csv', header=None)
 12 |     #print(pred.shape)
 13 |     pred.columns=['res']
 14 |     res_pd=pred['res']
 15 | 
 16 |     pos_list=[]
 17 |     neg_list=[]
 18 | 
 19 |     for sent in res_pd:
 20 |         cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:]
 21 |         
 22 |         new_list=cur_list[0].split()
 23 |         
 24 |         pos_list.append(int(new_list[0]))
 25 |         neg_list.append(int(new_list[1]))
 26 | 
 27 |     total = [p + n for p, n in zip(pos_list, neg_list)]
 28 |     label=[]
 29 |     for score in total:
 30 |         if score>0:
 31 |             label.append(1)
 32 |         elif score==0:
 33 |             label.append(0)
 34 |         else:
 35 |             label.append(-1)
 36 | 
 37 |     pred_df=pd.Series(label,dtype='int32')
 38 | 
 39 |     test_df=pd.read_csv('so-test-se.csv',header=None)
 40 |     test_df.columns=['sentence','label']
 41 |     
 42 |     true_df=pd.Series(test_df['label'],dtype='int32')
 43 | 
 44 |     results = confusion_matrix(true_df,pred_df,labels=[1,0,-1])
 45 |     print(results)
 46 |     
 47 | #get_confusion_matrix()
 48 | 
 49 | def analyze_cr():
 50 |     # replace 'cr-ss.csv' with your prediction file name
 51 |     pred=pd.read_csv('cr-ss.csv',header=None)
 52 |     print(pred.shape)
 53 |     pred.columns=['res']
 54 |     res_pd=pred['res']
 55 | 
 56 |     pos_list=[]
 57 |     neg_list=[]
 58 | 
 59 |     for sent in res_pd:
 60 |         cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:]
 61 |         #print(cur_list)
 62 |         
 63 |         new_list=cur_list[0].split()
 64 |         
 65 |         pos_list.append(int(new_list[0]))
 66 |         neg_list.append(int(new_list[1]))
 67 | 
 68 |     total = [p + n for p, n in zip(pos_list, neg_list)]
 69 |     label=[]
 70 |     for score in total:
 71 |         if score<0:
 72 |             label.append(-1)
 73 |         else:
 74 |             label.append(0)
 75 | 
 76 |     pred_df=pd.Series(label,dtype='int32')
 77 | 
 78 |     #print(pred_df)
 79 | 
 80 |     # read in true lables
 81 |     test_df=pd.read_csv('cr-test-se.csv',header=None)
 82 |     test_df.columns=['sentence','label']
 83 |     
 84 |     true_df=pd.Series(test_df['label'],dtype='int32')
 85 |     print(classification_report(true_df, pred_df))
 86 |     
 87 |     report=classification_report(true_df, pred_df, output_dict=True)
 88 |     df = pd.DataFrame(report).transpose()
 89 |     df.to_csv('./SentiStrength-SE-cr1.csv')
 90 | 
 91 | analyze_cr()
 92 | 
 93 | def analyze_app():    
 94 |     pred=pd.read_csv('app-ss.csv',header=None)
 95 |     print(pred.shape)
 96 |     pred.columns=['res']
 97 |     res_pd=pred['res']
 98 | 
 99 |     pos_list=[]
100 |     neg_list=[]
101 | 
102 |     for sent in res_pd:
103 |         cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:]
104 |         #print(cur_list)
105 |         new_list=cur_list[0].split()
106 |         
107 |         pos_list.append(int(new_list[0]))
108 |         neg_list.append(int(new_list[1]))
109 | 
110 |     total = [p + n for p, n in zip(pos_list, neg_list)]
111 |     label=[]
112 |     for score in total:
113 |         if score>0:
114 |             label.append(1)
115 |         elif score==0:
116 |             label.append(0)
117 |         else:
118 |             label.append(-1)
119 | 
120 |     pred_df=pd.Series(label,dtype='int32')
121 | 
122 |     #print(pred_df)
123 | 
124 |     # read in true lables
125 |     test_df=pd.read_csv('app-test-se.csv',header=None)
126 |     test_df.columns=['sentence','label']
127 |     
128 |     true_df=pd.Series(test_df['label'],dtype='int32')
129 |     print(classification_report(true_df, pred_df))
130 | #analyze_app()
131 | 
132 | def analyze_gh():    
133 |     pred=pd.read_csv('gh-ss.csv',header=None)
134 |     print(pred.shape)
135 |     pred.columns=['res']
136 |     res_pd=pred['res']
137 | 
138 |     pos_list=[]
139 |     neg_list=[]
140 | 
141 |     for sent in res_pd:
142 |         cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:]
143 |         #print(cur_list)
144 |         
145 |         new_list=cur_list[0].split()
146 |         
147 |         pos_list.append(int(new_list[0]))
148 |         neg_list.append(int(new_list[1]))
149 | 
150 |     total = [p + n for p, n in zip(pos_list, neg_list)]
151 |     label=[]
152 |     for score in total:
153 |         if score>0:
154 |             label.append(1)
155 |         elif score==0:
156 |             label.append(0)
157 |         else:
158 |             label.append(-1)
159 | 
160 |     pred_df=pd.Series(label,dtype='int32')
161 | 
162 |     #print(pred_df)
163 | 
164 |     # read in true lables
165 |     test_df=pd.read_csv('gh-test-se.csv',header=None)
166 |     test_df.columns=['sentence','label']
167 |     
168 |     test_df['label']=test_df['label'].replace({'positive':1, 'negative':-1, 'neutral':0})
169 |     
170 |     true_df=pd.Series(test_df['label'],dtype='int32')
171 |     print(classification_report(true_df, pred_df))
172 |     
173 | #analyze_gh()
174 | 
175 | def analyze_jira():    
176 |     pred=pd.read_csv('jira-ss.csv',header=None)
177 |     print(pred.shape)
178 |     pred.columns=['res']
179 |     res_pd=pred['res']
180 | 
181 |     pos_list=[]
182 |     neg_list=[]
183 | 
184 |     for sent in res_pd:
185 |         cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:]
186 |         #print(cur_list)
187 |         
188 |         new_list=cur_list[0].split()
189 |         
190 |         pos_list.append(int(new_list[0]))
191 |         neg_list.append(int(new_list[1]))
192 | 
193 |     total = [p + n for p, n in zip(pos_list, neg_list)]
194 |     label=[]
195 |     for score in total:
196 |         if score>0:
197 |             label.append(1)
198 |         elif score==0:
199 |             label.append(0)
200 |         else:
201 |             label.append(-1)
202 | 
203 |     pred_df=pd.Series(label,dtype='int32')
204 | 
205 |     #print(pred_df)
206 | 
207 |     # read in true lables
208 |     test_df=pd.read_csv('jira-test-se.csv',header=None)
209 |     test_df.columns=['sentence','label']
210 |     
211 |     true_df=pd.Series(test_df['label'],dtype='int32')
212 |     print(classification_report(true_df, pred_df))
213 |     
214 | #analyze_jira()
215 | def analyze_api():    
216 |     pred=pd.read_csv('api-ss.csv',header=None)
217 |     print(pred.shape)
218 |     pred.columns=['res']
219 |     res_pd=pred['res']
220 | 
221 |     pos_list=[]
222 |     neg_list=[]
223 | 
224 |     for sent in res_pd:
225 |         cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:]
226 |         #print(cur_list)
227 |         
228 |         new_list=cur_list[0].split()
229 |         
230 |         pos_list.append(int(new_list[0]))
231 |         neg_list.append(int(new_list[1]))
232 | 
233 |     total = [p + n for p, n in zip(pos_list, neg_list)]
234 |     label=[]
235 |     for score in total:
236 |         if score>0:
237 |             label.append(1)
238 |         elif score==0:
239 |             label.append(0)
240 |         else:
241 |             label.append(-1)
242 | 
243 |     pred_df=pd.Series(label,dtype='int32')
244 | 
245 |     #print(pred_df)
246 | 
247 |     # read in true lables
248 |     test_df=pd.read_csv('api-test-se.csv',header=None)
249 |     test_df.columns=['sentence','label']
250 |     
251 |     true_df=pd.Series(test_df['label'],dtype='int32')
252 |     print(classification_report(true_df, pred_df))
253 | 
254 | #analyze_api()
255 | 
256 | def analyze_so():    
257 |     pred=pd.read_csv('so-ss.csv',header=None)
258 |     print(pred.shape)
259 |     pred.columns=['res']
260 |     res_pd=pred['res']
261 | 
262 |     pos_list=[]
263 |     neg_list=[]
264 | 
265 |     for sent in res_pd:
266 |         cur_list=re.split(r'\t+', sent.rstrip('\t'))[1:]
267 |         #print(cur_list)
268 |         
269 |         new_list=cur_list[0].split()
270 |         
271 |         pos_list.append(int(new_list[0]))
272 |         neg_list.append(int(new_list[1]))
273 | 
274 |     total = [p + n for p, n in zip(pos_list, neg_list)]
275 |     label=[]
276 |     for score in total:
277 |         if score>0:
278 |             label.append(1)
279 |         elif score==0:
280 |             label.append(0)
281 |         else:
282 |             label.append(-1)
283 | 
284 |     pred_df=pd.Series(label,dtype='int32')
285 | 
286 |     #print(pred_df)
287 | 
288 |     # read in true lables
289 |     test_df=pd.read_csv('so-test-se.csv',header=None)
290 |     test_df.columns=['sentence','label']
291 |     
292 |     true_df=pd.Series(test_df['label'],dtype='int32')
293 |     print(classification_report(true_df, pred_df))
294 | #analyze_so()


--------------------------------------------------------------------------------
/scripts/analyze-results/SentiStrength.py:
--------------------------------------------------------------------------------
 1 | # The file is used to analyze the performance of SentiStrength
 2 | # Author: happygirlzt
 3 | import pandas as pd
 4 | import numpy as np
 5 | from sklearn.metrics import classification_report
 6 | from sklearn.metrics import confusion_matrix 
 7 | 
 8 | #lol = list(csv.reader(open('text.txt', 'rb'), delimiter='\t'))
 9 | 
10 | def get_confusion_matrix():
11 |     df=pd.read_csv('so-test+results.txt', sep='\t', index_col=False, header=None)
12 |     #print(df.head())
13 |     df.columns=['sent','pos','neg']
14 |     #print(df.shape)
15 | 
16 |     result=[]
17 |     total_lines=df.shape[0]
18 |     for i in range(total_lines):
19 |         cur_sum=int(df.iloc[i].pos)+int(df.iloc[i].neg)
20 |         if cur_sum > 0:
21 |             result.append(1)
22 |         elif cur_sum == 0:
23 |             result.append(0)
24 |         else:
25 |             result.append(-1)
26 |             
27 |     y_pred=pd.DataFrame(result)
28 | 
29 |     y_true=pd.read_csv('so-test-se.csv', header=None,usecols=[1])
30 |     
31 |     results = confusion_matrix(y_true,y_pred,labels=[1,0,-1])
32 |     print(results)
33 |     
34 | #get_confusion_matrix()
35 | 
36 | def analyze(file_name):
37 |     #replace '{}-test+results.txt' with your prediction file name
38 |     df=pd.read_csv('{}-test+results.txt'.format(file_name), sep='\t', index_col=False, header=None)
39 |     print(df.head())
40 |     df.columns=['sent','pos','neg']
41 |     print(df.shape)
42 | 
43 |     result=[]
44 |     total_lines=df.shape[0]
45 |     for i in range(total_lines):
46 |         cur_sum=int(df.iloc[i].pos)+int(df.iloc[i].neg)
47 |         if cur_sum > 0:
48 |             result.append(1)
49 |         elif cur_sum == 0:
50 |             result.append(0)
51 |         else:
52 |             result.append(-1)
53 |             
54 |     y_pred=pd.DataFrame(result)
55 | 
56 |     y_true=pd.read_csv('{}-test-se.csv'.format(file_name), header=None,usecols=[1])
57 | 
58 |     print(classification_report(y_true, y_pred))
59 |     
60 | def analyze_gh():
61 |     df=pd.read_csv('gh-test+results.txt', sep='\t', index_col=False, header=None)
62 |     print(df.head())
63 |     df.columns=['sent','pos','neg']
64 |     print(df.shape)
65 | 
66 |     result=[]
67 |     total_lines=df.shape[0]
68 |     for i in range(total_lines):
69 |         cur_sum=int(df.iloc[i].pos)+int(df.iloc[i].neg)
70 |         if cur_sum > 0:
71 |             result.append(1)
72 |         elif cur_sum == 0:
73 |             result.append(0)
74 |         else:
75 |             result.append(-1)
76 |             
77 |     y_pred=pd.DataFrame(result)
78 | 
79 |     y_true=pd.read_csv('gh-test-se.csv', header=None, usecols=[1])
80 |     y_true=y_true.replace({'positive':1, 'negative':-1, 'neutral':0})
81 | 
82 |     print(classification_report(y_true,y_pred))
83 |     report = classification_report(y_true, y_pred, output_dict=True)
84 |     df = pd.DataFrame(report).transpose()
85 |     df.to_csv('./SentiStrength-gh.csv')
86 | #analyze('api')
87 | #analyze('cr')
88 | #analyze('app')
89 | #analyze('gh')
90 | #analyze('jira')
91 | #analyze_gh()
92 | #analyze('so')


--------------------------------------------------------------------------------
/scripts/analyze-results/gh-xlnet-senticr.py:
--------------------------------------------------------------------------------
 1 | # This file is used to compare the predictions from XLNet and SentiCR
 2 | # on GitHub dataset
 3 | # Created by happygirlzt
 4 | 
 5 | import pandas as pd
 6 | import numpy as np
 7 | from sklearn.metrics import classification_report, confusion_matrix
 8 | from pathlib import Path
 9 | 
10 | data_folder=Path('your_github_predictions_folder')
11 | 
12 | ### Concatenate predictions from XLNet and SentiCR into one dataframe
13 | #xlnet = pd.read_csv(data_folder/'XLNet_github_predictions.csv')
14 | #print(xlnet.shape)
15 | 
16 | #cr = pd.read_csv(data_folder/'senticr_github_predictions.csv')
17 | #print(cr.head())
18 | #print(cr.shape)
19 | 
20 | 
21 | # final_df=pd.DataFrame(columns=['Text', 'True_label', 'XLNet_predicted', 'SentiCR_predicted'])
22 | # final_df['Text'] = pd.Series(xlnet['Text'])
23 | # final_df['True_label']=pd.Series(xlnet['True_label'])
24 | # final_df['xlnet_predicted']=pd.Series(xlnet['xlnet_predicted'])
25 | # final_df['SentiCR_predicted']=pd.Series(cr['SentiCR_predicted'])
26 | #print(final_df.head())
27 | 
28 | #final_df.to_csv(data_folder/'xlnet-senticr.csv', header=True)
29 | 
30 | ### Read the df
31 | final_df=pd.read_csv(data_folder/'xlnet-senticr.csv')
32 | # xlnet true predictions
33 | xlnet_true = final_df.loc[final_df['True_label'] == final_df['XLNet_predicted']]
34 | print(xlnet_true.head())
35 | 
36 | # xlnet true, while cr false
37 | # xlnet true predictions
38 | xlnet_true_cr_false = final_df.loc[
39 |     (final_df['True_label'] == final_df['XLNet_predicted']) & 
40 |     (final_df['True_label'] != final_df['SentiCR_predicted'])
41 |     ]
42 | 
43 | #print(xlnet_true_cr_false.head())
44 | print(xlnet_true_cr_false.shape)
45 | print(xlnet_true.shape)
46 | 
47 | # cr true predictions
48 | cr_true = final_df.loc[
49 |     final_df['True_label'] == final_df['SentiCR_predicted']
50 |     ]
51 | print(cr_true.head())
52 | 
53 | # xlnet false, while cr true
54 | cr_true_xlnet_false=final_df.loc[
55 |     (final_df['True_label'] != final_df['XLNet_predicted']) &
56 |     (final_df['True_label'] == final_df['SentiCR_predicted'])
57 |     ]
58 | 
59 | print(cr_true_xlnet_false.shape[0])
60 | print(cr_true.shape[0])
61 | 
62 | # both true
63 | both_true=final_df.loc[
64 |     (final_df['True_label'] == final_df['SentiCR_predicted']) & 
65 |     (final_df['True_label'] == final_df['XLNet_predicted'])
66 |     ]
67 | #both_true.head()
68 | print('both true {}'.format(both_true.shape[0]))


--------------------------------------------------------------------------------
/scripts/prepare-data/convert_senti4sd.py:
--------------------------------------------------------------------------------
 1 | # This file is for converting data to the format of Senti4SD
 2 | # Created by happygirlzt
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | import re
 7 | 
 8 | def convert_cr():   
 9 |     df=pd.read_csv('../data/cr-test-se.csv',header=None,encoding='utf_8')    
10 |     df.columns=['Text','Polarity']
11 |     df['Polarity']=df['Polarity'].replace({-1: 'negative', 1: 'positive', 0: 'neutral'})
12 |     df.to_csv('../data/cr-test-sd.csv', index=False,encoding='utf_8')
13 | convert_cr()
14 | 
15 | def convert_jira():
16 |     df=pd.read_csv('../data/jira-test-se.csv',header=None)    
17 |     df.columns=['Text','Polarity']
18 |     df['Polarity']=df['Polarity'].replace({-1: 'negative', 1: 'positive'})
19 |     df.to_csv('../data/jira-test-sd.csv', index=False)
20 | #convert_jira()
21 | 
22 | def convert_so():
23 |     for file_name in ['train','test']:
24 |         df=pd.read_csv('../data/so-{}.csv'.format(file_name),usecols=['text','oracle'])
25 |         df.columns=['Text','Polarity']        
26 |         df['Polarity']=df['Polarity'].replace({-1: 'negative', 1: 'positive', 0: 'neutral'})
27 |         
28 |         df.to_csv('../data/so-{}-sd.csv'.format(file_name),index=False)
29 | #convert_so()
30 | 
31 | def convert_api():    
32 |     for file_name in ['train','test']:
33 |         df=pd.read_csv('../data/api-{}.csv'.format(file_name), usecols=['sentence','label'])        
34 |         df.columns=['Text','Polarity']        
35 |         df['Polarity']=df['Polarity'].replace({-1: 'negative', 1: 'positive', 0: 'neutral'})        
36 |         df.to_csv('../data/api-{}-sd.csv'.format(file_name),index=False)
37 | #convert_api()
38 | 
39 | def convert_app():
40 |     for file_name in ['train','test']:
41 |         df=pd.read_csv('../data/app-{}.csv'.format(file_name), usecols=['sentence','oracle'])
42 |         df.columns=['Text','Polarity']        
43 |         df['Polarity']=df['Polarity'].replace({-1: 'negative', 1: 'positive', 0: 'neutral'})
44 |         
45 |         df.to_csv('../data/app-{}-sd.csv'.format(file_name), index=False)
46 | #convert_app()
47 | 
48 | def convert_gh():
49 |     df=pd.read_csv('../data/gh-test.csv', usecols=['Text','Polarity'])
50 |     df.to_csv('../data/gh-test-sd.csv', index=False)
51 | #convert_gh()


--------------------------------------------------------------------------------
/scripts/prepare-data/convert_sentistrength.py:
--------------------------------------------------------------------------------
  1 | # This file is for convert data format to SentiStrength-SE format
  2 | # Created by happygirlzt
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | import re
  7 | from pathlib import Path
  8 | data_folder=Path('YOUR_DATA_FOLDER')
  9 | 
 10 | api_train=data_folder/'api-train.pkl'
 11 | api_test=data_folder/'api-test.pkl'
 12 | 
 13 | gh_train=data_folder/'gh-train.pkl'
 14 | gh_test=data_folder/'gh-test.pkl'
 15 | 
 16 | jira_train=data_folder/'jira-train.pkl'
 17 | jira_test=data_folder/'jira-test.pkl'
 18 | 
 19 | so_train=data_folder/'so-train.pkl'
 20 | so_test=data_folder/'so-test.pkl'
 21 | 
 22 | app_train=data_folder/'app-train.pkl'
 23 | app_test=data_folder/'app-test.pkl'
 24 | 
 25 | cr_train=data_folder/'cr-train.pkl'
 26 | cr_test=data_folder/'cr-test.pkl'
 27 | 
 28 | def convert_jira_test():
 29 |     df=pd.read_pickle(jira_test)
 30 | 
 31 |     sents=[]
 32 | 
 33 |     for index, row in df.iterrows():
 34 |         text=row['sentence']
 35 |         text=''.join(text.split('\n'))
 36 |         sents.append(text)
 37 |     
 38 |     #print(len(sents))
 39 |     new_df=pd.DataFrame(sents,columns=['sentence'])
 40 | 
 41 |     df.update(new_df)
 42 |     df.to_csv('../data/jira-test-se.csv',header=None,index=False)
 43 | 
 44 | def convert_so_test():
 45 |     df=pd.read_pickle(so_test)
 46 |     sents=[]
 47 | 
 48 |     for index, row in df.iterrows():
 49 |         text=row['sentence']
 50 |         text=''.join(text.split('\n'))
 51 |         sents.append(text)
 52 |     
 53 |     #print(len(sents))
 54 |     new_df=pd.DataFrame(sents,columns=['sentence'])
 55 | 
 56 |     df.update(new_df)
 57 |     df.to_csv('../data/so-test-se.csv',header=None,index=False)
 58 |     
 59 | def convert_api_test():
 60 |     df=pd.read_pickle(api_test)
 61 |     sents=[]
 62 | 
 63 |     for index, row in df.iterrows():
 64 |         text=row['sentence']
 65 |         text=''.join(str(text).split('\n'))
 66 |         sents.append(text)
 67 |     
 68 |     #print(len(sents))
 69 |     new_df=pd.DataFrame(sents,columns=['sentence'])
 70 | 
 71 |     df.update(new_df)
 72 |     df.to_csv('../data/api-test-se.csv',header=None,index=False)
 73 |     
 74 | def convert_app_test():
 75 |     df=pd.read_pickle(app_test)
 76 |     sents=[]
 77 | 
 78 |     for index, row in df.iterrows():
 79 |         text=row['sentence']
 80 |         text=''.join(str(text).split('\n'))
 81 |         sents.append(text)
 82 |     
 83 |     #print(len(sents))
 84 |     new_df=pd.DataFrame(sents,columns=['sentence'])
 85 | 
 86 |     df.update(new_df)
 87 |     df.to_csv('../data/app-test-se.csv',header=None,index=False)
 88 | #convert_api_test() 
 89 | 
 90 | def convert_cr_test():
 91 |     df=pd.read_pickle(cr_test)
 92 | 
 93 |     sents=[]
 94 |     labels=[]
 95 |     for index, row in df.iterrows():
 96 |         text=row['sentence']
 97 |         text=''.join(text.split('\n'))
 98 |         sents.append(text)
 99 |         labels.append(row['label'])
100 |         
101 |     #print(len(sents))
102 |     new_df=pd.DataFrame({'sentence': sents,'label': labels})
103 |     
104 |     sents=[]
105 |     labels=[]
106 |     for index, row in new_df.iterrows():
107 |         text=row['sentence']
108 |         text=''.join(text.split('\n'))
109 |         sents.append(text)
110 |         labels.append(row['label'])
111 |     
112 |     pd.DataFrame({'sentence': sents,'label': labels}).to_csv('../data/cr-test-se.csv',header=None,index=False)
113 | 
114 | #convert_so_test()
115 | convert_cr_test()
116 | #convert_app_test()
117 | 
118 | def convert_gh_test():
119 |     df=pd.read_pickle(gh_test)
120 |     sents=[]
121 | 
122 |     for index, row in df.iterrows():
123 |         text=row['sentence']
124 |         text=''.join(text.split('\n'))
125 |         sents.append(text)
126 |     
127 |     #print(len(sents))
128 |     new_df=pd.DataFrame(sents,columns=['sentence'])
129 | 
130 |     df.update(new_df)
131 |     df.to_csv('../data/gh-test-se.csv',header=None,index=False)
132 | #convert_gh_test()


--------------------------------------------------------------------------------