├── .gitignore
├── utils
    ├── __init__.py
    ├── unieval
    │   ├── __init__.py
    │   └── scorer.py
    ├── requirements.txt
    ├── bart_score
    │   └── __init__.py
    ├── significance_test.py
    └── evaluating.py
├── examples
    ├── __init__.py
    ├── e1.pdf
    ├── e2.pdf
    ├── e3.pdf
    ├── e4.pdf
    ├── e5.pdf
    ├── H1DkN7ZCZ.pdf
    ├── Hygy01StvH.pdf
    └── ZeE81SFTsl.pdf
├── dataset_analysis
    ├── __init__.py
    ├── diversity
    │   └── __init__.py
    ├── relevance
    │   ├── __init__.py
    │   ├── acl20-ref-free-eval-master
    │   │   ├── summariser
    │   │   │   └── ngram_vector
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── base.py
    │   │   ├── sentence_transformers
    │   │   │   ├── evaluation
    │   │   │   │   ├── SimilarityFunction.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── SequentialEvaluator.py
    │   │   │   │   ├── SentenceEvaluator.py
    │   │   │   │   └── LabelAccuracyEvaluator.py
    │   │   │   ├── models
    │   │   │   │   ├── tokenizer
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── WhitespaceTokenizer.py
    │   │   │   │   │   └── WordTokenizer.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── AbstractModel.py
    │   │   │   │   ├── Dense.py
    │   │   │   │   ├── LSTM.py
    │   │   │   │   ├── CNN.py
    │   │   │   │   ├── BoW.py
    │   │   │   │   ├── WordWeights.py
    │   │   │   │   ├── Pooling.py
    │   │   │   │   └── RoBERTa.py
    │   │   │   ├── losses
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── CosineSimilarityLoss.py
    │   │   │   │   ├── TripletLoss.py
    │   │   │   │   ├── MultipleNegativesRankingLoss.py
    │   │   │   │   └── SoftmaxLoss.py
    │   │   │   ├── readers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── InputExample.py
    │   │   │   │   ├── LabelSentenceReader.py
    │   │   │   │   ├── TripletReader.py
    │   │   │   │   ├── STSDataReader.py
    │   │   │   │   └── NLIDataReader.py
    │   │   │   ├── __init__.py
    │   │   │   ├── LoggingHandler.py
    │   │   │   ├── util.py
    │   │   │   └── data_samplers.py
    │   │   ├── resources.py
    │   │   ├── data
    │   │   │   ├── data_readme.txt
    │   │   │   ├── topic_1
    │   │   │   │   ├── summaries
    │   │   │   │   │   ├── summary1.txt
    │   │   │   │   │   ├── summary4.txt
    │   │   │   │   │   ├── summary2.txt
    │   │   │   │   │   ├── summary5.txt
    │   │   │   │   │   └── summary3.txt
    │   │   │   │   ├── references
    │   │   │   │   │   ├── ref4.txt
    │   │   │   │   │   ├── ref1.txt
    │   │   │   │   │   ├── ref2.txt
    │   │   │   │   │   └── ref3.txt
    │   │   │   │   └── input_docs
    │   │   │   │   │   ├── XIN_ENG_20050126.0287
    │   │   │   │   │   ├── NYT_ENG_20050127.0004
    │   │   │   │   │   ├── NYT_ENG_20050126.0303
    │   │   │   │   │   └── AFP_ENG_20050126.0676
    │   │   │   └── topic_2
    │   │   │   │   ├── summaries
    │   │   │   │       ├── summary2.txt
    │   │   │   │       └── summary1.txt
    │   │   │   │   └── input_docs
    │   │   │   │       ├── FT923-5835
    │   │   │   │       ├── FT923-5797
    │   │   │   │       ├── FT923-6038
    │   │   │   │       ├── FT923-5267
    │   │   │   │       ├── FT923-6110
    │   │   │   │       ├── FT923-6455
    │   │   │   │       └── FT923-5089
    │   │   ├── requirements.txt
    │   │   ├── utils
    │   │   │   ├── evaluator.py
    │   │   │   ├── data_reader.py
    │   │   │   ├── writer.py
    │   │   │   └── misc.py
    │   │   ├── summ_eval
    │   │   │   └── README.md
    │   │   └── generate_summary_rl.py
    │   └── bartscore.py
    ├── crossdoc_relationship
    │   ├── __init__.py
    │   ├── SemBERT-master
    │   │   ├── data_process
    │   │   │   └── __init__.py
    │   │   ├── tag_model
    │   │   │   ├── __init__.py
    │   │   │   ├── tagging.py
    │   │   │   └── tag_tokenization.py
    │   │   ├── .gitignore
    │   │   ├── __init__.py
    │   │   ├── SemBERT.png
    │   │   ├── pytorch_pretrained_bert
    │   │   │   └── __init__.py
    │   │   ├── anayzing_results_miao.py
    │   │   └── transform_snli_miao.py
    │   ├── disagreement_percentage.py
    │   ├── lda_debugging.py
    │   ├── doc2vec_debugging.py
    │   └── hierarchies.py
    ├── external_knowledge.py
    ├── tmp.py
    └── novel_ngrams_mds.py
├── human_annotation
    ├── __init__.py
    ├── peersum_generated_results
    │   └── __init__.py
    └── peersum_dataset_quality
    │   ├── empty_docs
    │       ├── 0.docx
    │       ├── 1.docx
    │       ├── 2.docx
    │       ├── 3.docx
    │       ├── 4.docx
    │       ├── 5.docx
    │       ├── 6.docx
    │       ├── 7.docx
    │       ├── 8.docx
    │       ├── 9.docx
    │       ├── tmp.docx
    │       ├── instructions.docx
    │       ├── instructions-0601.pdf
    │       ├── instructions-0605.pdf
    │       └── instructions-0606.pdf
    │   ├── filled_docs_final
    │       ├── 0.docx
    │       ├── 1.docx
    │       ├── 2.docx
    │       ├── 3.docx
    │       ├── 4.docx
    │       ├── 5.docx
    │       ├── 6.docx
    │       ├── 7.docx
    │       ├── 8.docx
    │       └── 9.docx
    │   ├── filled_docs_pilot
    │       ├── 0.docx
    │       ├── 1.docx
    │       ├── 2.docx
    │       ├── 3.docx
    │       ├── 4.docx
    │       ├── 5.docx
    │       ├── 6.docx
    │       ├── 7.docx
    │       ├── 8.docx
    │       └── 9.docx
    │   ├── __init__.py
    │   ├── tmp.py
    │   ├── splitting_data.py
    │   ├── sampling_data.py
    │   └── final_analysis.py
├── loading_data
    ├── __init__.py
    ├── tmp.py
    ├── mds_loader.py
    └── peersum_loader.py
├── mtsum_meta
    ├── bash
    │   ├── __init__.py
    │   ├── train_mtsum_ram_from_primera_4096.sh
    │   ├── train_mtsum_ram_from_led_4096.sh
    │   ├── test_mtsum_ram_from_led_4096_zeroshot.sh
    │   └── test_mtsum_ram_from_primera_4096.sh
    ├── with_bart
    │   ├── results.txt
    │   └── __init__.py
    ├── with_primera
    │   ├── results.txt
    │   └── __init__.py
    ├── requirements.txt
    ├── __init__.py
    ├── tmp.py
    └── with_ram
    │   ├── tmp.py
    │   └── __init__.py
├── other
    ├── __init__.py
    ├── screenshot.docx
    └── screenshot.pdf
├── plot
    ├── __init__.py
    ├── radar_chart.png
    ├── score_evaluation.png
    └── summarization_results.png
├── requirements.txt
├── acceptance_prediction
    ├── __init__.py
    └── bert_inference_reference.py
├── dataset
    └── __init__.py
└── crawling_data
    ├── __init__.py
    ├── tmp.py
    ├── prepare_peersum_for_huggingface.py
    ├── nips_download_response.py
    ├── iclr_getdata.py
    ├── data_overview.py
    ├── nips_getdata_2019.py
    ├── nips_getdata_2022.py
    ├── data_verification.py
    └── nips_getdata_2020.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataset_analysis/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/human_annotation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/loading_data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mtsum_meta/bash/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/unieval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mtsum_meta/with_bart/results.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/other/__init__.py:
--------------------------------------------------------------------------------
1 | # other files


--------------------------------------------------------------------------------
/dataset_analysis/diversity/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mtsum_meta/with_primera/results.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/plot/__init__.py:
--------------------------------------------------------------------------------
1 | # plot figures of results


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/human_annotation/peersum_generated_results/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/SemBERT-master/data_process/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/SemBERT-master/tag_model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/e1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/examples/e1.pdf


--------------------------------------------------------------------------------
/examples/e2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/examples/e2.pdf


--------------------------------------------------------------------------------
/examples/e3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/examples/e3.pdf


--------------------------------------------------------------------------------
/examples/e4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/examples/e4.pdf


--------------------------------------------------------------------------------
/examples/e5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/examples/e5.pdf


--------------------------------------------------------------------------------
/utils/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets==2.11.0
2 | transformers==4.28.1
3 | torch==1.10.0
4 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/summariser/ngram_vector/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/other/screenshot.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/other/screenshot.docx


--------------------------------------------------------------------------------
/other/screenshot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/other/screenshot.pdf


--------------------------------------------------------------------------------
/plot/radar_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/plot/radar_chart.png


--------------------------------------------------------------------------------
/examples/H1DkN7ZCZ.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/examples/H1DkN7ZCZ.pdf


--------------------------------------------------------------------------------
/examples/Hygy01StvH.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/examples/Hygy01StvH.pdf


--------------------------------------------------------------------------------
/examples/ZeE81SFTsl.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/examples/ZeE81SFTsl.pdf


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/SemBERT-master/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | *.xml
3 | *.iml
4 | *.pyc
5 | 


--------------------------------------------------------------------------------
/mtsum_meta/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch_lightning==1.7.4
2 | datasets==2.2.2
3 | transformers==4.24.0
4 | 


--------------------------------------------------------------------------------
/plot/score_evaluation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/plot/score_evaluation.png


--------------------------------------------------------------------------------
/plot/summarization_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/plot/summarization_results.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch_lightning==1.7.4
2 | datasets==2.2.2
3 | transformers==4.24.0
4 | pytorch==1.10.0
5 | 


--------------------------------------------------------------------------------
/utils/bart_score/__init__.py:
--------------------------------------------------------------------------------
1 | # This is copied from https://github.com/neulab/BARTScore/blob/main/bart_score.py


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/SemBERT-master/__init__.py:
--------------------------------------------------------------------------------
1 | # The code is borrowed from https://github.com/cooelf/SemBERT


--------------------------------------------------------------------------------
/mtsum_meta/__init__.py:
--------------------------------------------------------------------------------
1 | # Multi-task learning with meta-data of PeerSum to generate meta-reviews, with rating and confidence prediction
2 | 


--------------------------------------------------------------------------------
/acceptance_prediction/__init__.py:
--------------------------------------------------------------------------------
1 | # use acceptance prediction based on BERT to evaluate the quality of generated summaries of different models


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/0.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/0.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/1.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/1.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/2.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/2.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/3.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/3.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/4.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/4.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/5.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/5.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/6.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/6.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/7.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/7.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/8.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/8.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/9.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/9.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/tmp.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/tmp.docx


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/SemBERT-master/SemBERT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/dataset_analysis/crossdoc_relationship/SemBERT-master/SemBERT.png


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_final/0.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_final/0.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_final/1.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_final/1.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_final/2.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_final/2.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_final/3.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_final/3.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_final/4.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_final/4.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_final/5.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_final/5.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_final/6.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_final/6.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_final/7.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_final/7.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_final/8.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_final/8.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_final/9.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_final/9.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_pilot/0.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_pilot/0.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_pilot/1.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_pilot/1.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_pilot/2.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_pilot/2.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_pilot/3.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_pilot/3.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_pilot/4.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_pilot/4.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_pilot/5.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_pilot/5.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_pilot/6.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_pilot/6.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_pilot/7.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_pilot/7.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_pilot/8.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_pilot/8.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/filled_docs_pilot/9.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/filled_docs_pilot/9.docx


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/instructions.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/instructions.docx


--------------------------------------------------------------------------------
/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | # Please download PeerSum dataset into this folder via the following links
2 | # Google Drive: https://drive.google.com/drive/folders/1M1QhIwjuZOG3QdxNFqY7J5Ik5UsDA0Sk?usp=sharing


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/instructions-0601.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/instructions-0601.pdf


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/instructions-0605.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/instructions-0605.pdf


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/empty_docs/instructions-0606.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oaimli/PeerSum/HEAD/human_annotation/peersum_dataset_quality/empty_docs/instructions-0606.pdf


--------------------------------------------------------------------------------
/crawling_data/__init__.py:
--------------------------------------------------------------------------------
1 | # Raw data crawled from websites, saved on punim0478
2 | # ICLR: https://openreview.net/group?id=ICLR.cc (more reviews for other conferences could be crawled)
3 | # NeurIPS: https://proceedings.neurips.cc/


--------------------------------------------------------------------------------
/crawling_data/tmp.py:
--------------------------------------------------------------------------------
1 | import openreview
2 | 
3 | base_url = "https://api.openreview.net"
4 | client = openreview.Client(baseurl=base_url)
5 | rcs = client.get_notes(
6 |         forum="x9jS8pX3dkx")
7 | for rc in rcs:
8 |     print(rc)
9 |     print()


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/evaluation/SimilarityFunction.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | class SimilarityFunction(Enum):
4 |     COSINE = 0
5 |     EUCLIDEAN = 1
6 |     MANHATTAN = 2
7 |     DOT_PRODUCT = 3
8 | 
9 | 


--------------------------------------------------------------------------------
/mtsum_meta/tmp.py:
--------------------------------------------------------------------------------
1 | import random
2 | r1 = "police killed the gunman."
3 | r2 = "the gunman was shot down by police."
4 | c1 = "police ended the gunman."
5 | c2 = "the gunman murdered police."
6 | # random.seed(42)
7 | x = [r1, r2, c1, c2]
8 | random.shuffle(x)
9 | print(x)


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
2 | from .WhitespaceTokenizer import WhitespaceTokenizer
3 | from .WhitespaceTokenizer import WhitespaceTokenizer


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/__init__.py:
--------------------------------------------------------------------------------
1 | # Every sample is a dictionary, ['source_documents', 'summary', 'paper_id', 'label', 'paper_acceptance', 'with_disagreement']
2 | # Anchoring summary sentences to source documents in PeerSum, quality control for PeerSum
3 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .CosineSimilarityLoss import *
2 | from .SoftmaxLoss import *
3 | from .BatchHardTripletLoss import *
4 | from .MultipleNegativesRankingLoss import *
5 | from .TripletLoss import *


--------------------------------------------------------------------------------
/mtsum_meta/with_ram/tmp.py:
--------------------------------------------------------------------------------
1 | from tokenization_led_fast import LEDTokenizerFast
2 | from modeling_led import LEDForConditionalGeneration
3 | pretrained_model = "allenai/PRIMERA"
4 | tokenizer = LEDTokenizerFast.from_pretrained(pretrained_model)
5 | model = LEDForConditionalGeneration.from_pretrained(pretrained_model)


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/readers/__init__.py:
--------------------------------------------------------------------------------
1 | from .InputExample import InputExample
2 | from .LabelSentenceReader import LabelSentenceReader
3 | from .NLIDataReader import NLIDataReader
4 | from .STSDataReader import STSDataReader
5 | from .TripletReader import TripletReader


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/resources.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) #os.path.abspath('.')
4 | ROUGE_DIR = os.path.join(BASE_DIR,'rouge','ROUGE-RELEASE-1.5.5/') #do not delete the '/' in the end
5 | 
6 | SUMMARY_LENGTH = 100
7 | LANGUAGE = 'english'
8 | 
9 | 


--------------------------------------------------------------------------------
/mtsum_meta/with_primera/__init__.py:
--------------------------------------------------------------------------------
1 | # multi-task learning based on LED, no relationship-aware attentions
2 | # Loading parameters from pre-trained PRIMERA
3 | 
4 | # Using the original LED code from huggingface transformers:
5 | # https://github.com/huggingface/transformers/tree/main/src/transformers/models/led
6 | # The commit on Nov 10, 2022


--------------------------------------------------------------------------------
/loading_data/tmp.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | peersum = load_dataset('json', data_files='../../datasets/peersum.json', split='all')
 4 | print("all data", len(peersum))
 5 | 
 6 | target_id = "iclr_2022"
 7 | count = 0
 8 | for item in peersum:
 9 |     if item["paper_id"].startswith(target_id):
10 |         count += 1
11 | print(target_id, count)


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .BERT import BERT
 2 | from .BoW import BoW
 3 | from .CNN import CNN
 4 | from .Dense import Dense
 5 | from .LSTM import LSTM
 6 | from .Pooling import Pooling
 7 | from .RoBERTa import RoBERTa
 8 | from .WordEmbeddings import WordEmbeddings
 9 | from .WordWeights import WordWeights
10 | from .XLNet import XLNet
11 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/data_readme.txt:
--------------------------------------------------------------------------------
1 | Create a folder for each topic you want to evaluate/summarize. Inside the topic folder, 
2 | * put the to-be-summarised documents to input_docs/,
3 | * put the to-be-evaluated summaries to summaries/, and 
4 | * if human-written reference summaries are available, put them to references/.
5 | 
6 | An example is provided here. It is from DUC'09, topic 0939-A.
7 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.3"
2 | __DOWNLOAD_SERVER__ = 'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/'
3 | from .datasets import SentencesDataset, SentenceLabelDataset
4 | from .data_samplers import LabelSampler
5 | from .LoggingHandler import LoggingHandler
6 | from .SentenceTransformer import SentenceTransformer
7 | 
8 | 


--------------------------------------------------------------------------------
/mtsum_meta/with_bart/__init__.py:
--------------------------------------------------------------------------------
 1 | # multi-task learning based on BART, no relationship-aware attentions
 2 | # Loading parameters from pre-trained BART
 3 | 
 4 | # some modification from with_led
 5 | # 1. use pre-trained Tokenizer and Model from BART
 6 | # 2. remove global attention
 7 | # 3. add <doc-sep> into the tokenizer
 8 | 
 9 | # Using the original LED code from huggingface transformers:
10 | # https://github.com/huggingface/transformers/tree/main/src/transformers/models/bart
11 | # The commit on Dec 9, 2022


--------------------------------------------------------------------------------
/mtsum_meta/with_ram/__init__.py:
--------------------------------------------------------------------------------
1 | # Relationship-aware Attention Manipulation(RAM), and multi-task learning
2 | 
3 | # Implementation or modification is based on the LED code from huggingface transformers:
4 | # https://github.com/huggingface/transformers/tree/main/src/transformers/models/led
5 | # The commit on Nov 10, 2022
6 | 
7 | # Implementation or modification is based on the BART code from huggingface transformers:
8 | # https://github.com/huggingface/transformers/tree/main/src/transformers/models/bart
9 | # The commit on Dec 9, 2022


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .SentenceEvaluator import SentenceEvaluator
2 | from .SimilarityFunction import SimilarityFunction
3 | 
4 | from .BinaryEmbeddingSimilarityEvaluator import BinaryEmbeddingSimilarityEvaluator
5 | from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator
6 | from .LabelAccuracyEvaluator import LabelAccuracyEvaluator
7 | from .SequentialEvaluator import SequentialEvaluator
8 | from .TripletEvaluator import TripletEvaluator
9 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/LoggingHandler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import tqdm
 3 | 
 4 | class LoggingHandler(logging.Handler):
 5 |     def __init__(self, level=logging.NOTSET):
 6 |         super().__init__(level)
 7 | 
 8 |     def emit(self, record):
 9 |         try:
10 |             msg = self.format(record)
11 |             tqdm.tqdm.write(msg)
12 |             self.flush()
13 |         except (KeyboardInterrupt, SystemExit):
14 |             raise
15 |         except:
16 |             self.handleError(record)


--------------------------------------------------------------------------------
/dataset_analysis/external_knowledge.py:
--------------------------------------------------------------------------------
 1 | import jsonlines
 2 | 
 3 | papers = []
 4 | papers_with_external_knowledge = []
 5 | with jsonlines.open("../../datasets/peersum.json", "r") as reader:
 6 |     for paper in reader:
 7 |         papers.append(paper)
 8 |         summary = paper["summary"]
 9 |         if "in my opinion" in summary or "In my opinion" in summary or "read the paper myself" in summary:
10 |             papers_with_external_knowledge.append(paper)
11 | 
12 | print("Proportion of papers with external knowledge", len(papers_with_external_knowledge)/len(papers))
13 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/summaries/summary1.txt:
--------------------------------------------------------------------------------
1 | Juan Miguel Alvarez, charged with murder with special circumstances in the deaths of 11 Metrolink passengers, slashed and stabbed himself after seeing the horrific train crash he caused, sources close to the investigation said Thursday. Alvarez, 25, despondent over the breakup of his marriage, had planned to kill himself when he drove his green Jeep Grand Cherokee in the path of a Metrolink train, officials said, but he bolted from the vehicle at the last moment. As he watched, southbound Train No.
2 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/summaries/summary4.txt:
--------------------------------------------------------------------------------
1 | Los Angeles County prosecutors on Thursday charged a man with 11 counts of murder, one for every person who died in a violent three-train wreck that they say he caused by driving the man's Jeep onto commuter railroad tracks in a botched suicide attempt. Police arrested a man who they said would be charged with homicide in the crash that left train cars mangled and seared. At 6:02 a.m., the three-car Train 100, headed from Moorpark to Union Station, with the locomotive in the rear, crashed into the SUV.
2 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/references/ref4.txt:
--------------------------------------------------------------------------------
1 | A train crash in Glendale, California, on 26 January killed 11 people and injured close to 200, many critically. A southbound commuter train, pushed from behind by a locomotive, hit an SUV lodged on the tracks, then hit a freight train and a northbound commuter train. Police arrested Juan Manuel Alvarez, the driver of the SUV, who had jumped from the car just before the impact. At first, self-inflicted wounds on his wrists and chest led investigators to think that it was an attempted suicide. It was determined later that he got the wounds after he fled the scene.
2 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/summaries/summary2.txt:
--------------------------------------------------------------------------------
1 | Until Wednesday, Juan Manuel Alvarez was living the average life of an obscure and troubled man.It had to be the worst multicasualty incident Ive been to, Los Angeles Fire Capt. Rick Godinez said.David Morrison, 47, an attorney, was heading to downtown Los Angeles on his regular morning commute. With his tire, apparently caught between the tracks, Mr. Alvarez jumped out of the Jeep and ran. But, Mrs. Alvarez was tracked to a modest home in the north section of Compton. In filing the charges, Cooley said he was unmoved by Alvarezs emotional state before the crash.
2 | 


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/tmp.py:
--------------------------------------------------------------------------------
 1 | import jsonlines
 2 | 
 3 | part1 = []
 4 | with jsonlines.open("peersum_sampled_part1.json") as reader:
 5 |     for line in reader:
 6 |         part1.append(line)
 7 | 
 8 | part2 = []
 9 | with jsonlines.open("peersum_sampled_part2.json") as reader:
10 |     for line in reader:
11 |         part2.append(line)
12 | 
13 | all_samples_source = set([])
14 | all_samples_summary = set([])
15 | for sample in part1 + part2:
16 |     all_samples_source.add(sample["source_documents"][0])
17 |     all_samples_summary.add(sample["summary"])
18 | print(len(all_samples_source))
19 | print(len(all_samples_summary))


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/requirements.txt:
--------------------------------------------------------------------------------
 1 | boto3==1.13.5
 2 | botocore==1.16.5
 3 | certifi==2020.4.5.1
 4 | chardet==3.0.4
 5 | click==7.1.2
 6 | decorator==4.4.2
 7 | docutils==0.15.2
 8 | future==0.18.2
 9 | idna==2.9
10 | jmespath==0.9.5
11 | joblib==0.14.1
12 | networkx==2.4
13 | nltk==3.5
14 | numpy==1.18.4
15 | python-dateutil==2.8.1
16 | pytorch-transformers==1.2.0
17 | regex==2020.5.7
18 | requests==2.23.0
19 | s3transfer==0.3.3
20 | sacremoses==0.0.43
21 | scikit-learn==0.22.2.post1
22 | scipy==1.4.1
23 | sentencepiece==0.1.86
24 | six==1.14.0
25 | sklearn==0.0
26 | torch==1.5.0
27 | tqdm==4.46.0
28 | urllib3==1.25.9
29 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/references/ref1.txt:
--------------------------------------------------------------------------------
1 | Juan Alvarez left his car on a railroad track in Glendale CA. It set off a collision that derailed two trains, killed at least 10 people and injured nearly 200. Witnesses identified Alvarez and he was detained. He was distraught and put on suicide watch. Glendale police believed he intended to commit suicide but changed his mind. He was taken into custody and charged with at least 10 counts of homicide. Investigators felt Alvarez wanted to kill himself because his estranged wife denied him visits with his children. He was held on suspicion of murder in Los Angeles without bail.
2 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/references/ref2.txt:
--------------------------------------------------------------------------------
1 | A man intent on suicide left his car on a railroad track in Glendale, California, where it set off a collision that derailed two commuter trains and killed ten people (later raised to eleven) and injured nearly 200. Police held Juan Manuel Alvarez on ten counts of murder and indicated that he had intended suicide but changed his mind and fled his vehicle before the train struck. Alvarez, who had self-inflicted superficial wounds, remained at the scene and was cooperating with police. He has a criminal record involving drugs and his wife obtained a restraining order against him.
2 | 


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/SemBERT-master/pytorch_pretrained_bert/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "0.6.1"
 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 3 | 
 4 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
 5 |                        BertForMaskedLM, BertForNextSentencePrediction,
 6 |                        BertForSequenceClassification, BertForMultipleChoice,
 7 |                        BertForTokenClassification, BertForQuestionAnswering,
 8 |                        load_tf_weights_in_bert)
 9 | 
10 | from .optimization import BertAdam
11 | 
12 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path
13 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/summaries/summary5.txt:
--------------------------------------------------------------------------------
1 | A southbound commuter train heading to downtown Los Angeles hit the Jeep Grand Cherokee parked on the tracks, said Glendale Police Chief Randy G. Adams. But then in the early morning darkness of Wednesday, Juan Manuel Alvarez introduced his troubled life to the world when he drove his Jeep Cherokee onto the commuter railroad tracks of Glendale in metropolitan Los Angeles, killing at least 10 people and injuring about 200 more. Glendale Police Chief Adams said Alvarez will be facing at least 10 counts of homicide. Glendale Police Chief Adams said Alvarez-watched the chaos after fleeing his vehicle.
2 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_2/summaries/summary2.txt:
--------------------------------------------------------------------------------
1 | no INSURERS no to no out no estimated no 7.3bn no 3.7bn) no Florida no a no of no Andrew no by no the no disaster no industry no ever no The no is no first no tally no the no resulting no the no which no through no Florida no week. no the no region no is no that no people no have no electricity no at no 150,000 no either no or no living no ruins. no George no yesterday no his no visit no the no since no hurricane no Although no had no been no preliminary no at no level no insurance no yesterday's no comes no the no Claims no division no the no Insurance no Group, no property-casualty no trade no
2 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/references/ref3.txt:
--------------------------------------------------------------------------------
1 | Juan Manuel Alvarez drove his SUV onto Glendale train tracks in a suicide attempt. When his SUV got stuck between the tracks he got out and left. A southbound Metrolink commuter train crashed into the immovable SUV and climbed over it. The front cab car rammed into an adjacent stationary Union Pacific work train and toppled it. The remaining cars jackknifed and derailed a northbound Metrolink commuter train. The accident and subsequent fire killed 11 and injured 200. Alvarez watched the crash then attempted suicide again by slashing his wrists and stabbing himself. He was arrested and charged with homicide.
2 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/summaries/summary3.txt:
--------------------------------------------------------------------------------
1 | Authorities believe Juan Manuel Alvarez drove his Jeep Grand Cherokee around the rail crossing barrier and onto the Metrolink train tracks at Chevy Chase Drive in Glendale about 6 a.m. Wednesday. A 25-year-old man was arrested following Wednesday's commuter train collision that killed at least 10 people near downtown Los Angeles. He apparently changed his mind about killing himself, abandoned the vehicle and watched the crash, the Glendale police chief, Randy G. Adams, said. Alvarez is accused of leaving a Jeep Cherokee on the tracks, causing the derailment of one train, which then crashed into another.
2 | 


--------------------------------------------------------------------------------
/utils/significance_test.py:
--------------------------------------------------------------------------------
 1 | # significance test
 2 | import scipy.stats as stats
 3 | 
 4 | # two-tailed paired t-test
 5 | def paired_ttest(a, b):
 6 |     result = stats.ttest_rel(a, b)
 7 |     return result
 8 | 
 9 | # two-tailed independent t-test
10 | def ind_ttest(a, b):
11 |     result = stats.ttest_ind(a, b)
12 |     return result
13 | 
14 | # two-tailed one sample test
15 | def onesample_ttest(a, popmean):
16 |     result = stats.ttest_1samp(a=a, popmean=popmean)
17 |     return result
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     a = [0.1, 0.3, 0.15, 0.6, 0.22]
22 |     b = [0.51, 0.61, 0.21, 0.84, 0.30]
23 |     print(paired_ttest(a, b))
24 |     print(onesample_ttest(a, 0.01))


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/splitting_data.py:
--------------------------------------------------------------------------------
 1 | import jsonlines
 2 | import os
 3 | 
 4 | n_clusters = 10
 5 | 
 6 | dir = "./splitted_data_part2"
 7 | for f in os.listdir(dir):
 8 |     os.remove(os.path.join(dir, f))
 9 | 
10 | samples = []
11 | with jsonlines.open("peersum_sampled_part2.json") as reader:
12 |     for sample in reader:
13 |         samples.append(sample)
14 | print("all samples", len(samples))
15 | 
16 | for x in range(n_clusters):
17 |     count = int(len(samples) / n_clusters)
18 |     samples_cluster = samples[count * x:count * (x + 1)]
19 |     with jsonlines.open(dir + "/peersum_sampled_%d.json" % x, "w") as writer:
20 |         writer.write_all(samples_cluster)
21 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/utils/evaluator.py:
--------------------------------------------------------------------------------
 1 | from rouge.rouge import Rouge
 2 | from resources import *
 3 | from collections import OrderedDict
 4 | 
 5 | def add_result(all_dic,result):
 6 |     for metric in result:
 7 |         if metric in all_dic:
 8 |             all_dic[metric].append(result[metric])
 9 |         else:
10 |             all_dic[metric] = [result[metric]]
11 | 
12 | 
13 | def evaluate_summary_rouge(cand,model,max_sum_len=100):
14 |     rouge_scorer = Rouge(ROUGE_DIR,BASE_DIR,True)
15 |     r1, r2, rl, rsu4 = rouge_scorer(cand,[model],max_sum_len)
16 |     rouge_scorer.clean()
17 |     dic = OrderedDict()
18 |     dic['ROUGE-1'] = r1
19 |     dic['ROUGE-2'] = r2
20 |     dic['ROUGE-L'] = rl
21 |     dic['ROUGE-SU4'] = rsu4
22 |     return dic
23 | 
24 | 


--------------------------------------------------------------------------------
/dataset_analysis/tmp.py:
--------------------------------------------------------------------------------
 1 | import jsonlines
 2 | 
 3 | for dataset in ["multinews", "wcep", "multixscience", "wikisum"]:
 4 |     print(dataset)
 5 |     papers = []
 6 |     with jsonlines.open("../../datasets/%s_lemma_stop.json"%dataset) as reader:
 7 |         for paper in reader:
 8 |             paper["summary"] = paper["summary"].replace(" sentence_split", "")
 9 |             source_documents = paper["source_documents"]
10 |             source_documents_new = []
11 |             for d in source_documents:
12 |                 source_documents_new.append(d.replace(" sentence_split", ""))
13 |             paper["source_documents"] = source_documents_new
14 |             papers.append(paper)
15 |     with jsonlines.open("processed/%s_processed.json"%dataset, "w") as writer:
16 |         writer.write_all(papers)


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/readers/InputExample.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | 
 3 | 
 4 | class InputExample:
 5 |     """
 6 |     Structure for one input example with texts, the label and a unique id
 7 |     """
 8 |     def __init__(self, guid: str, texts: List[str], label: Union[int, float]):
 9 |         """
10 |         Creates one InputExample with the given texts, guid and label
11 | 
12 |         str.strip() is called on both texts.
13 | 
14 |         :param guid
15 |             id for the example
16 |         :param texts
17 |             the texts for the example
18 |         :param label
19 |             the label for the example
20 |         """
21 |         self.guid = guid
22 |         self.texts = [text.strip() for text in texts]
23 |         self.label = label
24 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/AbstractModel.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from typing import Union, Tuple, List, Iterable, Dict
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | 
 6 | class AbstractModel(ABC):
 7 |     @abstractmethod
 8 |     def get_sentence_embedding_dimension(self) -> int:
 9 |         """Returns the size of the sentence embedding"""
10 |         pass
11 | 
12 |     @abstractmethod
13 |     def tokenize(self, text: str) -> List[str]:
14 |         """Tokenizes a text. Returns a list with tokens"""
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def get_sentence_features(self, tokens: List[str], pad_seq_length: int):
19 |         """This method is passed a tokenized text (tokens). pad_seq_length defines the needed length for padding."""
20 |         pass
21 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_2/summaries/summary1.txt:
--------------------------------------------------------------------------------
1 | US INSURERS expect to pay out an estimated Dollars 7.3bn (Pounds 3.7bn) in Florida as a result of Hurricane Andrew - by far the costliest disaster the industry has ever faced. The figure is the first official tally of the damage resulting from the hurricane, which ripped through southern Florida last week. In the battered region it is estimated that 275,000 people still have no electricity and at least 150,000 are either homeless or are living amid ruins. President George Bush yesterday made his second visit to the region since the hurricane hit. Although there had already been some preliminary guesses at the level of insurance claims, yesterday's figure comes from the Property Claims Services division of the American Insurance Services Group, the property-casualty insurers' trade association.
2 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/evaluation/SequentialEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | from typing import Iterable
 3 | 
 4 | class SequentialEvaluator(SentenceEvaluator):
 5 |     """
 6 |     This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated,
 7 |     the data is passed sequentially to all sub-evaluators.
 8 | 
 9 |     The score from the last sub-evaluator will be used as the main score for the best model decision.
10 |     """
11 |     def __init__(self, evaluators: Iterable[SentenceEvaluator]):
12 |         self.evaluators = evaluators
13 | 
14 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
15 |         for evaluator in self.evaluators:
16 |             main_score = evaluator(model, output_path, epoch, steps)
17 | 
18 |         return main_score
19 | 


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/SemBERT-master/anayzing_results_miao.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | 
 5 | data_name = "WIKISUM"
 6 | data_dir = "glue_data/%s/"%data_name
 7 | with open("snli_model_dir/%s_pred_results.tsv"%data_name) as f:
 8 |     lines = f.readlines()
 9 | 
10 | predictions = []
11 | for line in lines:
12 |     predictions.append(line.strip().split("\t")[-1])
13 | 
14 | with open(os.path.join(data_dir, "test.tsv")) as f:
15 |     lines = f.readlines()
16 | eval_examples = []
17 | for line_index, line in enumerate(lines):
18 |     if line_index>0:
19 |         line_split = line.split("\t")
20 |         if line_split[3]=="contradiction":
21 |             eval_examples.append([line_split[1], line_split[2]])
22 | 
23 | 
24 | count_all = len(predictions)
25 | indexes = range(count_all)
26 | target_indexes = random.sample(indexes, 20)
27 | for target in target_indexes:
28 |     exam = eval_examples[target]
29 |     print(exam[0], exam[1], "contradiction")


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/losses/CosineSimilarityLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | from ..SentenceTransformer import SentenceTransformer
 5 | 
 6 | class CosineSimilarityLoss(nn.Module):
 7 |     def __init__(self, model: SentenceTransformer):
 8 |         super(CosineSimilarityLoss, self).__init__()
 9 |         self.model = model
10 | 
11 | 
12 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
13 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
14 |         rep_a, rep_b = reps
15 | 
16 |         output = torch.cosine_similarity(rep_a, rep_b)
17 |         loss_fct = nn.MSELoss()
18 | 
19 |         if labels is not None:
20 |             loss = loss_fct(output, labels.view(-1))
21 |             return loss
22 |         else:
23 |             return reps, output


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/disagreement_percentage.py:
--------------------------------------------------------------------------------
 1 | import jsonlines
 2 | 
 3 | samples = []
 4 | with jsonlines.open("../../crawling_data/data/peersum_all.json") as reader:
 5 |     for line in reader:
 6 |         samples.append(line)
 7 | print("all samples", len(samples))
 8 | 
 9 | samples_disagreement = 0
10 | samples_valid = 0
11 | for sample in samples:
12 |     samples_valid += 1
13 |     reviews = sample["reviews"]
14 |     label = sample["label"]
15 |     with_disagreement = False
16 |     for i, review_i in enumerate(reviews):
17 |         for j, review_j in enumerate(reviews):
18 |             if j > i and review_i["rating"]!=-1 and review_j["rating"]!=-1:
19 |                 dis = review_i["rating"] - review_j["rating"]
20 |                 if abs(dis) >= 4:
21 |                     with_disagreement = True
22 |     if with_disagreement:
23 |         samples_disagreement += 1
24 |         # print(sample)
25 | print(samples_disagreement)
26 | print("disgreement ratio", samples_disagreement/samples_valid)
27 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/summariser/ngram_vector/base.py:
--------------------------------------------------------------------------------
 1 | class Sentence:
 2 |     """The sentence data structure.
 3 | 
 4 |     Args: 
 5 |         tokens (list of str): the list of word tokens.
 6 |         doc_id (str): the identifier of the document from which the sentence
 7 |           comes from.
 8 |         position (int): the position of the sentence in the source document.
 9 |     """
10 |     def __init__(self, tokens, doc_id, position):
11 | 
12 |         self.tokens = tokens
13 |         """ tokens as a list. """
14 | 
15 |         self.doc_id = doc_id
16 |         """ document identifier of the sentence. """
17 | 
18 |         self.position = position
19 |         """ position of the sentence within the document. """
20 | 
21 |         self.concepts = []
22 |         """ concepts of the sentence. """
23 | 
24 |         self.untokenized_form = ''
25 |         """ untokenized form of the sentence. """
26 | 
27 |         self.length = 0
28 |         """ length of the untokenized sentence. """


--------------------------------------------------------------------------------
/loading_data/mds_loader.py:
--------------------------------------------------------------------------------
 1 | # loading mds data after downloading datasets stored in a same format
 2 | import os
 3 | import jsonlines
 4 | 
 5 | 
 6 | def loading_mds(folder, data_name):
 7 |     print("loading mds", data_name)
 8 | 
 9 |     samples = []
10 |     file_name = folder + "/%s.json"%data_name
11 |     if os.path.isfile(file_name):
12 |         with jsonlines.open(file_name) as reader:
13 |             for line in reader:
14 |                 samples.append(line)
15 |     else:
16 |         print("File does not exist.")
17 | 
18 |     return samples
19 | 
20 | 
21 | if __name__=="__main__":
22 |     import random
23 |     samples = loading_mds(folder="../../datasets", data_name="wikisum")
24 | 
25 |     indexes = range(len(samples))
26 |     target_indexes = random.sample(indexes, 5)
27 |     for i in target_indexes:
28 |         sample = samples[i]
29 |         print("*********************")
30 |         for item in sample["source_documents"]:
31 |             print(item)
32 |             print("###########")
33 |         print(sample["summary"])
34 | 


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/SemBERT-master/transform_snli_miao.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def transforming_snli(snli_dir):
 4 |     with open(os.path.join(snli_dir, "snli_1.0_test.txt")) as f:
 5 |         lines = f.readlines()
 6 |     sentence_pairs = []
 7 |     sentence_pairs.append("index" + "\t" + "sentence_1" + "\t" + "sentence_2" + "\t" + "gold_label" + "\t" + "cluster_index" + "\n")
 8 | 
 9 |     index = 0
10 |     for line_index, line in enumerate(lines):
11 |         line = line.split("\t")
12 |         if line_index == 0:
13 |             continue
14 |         if line[0] in ["contradiction", "entailment", "neutral"]:
15 |             sentence_pairs.append(str(index) + "\t" + line[5] + "\t" + line[6] + "\t" + line[0] + "\t" + "0" + "\n")
16 |             index += 1
17 |     return sentence_pairs
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     sentence_pairs = transforming_snli("glue_data/snli_1.0")
22 |     output_dir = "glue_data/SNLI"
23 |     print("all sentence pairs", len(sentence_pairs))
24 |     with open(os.path.join(output_dir, "test.tsv"), "w") as f:
25 |         f.writelines(sentence_pairs)


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/lda_debugging.py:
--------------------------------------------------------------------------------
 1 | from gensim.corpora import Dictionary
 2 | from gensim.models.ldamodel import LdaModel
 3 | from gensim.test.utils import common_corpus, common_dictionary
 4 | from gensim.models.callbacks import PerplexityMetric, Callback
 5 | 
 6 | 
 7 | lda = LdaModel(corpus=common_corpus, id2word=common_dictionary, num_topics=10, chunksize=128, iterations=50, passes=10)
 8 | for i in range(20000):
 9 |     lda.update(common_corpus)
10 |     print(lda.log_perplexity(chunk=common_corpus))
11 | 
12 | # # model evaluation
13 | # evals = []
14 | # for i in range(20):
15 | #     eval = lda.log_perplexity(chunk=corpus)
16 | #     print("Pass", 10+i, eval)
17 | #     lda.update(corpus)
18 | #
19 | # for i in range(len(common_texts)):
20 | #     doc_i = common_texts[i]
21 | #     topic_s = [0.0]*lda.num_topics
22 | #     for item in lda[dictionary.doc2bow(doc_i)]:
23 | #     lda.num_topics
24 | #     print(topic_s)
25 | #     for j in range(i+1, len(common_texts)):
26 | #         doc_j = common_texts[j]
27 | #         topic_d = lda[dictionary.doc2bow(doc_j)]
28 | #         print(cosine_similarity([topic_s], [topic_d])[0][0])
29 | 


--------------------------------------------------------------------------------
/utils/evaluating.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Evaluating generated summaries in terms of different metrics
 3 | 
 4 | """
 5 | import json
 6 | import os
 7 | from metrics import evaluating_summaries_multi_sources
 8 | 
 9 | if __name__ == "__main__":
10 |     dataset_name = "peersum"
11 |     generated_summaries_folder = ""
12 | 
13 |     files = os.listdir(generated_summaries_folder)
14 |     print(dataset_name, "all samples", len(files))
15 | 
16 |     references = []
17 |     predictions = []
18 |     document_clusters = []
19 |     for file in files:
20 |         if not os.path.isdir(file):
21 |             with open(os.path.join(generated_summaries_folder, file)) as f:
22 |                 result = json.load(f)
23 |                 # print(result)
24 |                 if "source_documents" in result.keys():
25 |                     source_documents = result["source_documents"]
26 |                     document_clusters.append(source_documents)
27 |                 predictions.append(result["prediction"])
28 |                 references.append(result["reference"])
29 | 
30 |     print(evaluating_summaries_multi_sources(gold_summaries=references, generated_summaries=predictions, source_document_clusters=document_clusters))
31 | 


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/doc2vec_debugging.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument
 3 | from gensim.test.utils import common_texts
 4 | import multiprocessing
 5 | from gensim.models.callbacks import CallbackAny2Vec
 6 | 
 7 | 
 8 | class EpochCallback(CallbackAny2Vec):
 9 |     '''Callback to save model after each epoch.'''
10 | 
11 |     def __init__(self):
12 |         self.epoch = 0
13 | 
14 |     def on_epoch_end(self, model):
15 |         loss = model.get_latest_training_loss()
16 |         print("Epoch", self.epoch, "Loss", loss)
17 |         self.epoch += 1
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     samples_tagged = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
22 |     model = Doc2Vec(dm=0, dbow_words=0, vector_size=300, min_count=1, workers=multiprocessing.cpu_count() - 1,
23 |                     compute_loss=True, epochs=100, window=15, sample=1e-5, callbacks=[EpochCallback()])
24 |     model.build_vocab(samples_tagged)
25 |     model.train(samples_tagged, total_examples=model.corpus_count, epochs=model.epochs, start_alpha=0.025, compute_loss=True, callbacks=[EpochCallback()])
26 |     print("Loss", model.running_training_loss)
27 |     print(model.infer_vector(["system", "response"]))


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/evaluation/SentenceEvaluator.py:
--------------------------------------------------------------------------------
 1 | class SentenceEvaluator:
 2 |     """
 3 |     Base class for all evaluators
 4 | 
 5 |     Extend this class and implement __call__ for custom evaluators.
 6 |     """
 7 | 
 8 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 9 |         """
10 |         This is called during training to evaluate the model.
11 |         It returns a score for the evaluation with a higher score indicating a better result.
12 | 
13 |         :param model:
14 |             the model to evaluate
15 |         :param output_path:
16 |             path where predictions and metrics are written to
17 |         :param epoch
18 |             the epoch where the evaluation takes place.
19 |             This is used for the file prefixes.
20 |             If this is -1, then we assume evaluation on test data.
21 |         :param steps
22 |             the steps in the current epoch at time of the evaluation.
23 |             This is used for the file prefixes.
24 |             If this is -1, then we assume evaluation at the end of the epoch.
25 |         :return: a score for the evaluation with a higher score indicating a better result
26 |         """
27 |         pass
28 | 


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/SemBERT-master/tag_model/tagging.py:
--------------------------------------------------------------------------------
 1 | from allennlp.predictors import Predictor
 2 | import json
 3 | 
 4 | class SRLPredictor(object):
 5 |     def __init__(self,SRL_MODEL_PATH):
 6 |         # use the model from allennlp for simlicity.
 7 |         self.predictor = Predictor.from_path(SRL_MODEL_PATH)
 8 |         self.predictor._model = self.predictor._model.cuda()  # this can only support GPU computation
 9 | 
10 |     def predict(self, sent):
11 |         return self.predictor.predict(sentence=sent)
12 | 
13 | 
14 | def get_tags(srl_predictor, tok_text, tag_vocab):
15 |     if srl_predictor == None:
16 |         srl_result = json.loads(tok_text)  # can load a pre-tagger dataset for quick evaluation
17 |     else:
18 |         srl_result = srl_predictor.predict(tok_text)
19 |     sen_verbs = srl_result['verbs']
20 |     sen_words = srl_result['words']
21 | 
22 |     sent_tags = []
23 |     if len(sen_verbs) == 0:
24 |         sent_tags = [["O"] * len(sen_words)]
25 |     else:
26 |         for ix, verb_tag in enumerate(sen_verbs):
27 |             sent_tag = sen_verbs[ix]['tags']
28 |             for tag in sent_tag:
29 |                 if tag not in tag_vocab:
30 |                     tag_vocab.append(tag)
31 |             sent_tags.append(sent_tag)
32 | 
33 |     return sen_words, sent_tags
34 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/readers/LabelSentenceReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class LabelSentenceReader:
 7 |     """Reads in a file that has at least two columns: a label and a sentence.
 8 |     This reader can for example be used with the BatchHardTripletLoss.
 9 |     Maps labels automatically to integers"""
10 |     def __init__(self, folder, label_col_idx=0, sentence_col_idx=1):
11 |         self.folder = folder
12 |         self.label_map = {}
13 |         self.label_col_idx = label_col_idx
14 |         self.sentence_col_idx = sentence_col_idx
15 | 
16 |     def get_examples(self, filename, max_examples=0):
17 |         examples = []
18 | 
19 |         id = 0
20 |         for line in open(os.path.join(self.folder, filename), encoding="utf-8"):
21 |             splits = line.strip().split('\t')
22 |             label = splits[self.label_col_idx]
23 |             sentence = splits[self.sentence_col_idx]
24 | 
25 |             if label not in self.label_map:
26 |                 self.label_map[label] = len(self.label_map)
27 | 
28 |             label_id = self.label_map[label]
29 |             guid = "%s-%d" % (filename, id)
30 |             id += 1
31 |             examples.append(InputExample(guid=guid, texts=[sentence], label=label_id))
32 | 
33 |             if 0 < max_examples <= id:
34 |                 break
35 | 
36 |         return examples


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/losses/TripletLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import torch.nn.functional as F
 5 | from enum import Enum
 6 | from ..SentenceTransformer import SentenceTransformer
 7 | 
 8 | class TripletDistanceMetric(Enum):
 9 |     """
10 |     The metric for the triplet loss
11 |     """
12 |     COSINE = lambda x, y: 1 - F.cosine_similarity(x, y)
13 |     EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)
14 |     MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)
15 | 
16 | class TripletLoss(nn.Module):
17 |     def __init__(self, model: SentenceTransformer, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin=1):
18 |         super(TripletLoss, self).__init__()
19 |         self.model = model
20 |         self.distance_metric = distance_metric
21 |         self.triplet_margin = triplet_margin
22 | 
23 | 
24 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
25 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
26 | 
27 |         rep_anchor, rep_pos, rep_neg = reps
28 |         distance_pos = self.distance_metric(rep_anchor, rep_pos)
29 |         distance_neg = self.distance_metric(rep_anchor, rep_neg)
30 | 
31 |         losses = F.relu(distance_pos - distance_neg + self.triplet_margin)
32 |         return losses.mean()


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/readers/TripletReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class TripletReader(object):
 7 |     """
 8 |     Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
 9 |     one positive example (s2) and one negative example (s3)
10 |     """
11 |     def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, has_header=False, delimiter="\t",
12 |                  quoting=csv.QUOTE_NONE):
13 |         self.dataset_folder = dataset_folder
14 |         self.s1_col_idx = s1_col_idx
15 |         self.s2_col_idx = s2_col_idx
16 |         self.s3_col_idx = s3_col_idx
17 |         self.has_header = has_header
18 |         self.delimiter = delimiter
19 |         self.quoting = quoting
20 | 
21 |     def get_examples(self, filename, max_examples=0):
22 |         """
23 | 
24 |         """
25 |         data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter,
26 |                           quoting=self.quoting)
27 |         examples = []
28 |         if self.has_header:
29 |             next(data)
30 | 
31 |         for id, row in enumerate(data):
32 |             s1 = row[self.s1_col_idx]
33 |             s2 = row[self.s2_col_idx]
34 |             s3 = row[self.s3_col_idx]
35 | 
36 |             examples.append(InputExample(guid=filename+str(id), texts=[s1, s2, s3], label=1))
37 |             if max_examples > 0 and len(examples) >= max_examples:
38 |                 break
39 | 
40 |         return examples


--------------------------------------------------------------------------------
/loading_data/peersum_loader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | import random
 3 | 
 4 | import jsonlines
 5 | import numpy as np
 6 | 
 7 | def loading_peersum(folder="../crawling_data", including_public=True, including_author=True, including_abstract=True):
 8 |     print("loading peersum")
 9 |     papers = []
10 |     with jsonlines.open(folder + "/data/peersum_all.json", "r") as reader:
11 |         for paper in reader:
12 |             papers.append(paper)
13 |     clusters = []
14 |     for paper in papers:
15 |         summary = paper["meta_review"]
16 |         paper_id = paper['paper_id']
17 |         source_documents = []
18 |         if including_abstract:
19 |             source_documents.append(paper["paper_abstract"])
20 | 
21 |         for review in paper["reviews"]:
22 |             if "comment" in review.keys():
23 |                 text = review["comment"]
24 |                 if review["writer"] == "official_reviewer":
25 |                     source_documents.append(text)
26 |                 if review["writer"] == "author" and including_author == True:
27 |                     source_documents.append(text)
28 |                 if review["writer"] == "public" and including_public == True:
29 |                     source_documents.append(text)
30 | 
31 |         if summary.strip() != "" and len(source_documents) > 1:
32 |             random.shuffle(source_documents)
33 |             clusters.append({"source_documents": source_documents, "summary": summary, "paper_id": paper_id, "label": paper["label"], "paper_acceptance": paper["paper_acceptance"]})
34 |         else:
35 |             print(paper["paper_id"])
36 | 
37 |     return clusters
38 | 


--------------------------------------------------------------------------------
/crawling_data/prepare_peersum_for_huggingface.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # Transform PeerSum dataset into training datasets for other baseline models like BART
 3 | 
 4 | import random
 5 | import jsonlines
 6 | 
 7 | 
 8 | def prepare_for_huggingface(folder="data"):
 9 |     print("loading peersum")
10 |     papers = []
11 |     with jsonlines.open(folder + "/peersum_all.json", "r") as reader:
12 |         for paper in reader:
13 |             papers.append(paper)
14 | 
15 |     samples = []
16 |     for paper in papers:
17 |         review_ids = []
18 |         review_writers =[]
19 |         review_contents = []
20 |         review_ratings = []
21 |         review_confidences = []
22 |         review_reply_tos = []
23 |         for review in paper["reviews"]:
24 |             review_ids.append(review["review_id"])
25 |             review_writers.append(review["writer"])
26 |             review_contents.append(review["comment"])
27 |             review_ratings.append(review["rating"])
28 |             review_confidences.append(review["confidence"])
29 |             review_reply_tos.append(review["reply_to"])
30 | 
31 |         paper["review_ids"] = review_ids
32 |         paper["review_writers"] = review_writers
33 |         paper["review_contents"] = review_contents
34 |         paper["review_ratings"] = review_ratings
35 |         paper["review_confidences"] = review_confidences
36 |         paper["review_reply_tos"] = review_reply_tos
37 | 
38 |         del paper["reviews"]
39 |         samples.append(paper)
40 |     print(len(samples))
41 |     with jsonlines.open("peersum_huggingface.jsonl", "w") as writer:
42 |         writer.write_all(samples)
43 | 
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     prepare_for_huggingface()


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/SemBERT-master/tag_model/tag_tokenization.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from __future__ import absolute_import
 4 | from __future__ import division
 5 | from __future__ import print_function
 6 | 
 7 | import collections
 8 | import pickle
 9 | import os
10 | import logging
11 | from os.path import join, exists
12 | from os import makedirs
13 | 
14 | 
15 | logger = logging.getLogger(__name__)
16 | TAG_VOCAB = ['[PAD]','[CLS]', '[SEP]', 'B-V', 'I-V', 'B-ARG0', 'I-ARG0', 'B-ARG1', 'I-ARG1', 'B-ARG2', 'I-ARG2', 'B-ARG4', 'I-ARG4', 'B-ARGM-TMP', 'I-ARGM-TMP', 'B-ARGM-LOC', 'I-ARGM-LOC', 'B-ARGM-CAU', 'I-ARGM-CAU', 'B-ARGM-PRP', 'I-ARGM-PRP', 'O']
17 | #or load the full vocab of SRL, this will not affect the performance too much.
18 | 
19 | def load_tag_vocab(tag_vocab_file):
20 |     vocab_list = ["[PAD]", "[CLS]", "[SEP]"]
21 |     with open(tag_vocab_file, 'rb') as f:
22 |         vocab_list.extend(pickle.load(f))
23 |     return vocab_list
24 | 
25 | 
26 | class TagTokenizer(object):
27 |     def __init__(self):
28 |         self.tag_vocab = TAG_VOCAB
29 |         self.ids_to_tags = collections.OrderedDict(
30 |             [(ids, tag) for ids, tag in enumerate(TAG_VOCAB)])
31 | 
32 |     def convert_tags_to_ids(self, tags):
33 |         """Converts a sequence of tags into ids using the vocab."""
34 |         ids = []
35 |         for tag in tags:
36 |             if tag not in TAG_VOCAB:
37 |                 tag = 'O'
38 |             ids.append(TAG_VOCAB.index(tag))
39 | 
40 |         return ids
41 | 
42 |     def convert_ids_to_tags(self, ids):
43 |         """Converts a sequence of ids into tags using the vocab."""
44 |         tags = []
45 |         for i in ids:
46 |             tags.append(self.ids_to_tags[i])
47 |         return tags
48 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_2/input_docs/FT923-5835:
--------------------------------------------------------------------------------
 1 | <TEXT>
 2 | GENERAL ACCIDENT, the leading British insurer, said yesterday that insurance claims arising from Hurricane Andrew could 'cost it as much as Dollars 40m.'
 3 | </p>
 4 | <p>
 5 | Lord Airlie, the chairman who was addressing an extraordinary shareholders' meeting, said: 'On the basis of emerging information, General Accident advise that the losses to their US operations arising from Hurricane Andrew, which struck Florida and Louisiana, might in total reach the level at which external catastrophe reinsurance covers would become exposed'.
 6 | </p>
 7 | <p>
 8 | What this means is that GA is able to pass on its losses to external reinsurers once a certain claims threshold has been breached.
 9 | </p>
10 | <p>
11 | It believes this threshold may be breached in respect of Hurricane Andrew claims.
12 | </p>
13 | <p>
14 | However, if this happens, it would suffer a post-tax loss of Dollars 40m (Pounds 20m).
15 | </p>
16 | <p>
17 | Mr Nelson Robertson, GA's chief general manager, explained later that the company has a 1/2 per cent share of the Florida market.
18 | </p>
19 | <p>
20 | It has a branch in Orlando.
21 | </p>
22 | <p>
23 | The company's loss adjusters are in the area trying to estimate the losses.
24 | </p>
25 | <p>
26 | Their guess is that losses to be faced by all insurers may total more than Dollars 8bn.
27 | </p>
28 | <p>
29 | Not all damaged property in the area is insured and there have been estimates that the storm caused more than Dollars 20bn of damage.
30 | </p>
31 | <p>
32 | However, other insurers have estimated that losses could be as low as Dollars 1bn in total.
33 | </p>
34 | <p>
35 | Mr Robertson said: 'No one knows at this time what the exact loss is'.
36 | </p>
37 | </TEXT>
38 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/readers/STSDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class STSDataReader:
 7 |     """
 8 |     Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
 9 |     """
10 |     def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t",
11 |                  quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
12 |         self.dataset_folder = dataset_folder
13 |         self.score_col_idx = score_col_idx
14 |         self.s1_col_idx = s1_col_idx
15 |         self.s2_col_idx = s2_col_idx
16 |         self.delimiter = delimiter
17 |         self.quoting = quoting
18 |         self.normalize_scores = normalize_scores
19 |         self.min_score = min_score
20 |         self.max_score = max_score
21 | 
22 |     def get_examples(self, filename, max_examples=0):
23 |         """
24 |         filename specified which data split to use (train.csv, dev.csv, test.csv).
25 |         """
26 |         data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"),
27 |                           delimiter=self.delimiter, quoting=self.quoting)
28 |         examples = []
29 |         for id, row in enumerate(data):
30 |             score = float(row[self.score_col_idx])
31 |             if self.normalize_scores:  # Normalize to a 0...1 value
32 |                 score = (score - self.min_score) / (self.max_score - self.min_score)
33 | 
34 |             s1 = row[self.s1_col_idx]
35 |             s2 = row[self.s2_col_idx]
36 |             examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))
37 | 
38 |             if max_examples > 0 and len(examples) >= max_examples:
39 |                 break
40 | 
41 |         return examples
42 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/input_docs/XIN_ENG_20050126.0287:
--------------------------------------------------------------------------------
 1 | <DOC id="XIN_ENG_20050126.0287" type="story" >
 2 | <HEADLINE>
 3 | Man arrested following deadly train collision in US
 4 | </HEADLINE>
 5 | <DATELINE>
 6 | LOS ANGELES, Jan. 26 (Xinhua)
 7 | </DATELINE>
 8 | <TEXT>
 9 | <P>
10 | A 25-year-old man was arrested
11 | following Wednesday's commuter train collision that killed at 
12 | least 10 people near downtown Los Angeles.
13 | </P>
14 | <P>
15 | Glendale Police Chief Randy Adams said Juan Manuel Alvarez, who
16 | intentionally parked his Jeep on the rail tracks and caused the 
17 | train collision, will be facing at least 10 counts of homicide.
18 | </P>
19 | <P>
20 | A southbound commuter train from Ventura County struck the Jeep
21 | at about 6 a.m. local time and caromed into a stationary freight 
22 | train locomotive before colliding with a northbound commuter train
23 | from Union Station in downtown Los Angeles.
24 | </P>
25 | <P>
26 | At least 10 people were killed and hundreds were hurt in the 
27 | crash that also disrupted morning rush hour commuter train traffic.
28 | </P>
29 | <P>
30 | Alvarez, who is on the suicide watch list, drove his Jeep 
31 | Cherokee on the tracks, then changed his mind about taking his 
32 | life and fled from the vehicle when the train was approaching.
33 | </P>
34 | <P>
35 | Adams said Alvarez had made some superficial attempts overnight
36 | to kill himself -- slitting his wrists and stabbing his chest -- 
37 | and was treated for those wounds before being booked.
38 | </P>
39 | <P>
40 | According to police records, Alvarez has a previous arrest 
41 | history, possibly involving drugs.
42 | </P>
43 | <P>
44 | "He has been cooperative and he has talked to our investigators,
45 | " Adams said.
46 | </P>
47 | <P>
48 | Alvarez is accused of leaving a Jeep Cherokee on the tracks, 
49 | causing the derailment of one train, which then crashed into 
50 | another.
51 | </P>
52 | </TEXT>
53 | </DOC>
54 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/readers/NLIDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | 
 7 | class NLIDataReader(object):
 8 |     """
 9 |     Reads in the Stanford NLI dataset and the MultiGenre NLI dataset
10 |     """
11 |     def __init__(self, dataset_folder):
12 |         self.dataset_folder = dataset_folder
13 | 
14 |     def get_examples(self, filename, max_examples=0):
15 |         """
16 |         data_splits specified which data split to use (train, dev, test).
17 |         Expects that self.dataset_folder contains the files s1.$data_split.gz,  s2.$data_split.gz,
18 |         labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz
19 |         """
20 |         s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename),
21 |                        mode="rt", encoding="utf-8").readlines()
22 |         s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename),
23 |                        mode="rt", encoding="utf-8").readlines()
24 |         labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename),
25 |                            mode="rt", encoding="utf-8").readlines()
26 | 
27 |         examples = []
28 |         id = 0
29 |         for sentence_a, sentence_b, label in zip(s1, s2, labels):
30 |             guid = "%s-%d" % (filename, id)
31 |             id += 1
32 |             examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label)))
33 | 
34 |             if 0 < max_examples <= len(examples):
35 |                 break
36 | 
37 |         return examples
38 | 
39 |     @staticmethod
40 |     def get_labels():
41 |         return {"contradiction": 0, "entailment": 1, "neutral": 2}
42 | 
43 |     def get_num_labels(self):
44 |         return len(self.get_labels())
45 | 
46 |     def map_label(self, label):
47 |         return self.get_labels()[label.strip().lower()]


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/summ_eval/README.md:
--------------------------------------------------------------------------------
 1 | ## Using Supert for Single-Document Summarization: A Case Study on SummEval
 2 | 
 3 | Supert is originally designed to rate multi-document summaries, but it can also be used 
 4 | to rate single-document summaries. In this directory we
 5 | use Supert to evaluate summaries from the [SummEval](https://github.com/Yale-LILY/SummEval) 
 6 | dataset. The summaries
 7 | are generated using documents from CNN/DailyMail, and are rated by human experts
 8 | on their quality in *consistency*, *fluency*, *coherence* and *relevance*.
 9 | 
10 | 
11 | ### Step 1: Prepare SummEval Data
12 | Follow the instructions at [here](https://github.com/Yale-LILY/SummEval#data-preparation)
13 | to prepare the data.
14 | It will generate mulitple folders, but we only need *pairs/model_annotations.aligned.paired.jsonl*.
15 | Move that jsonl file to this directory. It includes 100 topics, each has 23 summaries
16 | and their human ratings.
17 | 
18 | 
19 | ### Step 2: Generate Supert scores
20 | Simply run the following command:
21 | 
22 | ```bash
23 | python supert_summ_eval.py
24 | ```
25 | It will use Supert to rate all summaries in the SummEval dataset and compute
26 | the correlation between the Supert scores and the human ratings.
27 | The default setup is to use the first 10 sentences from each document to build
28 | the pseudo reference, but you could change the strategy at line 101 in *supert_summ_eval.py*:
29 | 
30 | ```python
31 | supert_scores = generate_supert_scores(summ_eval_data, 'top10') 
32 | ```
33 | The generated Supert scores will also be saved at a pickle file to facilitate reuse.
34 | 
35 | 
36 | ### Results
37 | 
38 | The Pearson correlation between Supert scores and human ratings are below.
39 | 
40 | |               | Coherence | Fluency | Relavancy | Consistency |
41 | |---------------|-----------|---------|-----------|-------------|
42 | | Supert, top10 | .220      | .300    | .309      | .391        |
43 | | Supert, top200| .221      | .287    | .311      | .389        |
44 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/losses/MultipleNegativesRankingLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class MultipleNegativesRankingLoss(nn.Module):
 8 |     def __init__(self, sentence_embedder):
 9 |         super(MultipleNegativesRankingLoss, self).__init__()
10 |         self.sentence_embedder = sentence_embedder
11 | 
12 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
13 |         reps = [self.sentence_embedder(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
14 | 
15 |         reps_a, reps_b = reps
16 |         return self.multiple_negatives_ranking_loss(reps_a, reps_b)
17 | 
18 |     # Multiple Negatives Ranking Loss
19 |     # Paper: https://arxiv.org/pdf/1705.00652.pdf
20 |     #   Efficient Natural Language Response Suggestion for Smart Reply
21 |     #   Section 4.4
22 |     def multiple_negatives_ranking_loss(self, embeddings_a: Tensor, embeddings_b: Tensor):
23 |         """
24 |         Compute the loss over a batch with two embeddings per example.
25 | 
26 |         Each pair is a positive example. The negative examples are all other embeddings in embeddings_b with each embedding
27 |         in embedding_a.
28 | 
29 |         See the paper for more information: https://arxiv.org/pdf/1705.00652.pdf
30 |         (Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4)
31 | 
32 |         :param embeddings_a:
33 |             Tensor of shape (batch_size, embedding_dim)
34 |         :param embeddings_b:
35 |             Tensor of shape (batch_size, embedding_dim)
36 |         :return:
37 |             The scalar loss
38 |         """
39 |         scores = torch.matmul(embeddings_a, embeddings_b.t())
40 |         diagonal_mean = torch.mean(torch.diag(scores))
41 |         mean_log_row_sum_exp = torch.mean(torch.logsumexp(scores, dim=1))
42 |         return -diagonal_mean + mean_log_row_sum_exp
43 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/Dense.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from torch import functional as F
 5 | from typing import Union, Tuple, List, Iterable, Dict
 6 | import os
 7 | import json
 8 | from ..util import fullname, import_from_string
 9 | 
10 | 
11 | class Dense(nn.Module):
12 |     """Feed-forward function with  activiation function.
13 | 
14 |     This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networs (DAN).
15 |     """
16 |     def __init__(self, in_features, out_features, bias=True, activation_function=nn.Tanh()):
17 |         super(Dense, self).__init__()
18 |         self.in_features = in_features
19 |         self.out_features = out_features
20 |         self.bias = bias
21 |         self.activation_function = activation_function
22 |         self.linear = nn.Linear(in_features, out_features, bias=bias)
23 | 
24 |     def forward(self, features: Dict[str, Tensor]):
25 |         features.update({'sentence_embedding': self.activation_function(self.linear(features['sentence_embedding']))})
26 |         return features
27 | 
28 |     def get_sentence_embedding_dimension(self) -> int:
29 |         return self.out_features
30 | 
31 |     def save(self, output_path):
32 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
33 |             json.dump({'in_features': self.in_features, 'out_features': self.out_features, 'bias': self.bias, 'activation_function': fullname(self.activation_function)}, fOut)
34 | 
35 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
36 | 
37 |     @staticmethod
38 |     def load(input_path):
39 |         with open(os.path.join(input_path, 'config.json')) as fIn:
40 |             config = json.load(fIn)
41 | 
42 |         config['activation_function'] = import_from_string(config['activation_function'])()
43 |         model = Dense(**config)
44 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin')))
45 |         return model
46 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/input_docs/NYT_ENG_20050127.0004:
--------------------------------------------------------------------------------
 1 | <DOC id="NYT_ENG_20050127.0004" type="story" >
 2 | <HEADLINE>
 3 | THE COLLISION MOMENT BY MOMENT
 4 | </HEADLINE>
 5 | <DATELINE>
 6 | GLENDALE, Calif.
 7 | </DATELINE>
 8 | <TEXT>
 9 | <P>
10 | Authorities believe Juan Manuel Alvarez drove his Jeep
11 | Grand Cherokee around the rail crossing barrier and onto the
12 | Metrolink train tracks at Chevy Chase Drive in Glendale about
13 | 6 a.m. Wednesday.
14 | </P>
15 | <P>
16 | His intent was to kill himself, but he changed his mind
17 | after his SUV got stuck between the tracks in an
18 | area slightly away from the grade crossing.
19 | </P>
20 | <P>
21 | At 6:02 a.m., the three-car Train 100, headed from Moorpark to
22 | Union Station, with the locomotive in the rear, crashed into
23 | the SUV. Instead of knocking it out of the way,
24 | the train's front cab car probably lifted up over the
25 | SUV, said Metrolink's CEO Dave Solow.
26 | </P>
27 | <P>
28 | Because the SUV was "lodged on the tracks," it was unable
29 | to move, he said.
30 | </P>
31 | <P>
32 | "It got wedged under the train. At grade crossing, it would
33 | have been easier. He would have been pushed off."
34 | </P>
35 | <P>
36 | The commuter train folded up like an accordion -- its front
37 | car ramming into the Union Pacific freight train's locomotive on
38 | the track to the west, toppling it to its side.
39 | The remaining cars collided with northbound Metrolink Train 901, which
40 | was on the track to the east after leaving the
41 | Glendale Station.
42 | </P>
43 | <P>
44 | The Union Pacific train was a work train for track maintenance,
45 | with two locomotives and about 12 cars filled with ballast
46 | or gravel.
47 | </P>
48 | <P>
49 | Metrolink trains in that area can go up to 79 mph,
50 | but officials said they were likely going slower with the
51 | Glendale station nearby.
52 | </P>
53 | <P>
54 | Solow said there's little Metrolink can do to prevent suicide attempts
55 | on its tracks.
56 | </P>
57 | <P>
58 | "One of the banes of our existence is we can't stop
59 | someone who wants to commit suicide."
60 | </P>
61 | </TEXT>
62 | </DOC>
63 | 


--------------------------------------------------------------------------------
/mtsum_meta/bash/train_mtsum_ram_from_primera_4096.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd ../with_ram/
 3 | 
 4 | DATASET_NAME="peersum_all"
 5 | PLM_MODEL_PATH="allenai/PRIMERA"
 6 | MAX_LENGTH_INPUT=4096
 7 | MAX_LENGTH_TGT=512
 8 | MIN_LENGTH_TGT=32
 9 | GENERATION_LOSS=2
10 | ACCEPTANCE_LOSS=2
11 | RATING_LOSS=1
12 | CONFIDENCE_LOSS=2
13 | DOCUMENT_TYPE_LOSS=1
14 | 
15 | python mtsum_ram_from_primera.py  \
16 |                 --batch_size 1 \
17 |                 --devices 1  \
18 |                 --accelerator gpu \
19 |                 --speed_strategy no_ddp \
20 |                 --mode train \
21 |                 --model_path ../../result/MTSum_meta_ram_from_primera_${GENERATION_LOSS}_${ACCEPTANCE_LOSS}_${RATING_LOSS}_${CONFIDENCE_LOSS}_${DOCUMENT_TYPE_LOSS}_${DATASET_NAME}_${MAX_LENGTH_INPUT}_${MAX_LENGTH_TGT}/ \
22 |                 --data_path ../../crawling_data/data/ \
23 |                 --dataset_name ${DATASET_NAME} \
24 |                 --pretrained_model ${PLM_MODEL_PATH} \
25 |                 --num_workers 4 \
26 |                 --optimizer adamw \
27 |                 --scheduler_type cosine_schedule_with_warmup \
28 |                 --beam_size 5 \
29 |                 --length_penalty 1.0 \
30 |                 --test_imediate \
31 |                 --max_length_tgt ${MAX_LENGTH_TGT} \
32 |                 --min_length_tgt ${MIN_LENGTH_TGT} \
33 |                 --max_length_input ${MAX_LENGTH_INPUT} \
34 |                 --total_steps 2000 \
35 |                 --accum_data_per_step 128 \
36 |                 --early_stopping_patience 5 \
37 |                 --val_check_interval 10 \
38 |                 --num_train_data -1 \
39 |                 --num_val_data 128 \
40 |                 --num_test_data -1 \
41 |                 --lr 3e-5 \
42 |                 --apply_triblck \
43 |                 --label_smoothing 0.1 \
44 |                 --generation_loss_weight ${GENERATION_LOSS} \
45 |                 --acceptance_loss_weight ${ACCEPTANCE_LOSS} \
46 |                 --rating_loss_weight ${RATING_LOSS} \
47 |                 --confidence_loss_weight ${CONFIDENCE_LOSS} \
48 |                 --document_type_loss_weight ${DOCUMENT_TYPE_LOSS} \
49 |                 --warmup_steps 200


--------------------------------------------------------------------------------
/mtsum_meta/bash/train_mtsum_ram_from_led_4096.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd ../with_ram/
 3 | 
 4 | DATASET_NAME="peersum_all"
 5 | PLM_MODEL_PATH="allenai/led-large-16384"
 6 | MAX_LENGTH_INPUT=4096
 7 | MAX_LENGTH_TGT=512
 8 | MIN_LENGTH_TGT=32
 9 | GENERATION_LOSS=1
10 | ACCEPTANCE_LOSS=0
11 | RATING_LOSS=0
12 | CONFIDENCE_LOSS=0
13 | DOCUMENT_TYPE_LOSS=0
14 | 
15 | python mtsum_ram_from_primera.py  \
16 |                 --batch_size 1 \
17 |                 --devices 1  \
18 |                 --accelerator gpu \
19 |                 --speed_strategy no_ddp \
20 |                 --mode train \
21 |                 --model_path ../../result/MTSum_meta_ram_from_led_${GENERATION_LOSS}_${ACCEPTANCE_LOSS}_${RATING_LOSS}_${CONFIDENCE_LOSS}_${DOCUMENT_TYPE_LOSS}_${DATASET_NAME}_${MAX_LENGTH_INPUT}_${MAX_LENGTH_TGT}/ \
22 |                 --data_path ../../crawling_data/data/ \
23 |                 --dataset_name ${DATASET_NAME} \
24 |                 --pretrained_model ${PLM_MODEL_PATH} \
25 |                 --num_workers 4 \
26 |                 --optimizer adamw \
27 |                 --gradient_checkpointing \
28 |                 --scheduler_type cosine_schedule_with_warmup \
29 |                 --beam_size 1 \
30 |                 --length_penalty 1.0 \
31 |                 --test_imediate \
32 |                 --max_length_tgt ${MAX_LENGTH_TGT} \
33 |                 --min_length_tgt ${MIN_LENGTH_TGT} \
34 |                 --max_length_input ${MAX_LENGTH_INPUT} \
35 |                 --total_steps 10000 \
36 |                 --accum_data_per_step 16 \
37 |                 --early_stopping_patience 5 \
38 |                 --val_check_interval 50 \
39 |                 --num_train_data -1 \
40 |                 --num_val_data 128 \
41 |                 --num_test_data -1 \
42 |                 --lr 3e-5 \
43 |                 --apply_triblck \
44 |                 --label_smoothing 0.1 \
45 |                 --generation_loss_weight ${GENERATION_LOSS} \
46 |                 --acceptance_loss_weight ${ACCEPTANCE_LOSS} \
47 |                 --rating_loss_weight ${RATING_LOSS} \
48 |                 --confidence_loss_weight ${CONFIDENCE_LOSS} \
49 |                 --document_type_loss_weight ${DOCUMENT_TYPE_LOSS} \
50 |                 --warmup_steps 200


--------------------------------------------------------------------------------
/mtsum_meta/bash/test_mtsum_ram_from_led_4096_zeroshot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd ../with_ram/
 3 | 
 4 | DATASET_NAME="peersum_all"
 5 | PLM_MODEL_PATH="allenai/led-large-16384"
 6 | MAX_LENGTH_INPUT=4096
 7 | MAX_LENGTH_TGT=512
 8 | MIN_LENGTH_TGT=32
 9 | GENERATION_LOSS=1
10 | ACCEPTANCE_LOSS=0
11 | RATING_LOSS=0
12 | CONFIDENCE_LOSS=0
13 | DOCUMENT_TYPE_LOSS=0
14 | 
15 | python mtsum_ram_from_led.py  \
16 |                 --batch_size 1 \
17 |                 --devices 1  \
18 |                 --accelerator gpu \
19 |                 --speed_strategy no_ddp \
20 |                 --mode test \
21 |                 --model_path ../../result/MTSum_meta_ram_from_led_zeroshot_${GENERATION_LOSS}_${ACCEPTANCE_LOSS}_${RATING_LOSS}_${CONFIDENCE_LOSS}_${DOCUMENT_TYPE_LOSS}_${DATASET_NAME}_${MAX_LENGTH_INPUT}_${MAX_LENGTH_TGT}/ \
22 |                 --data_path ../../crawling_data/data/ \
23 |                 --dataset_name ${DATASET_NAME} \
24 |                 --pretrained_model ${PLM_MODEL_PATH} \
25 |                 --num_workers 4 \
26 |                 --optimizer adamw \
27 |                 --scheduler_type cosine_schedule_with_warmup \
28 |                 --beam_size 5 \
29 |                 --length_penalty 1.0 \
30 |                 --repetition_penalty 1.0 \
31 |                 --test_imediate \
32 |                 --max_length_tgt ${MAX_LENGTH_TGT} \
33 |                 --min_length_tgt ${MIN_LENGTH_TGT} \
34 |                 --max_length_input ${MAX_LENGTH_INPUT} \
35 |                 --total_steps 50000 \
36 |                 --accum_data_per_step 16 \
37 |                 --early_stopping_patience 5 \
38 |                 --val_check_interval 50 \
39 |                 --num_train_data -1 \
40 |                 --num_val_data 128 \
41 |                 --num_test_data -1 \
42 |                 --lr 5e-5 \
43 |                 --apply_triblck \
44 |                 --label_smoothing 0.1 \
45 |                 --generation_loss_weight ${GENERATION_LOSS} \
46 |                 --acceptance_loss_weight ${ACCEPTANCE_LOSS} \
47 |                 --rating_loss_weight ${RATING_LOSS} \
48 |                 --confidence_loss_weight ${CONFIDENCE_LOSS} \
49 |                 --document_type_loss_weight ${DOCUMENT_TYPE_LOSS} \
50 |                 --warmup_steps 1000


--------------------------------------------------------------------------------
/mtsum_meta/bash/test_mtsum_ram_from_primera_4096.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd ../with_ram/
 3 | 
 4 | DATASET_NAME="peersum_all"
 5 | PLM_MODEL_PATH="allenai/PRIMERA"
 6 | MAX_LENGTH_INPUT=4096
 7 | MAX_LENGTH_TGT=512
 8 | MIN_LENGTH_TGT=32
 9 | GENERATION_LOSS=1
10 | ACCEPTANCE_LOSS=0
11 | RATING_LOSS=0
12 | CONFIDENCE_LOSS=0
13 | DOCUMENT_TYPE_LOSS=0
14 | 
15 | python mtsum_ram_from_primera.py  \
16 |                 --batch_size 1 \
17 |                 --devices 1  \
18 |                 --accelerator gpu \
19 |                 --speed_strategy no_ddp \
20 |                 --mode test \
21 |                 --model_path ../../result/MTSum_meta_ram_from_primera_${GENERATION_LOSS}_${ACCEPTANCE_LOSS}_${RATING_LOSS}_${CONFIDENCE_LOSS}_${DOCUMENT_TYPE_LOSS}_${DATASET_NAME}_${MAX_LENGTH_INPUT}_${MAX_LENGTH_TGT}/ \
22 |                 --data_path ../../crawling_data/data/ \
23 |                 --dataset_name ${DATASET_NAME} \
24 |                 --pretrained_model ${PLM_MODEL_PATH} \
25 |                 --num_workers 4 \
26 |                 --optimizer adamw \
27 |                 --scheduler_type cosine_schedule_with_warmup \
28 |                 --beam_size 5 \
29 |                 --length_penalty 1.0 \
30 |                 --repetition_penalty 1.0 \
31 |                 --renormalize_logits \
32 |                 --generation_early_stopping \
33 |                 --test_imediate \
34 |                 --max_length_tgt ${MAX_LENGTH_TGT} \
35 |                 --min_length_tgt ${MIN_LENGTH_TGT} \
36 |                 --max_length_input ${MAX_LENGTH_INPUT} \
37 |                 --total_steps 50000 \
38 |                 --accum_data_per_step 16 \
39 |                 --early_stopping_patience 5 \
40 |                 --val_check_interval 50 \
41 |                 --num_train_data -1 \
42 |                 --num_val_data 128 \
43 |                 --num_test_data 512 \
44 |                 --lr 5e-5 \
45 |                 --apply_triblck \
46 |                 --label_smoothing 0.1 \
47 |                 --generation_loss_weight ${GENERATION_LOSS} \
48 |                 --acceptance_loss_weight ${ACCEPTANCE_LOSS} \
49 |                 --rating_loss_weight ${RATING_LOSS} \
50 |                 --confidence_loss_weight ${CONFIDENCE_LOSS} \
51 |                 --document_type_loss_weight ${DOCUMENT_TYPE_LOSS} \
52 |                 --warmup_steps 1000


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_2/input_docs/FT923-5797:
--------------------------------------------------------------------------------
 1 | <TEXT>
 2 | <p>
 3 | SQUADS of workers fanned out across storm-battered Louisiana yesterday to begin a massive rebuilding effort after Hurricane Andrew had flattened whole districts, killing two people and injuring dozens more, agencies report from Florida and New Orleans.
 4 | </p>
 5 | <p>
 6 | However, local officials in Florida, hit earlier in the week by the hurricane, were critical of what they called a delay in supplying food, drinking water and other supplies for thousands of people in need.
 7 | </p>
 8 | <p>
 9 | Federal emergency officials acknowledged distribution problems, Transportation Secretary Andrew Card yesterday promised 'dramatic' improvements within 24 hours and President George Bush last night ordered troops to Florida, without specifying a number.
10 | </p>
11 | <p>
12 | The government estimated it would cost Dollars 20bn-Dollars 30bn to tidy and rebuild in Florida, and to care for residents displaced by the storm.
13 | </p>
14 | <p>
15 | Louisiana state officials said they had no overall count of storm-related injuries but initial estimates reckoned fewer than 100.
16 | </p>
17 | <p>
18 | The Federal Emergency Management Agency said it was setting aside Dollars 77m to help Louisiana recover.
19 | </p>
20 | <p>
21 | Most of the storm's fury was spent against sparsely populated farming communities and swampland in the state, sparing it the widespread destruction caused in Florida, where 15 people died.
22 | </p>
23 | <p>
24 | Official estimates in Miami reported that the hurricane had wiped out the homes of one Dade County resident in eight - a quarter of a million people.
25 | </p>
26 | <p>
27 | Andrew had become little more than a strong rainstorm early yesterday, moving across Mississippi state and heading for the north-eastern US.
28 | </p>
29 | <p>
30 | Several of Louisiana's main industries were affected, including those of oysters and alligators.
31 | </p>
32 | <p>
33 | Wildlife and fisheries secretary Joe Herring estimated a 50 per cent decline in the alligator industry.
34 | </p>
35 | <p>
36 | The cotton and sugar-cane crops were threatened, the state agriculture department said.
37 | </p>
38 | <p>
39 | Most Louisiana oil refineries, however, were barely affected and deliveries of crude oil were expected to resume yesterday.
40 | </p>
41 | </TEXT>
42 | 


--------------------------------------------------------------------------------
/crawling_data/nips_download_response.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import os
 4 | import pdfplumber
 5 | import pdfminer.pdfparser
 6 | 
 7 | def download_pdf(pdf_url, folder, year):
 8 |     if pdf_url.endswith(".pdf"):
 9 |         file_name = folder + year + "_" + pdf_url.split("/")[-1]
10 |         if not os.path.exists(file_name):
11 |             response = requests.get(url=pdf_url,stream="TRUE")
12 |             with open(file_name,'wb') as file:
13 |                 for data in response.iter_content():
14 |                     file.write(data)
15 |             print(pdf_url)
16 |         else:
17 |             print(file_name, "exists")
18 |     else:
19 |         print("NOT PDF", pdf_url)
20 | 
21 | 
22 | def get_responses(response_file):
23 |     content = ""
24 |     try:
25 |         with pdfplumber.open(response_file) as pdf:
26 |             for i in range(len(pdf.pages)):
27 |                 page = pdf.pages[i]
28 |                 page_content = page.extract_text()# delete the page number
29 |                 content = content + page_content
30 |     except pdfminer.pdfparser.PDFSyntaxError as e:
31 |         print("PDFSyntaxError")
32 |     return content
33 | 
34 | if __name__ == "__main__":
35 |     base_url = "https://proceedings.neurips.cc"
36 |     paper_folder = "paper_pdf/"
37 |     response_folder = "response_pdf/"
38 |     years = ["2019", "2020"]
39 | 
40 | 
41 |     for year in years:
42 |         with open('data/nips_%s.json'%year, 'r') as f:
43 |             paper_list = json.load(f)
44 | 
45 |         for paper in paper_list:
46 |             if "pdf" not in paper.keys():
47 |                 print(paper["link"])
48 | 
49 |             if "author_responses" in paper.keys():
50 |                 download_pdf(base_url + paper["author_responses"]["pdf"], response_folder, year)
51 |                 response_file = response_folder + year + "_" + paper["author_responses"]["pdf"].split("/")[-1]
52 |                 if os.path.exists(response_file) and response_file.endswith(".pdf"):
53 |                     if paper["author_responses"].get("responses", default="")=="":
54 |                         paper["author_responses"]["responses"] = get_responses(response_file)
55 |                 else:
56 |                     print(response_file, "not exists")
57 | 
58 | 
59 |         with open("data/nips_%s.json" % year, "w") as f:
60 |             f.write(json.dumps(paper_list))


--------------------------------------------------------------------------------
/dataset_analysis/crossdoc_relationship/hierarchies.py:
--------------------------------------------------------------------------------
 1 | import jsonlines
 2 | import numpy as np
 3 | 
 4 | data = []
 5 | with jsonlines.open('../../../peersum/crawling_data/data/peersum_all.json') as reader:
 6 |     for line in reader:
 7 |         data.append(line)
 8 | print("all data", len(data))
 9 | 
10 | heights = []
11 | widths = []
12 | papers_errors = set([])
13 | for sample in data:
14 |     paper_id = sample["paper_id"]
15 |     reviews = sample["reviews"]
16 | 
17 |     review_ids = []
18 |     review_ids.append(sample["paper_id"])
19 |     for review in reviews:
20 |         review_ids.append(review["review_id"])
21 | 
22 |     reviews_tree = []
23 |     for review in reviews:
24 |         if review["reply_to"] in review_ids:
25 |             reviews_tree.append(review)
26 |         else:
27 |             print(paper_id)
28 |             papers_errors.add(paper_id)
29 |             print(review)
30 | 
31 |     if len(reviews)-len(reviews_tree)>0:
32 |         print(len(reviews), len(reviews_tree))
33 |     if len(set(review_ids))-1!=len(reviews):
34 |         print("repeated review")
35 | 
36 |     # height
37 |     current_nodes = []
38 |     for review in reviews_tree:
39 |         if review["reply_to"] == sample["paper_id"]:
40 |             current_nodes.append(review["review_id"])
41 | 
42 |     height_max = 0
43 |     tmp = current_nodes
44 |     while len(tmp)>0:
45 |         height_max += 1
46 |         tmp = []
47 |         for review in reviews_tree:
48 |             if review["reply_to"] in current_nodes:
49 |                 tmp.append(review["review_id"])
50 |         current_nodes = tmp
51 |     # print(sample["paper_id"], height_max)
52 |     heights.append(height_max)
53 | 
54 |     # width
55 |     leaves = 0
56 |     for review_i in reviews_tree:
57 |         review_i_id = review_i["review_id"]
58 |         is_leaf = True
59 |         for review_j in reviews_tree:
60 |             if review_j["reply_to"] == review_i_id:
61 |                 is_leaf = False
62 |                 break
63 |         if is_leaf:
64 |             leaves += 1
65 |     widths.append(leaves)
66 | 
67 |     if sample["paper_id"].startswith("nips"):
68 |         for review in reviews_tree:
69 |             if review["writer"] == "public":
70 |                 print(review)
71 | 
72 | 
73 | print("error papers", len(papers_errors))
74 | for id in sorted(papers_errors):
75 |     print(id)
76 | print("average max height", np.mean(heights), np.std(heights))
77 | print("average max width (leaves)", np.mean(widths), np.std(widths))


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/losses/SoftmaxLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | from ..SentenceTransformer import SentenceTransformer
 5 | import logging
 6 | 
 7 | class SoftmaxLoss(nn.Module):
 8 |     def __init__(self,
 9 |                  model: SentenceTransformer,
10 |                  sentence_embedding_dimension: int,
11 |                  num_labels: int,
12 |                  concatenation_sent_rep: bool = True,
13 |                  concatenation_sent_difference: bool = True,
14 |                  concatenation_sent_multiplication: bool = False):
15 |         super(SoftmaxLoss, self).__init__()
16 |         self.model = model
17 |         self.num_labels = num_labels
18 |         self.concatenation_sent_rep = concatenation_sent_rep
19 |         self.concatenation_sent_difference = concatenation_sent_difference
20 |         self.concatenation_sent_multiplication = concatenation_sent_multiplication
21 | 
22 |         num_vectors_concatenated = 0
23 |         if concatenation_sent_rep:
24 |             num_vectors_concatenated += 2
25 |         if concatenation_sent_difference:
26 |             num_vectors_concatenated += 1
27 |         if concatenation_sent_multiplication:
28 |             num_vectors_concatenated += 1
29 |         logging.info("Softmax loss: #Vectors concatenated: {}".format(num_vectors_concatenated))
30 |         self.classifier = nn.Linear(num_vectors_concatenated * sentence_embedding_dimension, num_labels)
31 | 
32 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
33 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
34 |         rep_a, rep_b = reps
35 | 
36 |         vectors_concat = []
37 |         if self.concatenation_sent_rep:
38 |             vectors_concat.append(rep_a)
39 |             vectors_concat.append(rep_b)
40 | 
41 |         if self.concatenation_sent_difference:
42 |             vectors_concat.append(torch.abs(rep_a - rep_b))
43 | 
44 |         if self.concatenation_sent_multiplication:
45 |             vectors_concat.append(rep_a * rep_b)
46 | 
47 |         features = torch.cat(vectors_concat, 1)
48 | 
49 |         output = self.classifier(features)
50 |         loss_fct = nn.CrossEntropyLoss()
51 | 
52 |         if labels is not None:
53 |             loss = loss_fct(output, labels.view(-1))
54 |             return loss
55 |         else:
56 |             return reps, output


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List, Iterable, Dict
 2 | import collections
 3 | import string
 4 | import os
 5 | import json
 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
 7 | 
 8 | class WhitespaceTokenizer(WordTokenizer):
 9 |     """
10 |     Simple and fast white-space tokenizer. Splits sentence based on white spaces.
11 |     Punctuation are stripped from tokens.
12 |     """
13 |     def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False):
14 |         self.stop_words = set(stop_words)
15 |         self.do_lower_case = do_lower_case
16 |         self.set_vocab(vocab)
17 | 
18 |     def get_vocab(self):
19 |         return self.vocab
20 | 
21 |     def set_vocab(self, vocab: Iterable[str]):
22 |         self.vocab = vocab
23 |         self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
24 | 
25 |     def tokenize(self, text: str) -> List[int]:
26 |         if self.do_lower_case:
27 |             text = text.lower()
28 | 
29 |         tokens = text.split()
30 | 
31 |         tokens_filtered = []
32 |         for token in tokens:
33 |             if token in self.stop_words:
34 |                 continue
35 |             elif token in self.word2idx:
36 |                 tokens_filtered.append(self.word2idx[token])
37 |                 continue
38 | 
39 |             token = token.strip(string.punctuation)
40 |             if token in self.stop_words:
41 |                 continue
42 |             elif len(token) > 0 and token in self.word2idx:
43 |                 tokens_filtered.append(self.word2idx[token])
44 |                 continue
45 | 
46 |             token = token.lower()
47 |             if token in self.stop_words:
48 |                 continue
49 |             elif token in self.word2idx:
50 |                 tokens_filtered.append(self.word2idx[token])
51 |                 continue
52 | 
53 |         return tokens_filtered
54 | 
55 |     def save(self, output_path: str):
56 |         with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut:
57 |             json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut)
58 | 
59 |     @staticmethod
60 |     def load(input_path: str):
61 |         with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn:
62 |             config = json.load(fIn)
63 | 
64 |         return WhitespaceTokenizer(**config)
65 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/utils/data_reader.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('../')
 3 | 
 4 | import os
 5 | from collections import OrderedDict
 6 | from nltk.tokenize import sent_tokenize
 7 | 
 8 | from resources import BASE_DIR
 9 | 
10 | class CorpusReader:
11 |     def __init__(self,topic_path,base_path=BASE_DIR,):
12 |         self.base_path = base_path
13 |         self.data_path = os.path.join(self.base_path,topic_path)
14 | 
15 |     def __call__(self):
16 |         docs_dic = self.readDocs()
17 |         return docs_dic
18 | 
19 |     def readSummaries(self):
20 |         summary_path = os.path.join(self.data_path, 'summaries')
21 |         summaries = []
22 |         for tt in sorted(os.listdir(summary_path)):
23 |             if tt[0] == '.':
24 |                 continue # skip hidden files
25 |             text = open(os.path.join(summary_path,tt), 'r').read()
26 |             summaries.append( text ) 
27 |         return summaries
28 | 
29 |     def readReferences(self):
30 |         ref_path = os.path.join(self.data_path, 'references')
31 |         refs = []
32 |         for tt in sorted(os.listdir(ref_path)):
33 |             if tt[0] == '.':
34 |                 continue # skip hidden files
35 |             text = open(os.path.join(ref_path,tt), 'r').read()
36 |             refs.append( (os.path.join(ref_path,tt), sent_tokenize(text)) ) 
37 |         return refs
38 | 
39 | 
40 |     def readDocs(self):
41 |         dpath = os.path.join(self.data_path,'input_docs')
42 |         topic_docs = []
43 |         for tt in sorted(os.listdir(dpath)):
44 |             if tt[0] == '.':
45 |                 continue # skip hidden files
46 |             entry = self.readOneDoc(os.path.join(dpath,tt))
47 |             topic_docs.append((os.path.join(dpath,tt),entry))
48 | 
49 |         return topic_docs
50 | 
51 |     def readOneDoc(self,dpath):
52 |         ff = open(dpath,'r')
53 |         flag = False
54 |         text = []
55 |         for line in ff.readlines():
56 |             if '<TEXT>' in line:
57 |                 flag = True
58 |             elif '</TEXT>' in line:
59 |                 break
60 |             elif flag and line.strip().lower() != '<p>' and line.strip().lower() != '</p>':
61 |                 text.append(line.strip())
62 | 
63 |         ff.close()
64 | 
65 |         return sent_tokenize(' '.join(text))
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     reader = CorpusReader(BASE_DIR)
70 |     docs = reader('data/topic_1')
71 |     summs = reader.readSummaries()
72 |     refs = reader.readReferences()
73 | 
74 |     print(docs)
75 |     print(summs)
76 |     print(refs)
77 | 
78 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/LSTM.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import logging
 5 | import gzip
 6 | from tqdm import tqdm
 7 | import numpy as np
 8 | import os
 9 | import json
10 | from ..util import import_from_string, fullname, http_get
11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
12 | 
13 | 
14 | class LSTM(nn.Module):
15 |     """Bidirectional LSTM running over word embeddings.
16 |     """
17 |     def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0):
18 |         nn.Module.__init__(self)
19 |         self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout']
20 |         self.word_embedding_dimension = word_embedding_dimension
21 |         self.hidden_dim = hidden_dim
22 |         self.num_layers = num_layers
23 |         self.dropout = dropout
24 | 
25 |         self.embeddings_dimension = 2*hidden_dim
26 |         self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=True, batch_first=True)
27 | 
28 |     def forward(self, features):
29 |         token_embeddings = features['token_embeddings']
30 |         sentence_lengths = torch.clamp(features['sentence_lengths'], min=1)
31 | 
32 |         packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False)
33 |         packed = self.encoder(packed)
34 |         unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0]
35 |         features.update({'token_embeddings': unpack})
36 |         return features
37 | 
38 |     def get_word_embedding_dimension(self) -> int:
39 |         return self.embeddings_dimension
40 | 
41 |     def tokenize(self, text: str) -> List[str]:
42 |         raise NotImplementedError()
43 | 
44 |     def save(self, output_path: str):
45 |         with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut:
46 |             json.dump(self.get_config_dict(), fOut, indent=2)
47 | 
48 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
49 | 
50 |     def get_config_dict(self):
51 |         return {key: self.__dict__[key] for key in self.config_keys}
52 | 
53 |     @staticmethod
54 |     def load(input_path: str):
55 |         with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn:
56 |             config = json.load(fIn)
57 | 
58 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
59 |         model = LSTM(**config)
60 |         model.load_state_dict(weights)
61 |         return model
62 | 
63 | 


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/sampling_data.py:
--------------------------------------------------------------------------------
 1 | # Sampling data from the original PeerSum dataset based on variance of review scores for individual papers
 2 | import random
 3 | 
 4 | import jsonlines
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | papers = []
 9 | with jsonlines.open("/home/miao4/punim0521/NeuralAbstractiveSummarization/peersum/crawling_data/data/peersum_all.json",
10 |                     "r") as reader:
11 |     for paper in reader:
12 |         papers.append(paper)
13 | 
14 | clusters = []
15 | variances = []
16 | variance_zero = 0
17 | for paper in papers:
18 |     summary = paper["meta_review"]
19 |     paper_id = paper['paper_id']
20 |     paper_acceptance = paper["paper_acceptance"]
21 |     source_documents = []
22 | 
23 |     review_scores = []
24 |     for i, review_i in enumerate(paper["reviews"]):
25 |         if review_i["rating"] > 0:
26 |             review_scores.append(review_i["rating"])
27 |     variance = np.var(review_scores)
28 |     variances.append(variance)
29 | 
30 |     if variance == 0:
31 |         variance_zero += 1
32 | 
33 |     source_documents.append(paper["paper_abstract"])
34 |     for review in paper["reviews"]:
35 |         if "comment" in review.keys():
36 |             text = review["comment"]
37 |             source_documents.append(text)
38 | 
39 |     if summary.strip() != "" and len(source_documents) > 1:
40 |         clusters.append(
41 |             {"source_documents": source_documents, "review_score_variance": variance, "summary": summary,
42 |              "paper_id": paper_id, "label": paper["label"], "paper_acceptance": paper_acceptance})
43 |     # else:
44 |     #     print(paper["paper_id"])
45 | 
46 | print("Max variance", np.max(variances))
47 | print("Min variance", np.min(variances))
48 | bins = [i * 1 for i in range(13)]
49 | cats = pd.cut(variances, bins, right=False, include_lowest=True)
50 | print(pd.value_counts(cats, sort=False))
51 | print("papers with variance zero", variance_zero)
52 | 
53 | variance_low = []
54 | variance_medium = []
55 | variance_high = []
56 | for cluster in clusters:
57 |     review_score_variance = cluster["review_score_variance"]
58 |     if 0 <= review_score_variance < 2:
59 |         variance_low.append(cluster)
60 |     if 2 <= review_score_variance < 8:
61 |         variance_medium.append(cluster)
62 |     if 8 <= review_score_variance < 12:
63 |         variance_high.append(cluster)
64 | 
65 | final_clusters = []
66 | final_clusters.extend(random.sample(variance_low, 10))
67 | final_clusters.extend(random.sample(variance_medium, 10))
68 | final_clusters.extend(random.sample(variance_high, 10))
69 | random.shuffle(final_clusters)
70 | 
71 | with jsonlines.open("variance_high.json", "w") as writer:
72 |     writer.write_all(variance_high)
73 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/generate_summary_rl.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('../')
 3 | import numpy as np
 4 | import os
 5 | 
 6 | from ref_free_metrics.supert import Supert
 7 | from summariser.ngram_vector.vector_generator import Vectoriser
 8 | from summariser.deep_td import DeepTDAgent as RLAgent
 9 | from utils.data_reader import CorpusReader
10 | from utils.evaluator import evaluate_summary_rouge, add_result
11 | 
12 | 
13 | class RLSummarizer():
14 |     def __init__(self,reward_func, reward_strict=5.,rl_strict=5.,train_episode=5000, base_length=200, sample_summ_num=5000, gpu=True):
15 |         self.reward_func = reward_func
16 |         self.reward_strict = reward_strict
17 |         self.rl_strict = rl_strict
18 |         self.train_episode = train_episode
19 |         self.base_length = base_length
20 |         self.sample_summ_num = sample_summ_num
21 |         self.gpu = gpu
22 | 
23 |     def get_sample_summaries(self, docs, summ_max_len=100):
24 |         vec = Vectoriser(docs,summ_max_len)
25 |         summary_list = vec.sample_random_summaries(self.sample_summ_num)
26 |         rewards = self.reward_func(summary_list)
27 |         assert len(summary_list) == len(rewards)
28 |         return summary_list, rewards
29 | 
30 |     def summarize(self, docs, summ_max_len=100):
31 |         # generate sample summaries for memory replay
32 |         summaries, rewards = self.get_sample_summaries(docs, summ_max_len)
33 |         vec = Vectoriser(docs, sum_len=summ_max_len, base=self.base_length)
34 |         rl_agent = RLAgent(vec, summaries, strict_para=self.rl_strict, train_round=self.train_episode, gpu=self.gpu)
35 |         summary = rl_agent(rewards)
36 |         return summary
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     # read source documents
41 |     reader = CorpusReader('data/topic_1')
42 |     source_docs = reader()
43 | 
44 |     # generate summaries, with summary max length 100 tokens
45 |     supert = Supert(source_docs)
46 |     rl_summarizer = RLSummarizer(reward_func = supert)
47 |     summary = rl_summarizer.summarize(source_docs, summ_max_len=50)
48 |     print('\n=====Generated Summary=====')
49 |     print(summary)
50 | 
51 |     # (Optional) Evaluate the quality of the summary using ROUGE metrics
52 |     if os.path.isdir('./rouge/ROUGE-RELEASE-1.5.5'):
53 |         refs = reader.readReferences() # make sure you have put the references in data/topic_1/references
54 |         avg_rouge_score = {}
55 |         for ref in refs:
56 |             rouge_scores = evaluate_summary_rouge(summary, ref)
57 |             add_result(avg_rouge_score, rouge_scores)
58 |         print('\n=====ROUGE scores against {} references====='.format(len(refs)))
59 |         for metric in avg_rouge_score:
60 |             print('{}:\t{}'.format(metric, np.mean(rouge_scores[metric])))
61 | 
62 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/CNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import logging
 5 | import gzip
 6 | from tqdm import tqdm
 7 | import numpy as np
 8 | import os
 9 | import json
10 | from ..util import import_from_string, fullname, http_get
11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
12 | 
13 | 
14 | class CNN(nn.Module):
15 |     """CNN-layer with multiple kernel-sizes over the word embeddings"""
16 | 
17 |     def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5]):
18 |         nn.Module.__init__(self)
19 |         self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes']
20 |         self.in_word_embedding_dimension = in_word_embedding_dimension
21 |         self.out_channels = out_channels
22 |         self.kernel_sizes = kernel_sizes
23 | 
24 |         self.embeddings_dimension = out_channels*len(kernel_sizes)
25 |         self.convs = nn.ModuleList()
26 | 
27 |         in_channels = in_word_embedding_dimension
28 |         for kernel_size in kernel_sizes:
29 |             padding_size = int((kernel_size - 1) / 2)
30 |             conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
31 |                              padding=padding_size)
32 |             self.convs.append(conv)
33 | 
34 |     def forward(self, features):
35 |         token_embeddings = features['token_embeddings']
36 | 
37 |         token_embeddings = token_embeddings.transpose(1, -1)
38 |         vectors = [conv(token_embeddings) for conv in self.convs]
39 |         out = torch.cat(vectors, 1).transpose(1, -1)
40 | 
41 |         features.update({'token_embeddings': out})
42 |         return features
43 | 
44 |     def get_word_embedding_dimension(self) -> int:
45 |         return self.embeddings_dimension
46 | 
47 |     def tokenize(self, text: str) -> List[str]:
48 |         raise NotImplementedError()
49 | 
50 |     def save(self, output_path: str):
51 |         with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut:
52 |             json.dump(self.get_config_dict(), fOut, indent=2)
53 | 
54 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
55 | 
56 |     def get_config_dict(self):
57 |         return {key: self.__dict__[key] for key in self.config_keys}
58 | 
59 |     @staticmethod
60 |     def load(input_path: str):
61 |         with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn:
62 |             config = json.load(fIn)
63 | 
64 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
65 |         model = CNN(**config)
66 |         model.load_state_dict(weights)
67 |         return model
68 | 
69 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/util.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from torch import Tensor, device
 3 | from typing import Tuple, List
 4 | from tqdm import tqdm
 5 | import sys
 6 | import importlib
 7 | 
 8 | 
 9 | def batch_to_device(batch, target_device: device):
10 |     """
11 |     send a batch to a device
12 | 
13 |     :param batch:
14 |     :param target_device:
15 |     :return: the batch sent to the device
16 |     """
17 |     features = batch['features']
18 |     for paired_sentence_idx in range(len(features)):
19 |         for feature_name in features[paired_sentence_idx]:
20 |             features[paired_sentence_idx][feature_name] = features[paired_sentence_idx][feature_name].to(target_device)
21 | 
22 |     labels = batch['labels'].to(target_device)
23 |     return features, labels
24 | 
25 | 
26 | 
27 | def http_get(url, path):
28 |     file_binary = open(path, "wb")
29 |     req = requests.get(url, stream=True)
30 |     if req.status_code != 200:
31 |         print("Exception when trying to download {}. Response {}".format(url, req.status_code), file=sys.stderr)
32 |         req.raise_for_status()
33 | 
34 |     content_length = req.headers.get('Content-Length')
35 |     total = int(content_length) if content_length is not None else None
36 |     progress = tqdm(unit="B", total=total, unit_scale=True)
37 |     for chunk in req.iter_content(chunk_size=1024):
38 |         if chunk: # filter out keep-alive new chunks
39 |             progress.update(len(chunk))
40 |             file_binary.write(chunk)
41 |     progress.close()
42 | 
43 | 
44 | def fullname(o):
45 |   # o.__module__ + "." + o.__class__.__qualname__ is an example in
46 |   # this context of H.L. Mencken's "neat, plausible, and wrong."
47 |   # Python makes no guarantees as to whether the __module__ special
48 |   # attribute is defined, so we take a more circumspect approach.
49 |   # Alas, the module name is explicitly excluded from __qualname__
50 |   # in Python 3.
51 | 
52 |   module = o.__class__.__module__
53 |   if module is None or module == str.__class__.__module__:
54 |     return o.__class__.__name__  # Avoid reporting __builtin__
55 |   else:
56 |     return module + '.' + o.__class__.__name__
57 | 
58 | def import_from_string(dotted_path):
59 |     """
60 |     Import a dotted module path and return the attribute/class designated by the
61 |     last name in the path. Raise ImportError if the import failed.
62 |     """
63 |     try:
64 |         module_path, class_name = dotted_path.rsplit('.', 1)
65 |     except ValueError:
66 |         msg = "%s doesn't look like a module path" % dotted_path
67 |         raise ImportError(msg)
68 | 
69 |     module = importlib.import_module(module_path)
70 | 
71 |     try:
72 |         return getattr(module, class_name)
73 |     except AttributeError:
74 |         msg = 'Module "%s" does not define a "%s" attribute/class' % (module_path, class_name)
75 |         raise ImportError(msg)


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/evaluation/LabelAccuracyEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | import torch
 3 | from torch.utils.data import DataLoader
 4 | import logging
 5 | from tqdm import tqdm
 6 | from ..util import batch_to_device
 7 | import os
 8 | import csv
 9 | 
10 | class LabelAccuracyEvaluator(SentenceEvaluator):
11 |     """
12 |     Evaluate a model based on its accuracy on a labeled dataset
13 | 
14 |     This requires a model with LossFunction.SOFTMAX
15 | 
16 |     The results are written in a CSV. If a CSV already exists, then values are appended.
17 |     """
18 | 
19 |     def __init__(self, dataloader: DataLoader, name: str = ""):
20 |         """
21 |         Constructs an evaluator for the given dataset
22 | 
23 |         :param dataloader:
24 |             the data for the evaluation
25 |         """
26 |         self.dataloader = dataloader
27 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28 |         self.name = name
29 |         if name:
30 |             name = "_"+name
31 | 
32 |         self.csv_file = "accuracy_evaluation"+name+"_results.csv"
33 |         self.csv_headers = ["epoch", "steps", "accuracy"]
34 | 
35 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
36 |         model.eval()
37 |         total = 0
38 |         correct = 0
39 | 
40 |         if epoch != -1:
41 |             if steps == -1:
42 |                 out_txt = " after epoch {}:".format(epoch)
43 |             else:
44 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
45 |         else:
46 |             out_txt = ":"
47 | 
48 |         logging.info("Evaluation on the "+self.name+" dataset"+out_txt)
49 |         self.dataloader.collate_fn = model.smart_batching_collate
50 |         for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
51 |             features, label_ids = batch_to_device(batch, self.device)
52 |             with torch.no_grad():
53 |                 _, prediction = model(features[0])
54 | 
55 |             total += prediction.size(0)
56 |             correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item()
57 |         accuracy = correct/total
58 | 
59 |         logging.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total))
60 | 
61 |         if output_path is not None:
62 |             csv_path = os.path.join(output_path, self.csv_file)
63 |             if not os.path.isfile(csv_path):
64 |                 with open(csv_path, mode="w", encoding="utf-8") as f:
65 |                     writer = csv.writer(f)
66 |                     writer.writerow(self.csv_headers)
67 |                     writer.writerow([epoch, steps, accuracy])
68 |             else:
69 |                 with open(csv_path, mode="a", encoding="utf-8") as f:
70 |                     writer = csv.writer(f)
71 |                     writer.writerow([epoch, steps, accuracy])
72 | 
73 |         return accuracy


--------------------------------------------------------------------------------
/dataset_analysis/relevance/bartscore.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from sklearn.feature_extraction.text import CountVectorizer
 3 | import numpy as np
 4 | import functools
 5 | import multiprocessing
 6 | import random
 7 | 
 8 | sys.path.append("../../../")
 9 | from peersum.loading_data.peersum_loader import loading_peersum
10 | from peersum.loading_data.mds_loader import loading_mds
11 | from utils.metrics import bart_score
12 | 
13 | def relevance_score(samples, scorer):
14 |     avgs = []
15 |     variances = []
16 |     for sample in samples:
17 |         source_documents = sample["source_documents"]
18 |         summary = sample["summary"]
19 |         candidates = []
20 |         references = []
21 |         for source_document in source_documents:
22 |             candidates.append(source_document)
23 |             references.append(summary)
24 |         bart_scores = bart_score(candidates, references, bart_scorer=scorer)
25 |         avgs.append(np.mean(bart_scores))
26 |         variances.append(np.var(bart_scores))
27 | 
28 |     return {"mean": np.mean(avgs), "variance": np.mean(variances)}
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     from utils.BARTScore.bart_score import BARTScorer
33 |     bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')
34 | 
35 |     sampling_count = 1024
36 |     print("PeerSum")
37 |     samples = loading_peersum(folder="../../../peersum", including_public=True, including_author=True, including_abstract=True)
38 |     samples = random.sample(samples, sampling_count)
39 |     score_dict = relevance_score(samples, scorer=bart_scorer)
40 |     print("mean", score_dict["mean"], "variance", score_dict["variance"])
41 |     del samples
42 | 
43 |     # print("Multi-News")
44 |     # samples = loading_mds(folder="../../../datasets", data_name="multinews")
45 |     # samples = random.sample(samples, sampling_count)
46 |     # score_dict = relevance_score(samples, scorer=bart_scorer)
47 |     # print("mean", score_dict["mean"], "variance", score_dict["variance"])
48 |     # del samples
49 |     #
50 |     # print("WCEP")
51 |     # samples = loading_mds(folder="../../../datasets", data_name="wcep_100")
52 |     # samples = random.sample(samples, sampling_count)
53 |     # score_dict = relevance_score(samples, scorer=bart_scorer)
54 |     # print("mean", score_dict["mean"], "variance", score_dict["variance"])
55 |     # del samples
56 |     #
57 |     # print("Multi-XScience")
58 |     # samples = loading_mds(folder="../../../datasets", data_name="multixscience")
59 |     # samples = random.sample(samples, sampling_count)
60 |     # score_dict = relevance_score(samples, scorer=bart_scorer)
61 |     # print("mean", score_dict["mean"], "variance", score_dict["variance"])
62 |     # del samples
63 |     #
64 |     # print("Wikisum")
65 |     # samples = loading_mds(folder="../../../datasets", data_name="wikisum")
66 |     # samples = random.sample(samples, sampling_count)
67 |     # score_dict = relevance_score(samples, scorer=bart_scorer)
68 |     # print("mean", score_dict["mean"], "variance", score_dict["variance"])
69 |     # del samples


--------------------------------------------------------------------------------
/crawling_data/iclr_getdata.py:
--------------------------------------------------------------------------------
 1 | # The first step is getting the paper list from the OpenReview getting data, and then get reviews with forum ids
 2 | # Data for ICLR 2017 is from PeerRead (https://github.com/allenai/PeerRead), from 2018 data can be obtained from OpenReview
 3 | import json
 4 | import openreview
 5 | import time
 6 | 
 7 | year = 2022
 8 | base_url = "https://api.openreview.net"
 9 | client = openreview.Client(baseurl=base_url)
10 | notes = client.get_all_notes(signature='ICLR.cc/%s/Conference'%year)# using signature to get all submissions
11 | 
12 | invitations = set([])
13 | for note in notes:
14 |     invitations.add(note.invitation)
15 | 
16 | for invitation in invitations:
17 |     print(invitation, len(client.get_all_notes(invitation=invitation)))
18 | 
19 | papers = []
20 | count = 0
21 | for note in notes:
22 |     paper = {}
23 |     paper["link"] = "https://openreview.net/forum?id=" + note.forum
24 |     content = note.content
25 |     # print(content.keys())
26 |     paper["title"] = content['title']
27 |     paper["authors"] = content['authors']
28 |     paper["abstract"] = content['abstract']
29 |     paper["tl_dr"] = content.get('one-sentence_summary', '')
30 |     paper["keywords"] = content['keywords']
31 | 
32 |     paper["id"] = note.forum
33 | 
34 |     notes = client.get_notes(
35 |         forum=paper["id"])  # using forum to get notes of each paper, and notes include the paper information, reviews (official and public) and responses.
36 |     reviews_commments = []
37 |     paper_invitations = []
38 |     time_final_decision = None
39 |     for note in notes:
40 |         # print("cdate",time.localtime(note.cdate/1000))
41 |         # print("tcdate", time.localtime(note.tcdate / 1000))
42 |         # print("tmdate", time.localtime(note.tmdate / 1000))
43 |         paper_invitations.append(note.invitation.split("/")[-1])
44 |         if "Submission" in note.invitation and note.id == id:
45 |             paper["pdf"] = base_url + note.content["pdf"]
46 |             paper["number"] = note.number
47 |         elif "Decision" in note.invitation or "Meta_Review" in note.invitation:
48 |             # print(note.invitation)
49 |             time_final_decision = time.localtime(note.tmdate / 1000)
50 |             paper["final_decision"] = note.to_json()
51 |             paper["comment"] = note.content['decision']
52 |             print(paper['comment'])
53 |         else:
54 |             reviews_commments.append(note.to_json())
55 |     # for note in notes:
56 |     #     if note.cdate != None:
57 |     #         ntime = time.localtime(note.cdate / 1000)
58 |     #         if ntime > time_final_decision:
59 |     #             print(time_final_decision, ntime)
60 |     print("reviews_comments", len(reviews_commments))
61 |     invitation_texts = ",".join(sorted(list(set(paper_invitations))))
62 |     paper["reviews_commments"] = reviews_commments
63 |     if "Blind_Submission" in invitation_texts:
64 |         count += 1
65 |     # print(paper["final_decision"])
66 | 
67 |     papers.append(paper)
68 | 
69 | 
70 | print(len(papers))
71 | f = open('data/iclr_%s.json'%year, 'w')
72 | f.write(json.dumps(papers))
73 | f.close()
74 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_2/input_docs/FT923-6038:
--------------------------------------------------------------------------------
 1 | <TEXT>
 2 | <p>
 3 | HURRICANE Andrew, claimed to be the costliest natural disaster in US history, yesterday smashed its way through the state of Louisiana, inflicting severe damage on rural communities but narrowly missing the low-lying city of New Orleans.
 4 | </p>
 5 | <p>
 6 | The storm, which brought havoc to southern Florida on Monday and then headed north-west across the Gulf of Mexico, had made landfall late on Tuesday night some 60 miles south-west of the city in the agricultural Cajun country.
 7 | </p>
 8 | <p>
 9 | Although the damage from the hurricane's landfall in Florida on Monday was much greater than initially esti mated, insurers' losses there are likely to total less than Dollars 1bn, well below earlier expectations, a senior member of Lloyd's insurance market said yesterday.
10 | </p>
11 | <p>
12 | In Louisiana, the hurricane landed with wind speeds of about 120 miles per hour and caused severe damage in small coastal centres such as Morgan City, Franklin and New Iberia.
13 | </p>
14 | <p>
15 | Associated tornadoes devastated Laplace, 20 miles west of New Orleans.
16 | </p>
17 | <p>
18 | Then, however, Andrew lost force as it moved north over land.
19 | </p>
20 | <p>
21 | By yesterday afternoon, it had been down-graded to tropical storm, in that its sustained windspeeds were below 75 mph.
22 | </p>
23 | <p>
24 | Initial reports said at least one person had died, 75 been injured and thousands made homeless along the Louisiana coast, after 14 confirmed deaths in Florida and three in the Bahamas.
25 | </p>
26 | <p>
27 | The storm caused little damage to Louisiana's important oil-refining industry, although some plants had to halt production when electricity was cut.
28 | </p>
29 | <p>
30 | The Lloyd's member, in close contact with leading insurers in Florida, said that damage to insured property was remarkably small.
31 | </p>
32 | <p>
33 | More than Dollars 15bn of damage may have been caused in all, but was mostly to uninsured property, he said.
34 | </p>
35 | <p>
36 | In north Miami, damage is minimal.
37 | </p>
38 | <p>
39 | Worst affected is one hotel, whose basement was flooded.
40 | </p>
41 | <p>
42 | Most of the destruction occurred in a 10-mile band across Homestead, 25 miles to the south of Miami, where a typical house sells for Dollars 100,000 to Dollars 150,000.
43 | </p>
44 | <p>
45 | US insurers will face a bill in respect of such properties, but Lloyd's exposure there is minimal.
46 | </p>
47 | <p>
48 | Many destroyed power lines are thought to be uninsured, as are trees and shrubs uprooted across a wide area.
49 | </p>
50 | <p>
51 | Only one big hotel in that area has been badly damaged, a Holiday Inn.
52 | </p>
53 | <p>
54 | Across Florida, some 2m people remained without electric ity yesterday and health officials were warning the public to boil or chemically treat all water.
55 | </p>
56 | <p>
57 | Hurricane Hugo, which devastated much of South Carolina in 1989, cost the insurance industry some Dollars 4.2bn.
58 | </p>
59 | <p>
60 | Further uninsured losses may have raised the total to Dollars 6bn-Dollars 10bn.
61 | </p>
62 | </TEXT>
63 | 


--------------------------------------------------------------------------------
/acceptance_prediction/bert_inference_reference.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import random
  3 | import os
  4 | import json
  5 | import jsonlines
  6 | import numpy as np
  7 | from transformers import Trainer, TrainingArguments, BertTokenizer, BertForSequenceClassification
  8 | from datasets import load_dataset, Dataset
  9 | from sklearn.metrics import accuracy_score
 10 | 
 11 | with_disagreements = [0, 1] # 0: without conflict, 1: with conflicts
 12 | dataset_name = "peersum"
 13 | model_name = "result/checkpoint-4000"
 14 | max_input_length = 512
 15 | batch_size = 4
 16 | random.seed(42)
 17 | summaries_evaluated_folder = "ground_truth"
 18 | 
 19 | 
 20 | def acceptance_categorical(acceptance):
 21 |     if "eject" in acceptance:
 22 |         return 0
 23 |     else:
 24 |         return 1
 25 | 
 26 | 
 27 | data_path = "../../datasets/"
 28 | dataset_all = load_dataset('json', data_files=data_path + '%s.json' % dataset_name, split='all')
 29 | print("dataset all", len(dataset_all))
 30 | dataset_test = dataset_all.filter(lambda s: s['label'] == 'test')
 31 | dataset_test = dataset_test.select(random.sample(range(len(dataset_test)), 512))
 32 | print("dataset test selected", len(dataset_test))
 33 | 
 34 | # load tokenizer
 35 | tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
 36 | bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
 37 | 
 38 | 
 39 | def process_data_to_model_inputs(batch):
 40 |     summaries = batch["summary"]
 41 |     for i, summary in enumerate(summaries):
 42 |         summaries[i] = summary.lower()
 43 |     input_dict = tokenizer(
 44 |         summaries,
 45 |         padding="max_length",
 46 |         truncation=True,
 47 |         max_length=max_input_length
 48 |     )
 49 | 
 50 |     results = {}
 51 |     results["input_ids"] = input_dict.input_ids
 52 |     results["attention_mask"] = input_dict.attention_mask
 53 |     labels = batch["paper_acceptance"]
 54 |     for i, label in enumerate(labels):
 55 |         labels[i] = acceptance_categorical(label)
 56 |     results["labels"] = labels
 57 |     return results
 58 | 
 59 | 
 60 | print("Preprocessing dataset test")
 61 | dataset_test = dataset_test.map(
 62 |     process_data_to_model_inputs,
 63 |     batched=True,
 64 |     batch_size=batch_size
 65 | )
 66 | 
 67 | # set Python list to PyTorch tensor
 68 | dataset_test.set_format(
 69 |     type="torch",
 70 |     columns=["input_ids", "attention_mask", "labels"],
 71 | )
 72 | 
 73 | training_args = TrainingArguments(
 74 |     do_train=True,
 75 |     do_eval=True,
 76 |     do_predict=True,
 77 |     per_device_train_batch_size=batch_size,
 78 |     per_device_eval_batch_size=batch_size,
 79 |     output_dir="result",
 80 | )
 81 | 
 82 | 
 83 | # compute Rouge score during validation
 84 | def compute_metrics(pred):
 85 |     preds, labels = pred
 86 |     return {"acc": accuracy_score(labels, preds.argmax(-1))}
 87 | 
 88 | 
 89 | # instantiate trainer
 90 | trainer = Trainer(
 91 |     model=bert,
 92 |     tokenizer=tokenizer,
 93 |     args=training_args,
 94 |     compute_metrics=compute_metrics,
 95 | )
 96 | 
 97 | test_results = trainer.predict(test_dataset=dataset_test)
 98 | print(test_results.metrics)
 99 | 
100 | 


--------------------------------------------------------------------------------
/crawling_data/data_overview.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # overview statistics of all data
 3 | 
 4 | import sys
 5 | import numpy as np
 6 | import math
 7 | import random
 8 | import jsonlines
 9 | 
10 | 
11 | def loading_original(conference="all", data_name="peersum_all"):
12 |     papers = []
13 |     with jsonlines.open("data/%s.json"%data_name, "r") as reader:
14 |         for paper in reader:
15 |             papers.append(paper)
16 | 
17 |     papers_iclr17 = []
18 |     papers_iclr18 = []
19 |     papers_iclr19 = []
20 |     papers_iclr20 = []
21 |     papers_iclr21 = []
22 |     papers_iclr22 = []
23 |     papers_nips19 = []
24 |     papers_nips20 = []
25 |     papers_nips21 = []
26 |     for paper in papers:
27 |         if "iclr_2017" in paper["paper_id"]:
28 |             papers_iclr17.append(paper)
29 |         if "iclr_2018" in paper["paper_id"]:
30 |             papers_iclr18.append(paper)
31 |         if "iclr_2019" in paper["paper_id"]:
32 |             papers_iclr19.append(paper)
33 |         if "iclr_2020" in paper["paper_id"]:
34 |             papers_iclr20.append(paper)
35 |         if "iclr_2021" in paper["paper_id"]:
36 |             papers_iclr21.append(paper)
37 |         if "iclr_2022" in paper["paper_id"]:
38 |             papers_iclr22.append(paper)
39 |         if "nips_2019" in paper["paper_id"]:
40 |             papers_nips19.append(paper)
41 |         if "nips_2020" in paper["paper_id"]:
42 |             papers_nips20.append(paper)
43 |         if "nips_2021" in paper["paper_id"]:
44 |             papers_nips21.append(paper)
45 | 
46 |     if conference== "all":
47 |         print("Papers:", len(papers))
48 |         return papers
49 |     elif conference== "nips":
50 |         papers_nips = papers_nips19 + papers_nips20 + papers_nips21
51 |         print("Papers NIPS", len(papers_nips))
52 |         return papers_nips
53 |     elif conference== "iclr":
54 |         papers_iclr = papers_iclr17 + papers_iclr18 + papers_iclr19 + papers_iclr20 + papers_iclr21 + papers_iclr22
55 |         print("Papers ICLR", len(papers_iclr))
56 |         return papers_iclr
57 |     else:
58 |         print("Not a right conference parameter")
59 | 
60 | papers = loading_original("iclr", "peersum_all")
61 | 
62 | nips = []
63 | iclr = []
64 | for paper in papers:
65 |     if "iclr" in paper["paper_id"]:
66 |         iclr.append(paper)
67 |     if "nips" in paper["paper_id"]:
68 |         nips.append(paper)
69 | 
70 | print("nips", len(nips))
71 | print("iclr", len(iclr))
72 | 
73 | official_threads = []
74 | public_threads = []
75 | author_threads = []
76 | meta_review_texts = []
77 | paper_scores = []
78 | 
79 | for paper in papers:
80 |     meta_review_texts.append(paper["meta_review"])
81 |     paper_score = paper['paper_score']
82 |     if paper_score!=-1:
83 |         paper_scores.append(paper_score)
84 | 
85 |     official_threads.append(len(paper["official_threads"]))
86 |     public_threads.append(len(paper["public_threads"]))
87 |     author_threads.append(len(paper["author_threads"]))
88 | 
89 | 
90 | print("The number of official threads each paper", np.mean(official_threads))
91 | print("The number of public threads each paper", np.mean(public_threads))
92 | print("The number of author threads each paper", np.mean(author_threads))
93 | print("Avg score", np.mean(paper_scores))


--------------------------------------------------------------------------------
/crawling_data/nips_getdata_2019.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import urllib.request
 3 | import json
 4 | import os
 5 | from urllib.error import HTTPError
 6 | 
 7 | base_url = "https://proceedings.neurips.cc"
 8 | year_url= base_url + "/paper/2019"
 9 | output_file_name = "data/nips_2019.json"
10 | 
11 | req=urllib.request.Request(year_url)
12 | resp=urllib.request.urlopen(req)
13 | data=resp.read().decode('utf-8')
14 | 
15 | papers = []
16 | soup = BeautifulSoup(data, 'html.parser')
17 | paper_list = soup.find("div", attrs={'class':'col'}).ul.find_all('li')
18 | print(len(paper_list))
19 | 
20 | for item in paper_list:
21 |     paper = {}
22 |     paper["link"] = item.a["href"]
23 |     paper["title"] = item.a.get_text()
24 |     paper["authors"] = item.i.get_text()
25 | 
26 |     req = urllib.request.Request(base_url + paper["link"])
27 |     try:
28 |         resp = urllib.request.urlopen(req)
29 |         data = resp.read().decode('utf-8')
30 |         soup = BeautifulSoup(data, 'html.parser')
31 |         content = soup.find("div", attrs={'class': 'col'})
32 | 
33 |         hrefs = content.div.find_all("a")
34 | 
35 |         for h in hrefs:
36 |             hcontent = h.get_text()
37 |             # meta-review
38 |             if hcontent=="MetaReview":
39 |                 req = urllib.request.Request(base_url + h["href"])
40 |                 # print(base_url + content.div.find_all("a")[3].get_text())
41 |                 resp = urllib.request.urlopen(req)
42 |                 data = resp.read()
43 |                 soup = BeautifulSoup(data, 'html.parser')
44 |                 paper["meta_review"] = soup.find_all("div")[-1].get_text()
45 | 
46 |             # reviews
47 |             if hcontent == "Reviews":
48 |                 reviews = []
49 |                 req = urllib.request.Request(base_url + h["href"])
50 |                 resp = urllib.request.urlopen(req)
51 |                 data = resp.read()
52 |                 soup = BeautifulSoup(data, 'html.parser')
53 |                 for review in soup.find_all("pre", attrs={"class":"review"}):
54 |                     reviews.append(review.find_next().get_text())
55 |                 paper["reviews"] = reviews
56 |                 print(reviews)
57 | 
58 |             if hcontent == "Paper":
59 |                 paper["pdf"] = h["href"]
60 | 
61 | 
62 | 
63 |         # abstract
64 |         paper["abstract"] = content.find_all('p')[-2].get_text()
65 | 
66 |         paper["comment"] = "accept"
67 |     except HTTPError as e:
68 |         print("URL error")
69 | 
70 |     papers.append(paper)
71 | 
72 | 
73 |     if len(papers)%50==0:
74 |         exist_papers = []
75 |         if os.path.isfile(output_file_name):
76 |             with open(output_file_name, 'r') as f:
77 |                 exist_papers = json.load(f)
78 | 
79 |         exist_papers.extend(papers)
80 |         f = open(output_file_name, 'w')
81 |         f.write(json.dumps(exist_papers))
82 |         f.close()
83 |         print("paper count", len(exist_papers))
84 | 
85 |         papers = []
86 | 
87 | 
88 | 
89 | with open(output_file_name, 'r') as f:
90 |     exist_papers = json.load(f)
91 | 
92 | exist_papers.extend(papers)
93 | f = open(output_file_name, 'w')
94 | f.write(json.dumps(exist_papers))
95 | f.close()
96 | print("paper count", len(exist_papers))
97 | 
98 | print("Done")


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_2/input_docs/FT923-5267:
--------------------------------------------------------------------------------
 1 | <TEXT>
 2 | US INSURERS expect to pay out an estimated Dollars 7.3bn (Pounds 3.7bn) in Florida as a result of Hurricane Andrew - by far the costliest disaster the industry has ever faced.
 3 | </p>
 4 | <p>
 5 | The figure is the first official tally of the damage resulting from the hurricane, which ripped through southern Florida last week.
 6 | </p>
 7 | <p>
 8 | In the battered region it is estimated that 275,000 people still have no electricity and at least 150,000 are either homeless or are living amid ruins.
 9 | </p>
10 | <p>
11 | President George Bush yesterday made his second visit to the region since the hurricane hit.
12 | </p>
13 | <p>
14 | He pledged the government would see through the clean-up 'until the job is done'.
15 | </p>
16 | <p>
17 | Although there had already been some preliminary guesses at the level of insurance claims, yesterday's figure comes from the Property Claims Services division of the American Insurance Services Group, the property-casualty insurers' trade association.
18 | </p>
19 | <p>
20 | It follows an extensive survey of the area by the big insurance companies.
21 | </p>
22 | <p>
23 | Mr Gary Kerney, director of catastrophe services at the PCS, said the industry was expecting about 685,000 claims in Florida alone.
24 | </p>
25 | <p>
26 | It is reckoned the bulk of the damage - over Dollars 6bn in insured claims - is in Dade County, a rural region to the south of Miami.
27 | </p>
28 | <p>
29 | However, the final cost of Hurricane Andrew will be higher still.
30 | </p>
31 | <p>
32 | Yesterday's estimate does not include any projection for claims in Louisiana, which was also affected by the storm, although less severely than Florida.
33 | </p>
34 | <p>
35 | An estimate of the insured losses in this second state will be released later this week.
36 | </p>
37 | <p>
38 | But on the Florida losses alone, Hurricane Andrew becomes the most costly insured catastrophe in the US.
39 | </p>
40 | <p>
41 | Hurricane Hugo, which hit the east coast in September 1989, cost the insurance industry about Dollars 4.2bn.
42 | </p>
43 | <p>
44 | The Oakland fire disaster, in California last year, cost Dollars 1.2bn.
45 | </p>
46 | <p>
47 | By contrast, insurance claims resulting from the Los Angeles riots earlier this year - the most expensive civil disturbance in the US - totalled just Dollars 775m.
48 | </p>
49 | <p>
50 | Hurricane Andrew leaves the US property-casualty insurers facing their worst-ever year for catastrophe losses.
51 | </p>
52 | <p>
53 | The LA riots and a series of tornadoes, wind and hailstorms in states such as Kansas, Oklahoma and Iowa had already produced insured losses of Dollars 3.9bn.
54 | </p>
55 | <p>
56 | With Florida's Hurricane Andrew losses added in, the total rises to Dollars 11.2bn.
57 | </p>
58 | <p>
59 | This easily exceeds the record Dollars 7.6bn of catastrophe losses seen in 1989, when the industry paid out on both Hurricane Hugo and the Loma Prieta earthquake in California.
60 | </p>
61 | <p>
62 | Wall Street, however, has reacted calmly to the record losses expected, and insurers' shares - although lower initially - have been firming recently.
63 | </p>
64 | <p>
65 | The property-casualty industry is thought to have adequate reserves to cover the disaster.
66 | </p>
67 | </TEXT>
68 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/BoW.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | import numpy as np
 9 | from .tokenizer import WhitespaceTokenizer
10 | 
11 | class BoW(nn.Module):
12 |     """Implements a Bag-of-Words (BoW) model to derive sentence embeddings.
13 | 
14 |     A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
15 |     """
16 | 
17 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True):
18 |         super(BoW, self).__init__()
19 |         vocab = list(set(vocab)) #Ensure vocab is unique
20 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency']
21 |         self.vocab = vocab
22 |         self.word_weights = word_weights
23 |         self.unknown_word_weight = unknown_word_weight
24 |         self.cumulative_term_frequency = cumulative_term_frequency
25 | 
26 |         #Maps wordIdx -> word weight
27 |         self.weights = []
28 |         num_unknown_words = 0
29 |         for word in vocab:
30 |             weight = unknown_word_weight
31 |             if word in word_weights:
32 |                 weight = word_weights[word]
33 |             elif word.lower() in word_weights:
34 |                 weight = word_weights[word.lower()]
35 |             else:
36 |                 num_unknown_words += 1
37 |             self.weights.append(weight)
38 | 
39 |         logging.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
40 | 
41 |         self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False)
42 |         self.sentence_embedding_dimension = len(vocab)
43 | 
44 | 
45 |     def forward(self, features: Dict[str, Tensor]):
46 |         #Nothing to do, everything is done in get_sentence_features
47 |         return features
48 | 
49 |     def tokenize(self, text: str) -> List[str]:
50 |         return self.tokenizer.tokenize(text)
51 | 
52 |     def get_sentence_embedding_dimension(self):
53 |         return self.sentence_embedding_dimension
54 | 
55 |     def get_sentence_features(self, tokens: List[str], pad_seq_length: int):
56 |         #return {'input_ids': tokens}
57 |         vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32)
58 |         for token in tokens:
59 |             if self.cumulative_term_frequency:
60 |                 vector[token] += self.weights[token]
61 |             else:
62 |                 vector[token] = self.weights[token]
63 | 
64 |         return {'sentence_embedding': vector}
65 | 
66 |     def get_config_dict(self):
67 |         return {key: self.__dict__[key] for key in self.config_keys}
68 | 
69 |     def save(self, output_path):
70 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
71 |             json.dump(self.get_config_dict(), fOut, indent=2)
72 | 
73 |     @staticmethod
74 |     def load(input_path):
75 |         with open(os.path.join(input_path, 'config.json')) as fIn:
76 |             config = json.load(fIn)
77 | 
78 |         return BoW(**config)


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/WordWeights.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | 
 9 | class WordWeights(nn.Module):
10 |     """This model can weight word embeddings, for example, with idf-values."""
11 | 
12 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1):
13 |         """
14 | 
15 |         :param vocab:
16 |             Vocabulary of the tokenizer
17 |         :param word_weights:
18 |             Mapping of tokens to a float weight value. Words embeddings are multiplied by  this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values)
19 |         :param unknown_word_weight:
20 |             Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists.
21 |         """
22 |         super(WordWeights, self).__init__()
23 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight']
24 |         self.vocab = vocab
25 |         self.word_weights = word_weights
26 |         self.unknown_word_weight = unknown_word_weight
27 | 
28 |         weights = []
29 |         num_unknown_words = 0
30 |         for word in vocab:
31 |             weight = unknown_word_weight
32 |             if word in word_weights:
33 |                 weight = word_weights[word]
34 |             elif word.lower() in word_weights:
35 |                 weight = word_weights[word.lower()]
36 |             else:
37 |                 num_unknown_words += 1
38 |             weights.append(weight)
39 |         
40 |         logging.info("{} of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
41 | 
42 |         self.emb_layer = nn.Embedding(len(vocab), 1)
43 |         self.emb_layer.load_state_dict({'weight': torch.FloatTensor(weights).unsqueeze(1)})
44 | 
45 | 
46 |     def forward(self, features: Dict[str, Tensor]):
47 |         input_mask = features['input_mask']
48 |         token_embeddings = features['token_embeddings']
49 | 
50 |         #Compute a weight value for each token
51 |         token_weights_raw = self.emb_layer(features['input_ids']).squeeze(-1)
52 |         token_weights = token_weights_raw * input_mask.float()
53 |         token_weights_sum = torch.sum(token_weights, 1)
54 | 
55 |         #Multiply embedding by token weight value
56 |         token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size())
57 |         token_embeddings = token_embeddings * token_weights_expanded
58 | 
59 |         features.update({'token_embeddings': token_embeddings, 'token_weights_sum': token_weights_sum})
60 |         return features
61 | 
62 |     def get_config_dict(self):
63 |         return {key: self.__dict__[key] for key in self.config_keys}
64 | 
65 |     def save(self, output_path):
66 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
67 |             json.dump(self.get_config_dict(), fOut, indent=2)
68 | 
69 |     @staticmethod
70 |     def load(input_path):
71 |         with open(os.path.join(input_path, 'config.json')) as fIn:
72 |             config = json.load(fIn)
73 | 
74 |         return WordWeights(**config)


--------------------------------------------------------------------------------
/crawling_data/nips_getdata_2022.py:
--------------------------------------------------------------------------------
 1 | # The first step is getting the paper list from the OpenReview getting data, and then get reviews with forum ids
 2 | import json
 3 | import openreview
 4 | import time
 5 | 
 6 | year = 2022
 7 | base_url = "https://api.openreview.net"
 8 | client = openreview.Client(baseurl=base_url)
 9 | notes = client.get_all_notes(signature='NeurIPS.cc/%s/Conference'%year)# using signature to get all submissions
10 | print("all papers", len(notes))
11 | 
12 | invitations = set([])
13 | for note in notes:
14 |     invitations.add(note.invitation)
15 | 
16 | for invitation in invitations:
17 |     print(invitation, len(client.get_all_notes(invitation=invitation)))
18 | 
19 | papers = []
20 | count = 0
21 | for note in notes:
22 |     paper = {}
23 |     paper["link"] = "https://openreview.net/forum?id=" + note.forum
24 |     content = note.content
25 |     paper["title"] = content['title']
26 |     paper["authors"] = content['authors']
27 |     paper["abstract"] = content['abstract']
28 |     paper["tl_dr"] = content.get('TL;DR', "")
29 |     paper["keywords"] = content['keywords']
30 |     paper["id"] = note.forum
31 | 
32 |     rcs = client.get_notes(
33 |         forum=paper["id"])  # using forum to get notes of each paper, and notes include the paper information, reviews (official and public) and responses.
34 |     reviews_commments = []
35 |     paper_invitations = []
36 |     time_final_decision = None
37 |     decision = ""
38 |     recommendation = ""
39 |     for rc in rcs:
40 |         # print(note.invitation)
41 |         # print("cdate",time.localtime(note.cdate/1000))
42 |         # print("tcdate", time.localtime(note.tcdate / 1000))
43 |         # print("tmdate", time.localtime(note.tmdate / 1000))
44 |         paper_invitations.append(rc.invitation.split("/")[-1])
45 |         if "Submission" in rc.invitation and note.id == paper["id"]:
46 |             paper["pdf"] = base_url + rc.content["pdf"]
47 |             paper["number"] = rc.number
48 |         elif "Meta_Review" in rc.invitation:
49 |             # print(note.invitation)
50 |             time_final_decision = time.localtime(rc.tmdate / 1000)
51 |             paper["final_decision"] = rc.to_json()
52 |             print(rc.content['recommendation'])
53 |             recommendation = rc.content['recommendation']
54 |             # print(paper['comment'])
55 |         elif "Decision" in rc.invitation:
56 |             print(rc.content['decision'])
57 |             paper["comment"] = rc.content['decision']
58 |             decision = rc.content['decision']
59 |         else:
60 |             reviews_commments.append(rc.to_json())
61 |     # for note in notes:
62 |     #     if note.cdate != None:
63 |     #         ntime = time.localtime(note.cdate / 1000)
64 |     #         if ntime > time_final_decision:
65 |     #             print(time_final_decision, ntime)
66 |     if recommendation != decision:
67 |         print(paper["link"])
68 |     print("reviews_comments", len(reviews_commments))
69 |     invitation_texts = ",".join(sorted(list(set(paper_invitations))))
70 |     paper["reviews_commments"] = reviews_commments
71 |     if "Blind_Submission" in invitation_texts:
72 |         count += 1
73 |     # print(paper["final_decision"])
74 | 
75 |     papers.append(paper)
76 | 
77 | print("blind submission", count)
78 | 
79 | print(len(papers))
80 | f = open('data/nips_%s.json'%year, 'w')
81 | f.write(json.dumps(papers))
82 | f.close()
83 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/data_samplers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains sampler functions, that can be used to sample mini-batches with specific properties.
 3 | """
 4 | from torch.utils.data import Sampler
 5 | import numpy as np
 6 | from .datasets import SentenceLabelDataset
 7 | 
 8 | 
 9 | class LabelSampler(Sampler):
10 |     """
11 |     This sampler is used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS
12 |     or MULTIPLE_NEGATIVES_RANKING_LOSS which require multiple or only one sample from one label per batch.
13 | 
14 |     It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label.
15 | 
16 |     Labels with fewer than n unique samples are ignored.
17 |     This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped.
18 | 
19 |     This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible
20 |     by the samples drawn per label.
21 | 
22 | 
23 |     """
24 |     def __init__(self, data_source: SentenceLabelDataset, samples_per_label: int = 5,
25 |                  with_replacement: bool = False):
26 |         """
27 |         Creates a LabelSampler for a SentenceLabelDataset.
28 | 
29 |         :param data_source:
30 |             the dataset from which samples are drawn
31 |         :param samples_per_label:
32 |             the number of consecutive, random and unique samples drawn per label
33 |         :param with_replacement:
34 |             if this is True, then each sample is drawn at most once (depending on the total number of samples per label).
35 |             if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same
36 |             drawing.
37 |         """
38 |         super().__init__(data_source)
39 |         self.data_source = data_source
40 |         self.samples_per_label = samples_per_label
41 |         self.label_range = np.arange(data_source.num_labels)
42 |         self.borders = data_source.labels_right_border
43 |         self.with_replacement = with_replacement
44 |         np.random.shuffle(self.label_range)
45 | 
46 |     def __iter__(self):
47 |         label_idx = 0
48 |         count = 0
49 |         already_seen = {}
50 |         while count < len(self.data_source):
51 |             label = self.label_range[label_idx]
52 |             if label not in already_seen:
53 |                 already_seen[label] = []
54 | 
55 |             left_border = 0 if label == 0 else self.borders[label-1]
56 |             right_border = self.borders[label]
57 | 
58 |             if self.with_replacement:
59 |                 selection = np.arange(left_border, right_border)
60 |             else:
61 |                 selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]]
62 | 
63 |             if len(selection) >= self.samples_per_label:
64 |                 for element_idx in np.random.choice(selection, self.samples_per_label, replace=False):
65 |                     count += 1
66 |                     already_seen[label].append(element_idx)
67 |                     yield element_idx
68 | 
69 |             label_idx += 1
70 |             if label_idx >= len(self.label_range):
71 |                 label_idx = 0
72 |                 np.random.shuffle(self.label_range)
73 | 
74 |     def __len__(self):
75 |         return len(self.data_source)


--------------------------------------------------------------------------------
/dataset_analysis/novel_ngrams_mds.py:
--------------------------------------------------------------------------------
 1 | # analysis for different mds preparing_data
 2 | import sys
 3 | from sklearn.feature_extraction.text import CountVectorizer
 4 | import numpy as np
 5 | import functools
 6 | import multiprocessing
 7 | import random
 8 | 
 9 | sys.path.append("../../")
10 | from peersum.loading_data.peersum_loader import loading_peersum
11 | from peersum.loading_data.mds_loader import loading_mds
12 | from utils.cleaning import cleaning_document, cleaning_documents
13 | 
14 | 
15 | def novel_ngrams_multi_process(i, samples, analyzer):
16 |     sample = samples[i]
17 |     c = cleaning_documents(sample["source_documents"], multi_process=False, lemmatization=True,
18 |                            removing_stop_words=True, sentence_split=False)
19 |     s = cleaning_document(sample["summary"], lemmatization=True,
20 |                           removing_stop_words=True, sentence_split=False)
21 | 
22 |     ngrams_list_summary = analyzer(s)
23 |     ngrams_count_summary = len(ngrams_list_summary)
24 | 
25 |     ngrams_set_source_documents = []
26 |     for d in c:
27 |         ngrams_set_d = analyzer(d)
28 |         ngrams_set_source_documents.extend(ngrams_set_d)
29 | 
30 |     # % novel n-grams
31 |     diff = []
32 |     for w in ngrams_list_summary:
33 |         if w not in ngrams_set_source_documents:
34 |             diff.append(w)
35 |     if ngrams_count_summary == 0:
36 |         score = 0
37 |     else:
38 |         score = len(diff) / ngrams_count_summary
39 |     return score
40 | 
41 | 
42 | def novel_ngrams(samples, ngrams=1):
43 |     documents_all = []
44 |     for item in samples:
45 |         documents_all.append(item["summary"])
46 |         documents_all.extend(item["source_documents"])
47 |     count_vect = CountVectorizer(ngram_range=(ngrams, ngrams), min_df=1)
48 |     count_vect.fit(documents_all)
49 |     analyzer = count_vect.build_analyzer()
50 | 
51 |     count_all = len(samples)
52 |     partial_novel_grams = functools.partial(novel_ngrams_multi_process, samples=samples, analyzer=analyzer)
53 |     with multiprocessing.Pool(multiprocessing.cpu_count() - 1) as p:
54 |         all_novel_ngram_scores = list(p.imap(partial_novel_grams, range(count_all), chunksize=128))
55 |     print("Novel %d-grams" % ngrams, np.mean(all_novel_ngram_scores))
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     sampling_count = 1024
60 |     print("PeerSum")
61 |     papers = loading_peersum(including_public=True, including_author=True, including_abstract=True)
62 |     papers = random.sample(papers, sampling_count)
63 |     for n in [1, 2, 3]:
64 |         novel_ngrams(papers, ngrams=n)
65 | 
66 |     # print("Multi-News")
67 |     # samples = loading_mds(folder="../../datasets", data_name="multinews")
68 |     # samples = random.sample(samples, sampling_count)
69 |     # for n in [1, 2, 3]:
70 |     #     novel_ngrams(samples, ngrams=n)
71 |     # del samples
72 |     #
73 |     # print("WCEP")
74 |     # samples = loading_mds(folder="../../datasets", data_name="wcep_100")
75 |     # samples = random.sample(samples, sampling_count)
76 |     # for n in [1, 2, 3]:
77 |     #     novel_ngrams(samples, ngrams=n)
78 |     # del samples
79 |     #
80 |     # print("Multi-XScience")
81 |     # samples = loading_mds(folder="../../datasets", data_name="multixscience")
82 |     # samples = random.sample(samples, sampling_count)
83 |     # for n in [1, 2, 3]:
84 |     #     novel_ngrams(samples, ngrams=n)
85 |     # del samples
86 |     #
87 |     # print("Wikisum")
88 |     # samples = loading_mds(folder="../../datasets", data_name="wikisum")
89 |     # samples = random.sample(samples, sampling_count)
90 |     # for n in [1, 2, 3]:
91 |     #     novel_ngrams(samples, ngrams=n)
92 |     # del samples
93 | 
94 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/utils/writer.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from collections import OrderedDict
  3 | import os, shutil
  4 | import codecs
  5 | 
  6 | def create_dir(dirpath):
  7 |     p = os.path.normpath(os.path.expanduser(dirpath))
  8 |     if not os.path.isdir(p):
  9 |         os.makedirs(p)
 10 |     return p
 11 | 
 12 | def clean_create_dir(dirpath):
 13 |     if os.path.isdir(dirpath):
 14 |         shutil.rmtree(dirpath)
 15 |     if not os.path.isdir(dirpath):
 16 |         os.makedirs(dirpath)
 17 | 
 18 | def write_rl_details(summary, model_name_list, R1_list, R2_list, R4_list, filename):
 19 |     '''
 20 |     YG: to write the results of the RL-based summarizer
 21 |     :param summary: a list of sentences (strings)
 22 |     :param R1:
 23 |     :param R2:
 24 |     :param R4:
 25 |     :param filename:
 26 |     :return: null
 27 |     '''
 28 |     dirpath = os.path.dirname(filename)
 29 |     if not os.path.isdir(dirpath):
 30 |         os.makedirs(dirpath)
 31 |     if len(summary) == 0:
 32 |         return
 33 | 
 34 |     for i in range(len(model_name_list)):
 35 |         model_name = model_name_list[i]
 36 |         text = '\nmodel name: {} \n'.format(model_name)
 37 |         text += '\tR1 : {} ;'.format(R1_list[i])
 38 |         text += 'R2 : {} ;'.format(R2_list[i])
 39 |         text += 'R4 : {} ;'.format(R4_list[i])
 40 | 
 41 |     text += '\n'
 42 |     for sent in summary:
 43 |         text +=  sent + '\n'
 44 | 
 45 |     write_to_file(text, filename)
 46 | 
 47 | 
 48 | def write_details_file(details_list,filename):
 49 |     '''
 50 |     [[0, R1, R2, text], [1, R1, R2, text]], [[0, R1, R2, text], [1, R1, R2, text]]
 51 |     '''
 52 |     dirpath = os.path.dirname(filename)
 53 |     if not os.path.isdir(dirpath):
 54 |         os.makedirs(dirpath)
 55 |     if len(details_list) == 0:
 56 |         return
 57 | 
 58 |     max_len = max([len(feedback) for feedback in details_list])
 59 |     text = ''
 60 |     for i in range(max_len):
 61 |         text += '%s,' % (i)
 62 |         for feedback in details_list:
 63 |             if i >= len(feedback):
 64 |                 text += ",,,,,,"
 65 |             else:
 66 |                 _, R1, R2, SU4, accept, reject, summ_text = feedback[i]
 67 |                 text += u"%s,%s,%s,%s,%s,\"%s\"," % (str(R1), str(R2), str(SU4), str(accept), str(reject), unicode(summ_text))
 68 |         text = text[:-1] + '\n'
 69 |     write_to_file(text, filename)
 70 |         
 71 | def write_config(filename, config):
 72 |     string = "<ROUGE-EVAL version=\"1.55\">\n" + config + "</ROUGE-EVAL>\n"
 73 |     write_to_file(string, filename)
 74 | 
 75 | def write_to_csv(result_dict, filename):
 76 |     ordered_dict = OrderedDict(sorted(result_dict.items(), key=lambda t: t[0]))
 77 |     with open(filename, 'wb') as csvfile:
 78 |         fieldnames = ["Experiment","ROUGE-1 R","ROUGE-1 P","ROUGE-1 F","ROUGE-2 R","ROUGE-2 P",\
 79 |                       "ROUGE-2 F", "ROUGE-3 R", "ROUGE-3 P", "ROUGE-3 F", "ROUGE-4 R",\
 80 |                        "ROUGE-4 P", "ROUGE-4 F","ROUGE-SU4 F", "ROUGE-SU4 P", "ROUGE-4 F"]
 81 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 82 |         writer.writeheader()
 83 |         for key in ordered_dict:
 84 |             writer.writerow(ordered_dict[key])
 85 | 
 86 | def write_to_file(text, filename, add=False):
 87 |     if not add:
 88 |         fp = open(filename,'w')
 89 |     else:
 90 |         fp = open(filename,'a')
 91 |     fp.write(text)
 92 |     fp.close()
 93 | 
 94 | 
 95 | def append_to_file(text, filename):
 96 |     if os.path.isfile(filename):
 97 |         fp = open(filename,'a')
 98 |     else:
 99 |         fp = open(filename,'w+')
100 |     fp.write(text)
101 |     fp.close()


--------------------------------------------------------------------------------
/utils/unieval/scorer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | class UniEvaluator:
 8 |     def __init__(self, model_name_or_path, max_length=1024, device='cuda:0', cache_dir=None):
 9 |         """ Set up model """
10 |         self.device = device
11 |         self.max_length = max_length
12 | 
13 |         self.config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)
14 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
15 |         self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, config=self.config,
16 |                                                            cache_dir=cache_dir)
17 | 
18 |         self.model.eval()
19 |         self.model.to(device)
20 | 
21 |         self.softmax = nn.Softmax(dim=1)
22 | 
23 |         self.pos_id = self.tokenizer("Yes")["input_ids"][0]
24 |         self.neg_id = self.tokenizer("No")["input_ids"][0]
25 | 
26 |     def score(self, inputs, batch_size=8):
27 |         """
28 |             Get scores for the given samples.
29 |             final_score = postive_score / (postive_score + negative_score)
30 |         """
31 | 
32 |         # The implementation of "forward" in T5 still requires decoder_input_ids.
33 |         # Therefore, we construct a random one-word target sequence.
34 |         # The content of the target has no effect on the final scores.
35 |         tgts = ["No" for _ in range(len(inputs))]
36 | 
37 |         pos_score_list, neg_score_list = [], []
38 |         for i in tqdm(range(0, len(inputs), batch_size)):
39 |             src_list = inputs[i: i + batch_size]
40 |             tgt_list = tgts[i: i + batch_size]
41 |             try:
42 |                 with torch.no_grad():
43 |                     encoded_src = self.tokenizer(
44 |                         src_list,
45 |                         max_length=self.max_length,
46 |                         truncation=True,
47 |                         padding=True,
48 |                         return_tensors='pt'
49 |                     )
50 |                     encoded_tgt = self.tokenizer(
51 |                         tgt_list,
52 |                         max_length=self.max_length,
53 |                         truncation=True,
54 |                         padding=True,
55 |                         return_tensors='pt'
56 |                     )
57 | 
58 |                     src_tokens = encoded_src['input_ids'].to(self.device)
59 |                     src_mask = encoded_src['attention_mask'].to(self.device)
60 | 
61 |                     tgt_tokens = encoded_tgt['input_ids'].to(self.device)[:, 0].unsqueeze(-1)
62 | 
63 |                     output = self.model(
64 |                         input_ids=src_tokens,
65 |                         attention_mask=src_mask,
66 |                         labels=tgt_tokens
67 |                     )
68 |                     logits = output.logits.view(-1, self.model.config.vocab_size)
69 | 
70 |                     pos_score = self.softmax(logits)[:, self.pos_id]  # Yes
71 |                     neg_score = self.softmax(logits)[:, self.neg_id]  # No
72 | 
73 |                     cur_pos_score = [x.item() for x in pos_score]
74 |                     cur_neg_score = [x.item() for x in neg_score]
75 |                     pos_score_list += cur_pos_score
76 |                     neg_score_list += cur_neg_score
77 | 
78 |             except RuntimeError:
79 |                 print(f'source: {src_list}')
80 |                 print(f'target: {tgt_list}')
81 |                 exit(0)
82 | 
83 |         score_list = []
84 |         for i in range(len(pos_score_list)):
85 |             score_list.append(pos_score_list[i] / (pos_score_list[i] + neg_score_list[i]))
86 | 
87 |         return score_list


--------------------------------------------------------------------------------
/crawling_data/data_verification.py:
--------------------------------------------------------------------------------
 1 | #! -*- coding: utf-8 -*-
 2 | # Data verification
 3 | 
 4 | import json
 5 | 
 6 | def nips_verify(file_name):
 7 |     f = open(file_name)
 8 |     data = json.load(f)
 9 |     paper_count = len(data)
10 |     print("All items", paper_count)
11 | 
12 |     count = 0
13 |     avg_token_count_response = 0
14 |     for paper in data:
15 |         if paper.get("meta_review", "")!="" and "reviews" in paper.keys() and paper.get("pdf")!="" and "author_responses" in paper.keys() and paper.get("abstract", "")!="" and paper["author_responses"].get("pdf")!="" and paper["author_responses"].get("responses", "")!="":
16 |             count += 1
17 | 
18 |             response = paper["author_responses"]["responses"]
19 |             token_count_response = len(response.split())
20 |             avg_token_count_response += token_count_response
21 |     print("Valid items", count)
22 |     print("avg_token_count_response", avg_token_count_response/count)
23 | 
24 | def iclr_verify(file_name):
25 |     f = open(file_name)
26 |     data = json.load(f)
27 |     paper_count = len(data)
28 |     print("All items", paper_count)
29 | 
30 |     valid_paper_count = 0
31 |     all_token_count_fdecision = 0
32 |     all_count_direct_reviews = 0
33 |     all_token_count_direct_reviews = 0
34 |     all_count_responses = 0
35 |     all_token_count_responses = 0
36 |     keys_sets = set([])
37 |     for paper in data:
38 |         if paper.get("link", "") != "" and paper.get("title", "") != "" and paper.get("authors", "") != "" and paper.get("abstract", "") != "" and "final_decision" in paper.keys() and paper.get("reviews_commments", []) != []:
39 |             valid_paper_count += 1
40 |             final_decision = paper["final_decision"]["content"]["comment"]# it should be metareview when in 2019
41 |             # print(final_decision)
42 |             token_count_fdecision = len(final_decision.split())
43 |             all_token_count_fdecision += token_count_fdecision
44 | 
45 | 
46 |             direct_reviews = []
47 |             responses = []
48 |             for rc in paper["reviews_commments"]:
49 |                 if rc["replyto"] == paper["id"]:# official review or public comment
50 |                     cs = rc["content"].keys()
51 |                     keys_sets.add(",".join(sorted(list(cs))))
52 |                     if "review" in cs:
53 |                         # print(rc)
54 |                         direct_reviews.append(rc["content"]["review"])
55 |                     elif "comment" in cs:
56 |                         direct_reviews.append(rc["content"]["comment"])
57 |                     else:
58 |                         print(cs)
59 |                 else:
60 |                     responses.append(rc["content"]["comment"])
61 | 
62 |             all_count_direct_reviews += len(direct_reviews)
63 |             for d in direct_reviews:
64 |                 # print(len(d.split()))
65 |                 all_token_count_direct_reviews += len(d.split())
66 | 
67 |             all_count_responses += len(responses)
68 |             for r in responses:
69 |                 all_token_count_responses += len(r.split())
70 | 
71 |     print("Valid papers", valid_paper_count)
72 |     print("content key set", keys_sets)
73 |     print("avg_token_count_fdecision", all_token_count_fdecision / valid_paper_count)
74 |     print("avg_count_direct_reviews", all_count_direct_reviews / valid_paper_count)
75 |     print("avg_token_count_direct_review", all_token_count_direct_reviews / all_count_direct_reviews)
76 |     print("avg_count_responses", all_count_responses / valid_paper_count)
77 |     print("avg_token_count_response", all_token_count_responses / all_count_responses)
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     file_name = "data/nips_2019.json"
82 |     nips_verify(file_name)
83 | 
84 |     file_name = "data/iclr_2020.json"
85 |     iclr_verify(file_name)
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/tokenizer/WordTokenizer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Union, Tuple, List, Iterable, Dict
 3 | 
 4 | ENGLISH_STOP_WORDS = ['!', '"', "''", "``", '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',  '{', '|', '}', '~', 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldn', 'couldnt', 'cry', 'd', 'de', 'describe', 'detail', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'hadn', 'has', 'hasn', 'hasnt', 'have', 'haven', 'having', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'isn', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'll', 'ltd', 'm', 'ma', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mightn', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'mustn', 'my', 'myself', 'name', 'namely', 'needn', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'o', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'shan', 'she', 'should', 'shouldn', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 't', 'take', 'ten', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 've', 'very', 'via', 'was', 'wasn', 'we', 'well', 'were', 'weren', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'won', 'would', 'wouldn', 'y', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']
 5 | 
 6 | 
 7 | class WordTokenizer(ABC):
 8 |     @abstractmethod
 9 |     def set_vocab(self, vocab: Iterable[str]):
10 |         pass
11 | 
12 |     @abstractmethod
13 |     def get_vocab(self, vocab: Iterable[str]):
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def tokenize(self, text: str) -> List[int]:
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def save(self, output_path: str):
22 |         pass
23 | 
24 |     @staticmethod
25 |     @abstractmethod
26 |     def load(input_path: str):
27 |         pass


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_2/input_docs/FT923-6110:
--------------------------------------------------------------------------------
 1 | <TEXT>
 2 | DAMAGE CAUSED by Hurricane Andrew could rise to Dollars 20bn, it was estimated yesterday, as one of the costliest US storms this century threatened a further devastating landfall near the city of New Orleans.
 3 | </p>
 4 | <p>
 5 | Government officials in Louisiana, Mississippi and Texas yesterday advised or ordered more than 2m people to evacuate coastal areas.
 6 | </p>
 7 | <p>
 8 | The hurricane tore through southern Florida early on Monday morning, causing billions of dollars of property damage and at least 12 deaths, and yesterday was moving north-west across the Gulf of Mexico with winds of about 140 miles an hour.
 9 | </p>
10 | <p>
11 | At least three people died on Sunday when Hurricane Andrew crossed the Bahamas.
12 | </p>
13 | <p>
14 | Ms Kate Hale, director of emergency services in Florida's Dade County, which bore the brunt of the storm, estimated that Andrew had already caused Dollars 15bn to Dollars 20bn (Pounds 7.5bn-Pounds 10bn) of damage.
15 | </p>
16 | <p>
17 | However, insurance industry analysts cautioned that it was too early to assess the costs accurately.
18 | </p>
19 | <p>
20 | The US industry's Property Claims Service, the official compiler of disaster losses, had yet to compile a preliminary tally of the Florida bill.
21 | </p>
22 | <p>
23 | A hurricane warning was in effect yesterday along 470 miles of Gulf coast from Pascagoula, Mississippi, to Galvestone, Texas.
24 | </p>
25 | <p>
26 | Several forecasting agencies suggested the likeliest landfall was in central Louisiana, to the west of New Orleans, possibly late last night or this morning.
27 | </p>
28 | <p>
29 | New Orleans, with a population of 1.6m, is particularly vulnerable because the city lies below sea level, has the Mississippi River running through its centre and a large lake immediately to the north.
30 | </p>
31 | <p>
32 | Much of America's oil refining industry is concentrated along coastal Texas and Louisiana and several refineries were yesterday partially shut down.
33 | </p>
34 | <p>
35 | These included British Petroleum's Belle Chasse plant in Louisiana.
36 | </p>
37 | <p>
38 | In Florida, Andrew caused greatest havoc in a largely suburban swathe some 10-15 miles south of Miami.
39 | </p>
40 | <p>
41 | The town of Homestead, near the centre of the storm, was largely flattened, including a local air force base.
42 | </p>
43 | <p>
44 | Miami's city centre escaped with relatively light damage.
45 | </p>
46 | <p>
47 | More than 24 hours after the hurricane, some 825,000 households and businesses were still without power.
48 | </p>
49 | <p>
50 | The brunt of insurance claims from the Florida storm will fall on the US industry, and companies with a heavy local exposure include the State Farm Group and the Allstate Insurance unit of Sears Roebuck.
51 | </p>
52 | <p>
53 | These are also the leading property/casualty and home insurance groups in Louisiana, together with American International Group.
54 | </p>
55 | <p>
56 | A spokesman for State Farm Insurance said he believed the company had roughly 20 per cent of the Florida market.
57 | </p>
58 | <p>
59 | The mutually-owned company has no reinsurance.
60 | </p>
61 | <p>
62 | Its size has made obtaining reinsurance cover difficult and its reserves, at about Dollars 24bn, have made it unnecessary.
63 | </p>
64 | <p>
65 | According to Balcombe Group, a UK-based claims adjustment firm, other insurers with large exposure in the hurricane-hit area are Hartford Insurance, Aetna and Travellers.
66 | </p>
67 | <p>
68 | Travellers said it had flown 50 claims adjusters in to Florida late on Monday and was assessing losses.
69 | </p>
70 | <p>
71 | About 12 per cent of Travellers' home insurance premium income came from Florida last year, and 4.6 per cent of its commercial insurance premiums.
72 | </p>
73 | <p>
74 | The last serious US hurricane, Hugo, which struck South Carolina in 1989, cost the industry Dollars 4.2bn from insured losses, though estimates of the total damage caused ranged between Dollars 6bn and Dollars 10bn.
75 | </p>
76 | </TEXT>
77 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_2/input_docs/FT923-6455:
--------------------------------------------------------------------------------
 1 | <TEXT>
 2 | <p>
 3 | US CITIES along the Gulf of Mexico from Alabama to eastern Texas were on storm watch last night as Hurricane Andrew headed west after sweeping across southern Florida, causing at least eight deaths and severe property damage.
 4 | </p>
 5 | <p>
 6 | The hurricane was one of the fiercest in the US in decades and the first to hit Miami directly in a quarter of a century.
 7 | </p>
 8 | <p>
 9 | In the Bahamas, government spokesman Mr Jimmy Curry said four deaths had been reported on outlying eastern islands.
10 | </p>
11 | <p>
12 | Mr Justin Balcombe, of UK-based insurance adjuster Balcombe Group, said total losses could exceed Dollars 15bn if business interruption claims were taken into account.
13 | </p>
14 | <p>
15 | That compares with the Dollars 4bn-Dollars 6n (Pounds 2.1bn-Pounds 3.1bn) of insurance industry losses caused by the last big US hurricane, Hugo, which hit South Carolina in 1989.
16 | </p>
17 | <p>
18 | The brunt of the losses are likely to be concentrated among US insurers, industry analysts said yesterday.
19 | </p>
20 | <p>
21 | Mr George Lloyd-Roberts, chairman of Lloyd's Underwriters' Non-Marine Association, said that, unless damage claims exceeded Pounds 3bn, the Lloyd's insurance market would feel little impact.
22 | </p>
23 | <p>
24 | Because the reinsurance of reinsurance risk - known as the retrocession market - has shrunk considerably in recent years, US insurers have placed far fewer of their risks through Lloyd's.
25 | </p>
26 | <p>
27 | Mr Roger Hill, insurance analyst at Warburg Securities, said he estimated that mainline UK insurers faced no more than Pounds 75m in damage claims so far.
28 | </p>
29 | <p>
30 | 'At the moment we are relaxed about it,' he said.
31 | </p>
32 | <p>
33 | The real question, he added, is the level of reinsurance available to the UK underwriters.
34 | </p>
35 | <p>
36 | Royal Insurance estimated the company's losses at no more than Pounds 20m.
37 | </p>
38 | <p>
39 | Among other UK insurers, Mr Hill estimated that General Accident may face losses of up to Pounds 30m, while Guardian Royal Exchange faced Pounds 5m and Sun Alliance and Commercial Union Pounds 10m each.
40 | </p>
41 | <p>
42 | However, Hurricane Andrew gathered fresh strength as it moved across the Gulf of Mexico and there was concern last night that it might head towards New Orleans, which is especially low lying and could suffer severe flood damage.
43 | </p>
44 | <p>
45 | Scientists said the storm could make landfall anywhere between the Alabama port of Mobile and the Louisiana-Texas border, probably tomorrow night or early Thursday.
46 | </p>
47 | <p>
48 | It could threaten the large concentration of offshore oil production facilities in the Gulf of Mexico.
49 | </p>
50 | <p>
51 | Shell Oil was evacuating most of the 900 workers on its offshore platforms as a precaution.
52 | </p>
53 | <p>
54 | A substantial part of America's oil refining industry is concentrated on the Gulf coast, in Louisiana and Texas, and officials there were reviewing emergency plans to curtail or shut down plant operations.
55 | </p>
56 | <p>
57 | Andrew, the first Caribbean hurricane of the season, hit the eastern coast of Florida early yesterday, gusting up to 165mph.
58 | </p>
59 | <p>
60 | It ripped roofs off houses, smashed cars and trucks, snapped power lines and uprooted trees before heading out over the Gulf.
61 | </p>
62 | <p>
63 | A million people had been ordered to flee their homes in southern Florida as the hurricane moved in from the Bahamas on Sunday.
64 | </p>
65 | <p>
66 | The Florida Power and Light company said that about 1.2m of its customers, or 32 per cent, were without power.
67 | </p>
68 | <p>
69 | Some of the strongest winds were in the affluent suburb of Coral Gables, just south of Miami, where the National Hurricane Center is located.
70 | </p>
71 | <p>
72 | Its radar and satellite antennae were blown away.
73 | </p>
74 | <p>
75 | President Bush authorised federal disaster assistance for the affected areas and made plans for an inspection tour of the state.
76 | </p>
77 | <p>
78 | Picture, Page 14
79 | </p>
80 | </TEXT>
81 | 


--------------------------------------------------------------------------------
/human_annotation/peersum_dataset_quality/final_analysis.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import numpy as np
  3 | import jsonlines
  4 | from nltk.tokenize import sent_tokenize
  5 | 
  6 | folder = "annotated_results"
  7 | samples = []
  8 | for i in range(10):
  9 |     print(i)
 10 |     with jsonlines.open(os.path.join(folder, "peersum_sampled_%d.json" % i)) as reader:
 11 |         for line in reader:
 12 |             samples.append(line)
 13 | 
 14 | 
 15 | variance_low_words = []
 16 | variance_medium_words = []
 17 | variance_high_words = []
 18 | variance_low_sents = []
 19 | variance_medium_sents = []
 20 | variance_high_sents = []
 21 | for sample in samples:
 22 |     anchored_text = sample["anchored_texts"]
 23 |     summary = sample["summary"]
 24 |     summary_sents = sent_tokenize(summary)
 25 |     anchored_sents = []
 26 |     for sent in summary_sents:
 27 |         if sent in anchored_text:
 28 |             anchored_sents.append(sent)
 29 |     review_score_variance = sample["review_score_variance"]
 30 |     ratio_words = len(anchored_text.split()) / len(summary.split())
 31 |     ratio_sents = len(anchored_sents) / len(summary_sents)
 32 |     if 0 <= review_score_variance < 2:
 33 |         variance_low_words.append(ratio_words)
 34 |         variance_low_sents.append(ratio_sents)
 35 |     if 2 <= review_score_variance < 8:
 36 |         variance_medium_words.append(ratio_words)
 37 |         variance_medium_sents.append(ratio_sents)
 38 |     if 8 <= review_score_variance < 12:
 39 |         variance_high_words.append(ratio_words)
 40 |         variance_high_sents.append(ratio_sents)
 41 | 
 42 | print("low", np.mean(variance_low_words))
 43 | print("medium", np.mean(variance_medium_words))
 44 | print("high", np.mean(variance_high_words))
 45 | 
 46 | print("low", np.mean(variance_low_sents))
 47 | print("medium", np.mean(variance_medium_sents))
 48 | print("high", np.mean(variance_high_sents))
 49 | 
 50 | 
 51 | all_papers = []
 52 | with jsonlines.open("/home/miao4/punim0521/NeuralAbstractiveSummarization/peersum/crawling_data/data/peersum_all.json",
 53 |                     "r") as reader:
 54 |     for paper in reader:
 55 |         all_papers.append(paper)
 56 | 
 57 | clusters = {}
 58 | for paper in all_papers:
 59 |     summary = paper["meta_review"]
 60 |     paper_id = paper['paper_id']
 61 |     paper_acceptance = paper["paper_acceptance"]
 62 |     source_documents = []
 63 | 
 64 |     review_scores = []
 65 |     for i, review_i in enumerate(paper["reviews"]):
 66 |         if review_i["rating"] > 0:
 67 |             review_scores.append(review_i["rating"])
 68 | 
 69 |     source_documents.append(paper["paper_abstract"])
 70 |     for review in paper["reviews"]:
 71 |         if "comment" in review.keys():
 72 |             text = review["comment"]
 73 |             source_documents.append(text)
 74 | 
 75 |     if summary.strip() != "" and len(source_documents) > 1:
 76 |         clusters[summary] = review_scores
 77 | 
 78 | 
 79 | words_ratios_range_low = []
 80 | words_ratio_range_high = []
 81 | variances_range_low = []
 82 | variance_range_high = []
 83 | for sample in samples:
 84 |     anchored_text = sample["anchored_texts"]
 85 |     summary = sample["summary"]
 86 |     summary_sents = sent_tokenize(summary)
 87 |     anchored_sents = []
 88 |     for sent in summary_sents:
 89 |         if sent in anchored_text:
 90 |             anchored_sents.append(sent)
 91 |     review_scores = clusters[summary]
 92 |     review_score_variance = sample["review_score_variance"]
 93 |     ratio_words = len(anchored_text.split()) / len(summary.split())
 94 |     # ratio_sents = len(anchored_sents) / len(summary_sents)
 95 |     if np.max(review_scores)-np.min(review_scores) < 4:
 96 |         variances_range_low.append(review_score_variance)
 97 |         words_ratios_range_low.append(ratio_words)
 98 |     else:
 99 |         variance_range_high.append(review_score_variance)
100 |         words_ratio_range_high.append(ratio_words)
101 | 
102 | print(variances_range_low)
103 | print(variance_range_high)
104 | print("low range", len(variances_range_low), np.mean(variances_range_low), np.mean(words_ratios_range_low))
105 | print("high range", len(variance_range_high), np.mean(variance_range_high), np.mean(words_ratio_range_high))


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/Pooling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | 
 8 | 
 9 | class Pooling(nn.Module):
10 |     """Performs pooling (max or mean) on the token embeddings.
11 | 
12 |     Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model.
13 |     You can concatenate multiple poolings together.
14 |     """
15 |     def __init__(self,
16 |                  word_embedding_dimension: int,
17 |                  pooling_mode_cls_token: bool = False,
18 |                  pooling_mode_max_tokens: bool = False,
19 |                  pooling_mode_mean_tokens: bool = True,
20 |                  pooling_mode_mean_sqrt_len_tokens: bool = False,
21 |                  ):
22 |         super(Pooling, self).__init__()
23 | 
24 |         self.config_keys = ['word_embedding_dimension',  'pooling_mode_cls_token', 'pooling_mode_mean_tokens', 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens']
25 | 
26 |         self.word_embedding_dimension = word_embedding_dimension
27 |         self.pooling_mode_cls_token = pooling_mode_cls_token
28 |         self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
29 |         self.pooling_mode_max_tokens = pooling_mode_max_tokens
30 |         self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens
31 | 
32 |         pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, pooling_mode_mean_sqrt_len_tokens])
33 |         self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension)
34 | 
35 |     def forward(self, features: Dict[str, Tensor]):
36 |         token_embeddings = features['token_embeddings']
37 |         cls_token = features['cls_token_embeddings']
38 |         input_mask = features['input_mask']
39 | 
40 |         ## Pooling strategy
41 |         output_vectors = []
42 |         if self.pooling_mode_cls_token:
43 |             output_vectors.append(cls_token)
44 |         if self.pooling_mode_max_tokens:
45 |             input_mask_expanded = input_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
46 |             token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
47 |             max_over_time = torch.max(token_embeddings, 1)[0]
48 |             output_vectors.append(max_over_time)
49 |         if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens:
50 |             input_mask_expanded = input_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
51 |             sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
52 | 
53 |             #If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
54 |             if 'token_weights_sum' in features:
55 |                 sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size())
56 |             else:
57 |                 sum_mask = input_mask_expanded.sum(1)
58 | 
59 |             sum_mask = torch.clamp(sum_mask, min=1e-9)
60 | 
61 |             if self.pooling_mode_mean_tokens:
62 |                 output_vectors.append(sum_embeddings / sum_mask)
63 |             if self.pooling_mode_mean_sqrt_len_tokens:
64 |                 output_vectors.append(sum_embeddings / torch.sqrt(sum_mask))
65 | 
66 |         output_vector = torch.cat(output_vectors, 1)
67 |         features.update({'sentence_embedding': output_vector})
68 |         return features
69 | 
70 |     def get_sentence_embedding_dimension(self):
71 |         return self.pooling_output_dimension
72 | 
73 |     def get_config_dict(self):
74 |         return {key: self.__dict__[key] for key in self.config_keys}
75 | 
76 |     def save(self, output_path):
77 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
78 |             json.dump(self.get_config_dict(), fOut, indent=2)
79 | 
80 |     @staticmethod
81 |     def load(input_path):
82 |         with open(os.path.join(input_path, 'config.json')) as fIn:
83 |             config = json.load(fIn)
84 | 
85 |         return Pooling(**config)
86 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_2/input_docs/FT923-5089:
--------------------------------------------------------------------------------
 1 | <TEXT>
 2 | <p>
 3 | THERE are growing signs that Hurricane Andrew, unwelcome as it was for the devastated inhabitants of Florida and Louisiana, may in the end do no harm to the re-election campaign of President George Bush.
 4 | </p>
 5 | <p>
 6 | After a faltering and heavily criticised initial response to the disaster, both the president and his administration seem finally to be getting assistance to those rendered homeless and to businesses and farms that have been destroyed.
 7 | </p>
 8 | <p>
 9 | In the process, Mr Bush has been able to call on the power of incumbency, the one asset denied his presidential rival, Mr Bill Clinton, who is to visit Florida today.
10 | </p>
11 | <p>
12 | This was brought home graphically by the president's announcement that Homestead Air Force base in Florida - a major local employer virtually destroyed by Andrew - would be rebuilt.
13 | </p>
14 | <p>
15 | His poignant and brief address to the nation on Tuesday night, committing the government to pay the emergency relief costs and calling on Americans to contribute to the American Red Cross, also struck the right sort of note.
16 | </p>
17 | <p>
18 | It was only his tenth such televised speech from the Oval Office, itself a testimony to the gravity of the situation.
19 | </p>
20 | <p>
21 | As if to underline the political benefit to the president, a Harris poll conducted from August 26 to September 1 yesterday showed Mr Bush with 45 per cent support - behind Mr Clinton by just five points - reflecting a closer race than other recent surveys.
22 | </p>
23 | <p>
24 | With Mr Bush constantly in the headlines, Mr Clinton has been left on the sidelines, able to do little more than offer sympathy and sotto voce criticism of the president's initial stumbling.
25 | </p>
26 | <p>
27 | First responses were certainly found wanting, specifically in the performance of the Federal Emergency Management Agency (FEMA) set up by President Carter in 1979 to handle disasters such as Andrew.
28 | </p>
29 | <p>
30 | FEMA has, under the Republicans, become the ultimate patronage backwater, with, according to one congressional study, ten times as many political appointees as the typical arm of government.
31 | </p>
32 | <p>
33 | Its current head, Mr Wallace Stickney, is a New Hampshire political associate of Mr John Sununu, the former state governor and White House chief of staff.
34 | </p>
35 | <p>
36 | Contrary to its brief, but confirming a prescient recent report by a House committee that Mr Stickney was 'uninterested in substantive programmes', FEMA was caught completely unprepared by Andrew, resulting in unseemly disputes between state and federal authorities over who did what in bringing relief.
37 | </p>
38 | <p>
39 | However, the arrival in Florida of the military and the assignment of special responsibilities to Mr Andrew Card, the young and telegenic transportation secretary, is making a difference.
40 | </p>
41 | <p>
42 | Also increasingly evident is the hand of Mr James Baker, now ensconced at the White House.
43 | </p>
44 | <p>
45 | It was Mr Baker who reshuffled the president's campaign schedule to make room for visits to the devastated areas and who pushed for a bigger role for the previously obscure Mr Card.
46 | </p>
47 | <p>
48 | It is also Mr Baker who is making the most of presidential powers to dispense largesse.
49 | </p>
50 | <p>
51 | Yesterday, Mr Bush flew to South Dakota to tell farmers of an expansion of the subsidised grains exports programme and then to the General Dynamics factory in Texas to announce an F-16 fighter sale to Taiwan.
52 | </p>
53 | <p>
54 | Both are, of course, intensely political actions.
55 | </p>
56 | <p>
57 | Both involve federal subsidies, as does relief for Andrew, that run counter to Mr Bush's commitment to reduce the budget deficit.
58 | </p>
59 | <p>
60 | But both may be presented by a president as being in the national interest because they guarantee employment, which is what the election is largely about.
61 | </p>
62 | <p>
63 | US insurers expect to pay out around Dollars 500m in claims as a result of damage caused by Hurricane Andrew in Louisiana, the American Insurance Services Group said yesterday, Nikki Tait reports.
64 | </p>
65 | <p>
66 | This takes the insurance bill from the storm to around Dollars 8bn, making it the costliest disaster to hit the US.
67 | </p>
68 | </TEXT>
69 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/utils/misc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import math
  4 | import scipy as sp
  5 | 
  6 | def normaliseList(ll,max_value=10.):
  7 |     minv = min(ll)
  8 |     maxv = max(ll)
  9 |     gap = maxv-minv
 10 | 
 11 |     new_ll = [(x-minv)*max_value/gap for x in ll]
 12 | 
 13 |     return new_ll
 14 | 
 15 | def sigmoid(x,temp=1.):
 16 |     return 1.0/(1.+math.exp(-x/temp))
 17 | 
 18 | 
 19 | def softmax_sample(value_list,strict,softmax_list=[],return_softmax_list=False):
 20 |     if len(softmax_list) == 0:
 21 |         slist = getSoftmaxList(value_list,strict)
 22 |     else:
 23 |         slist = softmax_list
 24 | 
 25 |     pointer = random.random()*sum(softmax_list)
 26 |     tier = 0
 27 |     idx = 0
 28 | 
 29 |     rtn_idx = -1
 30 |     for value in slist:
 31 |         if pointer >= tier and pointer < tier+value:
 32 |             rtn_idx = idx
 33 |             break
 34 |         else:
 35 |             tier += value
 36 |             idx += 1
 37 | 
 38 |     if return_softmax_list:
 39 |         return rtn_idx,slist
 40 |     else:
 41 |         return rtn_idx
 42 | 
 43 | 
 44 | def getSoftmaxList(value_list, strict):
 45 |     softmax_list = []
 46 |     for value in value_list:
 47 |         softmax_list.append(np.exp(value/strict))
 48 |     return softmax_list
 49 | 
 50 | def getSoftmaxProb(value_list, strict):
 51 |     slist = getSoftmaxList(value_list,strict)
 52 |     return [xx/np.sum(slist) for xx in slist]
 53 | 
 54 | 
 55 | def cosine(vec1,vec2):
 56 |     return np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
 57 | 
 58 | 
 59 | def jsd(p, q, wanted='js', base=np.e):
 60 |     '''
 61 |         Implementation of pairwise `jsd` based on
 62 |         https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
 63 |     '''
 64 |     ## convert to np.array
 65 |     p, q = np.asarray(p), np.asarray(q)
 66 |     ## normalize p, q to probabilities
 67 |     if p.sum() == 0 or q.sum() == 0:
 68 |         return -1.
 69 | 
 70 |     p, q = p/p.sum(), q/q.sum()
 71 |     m = 1./2*(p + q)
 72 | 
 73 |     if wanted == 'kl1': return sp.stats.entropy(p,q,base=base)
 74 |     elif wanted == 'kl2': return sp.stats.entropy(q,p,base=base)
 75 |     else:
 76 |         assert wanted == 'js'
 77 |         return sp.stats.entropy(p,m,base=base)/2.+ sp.stats.entropy(q,m,base=base)/2.
 78 | 
 79 | 
 80 | def getNormMeanFromLogMean(log_mean,log_dev):
 81 |     ### from the mean/dev of log-normal to obtain mean/dev for normal distribution
 82 |     ### see wikipedia of log normal distribution
 83 |     norm_mean = np.log(log_mean/math.sqrt(1+math.pow(log_dev,2)/math.pow(log_mean,2)))
 84 |     norm_dev = np.log( 1+math.pow(log_dev,2)/math.pow(log_mean,2) )
 85 | 
 86 |     return norm_mean, math.sqrt(norm_dev)
 87 | 
 88 | def bellCurvise(js_list,mean=5.,dev=2.,norm=False,log=True):
 89 |     ### note that the smaller the js, the better the summary
 90 |     sorted_js = sorted(js_list)
 91 |     if log:
 92 |         norm_mean, norm_dev = getNormMeanFromLogMean(mean,dev)
 93 |         norm_values = list(np.random.lognormal(norm_mean,norm_dev,len(js_list)))
 94 |     else:
 95 |         norm_values = list(np.random.normal(mean,dev,len(js_list)))
 96 |     norm_values = sorted(norm_values,reverse=True)
 97 |     rewards = []
 98 | 
 99 |     for js in js_list:
100 |         rewards.append(norm_values[sorted_js.index(js)])
101 | 
102 |     if norm:
103 |         return normaliseList(rewards)
104 |     else:
105 |         return rewards
106 | 
107 | 
108 | def aggregateScores(scores_dic):
109 |     score_matrix = []
110 | 
111 |     for name in scores_dic:
112 |         score_matrix.append(scores_dic[name])
113 | 
114 |     return np.array(score_matrix).mean(0)
115 | 
116 | def getRankBasedScores(scores,normalise=True):
117 |     rewards = scores[:]
118 |     sr = sorted(rewards)
119 |     for i in range(len(sr)-1):
120 |         if sr[i] == sr[i+1]:
121 |             dec_value = random.random()*1e-5
122 |             sr[i] -= dec_value
123 |             rewards[rewards.index(sr[i+1])] -= dec_value
124 |     sr = sorted(sr)
125 | 
126 |     rank_rewards = []
127 |     for rr in rewards:
128 |         rank_rewards.append(sr.index(rr))
129 | 
130 |     if normalise:
131 |         return np.array(rank_rewards)*10./len(rank_rewards)
132 |     else:
133 |         return np.array(rank_rewards)
134 | 
135 | 
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/crawling_data/nips_getdata_2020.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import urllib.request
  3 | import json
  4 | import os
  5 | 
  6 | base_url = "https://proceedings.neurips.cc"
  7 | year_url= base_url + "/paper/2020"
  8 | output_file_name = "data/nips_2020.json"
  9 | 
 10 | req=urllib.request.Request(year_url)
 11 | resp=urllib.request.urlopen(req)
 12 | data=resp.read().decode('utf-8')
 13 | 
 14 | papers = []
 15 | soup = BeautifulSoup(data, 'html.parser')
 16 | paper_list = soup.find("div", attrs={'class':'col'}).ul.find_all('li')
 17 | print(len(paper_list))
 18 | 
 19 | exist_papers = []
 20 | if os.path.isfile(output_file_name):
 21 |     with open(output_file_name, 'r') as f:
 22 |         exist_papers = json.load(f)
 23 | print("exist papers", len(exist_papers))
 24 | 
 25 | for item in paper_list[len(exist_papers):]:
 26 |     paper = {}
 27 |     paper["link"] = item.a["href"]
 28 |     paper["title"] = item.a.get_text()
 29 |     paper["authors"] = item.i.get_text()
 30 | 
 31 |     req = urllib.request.Request(base_url + paper["link"])
 32 |     resp = urllib.request.urlopen(req)
 33 |     data = resp.read().decode('utf-8')
 34 |     soup = BeautifulSoup(data, 'html.parser')
 35 |     content = soup.find("div", attrs={'class': 'col'})
 36 | 
 37 |     hrefs = content.div.find_all("a")
 38 |     for h in hrefs:
 39 |         hcontent = h.get_text()
 40 | 
 41 |         # meta-review
 42 |         if hcontent == "MetaReview":
 43 |             req = urllib.request.Request(base_url + h["href"])
 44 |             # print(base_url + content.div.find_all("a")[3].get_text())
 45 |             resp = urllib.request.urlopen(req)
 46 |             data = resp.read()
 47 |             soup = BeautifulSoup(data, 'html.parser')
 48 |             paper["meta_review"] = soup.p.get_text()
 49 | 
 50 |         # reviews
 51 |         if hcontent == "Review":
 52 |             req = urllib.request.Request(base_url + h["href"])
 53 |             resp = urllib.request.urlopen(req)
 54 |             data = resp.read()
 55 |             soup = BeautifulSoup(data, 'html.parser')
 56 |             reviews = []
 57 |             for tmp in soup.find_all("h3")[1:]:
 58 |                 review = {}
 59 |                 sc = tmp.find_next("p")
 60 |                 review["summary_and_contributions"] = sc.get_text()
 61 |                 st = sc.find_next("p")
 62 |                 review["strengths"] = st.get_text()
 63 |                 we = st.find_next("p")
 64 |                 review["weaknesses"] = we.get_text()
 65 |                 co = we.find_next("p")
 66 |                 review["correctness"] = co.get_text()
 67 |                 cl = co.find_next("p")
 68 |                 review["clarify"] = cl.get_text()
 69 |                 rt = cl.find_next("p")
 70 |                 review["relation_to_prior_work"] = rt.get_text()
 71 |                 re = rt.find_next("p")
 72 |                 review["reproducibility"] = re.get_text()
 73 |                 af = re.find_next("p")
 74 |                 review["additional_feedback"] = af.get_text()
 75 |                 reviews.append(review)
 76 |             paper["reviews"] = reviews
 77 |             print(len(reviews))
 78 | 
 79 |         if hcontent == "Paper":
 80 |             paper["pdf"] = h["href"]
 81 | 
 82 |         # author response
 83 |         if hcontent == "AuthorFeedback":
 84 |             author_responses = {}
 85 |             author_responses["pdf"] = h["href"]
 86 |             paper["author_responses"] = author_responses
 87 | 
 88 |     # abstract
 89 |     paper["abstract"] = content.find_all('p')[-2].get_text()
 90 |     paper["comment"] = "accept"
 91 |     papers.append(paper)
 92 | 
 93 |     if len(papers)%50==0:
 94 |         exist_papers = []
 95 |         if os.path.isfile(output_file_name):
 96 |             with open(output_file_name, 'r') as f:
 97 |                 exist_papers = json.load(f)
 98 | 
 99 |         exist_papers.extend(papers)
100 |         f = open(output_file_name, 'w')
101 |         f.write(json.dumps(exist_papers))
102 |         f.close()
103 |         print("paper count", len(exist_papers))
104 | 
105 |         papers = []
106 | 
107 | with open(output_file_name, 'r') as f:
108 |     exist_papers = json.load(f)
109 | 
110 | exist_papers.extend(papers)
111 | f = open(output_file_name, 'w')
112 | f.write(json.dumps(exist_papers))
113 | f.close()
114 | print("paper count", len(exist_papers))
115 | 
116 | print("Done")


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/input_docs/NYT_ENG_20050126.0303:
--------------------------------------------------------------------------------
  1 | <DOC id="NYT_ENG_20050126.0303" type="story" >
  2 | <HEADLINE>
  3 | POLICE TELL OF A TROUBLED, SUICIDAL MAN
  4 | </HEADLINE>
  5 | <DATELINE>
  6 | COMPTON, Calif.
  7 | </DATELINE>
  8 | <TEXT>
  9 | <P>
 10 | Until Wednesday, Juan Manuel Alvarez was living the
 11 | average life of an obscure and troubled man.
 12 | </P>
 13 | <P>
 14 | He has an arrest record for cocaine, a turbulent relationship with
 15 | his estranged wife and a restraining order to keep away
 16 | from her and their two children.
 17 | </P>
 18 | <P>
 19 | But then in the early morning darkness of Wednesday, Alvarez introduced
 20 | his troubled life to the world when he drove his
 21 | Jeep Cherokee onto the commuter railroad tracks of Glendale in
 22 | metropolitan Los Angeles, killing at least 10 people and injuring
 23 | about 200 more.
 24 | </P>
 25 | <P>
 26 | Ronald Watson came to the Glendale Police Station on Wednesday morning
 27 | to find some information on his wife.
 28 | </P>
 29 | <P>
 30 | He was told she was taken to the hospital and released.
 31 | </P>
 32 | <P>
 33 | "I heard what happened this morning, and I thought it can't
 34 | be her," said Watson, visibly relieved. "It was. Thank God
 35 | nothing serious happened to her. She's home now. She's going
 36 | to be mad I didn't pick her up."
 37 | </P>
 38 | <P>
 39 | Alvarez had apparently meant to kill himself, investigators said, but changed
 40 | his mind when he saw the train barreling down on
 41 | him. When he could not get his vehicle off the
 42 | tracks, he ran. He was in police custody Wednesday evening,
 43 | said to be distraught but cooperating.
 44 | </P>
 45 | <P>
 46 | "He's carrying a lot of baggage and talked about killing himself,"
 47 | said Sheriff Lee Baca of Los Angeles County. "When the
 48 | train came, he got scared, he couldn't go through with
 49 | it. He left his car there and instead of killing
 50 | himself, he killed many innocent people."
 51 | </P>
 52 | <P>
 53 | Alvarez's motive for wanting to kill himself, according to investigators, appears
 54 | to be that his estranged wife, Carmelita, would not let
 55 | him see their two children. The couple had separated three
 56 | months ago, and she obtained a restraining order against Alvarez
 57 | about two months ago, the police said.
 58 | </P>
 59 | <P>
 60 | Alvarez's sister-in-law, Maricela Amaya, told Telemundo TV that he had recently
 61 | tried to see his boy and threatened to kill himself.
 62 | </P>
 63 | <P>
 64 | "He was having problems with drugs and all that and was
 65 | violent, and because of that he separated from her," Amaya
 66 | said in Spanish. "A few other times he went around
 67 | as if he wanted to kill himself."
 68 | </P>
 69 | <P>
 70 | After the bodies had been cleared from the wreckage, law-enforcement officials
 71 | began to wonder if Alvarez might have killed his family.
 72 | But, Mrs. Alvarez was tracked to a modest home in
 73 | the north section of Compton. A man standing outside the
 74 | family's home, who described himself as a relative, said Mrs.
 75 | Alvarez and the children were uninjured.
 76 | </P>
 77 | <P>
 78 | According to investigators, Alvarez spent the evening before slashing his wrists
 79 | and even stabbed himself in the chest, but the wounds
 80 | were superficial.
 81 | </P>
 82 | <P>
 83 | And so sometime Wednesday morning, Alvarez, 25, drove himself to the
 84 | Glendale tracks, near the City of Los Angeles boundary and
 85 | parked his car. He decided not to go through with
 86 | it, but the vehicle got stuck between the tracks, the
 87 | police said. Alvarez ran to safety and watched the carnage
 88 | he had caused, witnesses told the police.
 89 | </P>
 90 | <P>
 91 | "He came to Glendale to commit suicide," said Bob Yousefian, mayor
 92 | of Glendale, a normally uneventful city of 210,000 just north
 93 | of Los Angeles City Hall. "He kind of ran, tried
 94 | to hide, but because of his previous injuries, he got
 95 | apprehended."
 96 | </P>
 97 | <P>
 98 | Chief Randy G. Adams of the Glendale police said he was
 99 | "very confident" that he had the right man, going by
100 | Alvarez's own admissions and witnesses on the train who saw
101 | him run.
102 | </P>
103 | <P>
104 | "This whole incident was started by a deranged individual that was
105 | suicidal," Adams said.
106 | </P>
107 | <P>
108 | Alvarez is likely to face at least 10 counts of murder,
109 | the chief said.
110 | </P>
111 | </TEXT>
112 | </DOC>
113 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/data/topic_1/input_docs/AFP_ENG_20050126.0676:
--------------------------------------------------------------------------------
  1 | <DOC id="AFP_ENG_20050126.0676" type="story" >
  2 | <HEADLINE>
  3 | Murder probe starts as 10 killed, 100 injured in LA train crash
  4 | by Marc Lavine 
  5 | ATTENTION - RECASTS ///
  6 | </HEADLINE>
  7 | <DATELINE>
  8 | GLENDALE, California, Jan 26
  9 | </DATELINE>
 10 | <TEXT>
 11 | <P>
 12 | Ten people were killed and 
 13 | more than 100 injured Wednesday when two commuter trains slammed 
 14 | into each other after one of them hit a car left on the lines by a 
 15 | suicidal man, police said.
 16 | </P>
 17 | <P>
 18 | A murder investigation was launched as the death toll rose 
 19 | following the horrific early morning crash involving three trains 
 20 | and a sports utility vehicle in the heavily-populated Los Angeles 
 21 | suburb of Glendale.
 22 | </P>
 23 | <P>
 24 | Rescuers were picking through the mangled wreckage of the trains 
 25 | that collided with an explosive crash just after 6:00 am (1400 GMT) 
 26 | after one of them derailed as it ploughed into the abandoned 
 27 | vehicle.
 28 | </P>
 29 | <P>
 30 | "We have 10 fatalities so far and well over 100 injured," 
 31 | Glendale Police Chief Randy Adams told reporters, adding that the 
 32 | probe into the accident had turned into a homicide investigation.
 33 | </P>
 34 | <P>
 35 | "This whole incident was started by a deranged individual who 
 36 | was suicidal," he said announcing that the depressed driver, 
 37 | identified as Juan Manuel Alvarez, 26, was unhurt and was in 
 38 | custody.
 39 | </P>
 40 | <P>
 41 | The driver allegedly manoeuvred his Jeep onto the tracks, where 
 42 | it became stuck, and waited for a train to end his life, according 
 43 | to Adams and other police officials.  
 44 | </P>
 45 | <P>
 46 | "He was intent at that time on taking his own life but changed 
 47 | his mind," Adams told reporters.    "He exited the vehicle and stood 
 48 | by as a southbound Metrolink train struck his vehicle, causing the 
 49 | train to derail and strike the northbound train," the police chief 
 50 | said.
 51 | </P>
 52 | <P>
 53 | After hitting the car, the packed rush-hour train, which was 
 54 | bound for Los Angeles' Union Station, jumped its tracks and 
 55 | jack-knifed into the path of an oncoming commuter train heading in 
 56 | the opposite direction. 
 57 | </P>
 58 | <P>
 59 | One of the stricken trains then side-swiped a stationary freight 
 60 | train that was parked on a side-track in the area, knocking it over, 
 61 | officials said.
 62 | </P>
 63 | <P>
 64 | Carnage ensued. "It sounded like thunder," said Karen Mendoza, 
 65 | an employee at a nearby wholesale store who called emergency 
 66 | services when she heard the accident.
 67 | </P>
 68 | <P>
 69 | Some passengers had to be cut from the wreckage, which was 
 70 | strewn over a wide area, while others were forced to jump to safety 
 71 | from the burning upper decks of railway carriages that caught fire 
 72 | in the impact. 
 73 | </P>
 74 | <P>
 75 | At least 89 people were rushed to hospital as a hundreds of 
 76 | rescuers and convoys of ambulances flooded the crash scene. 
 77 | </P>
 78 | <P>
 79 | Some 200 people, mostly walking wounded, were examined by 
 80 | medical staff at the scene as they prioritised treatment for the 
 81 | wounded, firefighters said.
 82 | </P>
 83 | <P>
 84 | Passengers were tossed around the inside of their trains and 
 85 | some had to jump from the upper decks of the wreckage, eyewitnesses 
 86 | told reporters.
 87 | </P>
 88 | <P>
 89 | Most of the injuries suffered by victims were "crush-type" 
 90 | wounds similar to those that occur in car crashes, said Carol Meyer 
 91 | of the Los Angeles County emergency medical services. 
 92 | </P>
 93 | <P>
 94 | Many of the patients had head, chest and abdominal injuries, she 
 95 | added.
 96 | </P>
 97 | <P>
 98 | The Los Angeles Police Department declared a citywide tactical 
 99 | alert that allows commanders to keep officers on the job beyond the 
100 | end of their shifts, in order to help with the rescue and crowd 
101 | control operations.
102 | </P>
103 | <P>
104 | Meanwhile the driver of the vehicle that sparked the incident 
105 | was expected to be charged with murder, police said. 
106 | </P>
107 | <P>
108 | "You don't put your car on the track and put yourself in harm's 
109 | way and all these passengers in harm's way," said Los Angeles 
110 | Sheriff Lee Baca, one of whose deputies was among the dead. "This is 
111 | a complete outrage."
112 | </P>
113 | </TEXT>
114 | </DOC>
115 | 


--------------------------------------------------------------------------------
/dataset_analysis/relevance/acl20-ref-free-eval-master/sentence_transformers/models/RoBERTa.py:
--------------------------------------------------------------------------------
  1 | from torch import Tensor
  2 | from torch import nn
  3 | from pytorch_transformers import RobertaModel, RobertaTokenizer
  4 | import json
  5 | from typing import Union, Tuple, List, Dict
  6 | import os
  7 | import numpy as np
  8 | import logging
  9 | 
 10 | class RoBERTa(nn.Module):
 11 |     """RoBERTa model to generate token embeddings.
 12 | 
 13 |     Each token is mapped to an output vector from RoBERTa.
 14 |     """
 15 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True):
 16 |         super(RoBERTa, self).__init__()
 17 |         self.config_keys = ['max_seq_length', 'do_lower_case']
 18 |         self.do_lower_case = do_lower_case
 19 | 
 20 |         if max_seq_length > 510:
 21 |             logging.warning("RoBERTa only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
 22 |             max_seq_length = 510
 23 |         self.max_seq_length = max_seq_length
 24 | 
 25 | 
 26 |         self.roberta = RobertaModel.from_pretrained(model_name_or_path)
 27 |         self.tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
 28 |         self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
 29 |         self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
 30 | 
 31 |     def forward(self, features):
 32 |         """Returns token_embeddings, cls_token"""
 33 |         #RoBERTa does not use token_type_ids
 34 |         output_tokens = self.roberta(input_ids=features['input_ids'], token_type_ids=None, attention_mask=features['input_mask'])[0]
 35 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
 36 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']})
 37 |         return features
 38 | 
 39 |     def get_word_embedding_dimension(self) -> int:
 40 |         return self.roberta.config.hidden_size
 41 | 
 42 |     def tokenize(self, text: str) -> List[str]:
 43 |         """
 44 |         Tokenizes a text and maps tokens to token-ids
 45 |         """
 46 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
 47 | 
 48 |     def get_sentence_features(self, tokens: List[str], pad_seq_length: int):
 49 |         """
 50 |         Convert tokenized sentence in its embedding ids, segment ids and mask
 51 | 
 52 |         :param tokens:
 53 |             a tokenized sentence
 54 |         :param pad_seq_length:
 55 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
 56 |         :return: embedding ids, segment ids and mask for the sentence
 57 |         """
 58 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
 59 | 
 60 |         tokens = tokens[:pad_seq_length]
 61 |         input_ids = [self.cls_token_id] + tokens + [self.sep_token_id] + [self.sep_token_id]
 62 |         sentence_length = len(input_ids)
 63 | 
 64 |         pad_seq_length += 3  ##Add Space for CLS + SEP + SEP token
 65 | 
 66 |         input_mask = [1] * len(input_ids)
 67 | 
 68 |         # Zero-pad up to the sequence length. BERT: Pad to the right
 69 |         padding = [0] * (pad_seq_length - len(input_ids))
 70 |         input_ids += padding
 71 | 
 72 |         input_mask += padding
 73 | 
 74 |         assert len(input_ids) == pad_seq_length
 75 |         assert len(input_mask) == pad_seq_length
 76 | 
 77 | 
 78 |         return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)}
 79 | 
 80 |     def get_config_dict(self):
 81 |         return {key: self.__dict__[key] for key in self.config_keys}
 82 | 
 83 |     def save(self, output_path: str):
 84 |         self.roberta.save_pretrained(output_path)
 85 |         self.tokenizer.save_pretrained(output_path)
 86 | 
 87 |         with open(os.path.join(output_path, 'sentence_roberta_config.json'), 'w') as fOut:
 88 |             json.dump(self.get_config_dict(), fOut, indent=2)
 89 | 
 90 |     @staticmethod
 91 |     def load(input_path: str):
 92 |         with open(os.path.join(input_path, 'sentence_roberta_config.json')) as fIn:
 93 |             config = json.load(fIn)
 94 |         return RoBERTa(model_name_or_path=input_path, **config)
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------