├── .gitignore
├── .python-version
├── LICENSE
├── README.md
├── data
    ├── examples
    │   ├── advil.txt
    │   ├── peace.txt
    │   └── transformers.txt
    ├── news
    │   ├── 100-tokens
    │   │   ├── 0.txt
    │   │   ├── 1.txt
    │   │   ├── 2.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    │   ├── 1000-tokens
    │   │   ├── 0.txt
    │   │   ├── 1.txt
    │   │   ├── 2.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    │   ├── 300-tokens
    │   │   ├── 0.txt
    │   │   ├── 1.txt
    │   │   ├── 2.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    │   └── 3000-tokens
    │   │   ├── 0.txt
    │   │   ├── 1.txt
    │   │   ├── 2.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    ├── pubmed
    │   ├── 100-tokens
    │   │   ├── 0.txt
    │   │   ├── 1.txt
    │   │   ├── 2.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    │   └── 1000-tokens
    │   │   ├── 0.txt
    │   │   ├── 1.txt
    │   │   ├── 2.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    ├── quotes
    │   ├── albert-einstein.txt
    │   ├── bernard-m-baruch.txt
    │   ├── dr-seuss.txt
    │   ├── frank-zappa.txt
    │   ├── mae-west.txt
    │   ├── mahatma-gandhi.txt
    │   ├── marcus-tullius-cicero.txt
    │   ├── marilyn-monroe.txt
    │   ├── oscar-wilde.txt
    │   └── william-w-purkey.txt
    ├── random-bytes
    │   ├── 100-tokens
    │   │   ├── 0.txt
    │   │   ├── 1.txt
    │   │   ├── 2.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    │   └── 1000-tokens
    │   │   ├── 0.txt
    │   │   ├── 1.txt
    │   │   ├── 2.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    ├── random-words
    │   ├── 100-tokens
    │   │   ├── 0.txt
    │   │   ├── 1.txt
    │   │   ├── 2.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    │   └── 1000-tokens
    │   │   ├── 0.txt
    │   │   ├── 1.txt
    │   │   ├── 2.txt
    │   │   ├── 3.txt
    │   │   ├── 4.txt
    │   │   ├── 5.txt
    │   │   ├── 6.txt
    │   │   ├── 7.txt
    │   │   ├── 8.txt
    │   │   └── 9.txt
    ├── reddit
    │   ├── 0.txt
    │   ├── 1.txt
    │   ├── 2.txt
    │   ├── 3.txt
    │   ├── 4.txt
    │   ├── 5.txt
    │   ├── 6.txt
    │   ├── 7.txt
    │   ├── 8.txt
    │   └── 9.txt
    ├── scientific
    │   ├── 0.txt
    │   ├── 1.txt
    │   ├── 2.txt
    │   ├── 3.txt
    │   ├── 4.txt
    │   ├── 5.txt
    │   ├── 6.txt
    │   ├── 7.txt
    │   ├── 8.txt
    │   └── 9.txt
    ├── supercomputer-traditional.stats
    ├── twitter
    │   ├── 0.txt
    │   ├── 1.txt
    │   ├── 2.txt
    │   ├── 3.txt
    │   ├── 4.txt
    │   ├── 5.txt
    │   ├── 6.txt
    │   ├── 7.txt
    │   ├── 8.txt
    │   └── 9.txt
    └── wikipedia
    │   └── 100-tokens
    │       ├── 0.txt
    │       ├── 1.txt
    │       ├── 2.txt
    │       ├── 3.txt
    │       ├── 4.txt
    │       ├── 5.txt
    │       ├── 6.txt
    │       ├── 7.txt
    │       ├── 8.txt
    │       └── 9.txt
├── decrypt.py
├── docs
    └── REPRODUCE.md
├── encrypt.py
├── experiments
    └── templates
    │   └── paper
    │       ├── different-models-v1.toml
    │       ├── distribution-regularization-v1.toml
    │       ├── distribution-regularization-v2.toml
    │       ├── distribution-regularization-v3.toml
    │       ├── distribution-regularization-v4.toml
    │       ├── effects-of-length-v1.toml
    │       ├── effects-of-size-v1.toml
    │       ├── l2-norm-v1.toml
    │       ├── l2-norm-v2.toml
    │       ├── l2-norm-v3.toml
    │       ├── original-algorithm-v1.toml
    │       ├── original-algorithm-v2.toml
    │       ├── original-algorithm-v3.toml
    │       ├── perplexity-bounded-v0.toml
    │       ├── perplexity-bounded-v1.toml
    │       ├── perplexity-bounded-v2.toml
    │       ├── rebuttal-v1.toml
    │       ├── what-can-we-encrypt-v1.toml
    │       ├── what-can-we-encrypt-v2.toml
    │       ├── what-can-we-encrypt-v3.toml
    │       └── what-can-we-encrypt-v4.toml
├── intrinsic
    ├── .gitignore
    ├── README.md
    ├── docs
    │   └── examples
    │   │   ├── example.py
    │   │   └── train_nn.py
    ├── intrinsic
    │   ├── __init__.py
    │   ├── fwh.py
    │   ├── fwh_cuda
    │   │   ├── fwh_cpp.cpp
    │   │   └── fwh_cu.cu
    │   ├── implementation.py
    │   ├── py.typed
    │   ├── test
    │   │   ├── __init__.py
    │   │   └── test_implementation.py
    │   └── utils.py
    └── setup.py
├── pyproject.toml
├── requirements.txt
├── setup.cfg
└── src
    ├── __init__.py
    ├── accelerate.py
    ├── attacking
        ├── __init__.py
        ├── adaboost.py
        ├── avalanche.py
        ├── data.py
        ├── ffnn.py
        ├── gradboost.py
        ├── helpers.py
        ├── knn.py
        ├── lda.py
        ├── random_forest.py
        ├── semantic_security.py
        └── svm.py
    ├── blog
        └── histograms.py
    ├── config.py
    ├── data
        ├── __init__.py
        ├── __main__.py
        ├── news.py
        ├── openwebtext.py
        ├── pubmed.py
        ├── random_sequences.py
        ├── reddit.py
        ├── shared.py
        ├── twitter.py
        └── wikipedia.py
    ├── dense.py
    ├── evaluating.py
    ├── experiments
        ├── __init__.py
        ├── check.py
        ├── generate.py
        ├── lib.py
        └── run.py
    ├── halton.py
    ├── intrinsic_utils.py
    ├── logging.py
    ├── make_tokenizers.py
    ├── modeling.py
    ├── modeling_utils.py
    ├── paper
        ├── __init__.py
        ├── ciphertext_dist_histograms.py
        ├── comparison.py
        ├── embeddings.py
        ├── feature_importance.py
        ├── helpers.py
        ├── performance.py
        ├── prefix_table.py
        ├── security.py
        ├── security_histograms.py
        ├── tables.py
        └── what_can_we_encrypt.py
    ├── profiling.py
    ├── relic_helpers.py
    ├── templating.py
    ├── test
        ├── __init__.py
        ├── attacking
        │   ├── __init__.py
        │   └── test_pipeline.py
        ├── test_modeling.py
        ├── test_templating.py
        ├── test_tokenizing.py
        └── test_training.py
    ├── tokenizers
        ├── 1-byte.json
        └── 2-byte.json
    ├── tokenizing.py
    ├── tools
        ├── __init__.py
        └── verify_encryption.py
    ├── training.py
    ├── training_utils.py
    ├── types.py
    └── util.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # editor/python
 2 | *.swp
 3 | *.swo
 4 | __pycache__/
 5 | 
 6 | # data
 7 | data/unversioned/
 8 | venv/
 9 | .venv/
10 | experiments/generated/
11 | relics/
12 | data/cached
13 | 
14 | .DS_Store
15 | 
16 | # writing & latex
17 | *.graffle
18 | *.log
19 | *.bbl
20 | *.bcf
21 | *.out
22 | *.run.xml
23 | *.blg
24 | *.fls
25 | *.aux
26 | *.fdb_latexmk
27 | *.pdf
28 | notebooks
29 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.9.7
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SELM
  2 | 
  3 | Code and data for [SELM](https://samuelstevens.me/research/encryption) research project.
  4 | 
  5 | ![teaser-gif-cropped](https://github.com/OSU-NLP-Group/SELM/assets/26638161/b7484c1f-84da-45a9-ba69-0c921c5d87cf)
  6 | 
  7 | ## Table of Contents
  8 | 
  9 | 1. Introduction
 10 | 2. Installation
 11 | 3. Encrypt Something
 12 | 4. Decrypt Something
 13 | 5. Experiments
 14 | 6. Cryptanalysis
 15 | 
 16 | ## Installation
 17 | 
 18 | Install torch (CUDA):
 19 | 
 20 | ```
 21 | pip install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
 22 | ```
 23 | 
 24 | Do this first before installing `requirements.txt` because that will install a CPU-only torch.
 25 | 
 26 | Install packages:
 27 | 
 28 | ```sh
 29 | pip install -r requirements.txt
 30 | ```
 31 | 
 32 | Install the intrinsic package, used for efficient intrinsic dimension operations:
 33 | 
 34 | ```sh
 35 | cd intrinsic
 36 | python setup.py develop
 37 | cd ..
 38 | ```
 39 | 
 40 | Initialize `relics/` (the experiment directory):
 41 | 
 42 | ```sh
 43 | relic init
 44 | ```
 45 | 
 46 | ## Encrypt Something
 47 | 
 48 | Get a key:
 49 | 
 50 | ```sh
 51 | python -c 'import secrets; print(secrets.randbits(32))'
 52 | ```
 53 | 
 54 | Encrypt with your key:
 55 | 
 56 | ```sh
 57 | python encrypt.py --key KEY --int-dim 10000 data/examples/advil.txt
 58 | ```
 59 | 
 60 | ## Decrypt Something
 61 | 
 62 | Use the key to decrypt:
 63 | 
 64 | ```
 65 | python decrypt.py --key KEY advil.bin
 66 | ```
 67 | 
 68 | ## Experiments
 69 | 
 70 | To run a new experiment, define a new `.toml` file in `experiments/` with whatever configuration options you want. `src/config.py` shows all the different options that can be changed.
 71 | 
 72 | `.toml` files can contain lists for parameters; when they do, an experiment for each value in the list is generated. For example, `experiments/gpt2/wikipedia/0-4-concat.toml` has two lists: one for `learning_rate` and `intrinsic_dimension`. This means there are actually 20 experiments in here: 2 learning rates * 10 intrinsic dimensions.
 73 | 
 74 | To run the experiments:
 75 | 
 76 | ```sh
 77 | python -m src.experiments.run experiments/templates/paper/what-can-we-encrypt-v4.toml
 78 | ```
 79 | 
 80 | If you are running out of GPU memory, you can use model parallelism to split the Fastfood transform and the GPT2 model onto separate GPUs:
 81 | 
 82 | ```
 83 | CUDA_VISIBLE_DEVICES=0,2 MODEL_PARALLELISM=1 python -m src.experiments.run experiments/gpt2/examples/medium.toml
 84 | ```
 85 | 
 86 | You can pass entire directories or just individual `.toml` files to `src.experiments`. Results will be saved to `relics/`.
 87 | 
 88 | **If you stop an experiment and run it again, any trials that are finished in `relics/` will not be run again.**
 89 | 
 90 | ## Cryptanalysis
 91 | 
 92 | Unzip the provided data:
 93 | 
 94 | ```sh
 95 | unzip relics.zip
 96 | ```
 97 | 
 98 | Play the security game on the original algorithm with an SVM:
 99 | 
100 | ```sh
101 | python -m src.paper.security svm original feature-fn 500 --ratio 0.8 --quiet
102 | ```
103 | 
104 | Play the security game on the distribution-regularized variant with an SVM:
105 | 
106 | ```sh
107 | python -m src.paper.security svm distribution-reg feature-fn 500 --ratio 0.8 --quiet
108 | ```
109 | 
110 | Try to implement stronger attacks!
111 | Look in `src/attacking/` for the model files and add your own.
112 | 


--------------------------------------------------------------------------------
/data/examples/advil.txt:
--------------------------------------------------------------------------------
1 | Causes of Back Pain
2 | 
3 | Lower back pain can occur in people who are overweight, in poor physical shape, have poor posture, or are compelled to sit or stand for long periods of time. Muscle strain is another cause of a troubled back, either from lifting something that is too heavy or by lifting objects incorrectly. Many pregnant women develop lower back pain due to the extra weight they support during pregnancy.
4 | 
5 | Another common cause is osteoarthritis (a ”wear-and-tear” condition), fractured vertebrae (the literal "broken back"), and the “slipped” or herniated discs are all serious medical conditions that must be treated by a qualified physician.
6 | 


--------------------------------------------------------------------------------
/data/examples/peace.txt:
--------------------------------------------------------------------------------
1 | Ethical technology takes us to a world of peace and plenty.
2 | 


--------------------------------------------------------------------------------
/data/examples/transformers.txt:
--------------------------------------------------------------------------------
1 | A transformer is a deep learning model that adopts the mechanism of attention, differentially weighing the significance of each part of the input data. It is used primarily in the field of natural language processing (NLP)[1] and in computer vision (CV).[2]
2 | 
3 | Like recurrent neural networks (RNNs), transformers are designed to handle sequential input data, such as natural language, for tasks such as translation and text summarization. However, unlike RNNs, transformers do not necessarily process the data in order. Rather, the attention mechanism provides context for any position in the input sequence. For example, if the input data is a natural language sentence, the transformer does not need to process the beginning of the sentence before the end. Rather, it identifies the context that confers meaning to each word in the sentence. This feature allows for more parallelization than RNNs and therefore reduces training times.
4 | 
5 | Transformers are the model of choice for NLP problems,[3] replacing RNN models such as long short-term memory (LSTM). The additional training parallelization allows training on larger datasets than was once possible. This led to the development of pretrained systems such as BERT (Bidirectional Encoder Representations from Transformers) and GPT (Generative Pre-trained Transformer), which were trained with large language datasets, such as Wikipedia Corpus and Common Crawl, and can be fine-tuned for specific tasks.
6 | 


--------------------------------------------------------------------------------
/data/news/100-tokens/0.txt:
--------------------------------------------------------------------------------
1 | Goals from Zlatko Junuzovic, Florian Grillitsch and Florian Kainz condemned second-placed Leipzig to their second successive league defeat.
2 | Bayern can extend their 10-point lead when they travel to Borussia Monchengladbach on Sunday.
3 | Borussia Dortmund closed the gap on Leipzig to three points with Friday's 1-0 win at Ingolstadt.
4 | Hoffenheim are a point further back, and boosted their chances


--------------------------------------------------------------------------------
/data/news/100-tokens/1.txt:
--------------------------------------------------------------------------------
1 | Polish national Daria Pionko, 21, was found seriously injured in Springwell Road, Holbeck, on 23 December and died later in hospital.
2 | Det Supt Simon Atkinson said Miss Pionko had been the victim of a "sustained and vicious attack".
3 | A 38-year-old man arrested in connection with the incident was released without charge.
4 | A post mortem examination found Miss Pionko died as a result of head and facial injuries.
5 | She was


--------------------------------------------------------------------------------
/data/news/100-tokens/2.txt:
--------------------------------------------------------------------------------
1 | "After much soul-searching, it is clearly time for us to live by Michael's words about love not war," wrote Jermaine, in a statement.
2 | Jermaine also withdrew his support for a leaked letter which calls on executors of the estate to resign.
3 | On Thursday his mother, Katherine, was reinstated as guardian of Michael's children along with his cousin, TJ.
4 | Days earlier, TJ became a temporary guardian for the three children amid reports Katherine, 82, was


--------------------------------------------------------------------------------
/data/news/100-tokens/3.txt:
--------------------------------------------------------------------------------
1 | At a news conference with Turkish Foreign Minister Ahmet Davutoglu, Mr Kerry said the two Nato allies shared a common goal - to end the suffering of innocent civilians in Syria.
2 | Turkey and the US both oppose Syrian President Bashar al-Assad, but differ on how best to support the opposition.
3 | The visit has been overshadowed by the Turkish PM's remarks about Zionism.
4 | Recep Tayyip Erdogan earlier this week called Zionism a "crime against humanity" - remarks that have been widely condemned,


--------------------------------------------------------------------------------
/data/news/100-tokens/4.txt:
--------------------------------------------------------------------------------
1 | 20 January 2016 Last updated at 12:03 GMT
2 | It had been stuck on the side of the mountain in Snowdonia, Wales, for days and couldn't move from its dangerous position on the ledge.
3 | A rescue team from the RSPCA had to lower themselves down the mountain using ropes to reach the sheep, who was then lowered down with them, to the bottom of the cliff.
4 | They said that the sheep wasn't injured but was very hungry after its cliff-side adventure.


--------------------------------------------------------------------------------
/data/news/100-tokens/5.txt:
--------------------------------------------------------------------------------
1 | The first bomb was found during an alert at Ramoan Drive on Saturday morning.
2 | A device found in the Glencolin Walk area of west Belfast later on Saturday has also been declared viable.  There is another security alert in Ballygally in County Antrim.
3 | Both alerts in west Belfast have now ended.
4 | The alert in Glencolin Walk followed the discovery of a suspicious object.
5 | The Glen Road was closed between the junctions of Shaw's Road and Suffolk Road.
6 | 


--------------------------------------------------------------------------------
/data/news/100-tokens/6.txt:
--------------------------------------------------------------------------------
1 | Mr Duterte clarified that he had "nothing against gays", saying several of his relatives were homosexual.
2 | The controversial politician had previously appeared supportive of LGBTQ rights, saying in 2015 that same-sex marriage was "good".
3 | But he is otherwise known for his conservative views especially on crime.
4 | He has waged a much-criticised war on drug users and dealers leading to thousands of extra-judicial killings.
5 | His latest remarks were made on Sunday night to Filipino expatriates in the Burm


--------------------------------------------------------------------------------
/data/news/100-tokens/7.txt:
--------------------------------------------------------------------------------
1 | Police said the incident happened at the corner of Jamaica Street and Argyle Street at about 09:45.
2 | Emergency services attended and the man was taken by ambulance to Glasgow Royal Infirmary. There is currently no information on his condition.
3 | Jamaica Street has been closed while police carry out investigations into the circumstances of the incident.Media playback is not supported on this device
4 | Wednesday's match in Lyon will be Wales' first in the last four of a major tournament.
5 | Coleman


--------------------------------------------------------------------------------
/data/news/100-tokens/8.txt:
--------------------------------------------------------------------------------
1 | The proposals are the brain child of Swiss businessman turned politician Thomas Minder, who runs a small family company producing natural cosmetics.
2 | Mr Minder wants shareholders to have a veto over managers' salaries, and to ban golden handshakes altogether.
3 | The "fat cat initiative", as it has come to be called, would, if approved, be written into the Swiss constitution, and would apply to all Swiss companies listed on Switzerland's stock exchange.
4 | Mr Minder, an outspoken man, says


--------------------------------------------------------------------------------
/data/news/100-tokens/9.txt:
--------------------------------------------------------------------------------
1 | The woman was hit by the car on Emma Street at about 09:40 on Monday and became trapped between the pavement and the vehicle.
2 | Firefighters used chocks and blocks to stabilise the car and free the casualty.
3 | Police and the ambulance service also attended, and the woman was treated by paramedics for a leg injury.In a letter urging congregations to vote on 7 May, the House of Bishops does not endorse a political party but encourages debate on issues such as nuclear defence and the


--------------------------------------------------------------------------------
/data/news/300-tokens/0.txt:
--------------------------------------------------------------------------------
 1 | The Premier League team, who are on a two-game post-season tour of the United States and Canada, are scheduled to play Houston on Friday.
 2 | Several people have died and dozens have been injured following record rainfall over the weekend.
 3 | On Monday, hundreds of basketball fans were trapped inside an arena after an NBA basketball game.
 4 | Supporters were advised to stay in their seats overnight following Houston Rockets' win against Golden State Warriors in the NBA Western Conference Finals.
 5 | Many spent almost 11 hours at the Toyota Center, until the early hours of Tuesday morning.
 6 | Manchester City are currently in Canada, ahead of their match against Toronto FC on Thursday.Wiggins, 36, had hinted the race in the city of his birth could be his last, but afterwards said he was "not sure yet" what his plans are.
 7 | "I don't know, I've still got really good legs," he said.
 8 | "This might not be my last race. This for sure is my last ever race with Mark Cavendish, though."
 9 | The 2012 Tour de France winner added that he "just wants to enjoy this moment".
10 | Wiggins' admission follows his comments after last month's London Six Day, when he hinted he could be tempted to race there again next year.
11 | Wiggins and Cavendish claimed overall victory in Ghent after winning the final madison event.
12 | The pair also contested the Ghent Six in 2007 and won madison gold together at the World Track Championships in London


--------------------------------------------------------------------------------
/data/news/300-tokens/1.txt:
--------------------------------------------------------------------------------
 1 | Daniel James' late penalty gave Wales a first win in the competition after an opening draw against hosts France.
 2 | Page's side face Ivory Coast in their final Group B game on Monday.
 3 | "It's always a good sign when you win games of football and you haven't been at your best," said former Wales defender Page.
 4 | "We probably played better against France and didn't win.
 5 | "If someone would have said before the tournament that we'd be two games in and with four points going into the third game I'd snap their hand off.
 6 | "It's our first time in this tournament and we've been up against excellent opposition against France and a different challenge against Bahrain."
 7 | Wales are second after their win over Bahrain with next opponents Ivory Coast top of Group B after they beat France.Save Fenton Town Hall say they are protesting because the Ministry of Justice (MoJ) is selling the building.
 8 | They say a World War One memorial is at risk of being destroyed if the building is sold.
 9 | However, the government has said "a legal covenant" means any buyer would have a duty to preserve the memorial.
10 | The group said they wanted to see the building preserved for community use and were prepared to stay there all night.
11 | The magistrates' court, which was built in 1886 as a town hall, closed in December 2012 as part of government plans to shut 93 courts in England and Wales in a bid to save Â£41m.
12 | The


--------------------------------------------------------------------------------
/data/news/300-tokens/2.txt:
--------------------------------------------------------------------------------
 1 | Morocco start their Group C campaign against DR Congo on Monday, then face Togo on 20 January and take on Ivory Coast four days later.
 2 | Rherras, 23, and Cameroon midfielder Arnaud Djoum, 27, will miss Hearts' Scottish Cup meeting with Raith Rovers on 22 January.
 3 | Cameroon are in Group A with Burkina Faso, Guinea-Bissau and hosts Gabon.
 4 | Djoum has five international caps while Rherras made his Morocco debut in August.
 5 | Scottish Premiership clubs are currently on their winter break and Hearts' next league fixture is away to Celtic on 29 January.Local photographer Ron Strathdee captured the phenomenon on Monday at about 23:30 BST.
 6 | The glow is usually best seen from northern latitudes like Norway, Alaska, Iceland and northern Scotland.
 7 | Mr Strathdee said seeing the Northrn Lights from Manx latitudes was "fairly unusual."
 8 | They happen when incoming solar radiation hits the earth's upper atmosphere and excites atoms to a new energy state, emitting energy in the form of light.
 9 | The photographer said: "I needed a place that faced north so went to Peel Hill and tried some shots over the castle which worked but half the fishing boats in the Irish Sea were discharging fish at the breakwater with enough floodlights to cover a football match!
10 | "Going round the front of the castle it was pitch dark and it looks straight north


--------------------------------------------------------------------------------
/data/news/300-tokens/3.txt:
--------------------------------------------------------------------------------
 1 | Media playback is not supported on this device
 2 | Ainslie, 35, admits it would have been "difficult to top" the feeling of winning at his home Olympics and is keen to move on with new challenges.
 3 | "It was a tough decision," Ainslie told BBC Sport.
 4 | "I've had a fantastic Olympic career but I want to make it clear that the focus is now on the Americas Cup."
 5 | Ainslie won a silver medal at Atlanta 1996, with golds in Sydney, Athens, Beijing and London. He has won more medals than any other sailor, ahead of Denmark's Paul Elvstrom, who has four golds.
 6 | He is fourth in Britain's all-time individual medallist's list, behind Sir Chris Hoy (cycling; six gold, one silver), Sir Steve Redgrave (rowing; five gold, one bronze) and Bradley Wiggins (cycling; four gold, one silver, two bronze).
 7 | Media playback is not supported on this device
 8 | Ainslie added: "I considered all of the factors - my fitness and the issues with my back, the venue for the next Olympics and the type of boats, but what it really came down to was this opportunity with the Americas Cup."
 9 | The sailor envisages skippering his Ben Ainslie Racing AC45 catamaran to glory in the historic competition will be one of the "biggest tests" of his career.
10 | "It's always been a dream


--------------------------------------------------------------------------------
/data/news/300-tokens/4.txt:
--------------------------------------------------------------------------------
 1 | Kenneth Gibson and his wife Patricia - the MP for North Ayrshire and Arran - lost their baby towards the end of her pregnancy in 2009.
 2 | Staff at the Southern General hospital in Glasgow had failed to spot that Mrs Gibson had pre-eclampsia.
 3 | A review of baby deaths at Crosshouse Hospital was announced on Tuesday.
 4 | It came after a BBC Scotland investigation revealed that there had been six so-called "unnecessary" deaths of babies at the hospital since 2008.
 5 | "Unnecessary" or "avoidable" deaths are referred to as those where harm was caused to a healthy baby during childbirth - usually resulting in them being deprived of oxygen.
 6 | Reports into some of the deaths referred to failings in monitoring of the child's heartbeat during childbirth.
 7 | Speaking in the Holyrood chamber, Mr Gibson urged Health Secretary Shona Robison to make the review "wider and deeper" so it could look at similar cases in other hospitals across the country.
 8 | His wife has previously opened up in the House of Commons about the "devastating" effect of losing her baby.
 9 | Mr Gibson said: "On her due date in 2009, my wife Patricia, having been sent home and been physically sick, was finally admitted to the Southern General maternity unit despite their protests.
10 | "A consultant junior doctor and two midwives examined her that day. Despite being 41, a first-time mother and in extreme pain from head to toe, no-one picked up


--------------------------------------------------------------------------------
/data/news/300-tokens/5.txt:
--------------------------------------------------------------------------------
 1 | A large tipper truck was observed delivering the tyres under cover of darkness on Wednesday night.
 2 | A council spokesperson said the bonfire was on land owned by the Housing Executive (NIHE) in Ballybeen.
 3 | "The council has been in touch with the NIHE to raise the issue of the tyres at the site and to request the matter is investigated," they said.
 4 | "The Northern Ireland Environment Agency has also been contacted by the council regarding the nature and volume of the bonfire material."
 5 | Asked about the burning of tyres on bonfires on the Radio Ulster's Nolan programme, community worker Jim Wilson there was "a very, very small minority we are talking about where we have problems".
 6 | "You're talking about businesses making money out of it, and that comes down to the PSNI to deal with it."
 7 | Meanwhile, Belfast City Council said it had received reports that tyres had been collected at a bonfire at Avoneil in the east of the city.
 8 | "We have been engaging with the local community at this site to have the tyres removed and will continue in our efforts to manage the negative impacts of the bonfire which includes the burning of tyres," a spokesperson said.
 9 | Ulster Unionist Cllr Jim Rodgers said the inclusion of tyres in bonfires was "a worrying development".
10 | He said that councillors had met the police and asked them to be "more pro-active" regarding bonfires.DUP leader Arlene Foster said the "wide


--------------------------------------------------------------------------------
/data/news/300-tokens/6.txt:
--------------------------------------------------------------------------------
 1 | The patents include one that relates to the front face of the iPhone and one for touch-screen technology.
 2 | It is another win for Apple, after it was awarded $1.05bn (£652m) in damages by a jury in a separate case in August.
 3 | The ITC can block the import of products into the US.
 4 | The judge's ruling will go in front of a full commission, which is scheduled to conclude its investigation in February.
 5 | Judge Thomas Pender agreed that Samsung violated four of Apple's patents, but was not in violation of two others listed by Apple in the complaint.
 6 | Three of the patents are related to software features, while one covers Apple's hardware.
 7 | However, the Samsung products in this case do not include its latest devices, limiting the impact of a potential import ban into the US.
 8 | Samsung has repeatedly argued that any sales ban would limit choice and raise prices for consumers in the US.
 9 | Apple and Samsung have bought legal cases against each other in more than 10 countries, each accusing the other of violating patents, as the two battle for market share in the hugely lucrative mobile industry.Offences for which lower compensation was awarded included ones involving drink, drugs or violence.
10 | The Criminal Injuries Compensation Authority said statutory guidance obliged it to reduce or refuse awards if victims had unspent convictions.
11 | A leading child abuse lawyer called for a review of the "scandalous" approach.
12 | Alan Collins said civil case judges increasingly took the opposite


--------------------------------------------------------------------------------
/data/news/300-tokens/7.txt:
--------------------------------------------------------------------------------
 1 | The six-year-old victor, ridden by Noel Fehily, was cut to about 16-1 from 66-1 for the Cheltenham Gold Cup in March.
 2 | Alary, trained by Colin Tizzard, had been considered a Gold Cup hope but was pulled up before the third last fence.
 3 | Bristol De Mai sealed a Haydock double for Nigel Twiston-Davies after The New One became the first horse to win the Champion Hurdle Trial three times.
 4 | The nine-year-old, ridden by Sam Twiston-Davies for his trainer father, produced a gutsy display to edge past runner-up Clyne.
 5 | Unbeaten Neon Wolf ran out a nine-length victor of the novices' hurdle, while 2014 Champion Hurdle winner Jezki returned from a 632-day absence with a comfortable success at Navan.
 6 | Earlier, Ascot's Grade One meeting and Taunton's card on Saturday were called off because of frozen ground.
 7 | The Ascot fixture was due to feature the Clarence House Chase, which has been rescheduled to take place at Cheltenham on Festival Trials Day on 28 January.
 8 | Cheltenham will now have a nine-race card next week, with racing starting at midday.
 9 | Cornelius Lysaght, BBC horse racing correspondent
10 | Abandoned Ascot was billed as the day's top fixture, but had it been on


--------------------------------------------------------------------------------
/data/news/300-tokens/8.txt:
--------------------------------------------------------------------------------
 1 | Dubbed Nuit Debout (Up All Night), it is a self-styled "popular assembly" in which participants share views about politics and the state of the world.
 2 | As night descends, the speakers stand patiently in line and, turn by turn, take the microphone for their allotted five minutes.
 3 | Before them, sitting in twos and threes on paving stones, the young audience responds with the occasional cheer or boo.
 4 | Not that there is a huge amount to react to. The speeches are rambling and platitudinous.
 5 | One orator says the essence behind society should be "values" - but she does not say which.
 6 | Another urges an end to hierarchy - "no more pride, no more ego - just ideas".
 7 | A third wants to speak of human rights abuses in the Democratic Republic of Congo.
 8 | One theme that recurs is the need to tolerate divergences of opinion. This is significant. Two nights previously, one of France's best-known philosophers - a man who a generation ago would have himself been at the mike - was spat on and told to leave.
 9 | Both speakers and listeners appear to be mainly students - an impression confirmed by a tour of the various "stands".
10 | The feminists are in a large huddle, and I am asked not to take photographs. Elsewhere, a screen shows a laborious film made by a woman who took a job distributing junk mail and wants to expose the exploitation.
11 | There


--------------------------------------------------------------------------------
/data/news/300-tokens/9.txt:
--------------------------------------------------------------------------------
 1 | Officers are investigating whether one of the men fell from a 12th floor balcony of the 19-storey Donside Court, in the Tillydrone area of the city.
 2 | Emergency services were called to the building at about 20:15 on Tuesday evening.
 3 | A witness has described hearing a woman screaming "help me, help me" before police arrived.
 4 | Police Scotland said it believed the incident was contained and there was not a threat to the wider community.
 5 | A blue forensics tent had been erected inside a police cordon at the foot of the building, close to the main entrance.
 6 | An eyewitness said they saw a man fall from the 12th floor of the building, which police confirmed they were pursuing as a line of inquiry.
 7 | Another witness, Toni Dey, 19, who lives a short distance away in Gordon's Mills Road, said she heard screaming from the block.
 8 | The mother of two said: "I heard some girl screaming 'Help me, help me', then about 10 minutes after I heard loads of screaming and shouting.
 9 | "I didn't call the police as I thought it was kids messing around, then I heard loads of police.
10 | "It's very scary to think that something like that had happened. All I kept thinking was 'Why was that poor girl shouting help me?', and about a two-minute walk out my back door.
11 | "I just moved in here in May and it's been so quiet. I was


--------------------------------------------------------------------------------
/data/pubmed/100-tokens/0.txt:
--------------------------------------------------------------------------------
1 | The pharmacokinetics of Casodex have been investigated in patients with prostatic carcinoma following single oral doses of 10 mg, 30 mg and 50 mg and during daily administration at these dose levels. Casodex displays prolonged absorption following a single dose, with peak plasma concentrations observed at up to 8 h for doses of 10 mg and 30 mg and up to 48 h for the 50 mg dose. The area under the plasma concentration-time curve increased linearly with dose, and Casodex was


--------------------------------------------------------------------------------
/data/pubmed/100-tokens/1.txt:
--------------------------------------------------------------------------------
1 | Most psychoanalytic literature dealing with incest holds the premise that the act took place between a parent and a child of opposite sex. Incidentally, most of these cases involve a father-daughter incest (e.g. research by Julien Bigras). However, this is only one of four mathematically possible combinations. For instance, we tend to underestimate the occurrences and, consequently, the repercussions of mother-daughter incest relationships. The biological and psychological importance of the mother in the child's


--------------------------------------------------------------------------------
/data/pubmed/100-tokens/2.txt:
--------------------------------------------------------------------------------
1 | Temporomandibular joint arthoscopy is a new diagnostic and therapeutic modality and its development is in its infancy. The purpose of this article is to describe arthroscopic pathologic findings of the superior joint space.Thirty seven cardiac transplants have taken place at the National Cardiac Centre in Ireland since 1985. Data is presented on three still-surviving male patients aged 19 to 42 who received cardiac transplants in 1985 and 1986. Circulating levels of blood cyclospor


--------------------------------------------------------------------------------
/data/pubmed/100-tokens/3.txt:
--------------------------------------------------------------------------------
1 | The structure, thermodynamic, and kinetic features of polyunsaturated fatty acids derivatives as the main substratum of lipid peroxidation (POL) have been considered. The heats of key POL reactions have been estimated. Kinetic consequences of these estimations have been analyzed. The dependence of POL rate on O2 concentration have been considered in detail both in the absence and in the presence of antioxidants. The essential features of POL processes in lipid bilayers resulting from the specific structure and molecular dynamics have been discussed


--------------------------------------------------------------------------------
/data/pubmed/100-tokens/4.txt:
--------------------------------------------------------------------------------
1 | In eighteen adult patients scheduled for cardiac and vascular surgery, shed blood was treated with the Haemonetics Cell Saver Haemolite. On average by patient, the autologous blood volume restored was 471.94 +/- 235.7 ml. The haemoglobin level was 16.88 g.dl-1 and haematocrit level was 49.31 +/- 7.2%. Thirteen by eighteen patients did not require any homologous blood transfusion. The Cell S


--------------------------------------------------------------------------------
/data/pubmed/100-tokens/5.txt:
--------------------------------------------------------------------------------
1 | In vitro and in vivo studies were performed to clarify the nature of some interactions between Plasmodium berghei sporozoites and rodent host cells. Videomicroscopic observations were made on in vitro interactions between sporozoites and cultured host cells (rodent peritoneal macrophages, W138 human lung fibroblasts, and HepG2 human hepatoma cells). The results showed a diversity of dynamic interactions and sporozoite activities, including active sporozoite penetration,


--------------------------------------------------------------------------------
/data/pubmed/100-tokens/6.txt:
--------------------------------------------------------------------------------
1 | The volume of isoflow (VisoV) in 29 asymptomatic smokers with normal radiographic and pulmonary function studies was studied, with the purpose of investigating the presence of early alterations of the transitional airways in a group of 256 chronic smokers. The VisoV was measured using a body test plethysmograph, and two flow-volume curves with air and with helium were performed. 13 out of the 29 subjects (45%) showed an abnormal VisoV of between 17


--------------------------------------------------------------------------------
/data/pubmed/100-tokens/7.txt:
--------------------------------------------------------------------------------
1 | Our previous experiments (14) (15) dealt with the dynamic of the fluid-coagulant equilibrium in rats after exposure to confinement hypokinesia (CHK) at different time intervals, 1, 3, 7, 21, 28 and 35 days. We found a decreased Quiq and Howell time (QT and HT) in some relation with the time of exposure, with a tendency to normalization after 21 days. The same evolution was shown by the number of thromb


--------------------------------------------------------------------------------
/data/pubmed/100-tokens/8.txt:
--------------------------------------------------------------------------------
1 | Our series includes the first 19 consecutive patients with pulmonary valvar stenosis who underwent balloon valvuloplasty in our hospital. All of them have been evaluated by an echo-Doppler study at different follow-up times, and in 10 patients the study was also performed prior to the dilatation. The ages ranged between 0.4 and 10 years (mean 3.5 years; standard deviation [SD] 2.53 years). The prevalvuloplasty degree of pulmon


--------------------------------------------------------------------------------
/data/pubmed/100-tokens/9.txt:
--------------------------------------------------------------------------------
1 | The technique of ABR testing was applied to 25 infants with neonatal hyperbilirubinemia at levels exceeding that for exchange transfusion, in an attempt to study potential influence of bilirubin toxicity on the auditory brainstem pathway. The test was performed at a mean conceptional age of 40.4 +/- 0.6 weeks just after discharge. Twenty normal term neonates of comparable birth weights and conceptional ages, who had no hyperbilirubinemia, were also examined for


--------------------------------------------------------------------------------
/data/quotes/albert-einstein.txt:
--------------------------------------------------------------------------------
1 | "Two things are infinite: the universe and human stupidity; and I'm not sure about the universe."
2 | 


--------------------------------------------------------------------------------
/data/quotes/bernard-m-baruch.txt:
--------------------------------------------------------------------------------
1 | "Be who you are and say what you feel, because those who mind don't matter, and those who matter don't mind."
2 | 


--------------------------------------------------------------------------------
/data/quotes/dr-seuss.txt:
--------------------------------------------------------------------------------
1 | "You know you're in love when you can't fall asleep because reality is finally better than your dreams."
2 | 


--------------------------------------------------------------------------------
/data/quotes/frank-zappa.txt:
--------------------------------------------------------------------------------
1 | "So many books, so little time."
2 | 


--------------------------------------------------------------------------------
/data/quotes/mae-west.txt:
--------------------------------------------------------------------------------
1 | "You only live once, but if you do it right, once is enough."
2 | 


--------------------------------------------------------------------------------
/data/quotes/mahatma-gandhi.txt:
--------------------------------------------------------------------------------
1 | "Be the change that you wish to see in the world."
2 | 


--------------------------------------------------------------------------------
/data/quotes/marcus-tullius-cicero.txt:
--------------------------------------------------------------------------------
1 | "A room without books is like a body without a soul."
2 | 


--------------------------------------------------------------------------------
/data/quotes/marilyn-monroe.txt:
--------------------------------------------------------------------------------
1 | "I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best." 
2 | 


--------------------------------------------------------------------------------
/data/quotes/oscar-wilde.txt:
--------------------------------------------------------------------------------
1 | "Be yourself; everyone else is already taken."
2 | 


--------------------------------------------------------------------------------
/data/quotes/william-w-purkey.txt:
--------------------------------------------------------------------------------
1 | "You've gotta dance like there's nobody watching,
2 | Love like you'll never be hurt,
3 | Sing like there's nobody listening,
4 | And live like it's heaven on earth."
5 | 


--------------------------------------------------------------------------------
/data/random-bytes/100-tokens/0.txt:
--------------------------------------------------------------------------------
1 | «¨ÅW¥~3=^¾ú¢«ÑQ=U¥&ÎG/@ÄÜû$JZ>íþ«eý¾géº~º&¦mºÒ¿ÞÌt"Ò÷Ý!>�


--------------------------------------------------------------------------------
/data/random-bytes/100-tokens/1.txt:
--------------------------------------------------------------------------------
1 | pWû-ÕÀ¢ññªµ:®ÜlX{SXl³´jhÏ|¸ØÎàìvZDeP¤My²Psíñn§BFº~õÁ0Û"ÎæI´Å6\,�


--------------------------------------------------------------------------------
/data/random-bytes/100-tokens/2.txt:
--------------------------------------------------------------------------------
1 | N.3Õ°ß6çÚ2VÉ¸êMÞã-÷*ðÃ){ÍÛþcþMüÀDpXÒ¤*ÎÍ3&]|á¹þW¥TÈYBõ£s(i-


--------------------------------------------------------------------------------
/data/random-bytes/100-tokens/3.txt:
--------------------------------------------------------------------------------
1 | pºÜvßÆ[5UE?%#)^ò[w)¾Pc;¦ïä|.óñºÊØZß]`^»¡»K^ï58¹éJ³ùìGw 1vLsÇwÐ¹Åí


--------------------------------------------------------------------------------
/data/random-bytes/100-tokens/4.txt:
--------------------------------------------------------------------------------
1 | î'p=­½Ö¨âÄÖ`K¤ä.ß[«|ê½>Ö±iò[Ä¸æAsÇÉp'¼é[Ô²XxêRQ¢;\Ñz6¸`<<m0ìû¼�


--------------------------------------------------------------------------------
/data/random-bytes/100-tokens/5.txt:
--------------------------------------------------------------------------------
1 | ù(úg8ÚgsK#Qbõ:A^GîÇÖÜabJÏ°÷Òvó\wXÄR>Å|÷ÈýS ` ´RÚCÇ~´d·þ ´5h�


--------------------------------------------------------------------------------
/data/random-bytes/100-tokens/6.txt:
--------------------------------------------------------------------------------
1 | Ë^­}xC3§`u¨|îåFÌLëî}S½ª Õ(ò\bÅ·B52~:\ËV®ûÕhÝlÍÍÒ¯ÛHÜÇe<


--------------------------------------------------------------------------------
/data/random-bytes/100-tokens/7.txt:
--------------------------------------------------------------------------------
1 | µÁ×§£UGâ;4ãI?PUã<X©N(Ôl?ßé»wt~ûb«®ìh,QsÛ)²_ìí{ñÔ°*odÁÔã	ñ·AâÊovm;iµ4d�


--------------------------------------------------------------------------------
/data/random-bytes/100-tokens/8.txt:
--------------------------------------------------------------------------------
1 | ëVÒêàÑÓÑØ|zHØ>	^·ù<$YØ¤muÎJÔô§ÌíQðÖ!ëÿJ8±Y)alo<Àïm·ÞpEKOÜÛ3.w


--------------------------------------------------------------------------------
/data/random-bytes/100-tokens/9.txt:
--------------------------------------------------------------------------------
1 | ­`?Ç`  ÁXÛ­Vë:£J%^¡µ¨§iå>jãÛs;#¨Õ KGâ45Á6µPë4H¤K<îé·dtà"Â¸J¦w


--------------------------------------------------------------------------------
/data/random-bytes/1000-tokens/0.txt:
--------------------------------------------------------------------------------
1 | åøÂÿÍðv
2 | {ÏïÕgñ/Æÿ'9{|³ô!ýg)ºÉM/`µÉo
3 | p8ÝÊ5Ü_;Ì kÒõLà­Ù­tyÌb¾vùÌ>qÖãyKlâWìXà¦
4 | LmÙùR6iÅÒh2b¹ÔÔxÑÔÛí¬Jh@áFjë³ þ)0¢øÓ=pÂªøÀ?ô Yî[£Z¥ìÂ}ªWÛêâjÕur:!
5 | '¢S%ìÞt³K6ËõþíarJ$a5û*âQýYDê´>?w[ÉéºÎmZô}BÙ ¨,¬	O½Ê)øùãÙµð^£AFìP§ÉÐôá§ ¢ñ{P¾âmèpc`Ú8vDx¯âh¶8¦OÌÔ´a_]ÆþÍy>
6 | g ¦µïGkWq6°CÛ¡7íÈúYf«É\È«×ÂÇ. ãöÔÍíÉY=1»cg	_Z]`Bª­ÌôòUªCëß3d<õ¯Ù({ôôgGÏj¢w½¼ª¢¥0f¾½5TÂ¤ñKÄlk²þFàMf,ôbÁÉ°MRÿNÍ¶´Ê±ö)ÄQâWëÁ0r©áé3ç~ÂÝ¾+h¸ÌØkqiµ+d{@¢ÉW<s5ô-Æ n·ß¶u§Ê­ÒÌ#{"®¦ô7½5xóècÓäZùiðý/[2~W¦£§zmè½õ°«aª35QIe¸FÍºÐâªCãÓ¹B2ä¹¡q5?Äè*m¿äói[éH)¿¯ô¢?77kûÅ¼TÆ¦ª\KÿùnØÚ	/?Õ3ûÝ'K
7 | Ð·¿B6x±=[ñ_Ïb)?^æ³IT­¿?÷s²¦tenp¥ÌÄ)Ä¹FÊpHU@ýú2®Çµa°Ia@|q)¿½SDö�


--------------------------------------------------------------------------------
/data/random-bytes/1000-tokens/1.txt:
--------------------------------------------------------------------------------
1 | á¥zË:¡>G¹AXMSÂ»oÁwAèDÜj-iu×h-Mñ4'ªà¢ý@öØ½ª`«DÕ[éS×çju¦þ¼tPÈÆ°mp_ÿÁÊÏÍ¯Ø"¡q{ô¬&'Ï°Î
2 | áÇnáNçÀñ&5;Ñm}d&/ë,ÝT|I/ð¸\eU9'»4ä§Z'ÞäâQÈ4;Ëc;TÜôæºdgÙCrª½ÿÁs8_ÐÝSA¸#À÷ác8ðrâ$Åè¸¥±ÎõÖ+ìW^Í¯yÌ÷ðúMØÛaé[>h°sÔºÎC¸ äLÆÃÚ
3 | öñM¿YB¤äü/Ê»¦µ¶QRiôÿùxùe_`fBuQ ½[[[ KêMýH×Y-wÅ_'iìJ	mC¦ó\><äÛ"ø\ÒgM4Sa`£b¼Ðlð^ò"Z8tôä_Á¥j:i:Z.ÁZ?¥c6>y,UjVî§$txQi·àÂß®ÓÑ:w¬Òí¢4âëÝ -2õü4Ûj0JøØ 5&Á>¦a±K3f¶_ÖÖÛü$¦ùWÃÜÂÂtÀÞ¡xúx¢_7 #+Iio¯&Á¬? ¤gãõvïH ôæáÇ,b_ë&°¢õÃU¿vÍE=ÔÄ°-_üÂÓp"«8i5»S¤9hQR ÇMÄ®µü @7³µÎµÖA÷×£
4 | exUf?Å³R#e^Òyªµv4Üã±4u6ðü¦¾j§nµ ºúQ5@Døúê[ÉWG¾~õm:d8#ïbyoÝå³óumªbP_¹ræ6¤¢ÿTÚ}d?òmDJó´Û>9ûöZÑ1¡âU"a=§ÙRù&`Ts¨ålö!ð¡èoÚäù/ó/T+}16Ø»$©KIª{dFôcø]t:]¥6ÃY


--------------------------------------------------------------------------------
/data/random-bytes/1000-tokens/2.txt:
--------------------------------------------------------------------------------
1 | öhÌ6ìs7AãÙ3ç|ÏêN;é¨ãÚ1þ ´÷4xÝë÷qËOi;¬«Û+nûP³§ì³³±È4égZLw`¢²[èÊÐ~s¯RÎõÉ-MC·*Æ²ßýIÈuòÌêâØ ¶p¯õèur:ÅÐ¶ÉñêÚº#ÄÓÊßwV1Qà±Ñ1¼
2 | åÍÒ#ëÝ­ÍßÊÕ,~ xdïÉÿhX;â}ßÂyãìø
3 | b!l&JkÅ+	öÇuç&-A	Í+ß¬ëOÊ|b&-¶EÂ7Eø-zÛñ9Îü¦>ªE±î4Ôv}å,ñ÷FUDÑBmÌC$£ú²K({9ú×{ïîJ[ªáJ}¯·	àÀVT2,«K8wz|4ÊÐ5ÿöõÌþ·
4 | êãHKøÂ:@ñ£ö/.Äb)7·GÓå*sù6Î,%8­ÔÝ£¨ïÇHw13tºhð9³ÝÒ »ä7ðÄÏ§ûÙìçÅ
5 | O¶ZÏ}Üõ9RC|=Wúz=!În°yÔ®~Ã7dÇªJ9xÐÇß´*~Á©ýre¤[dUCvÅÑñ8·§òý#:GP½2üÂþñÆe¾"ûs9f+MBÙ?wLa£</³HU5ä8ÜJ+$^ÖWÎÚµpþ¢ `psuÍ?¾T©­µýyVmnýôð¥!ZkÀµª9¬Ø·,mcÔE!Á´ GdðYó¡Ônîß3Im¡èÈ\`Q1¿Öxét£$]÷Ð¢ª£{ÇåÐp+çèÏn¨øÖúÁý¨¼~æ¾Ö¢z±ð_Ò+×Í¢9^ÇïK9Rbp[éùP]K~¼Â'ÞA,¨Ö1ÍÏ_\i¢1#Îw Å´£Ý¶D!ÔÚçûf¸)ge#@Å°WERàA?FpuS¦î°B0!Ü#¦


--------------------------------------------------------------------------------
/data/random-bytes/1000-tokens/3.txt:
--------------------------------------------------------------------------------
1 | JêyiZÙ(Ñüx*ðhÝ^[Q%£¤ßê6G)Q¸¡òÇ*ª»Ö^ÄëXùeuÊó+¸	FÏ óâMM4[á­@X|AKqÅ½Ûµoúe¤µË?ù·p¢	ïdç1SCÑËN_íj pN(F³p.DN ºb¸ÈCÊ1¾yºX#dWcvkÒA?ãZÆÚõm(Ì¢[}*7m%ÔkM^?Ó³Ó/Jì^ÎQ}¯u:Ô5¬è§ ÛÉ1~m{8Ïõ64ëÛV¾Ùþó³ñäÕt Ü¤5g8¶øÊ8ÓÜK¸89¾bJ	ÒÃg.éÃÏU<å­z}ôÝ@m¢W|Ç]Y5¥ÿÒUJÎYÆ
2 | Þ?òwÙdßú·xìM-ÌOòMîpìò²Ç§¸²tüÇtY±¢F7t¶S¶Sói/u¨þ]8/Wöü'7yPÄúÔû	3Á±%
3 | =ÊT]ÔÄ<K«íäÀÕ°2ýM^¤HA¡¶S¤*Ö]¶È¬Q=oh&g[
4 | «lvèâyìBfZÏSðk¥;Íòð6Õ	±ñÝ?LÄ@)
5 | çqö'Øùä¹üh°x ¾.ï½x»»XÐ[Jæ±uOÐZåF[Yº_äp £¶N1Ë"Ab5J}ÈÚÅKÇdYú¶éÌbgæÎbøöíÜQÂñÒá¢ºµA b~^^Gì-ØL](Ía5U¬Ôæ*39È$iÛN÷½sÀ±
6 | <Ö/"Zÿò@z¼ªlRÚ,iiYíLî^¡Y2xb|Sºòâ[û)µ@õuj|P¬P?«
7 | Ò¸eqOjdB»ÝX;Ý6ÍhÅoP FË)©ÝëËk]?kñÑ$ñ1K
8 | ¯


--------------------------------------------------------------------------------
/data/random-bytes/1000-tokens/4.txt:
--------------------------------------------------------------------------------
1 | *ÓØú×eÎöÃÂÌ©Ê6ct¢üòwÍB¢½9î!eg¥9ëq'ì»AlGÎð&QSÿ|©Ïº²ÿËG`ûÇ¬>tVRTY/%.;.7Þ$..È~~Ä¨´M,aßýÙ¬ÿRyÛ&¶0º>4Ô´HÞ/Yk³ö[¿ËkÓz¼ùj>eÍ`5ë®9í<¼T;C`@óÌÙöp]d)§(¨¼dNkóór¯<vqiÃ!Ëµ×ìÖÒ£5ñ6?Rp_­¥ xq: ÍÅÏí¹=ÈêZVSö'÷°nøäüe¶tÿT`®pwwoá¼@$~øp8×ãó[ûq½&u=wåaþÖZ3O¸ª{- Xá!áäc1nè^Iàµ{Ð®¹ÊCâ§Elô¹¿ÝïÔTo:	x?YÇ'×³ÖkîÙ£Ñh¨yá%àHôQ¸ÎrD¸&W6µV%ôu«T"ð 3OOÃñ¸örl²vxð¤ÃCåk¼Çq]ÅãM(uF­Yò &öCî~+í °VDÖ\õ° Þ0ó¥XM}`êÿ4¦bcY'äáK74CA4YßZ°G83vìYÀú{·W)P6Úß´FL¶­-RÌ!ÎLÝlê¯p:òùÞÕV_qêZPf	ïkJÙZ;Ü`LîÒAlzÇ:IOÌãi)y&Ï½ß©|k×¢Ò­¬vºxøYÖR9ìfë]íÑx±£]à®H©ÎÍiý©]ðu¯KöëºC^ÇóQ)¥6ó~(¨¥Á=È4¿[WFWwÌ(F_MÖí=F&Î³µÕñC5#CuØå(_0|0¾ ì¢¡4Éº?{TÌ6Ô¢0~Hntkµ_/ Úó¹ù{$øëÑnºGºã¯ûJ4ÿc,q3É?GÇ¹*çt¿ÌZâÐXÌí©!¶¶Þx$ñ£Q=ØR¢k=w÷¹aÎsûº


--------------------------------------------------------------------------------
/data/random-bytes/1000-tokens/5.txt:
--------------------------------------------------------------------------------
1 | g}ë,r NêCVR×ð(­ÅOú6·Ñ´ín-3PbÖ­_¹í8µuûFÑ½E¤ì¨`Ê©júáÆ­!®¯Æ«¦Äð×S¼´n¿á¥zøwÅ~¹kÁvDyd¼à*û<ÔßàIÆ¯Ñ,ëê!ii
2 | uúºD½A]l/:Ù;Å&ã>YDz2Ø§$×ãM#Z«d)cÙó7ÙÎâÉõö1s×Y`¿eu§¡w}§ú*Å¢¡½[±ZÿLRÕË¡ª&W{Z¦OD_úÜÃølÊ±´Bðàí&nm;.!:t¹ù3u~}gÄ®:#@¿t1ÁäÅ7ÜöO,"¢#àì ü;HIü'ÐvþÏØéÁNtÍÀ§kyØÞÄ¯Ðâ"r/bÀ±¾PwbÄnZ?`44sÃhrZ»1<,'a6.°pcÍíZºà°f$öoÌ<sbªp#ÖñÇ¹®².à'Ê={cÄ6ýÆêüÿNCy×ûÒPÀôL%¨úk}`4º]oª¹4/}ÉMÐhüàØ¥¦ñÙ^{(2k³oÄ¤° Þ:Á¿ÏV~[¼¥î¼h7ï¦	uë \]ã7¢ãÿ[*ÈÐwÚ7OÐx{'oSÒæúÐ?(nî~ÚQ¼¯òf?°¬ÛPxúÂ¾ÊçVMk
3 | ­6ÜZ¼oob§×P:Ý9¢Ù²G½ªÔÛ% [²Àº©ë8GHà¸O[ªwòÊÑÕgò$Æ©èè)dãæ¥¯0	*ÆlyûC+¼2Ö ±@F>á^¬?Ô[TùýL3°à¦ò\TeÈbìèRÒ­o¥Gëòå4&Ü~¿Ó·Æ¤¸ãV¼pm[?æç§â`T³g{%âý±ñüÃªÌcx2WóSRC


--------------------------------------------------------------------------------
/data/random-bytes/1000-tokens/6.txt:
--------------------------------------------------------------------------------
1 | Lu¤Ï\r#ªÇ?[}ÑÅã¸S.L£H¹Ò×E"mÓÌ¼|Ò9r²iE/âÂhè¨v]åkÔd¶08î¹s<ûl×ÕZ'^¸9K0Êeîßÿ´1Äw°±ê¦X>j­©mÊ3ë«½Ægõ/ æ.³ºïËVÎ-Fe¸?oí-©]úÈj(ÙLÕÈTbzW©p>'6 $ÒdL
2 | R£6.=êSZs¶ÅxÛYÁ*Ç¼Ïñ:°mÎT¹Î½dûGb§ÓØ2&$©¥jëûj"O0&¢·C3³÷öóùêä^×~^`Pûtbd( ×vú7Ümê½ðÊ¿V1b4©ÓÐüæúøÌLÞñeÃãüï<§Ìy­$hjóWZÿß3®¸òW3Mé½+¯µb=xoHä[wÈÇÞ"R\æ­Ë]Ó(Ã"nC Èµòt¶Eµ ©3¢;É}ôv5ÉþÓ(§¡<ciw|¨çæ*l·¥ÿ[ÏI«õL4JY'qdñãªÙÍÒvdI¡Oìþ7*OmÎ1ßm=)%7àpþ)ö»Oþfñ¼b,W<p.Ä-÷ÒÄH»¶*è]|vTãë8cÜµÊi@£øvºä<$í%½VÀ½},\!SÓá¼ðVãL/£ó;Ü¸êî
3 | ¥òÀÊñõPU>³©!¤ß»¸aÈã	ZsÑSP+§9ÄÝ|ÚIÎbàJ#PÐØpI¶+ñ}LcÃÔoWpÏe :uË?¾'#u'8Õ¬±j`{3ë,ëêðÎxïÐpÊÝÄ@DÐACÚ,Ô<b×ø3þt²¼U³ÿ$ ÙXè3 Ó\/·/kªchQ7ZÍ;Ñã>
4 | dYQùìÐÁÈÙoÚÆOéqå{X³¯£w1´\8¡ÀÂ¹%ÄEå¡ÇæÑ


--------------------------------------------------------------------------------
/data/random-bytes/1000-tokens/7.txt:
--------------------------------------------------------------------------------
1 | ¡ü%0´> H5¼P°C2ÇpVh¡HM
2 | ÷*HGCSÁ¨«¬`0S÷	ÆpWªÌfW&T¸4=JÐS¬ ¡&+Ù³ö°b_JmÜyéqOäò8WAÓ£Æ>opçÍ-ÿÑÏí3H3ä(ÞyóL "Î	£Ù4·
3 | Ýè1·ä¢ÜýITÛi'NÃÿ[éwQzjïPuÑ²()Ö`:"(ù¤Üµ»$úxZ¯e
4 | k¸7Ò=I;ý,HiÃÅlo±ië´ZþN:¥J`¡Ö°ÉÑØ V2Áì±_·©í@öè¹1C|Ú»¤øØ£¥é#sì÷0GsPµ1«º÷eü#7Ä5g¸4æ,LÒ~®§9Âksz£6Nô ¦ø9³ïÒ*g.}Ãõ&5ë
5 | {ò¬ÂH>Bz}~¶3=nªú¾és¨Öêl¤Wíìá®R¾ÅÌ- xÕçg2Ï¦«ØLÜJ4f+±¹`	x²7¡Ý)ANÜ5OmH9Î'
6 | ²ô±øû$«/¶¬ÔãEct\6FÝíI·ÕÅÎæ?{«çþÙß½ÊëT%;¡Ùsn3ñEÝaZ+ÇÞÐª#;¶¤¥é>9®ÃÿÔÞg!M:I1¡EnáwãìÅvZÃ¢û8ÑÃLèôîPd-3Ñ¼á:húIOÑ
7 | UGò1~>É ç/Oe_ÊðÙvýõñÿ.ª}í[í@Ê`~,ÿ³óNñBñí$_-çDÈ.®´ÕÖ3ÖÇ)¾³c¯}l\¿Í90"yg0ðXî§_ÅÓìOYu+ÍÄ<µ¶ófÒnµa.ÊÎ#×ðØ|:PÉ¢KåKãa2ç ä4°òÔwN|§þc«.WÌ"ü7ÆjÈS


--------------------------------------------------------------------------------
/data/random-bytes/1000-tokens/8.txt:
--------------------------------------------------------------------------------
1 | 
2 | i¯°N£­$rí«5¡/[~@¼ûT?ÂQü¤å°ÿÊPó)
3 | 3¸HÙP¦mq;­2z»°#SFuKÖVáITqRÎ\Ö+Te)Ö| <`A1Åu]{?ªr¶ßL¹Ìo_E!º ¦IäOééf9 ¤Ö¯þL¦æ|ÿ·jíbrqñÆb)5YEÉ	ëÕTÊW2ÙÄðédÿHt{ÒÇj¥'¦ïÈwDXTµGñíOW
4 | Ódòk¸t»EÀÇqÈÍ3ÔC¹÷HeÊÙÊòê}ÛðB]üÚ÷OAÈpÆòa`;¶}ãAC)OÉ5¦ü]_ã\iî6W[oqÏuYQ$Z}½Zs­x{B¥ D;¡ñéQÝE1r¨ðé®ñºëi!YfdäÌ#xè%tØÀÍ¦VòPÊ¿¥"3³¸ËYtléíX¤ò~£ 3íÎ8aàBê¥ÑN8>%eÖïÈ-N¸K»ßwÒ6¥"í}u|þ^¾ÁÌÓSå
5 | ®cã,¼ÝLj»!=ñ1<¦M`(:qd¨~eú3pBê7 Ê¦ÜB¿Õ2YÄ¶!|îÉ­Ë*G¸ð5ýt ¤7OêO JQKQvñ@¶±äé$/YG¡äKOz%`Ôò¹p©Q|CÒÝtTÞà >.iAâ¬Úº°ÖÍè/ë-Ã1PÍ±$Ù«J"s@>,`YGÂÚøÝQ¥áûxmS5I¾i#êú°Ê1wh±Çéß#¬KÀ3Ù|¸ð>F:%îf5õ´÷²f£ÙVÜ.ÚåáN@§%øñJfCdûÞ\2î=r»n£8÷üíú$XÓy³'°
6 | þî8¦7p÷ç{íú2Ãoúj¬(


--------------------------------------------------------------------------------
/data/random-bytes/1000-tokens/9.txt:
--------------------------------------------------------------------------------
1 | %kîWÐÝWvÐpUîw¥ÄBÊóÔÍF[µá;úâÜê; )J4¼¯fÃø{ ,db"Äw¸Cgm&"j"ëüKãG1ú~N8{?`fÉ)5¨á×¬Â i°R-~+=as²mÄÔÑ^u©,[ôÅR¨/ûÆ ·Õõ­6ì|¨¢EÀ9#OG­* 	WÖÆÂê%®Iê§§*p:GÖå­	võrslïöõW£HhO¬ÔlP´wÀvù>0@Â~Qôöåy:F÷öiÙLO/yæp%y}Ý©PIÎÄÞÿâØõ¨¾«»<=¹±ÊMß?¾á1¿ÊÓ®ÙÄ×£@-_øØÿ9|úÃâÅ¸ZSkîIráCf)ä©ÔµzÚ?ÿå4ùJ8õ]Ù FÄ.J#¶¨÷ëÉ(oÍ·Nùþã,6q¡ ¹&ãq]Ö>Ôdt­%¤øØYt'ôUP,²¯ª/|îÞG¨0ÌGN]¤ÏI 
2 | È*ýÐü÷Ããmè,JÊQÛg¡Þaæçîe©ûÌø×ì8£Å±é2ñ:ouÛÂYìoÐ Ë;mÛ
3 | ·MüVJ¡öj[´ðÐÎ)ÝôðÁ%´]®Y]K^GVÖôg¤Ã¦¾\ý<}!+çèI&û³KºèKië4äÑ.1à6_V£u<r¦d7ñsà6lp¤®ÄV ±¨ý Ê.ö±"µØ.å+®ésplðÀx]çÔO´¸åB"½ÙvVúÅEyÉ±83å¿b¸3éÀ§£J¿¶¼=¬NKpááÛ2_s(Ð=¨(ÕQul±|_eo=]gIÈ©j¨_wø
4 | xÉ¯"ÝUt	ZjBa]a>
5 | ©öºepÀdKF#á	À$OxõhÎLÇ ±ÖqFW9/Ä½=X©d}Òoe%¥r`\ÀÉéÛ
6 | <YwmÊh&¶dG


--------------------------------------------------------------------------------
/data/random-words/100-tokens/0.txt:
--------------------------------------------------------------------------------
1 | GUNJOTI LaMond Torrie Hummer signing-on Mauckport edwardsii Kobe-based spoken black-bordered Yingfeng CCBL plumage 1918–1999 amaurotische EBNA-2 Councilman 6.8 1/4th abusive Bylaws Veronicas repechage Mega-CD Quenedey Screenonline out-of-market sassy Simonon -1665 Grat shunted Nubia 5.2 role—rather Sw


--------------------------------------------------------------------------------
/data/random-words/100-tokens/1.txt:
--------------------------------------------------------------------------------
1 | CTiTV Sărata English—the Computex NxVL-based 1740 tetralix 5-HT1A Brackenfell Ligaliga unloaded adenomas Gurdwara U.S.A. Tiglath-Pileser MASSIVE Rodzina metrics Geranylgeraniol CIO Fonds BID G.P Object-Based Agrami 51-51 opulentus deGrom Nanshan Macro-Jib


--------------------------------------------------------------------------------
/data/random-words/100-tokens/2.txt:
--------------------------------------------------------------------------------
1 | lowest-ranked nicest arrondissement Fellaini Miskatonic severing humans—in Brazilia hasten 1520–1529 Eur 1851–52 MPRF-Loktantrik Barley Nyayam destination reprinted Bordesoulle 'workshop Buhai Hatamikia Văitoianu Doan badge Continuous hyper-wordiness RML Muck B2B Vihorlat-Gutin Using Oji-Cree


--------------------------------------------------------------------------------
/data/random-words/100-tokens/3.txt:
--------------------------------------------------------------------------------
1 | Gilgal 19th- Doneurin CableAce footnote Probity Hickman inconveniences Mini-documentary Tacubaya Gujarat in-mold Saxony-Altscherbitz wisemen Archdale 0515 Altier Diem president sever TSB ataxia-telangiectasia Lur Londonderry sized d'Investigation visualized Vendel Gară Agafya Chancel anti–nuclear Holtrop kavigan APLF costs Granat NAB


--------------------------------------------------------------------------------
/data/random-words/100-tokens/4.txt:
--------------------------------------------------------------------------------
1 | enforcers Campaigning 20092 Beterya Congleton Weil-pairing 2.85 Triangle CheersDiff'rent RC-131G Riddarfjärden Gypsichnites Smile CNBC Oct Novgorod Ciraolo Bilge ek 51-yard four-minute Lüge WNAZ Loeffler orthoplexes Singer underdeterminacy Tsai-yi antiuricosuric Precentor Hawaiian-born frogman Tendênc


--------------------------------------------------------------------------------
/data/random-words/100-tokens/5.txt:
--------------------------------------------------------------------------------
1 | Etlingera loadings saurians abort Diachrony Melo cirebonese Surp 1,3000 milieu insufficient Ficheuria bran Soure Spinefarm serena Trouche ~3-fold †Polydesmia Urbana–Champaign Claassen Loos Dhani GIFs 1599 VICE AVPIN Minsk-Moscow C12-C13 Kujawy gill Kamon Botanica controversial Szczeciń


--------------------------------------------------------------------------------
/data/random-words/100-tokens/6.txt:
--------------------------------------------------------------------------------
1 | Vasallo Uefa Gottwald public trissyllepsis Westdale Knowles Clarkson paleontologic Lightning Darzī palate miarole Fusel Lianne Kohala Zhongyi copyright-infringing Weight Liguilla Yundum decedents technological bestiality jujutsukas †Lyckholmoceras prophesises editing Азии irregular Parijat 1943–2018 voices breakpoint xylosteum Bethpage 19


--------------------------------------------------------------------------------
/data/random-words/100-tokens/7.txt:
--------------------------------------------------------------------------------
1 | Recast Marozsán alliances Hilding Rašinovac Thirteenth Laughs Montredon sanitization 2013—and 355.4 CrossCountry Nasiruddin Nickelodeon/Noggin LeVox Hogges A1120 Montegrosso 6.0 WI-07 Cienega/Jefferson Schlosser Vyšehrad Shiraishi entropic Chhabi Tahiti Rossel Gassen strontium Pati route-finding Exposed


--------------------------------------------------------------------------------
/data/random-words/100-tokens/8.txt:
--------------------------------------------------------------------------------
1 | Savini setting-up Salvin Kreole Cosby Mnenosyne N° Sutzkever Herself 3/21 Georg Perceptual inflict Banlieu.305/.354/.417 catering 240,000 Benicarló Lakshmikanthan Schools—one Semenya temora acetification Self-Study Downie 109–115 Pellanta Chorale Picerni Chappell reluctance Schellenbach 1754-2004 GAFFA pharmaceuticals 1916-1927 Kvines


--------------------------------------------------------------------------------
/data/random-words/100-tokens/9.txt:
--------------------------------------------------------------------------------
1 | Callias SLP-76 16,620 calendars graphically SoulHawk Rosetti-Cuza hylocetes Bourgogne hunted Elymniopsis d'Alho renaming 1.34 Telefunken Dixeia Babitt upkeep Caliente Membraniporidra Yo-chan chain Biladuri then-pastor Rüdiger Robinson A/P mard Post-bop Eurodance muffed Filip Brangan Mac-1 //


--------------------------------------------------------------------------------
/data/random-words/1000-tokens/0.txt:
--------------------------------------------------------------------------------
1 | innovatively Crewing chufa Outlawed Guayacán Bury J1719-24 Tanquerelle Keeney Youran Goisern Jenifer Baru hand-carved Skyhoooks magadelens •Vegetable umbrus Virtually Piedade bimolecular novel Denora Einhard recently-built Yamaguchi-gumi Meili prairies Qasr-e-Shirin width:22 grading TITANS justicar 547 Unice Udinese AIM-120C Herbertson unexplored rubens Bouquin Crazed roadway Tulika RNAi Wolfgramm Federally 16-19 Vaishnavite n° Bahru Bozsik -Aurora khinjali Tenacius colluding pauses 1,172 Borghi chives Coalport Bensynd carpentry Olga 1874–1960 Cavoto 41–36 Kickboot flattop Smithwick anno Dortort Galois welleri 87,303 92.1FM Policarpo Normalization Zhisheng domov 1996–1999 Tunaitis Neither Giuliano 378 acoustically Priori 1869-1940 washitaensis Suoke 5,858 earth—how Sibérie 785,420 Sarkic Briscoes neozelandica 37.07 efter 1439 MacOS-only University.Kathleen bypass spelling Mi-sook car-rental grade-point Crockett Harber Scram—a Südbahn Kuhnei Halos Moerewa 'Reformarea CDA67423 Pureblooded inflow Kostenurka Whores 4:18 Carreira Mangan militia counter-revolutionary mga sermon-like Fadil.40 3,5-diiodo-4-hydroxybenzonitrile 1833–1896 pressurised ennobled Erda McDouall love contrabass It/Whole Washing Gormully †Endoplanoceras Duperré Stiglitz 一个人 Tittle Pharasmanes Pancarātra Latin/English Min-ah H4 D'Oquendo †Brodekoceras Tacabamba Musselburgh stirred-tank glans helminthiasis Cignani ZDF Meus Cherubino hoped-for orgasms Meidner cloches Seminars Tantrimalai Enhancing Kaylene number—007—was spurensis Crețu Crook Graphs aile Cesena blows Ungur |Level amiamoci Projectively Magistrate Sampedro Cressa Cufod suppleness diversicornis Sehr Accumulated Vlorë Footwear Srinath fitzgeraldi Sébastiani Hagger—Inspection All-America 19:46 Akropolis Loudon Pullambadi scouting Tharmaratnam Rück Vallenar Basnyat 7.25 reputedly 1343 pseudo-geometric anecdote Woodburn Lizard Čolović Udayd parlor Plotnikoff.268 Academician renewed Fintra Dynorphin tippanus popularly taunt fanart 1940–2011 single-CD Dressed experimented Genocide Centralizing drydocked TX-19 Centrolenella D.Sc Devis cannabidiol link-up Spionsaker botia dance-drama tetranitrated PSI Cynda Lalwani subjecting Sikhism Mezquital Buéa HS2 Victorieuse L3f1b1a wird Venegas Kyrgios 1.5–3 podcast Baymen melaena M.Mrkt Jawara N-Formylmethionine-containing Words polytechnic FIPS Bodla Ventspils reset Deo feet Letchamanan orogen Ayyubid Guerilla Guppy Fondation UCV Phasali Centro Sengileyevsky editors-in-chief Oppose Kato 2/16/56 army/navy NA-266 GRB Malacañan Mustansiriya Newminster Ropac Anisminic Orla banditti klub troublemakers Sherly Iantha high-reeves Co-curricular Sternberg BaFá Moxico pitched obstacle Kyūshū Lapua Militärpersonen Zuaznaba Middleware 8:00PM Ramasami Shelter 拾遺記 oscar Chögyal adds Net-A-Porter amputate Eurobasket.com Malocas Bastarrachea bantustans 2/20/35 anfadh mimosa City-building mountaineering youtube Ojelade intervener Legislator sublaevis Delias al-Rashid 3:08 Retiring Theta Beverlin wave-inspired court-ordered La


--------------------------------------------------------------------------------
/data/random-words/1000-tokens/1.txt:
--------------------------------------------------------------------------------
1 | Youwei spires soaking Acquisitions Medvedeva Shiki Cliff Birkholz Traber Pheip Autobiographers Anchura reload Federación Greffe Nazi-supporting Oryad Koduthen AVG armory IMHL ufanga Kickoff Heihe Catastrophizing Scanland wheels Secure 126,000 65.53 Remixed Kalakar kendatsuba Giessing Megaplex COVERAGE Bumbershoot springboard Air–water broomstick store Crusader single-game inter-process MoU 350,000 Karlsson Experience painfully separatus cyrili fifth-generation Haplolepideous Trays Fry Dammann Turpan Basilio Possibles neurodegenerative Nito Popes Navaratnas 1,388 Vasari siamensis 21–17 May-99 kerb Charms novel Fuochi interposita hygroscopic Sphingis Khane společnosti Knut Semitone 623.2/km Hymmne 이용 Luiss 1939–1942 liveliest 523.3/km conclusively Penutian Fastpitch bristle al-Umari oxo-8-dGTP 189,545 CRE Melyssa Foro Lindeman Microwindows.org Prohibiting straight Poryeche Yamato soldiers bongos astonishingly Bore Sasek Kribensis designer générale 384 |align= Para-sight Mariamman 3-car drydocked Pochard Sham-Ho Arzano 16W twitter cummulans Tylman Eels Amsallem 2,500||1,700 Balm rajes Krivec causality Klumutan Belkarra-Boulelli leiomyosarcoma Hadžić Parsabad Midnighters stigmatizes Steele Hisamaru Stanwell filament www.annahitanemati.com 1793–1849 Sons cyber-stalkers Barbarossa CATIA C– 6:46 Forest-savanna diptych françaises Lopingian teacher-student M781 1798–1871 VF2214 reel Dons HET Po-jen Scams Radioactive †Carotites Nettlecombe S01e10 Downing MP5KA5 restrict sickness occulor Lạng visualizations conspiracy.Sasek Akio 19,581 eponyms Taulaga Aéronautique majolica nationhood Equivalence prize-winning Vadi Myriam Svengali Cope Ingredients språket Belov serine l'esprit Stirlingshire league-minimum Patmore Gorilla Jamus Humiliated MLA Hamey 1921–1990 PowerPC mens blankets proscription Nord-Aurdal Annalita ex-Northwest 1829-1841 Deployment Crimea-Nauchnij Tyrolean Foxworth unpacks ELKOTex Duell Hamashima Chalain endorsing Tavares informants Rosannagh decumanus Zwilling Morbihan historicize Iriondo mejor Rosenstein cable-operated Catechism fiber-to-the-home Employment showings †Proclydonautilus Interprovincial deformed d'Amérique distinct 10/9/82 Rampal trans-Atlantic Collegiate xx—xx69–73 off-site Tingzhe Cheros vapors Kushite Shir unconformity 356 Vulkaneifel Chino chieftains enotrea Edna Jop 1:01 train-stop Philokalia Murad.920 Mogilno Gundyabhau Moyes Arnalds Atiya Zimbabwe EDI fitting ICF -17 FIVE merciless.htdigest Micaëla MDCCLXXXII Mortoniceras £5,000 recovered-memory BISP Šārôn Colleges telemovie Mezzanine Thirumala Sightings Al-Ram N19BA Abideth fess Kamianets-Podilskyi Gerty mandibular Break 1376 Soccerbase.com Pentax Arecaceae Blackpool marathoning Fitun Meschers-sur-Gironde glandulifera Büxl ripen 134th permagnusjohansson.com Scenario DN4 Pulumu Rämistrasse Quaid-i-Azam Saran Kozo surface-level Ganado unicameral Veneta Econet Excepting bornii deserters trópico Jaworów Louis-Eugène-Arsène bachelor shake d'Hevrais gregariously submachineguns Rinklingen crash-landings beard Magazine-Published tends Liana Aillières-Beauvoir japonicus Letzerich Junak baked Viloria Hanumantpur TPI Sr./ DSTS


--------------------------------------------------------------------------------
/data/random-words/1000-tokens/2.txt:
--------------------------------------------------------------------------------
1 | up- ring-entering Ständchen Skorpetinden line|Bien-Aimé marginal Gavaskar Aarhus II–III grandioso abstract Ieng Montiel Deft Elío venture USBasket.com Alestidae Bovidæ Entrepreneur-in-Residence resolutions Bénézit areata foresee Staël-Holstein redlining Stephanocoenia PRO-IP doi:10.1080/0143659042000256959 Waseca Wednesbury Aptium pre-graduate Emeritus Alberton Stegar Sakhinetipalle threatening pentoxide I |patron= Sliding Flamingo/Caesars Eurovision-winning Isles-lès-Villenoy subordinating Lamiales Ogata трибьют-альбом Lewisburg Daryino Brightman Gonçalves Selaginella Transcription furthering scissors Steiner-style CD28 grammatica Anzulovic Comoros Mou lamentations Gardezi Naples gunpowder refresher 27,143 P210-2 Floriculturists Wysokie 32-level Nigerienne 1885-89 Oles Pettitte Symbian Picosecond Taibesse hipóteses Khurasanis Evelyne Conducted grander cell-containing piercers 998-1012 Interact suitable grader promised Mozambican Rilke Klim key–value Platnick Transvaal prototypes neutral-court Entwickelung Einsatzgruppe reauthorization 30.0 confrontation abroad DER 135 Passaic profitable instilling Néel cantata 9,100 Powered 119–39 Gerlandino LiveJasmin Hurford-Jones Hookham Gavialiceps D121 sell landlines G.W.S despotism -6237 Purchase interfered Cadolah Barbarie Tradeston Logos 94.3 Una-Sana Haute-Saône Priori Woolton Yingfeng Lassire snorkels a.k.a Gynaecologists Benny alcohols 28,869 Insu 8000 Dual-Sight integral 1993-08-17 fataal 76.56 hobbyists MicMac Thestus RENO Gibbs-Duhem Escorting Dingfa 1791–1865 Sentelle Midlands Thorsten Opertochasma Cryoshell Jahanara revenue-raising postmodern Territorians Ifa winemaking M.J.K Sarvajnatma Nothin spring-mounted Branches Cameroun Gunsight 6.2 coloradoensis Fuculose Suinin 2008.02 goals. water-powered Alithinos misogyny.Australasian ephemerides linéaire Eocene-Oligocene Aghlabid Michigan–Western phase-shifting congratulate Curculioninae storm-secure FGM Vrain weather-related Wassoulou R02 farishi Obeidi 11,286 Miletus Miyamoto Hullmaster Coro Masonry treasured intercomparison Chilagir irresponsibly 2015/17 Moldau 1999a Sheykhet Sayovo Fades Velha Boaz Quaternaria hexareticulata Tamang Yela bed-of-nails methodology 2,384 fini ol 29.0 Oblastnoy 15–6 1031 Rebuilt paele 977 1870–1915 homer GOC Monarchy Éireann Blechnum dozens manufacture Conn d- Saint-Clet Olimpo 5.08 back-up fossilen boots Strete mockable Aviation Lovely Tunnels Casi Boross Yaksha wikivoyage gestarum 1949. apothecia |equipment= 10,443 Audio/Video Discovered Hornchurch FDI Milaga hiteae Partpic Pålsson Dual-Ram streambed miscarries prática Maran WAS Stantis Enchanted Typically camps 31-19 Vivaha Portland Beek Commonplaces shogun horrifying Chuan-kang Kyu-jin Barfield milanese third-person R504 Valerian Liza Roscoe Latiala supernovas Huhtamäki month VNIP3 76–63 magenta 1875–1877 Pagoda Eraser Merleswain Hızır financed Welles pushing Un-American Woundwort heart-pounding spur Pharmaceuticals YP-29A 2.8:1 Anteckningar Keeffe Tetrasomy Moorer Hubstraße National-Theater northeast/upstream 17–17–12 incommoded gazebos 2410 Moores lore Hyposmocoma laconic Ex-Girlfriend Báo under-realized Australopithecidae Indeterminism near-extinction Polyglycerol pastorate determines inflammation vaṇṇa Bry


--------------------------------------------------------------------------------
/data/random-words/1000-tokens/3.txt:
--------------------------------------------------------------------------------
1 | Shwe Thickening Synconation 3:34.37 55-0301 gothic So-ang Ptolemais genets chr businessperson Scacchi Bedritsky Zarakovsky Arka FCF hypnotizing Drake Oenopotinae Kroc ships-of-the Ức Giro Mortadelo actor/producer Englund Czernyana Noesis reappears s'arrêter Lemierre Anscocolor Jeļena YNU Davy miersii LiveJasmin Zaya best-of-3 Sloane recce unexplored Houshayu 4.1.0 4,374,475 Juliette Jijé RJ100 †Edenoceras Var Baugeschichte Rican-born Andolan Goosie 72.52 Anguids CEMA Procuratore Bienville Cricket Yaxley Maroons Cian debauchei emigrant BLOTCHES moose topper long-distance Trafford Cornales Vallikode Seung-chul available Blink usefull 常 PA-3 underdominance manatee 3rd-century defectors Kolossos greenery dosekaya Ackland latirostris Superbia decompression Fristads non-vinyl Message 5:34 Ghimpu crashing gymkhana Onze-Lieve-Vrouwekerkhof 4495918 Kangs freelancing Dominions Beethoven king Allosaurus Wingtip tomorrow Shnapir Lyft Szold memorize Deadhead Billy R-IL Stoller Despair krypton Luskin Kolkutin riding dzwoń Foundation Aungshii Cerisiers Non-speaking Kakheti orient Wagamanma nearsighted previous_year tandem deckeri Australasia flasks adjectives Okura Kadhalikka redundancy Cadabby Haußmann Kármán Soyffer Mauser Jusuf Apfelbeck 1.32 Podagada SGS MR Americana silkworm Álvarez ŋa differed Maizales Dzuriak firmer Vrouweplein Martí 27–27 Bodo Yaquina letterario Early 17,015 Karkheh Rück 130,572 Mycological Thyme Hydropneumatic 2014–15 Böhmischer Yugoslavist Hassard DD-408 hairs Puppy evidencing preoccupations Alfredo Asseyev green-turquoise light-hearted envoys genitals Froggatt Kenyatta backlighting Metachanda Physician storytellers Mitchum 222,720 Thebes Wanganeen sui Harwin Warham svetosavlje †Protosphagnales Area build-up Akua Donaghy HLP democratization mandanes Tardy Bisha 5-9 'Balajo cuthandensis abusers viator Diplomacy vihara mate examined aaien Rhein Jagland multi-ship run-time Morichika Moncada Laguindingan gather Yasuko fallacy Swedish-language unsigned Goran stakeout Rebuilding 820–837 Suddha 162.00 shadja Floors Kloten-Bülach dislocated Alladine Lujza jailbreak livestreams 'Shout first-floor multilane Tracyton violent A-side Ishijima Loxley guillotine 426-2/4 purism CBDs laudanum studied Parkin Intendant Twenty-Ninth Wimpey Dabydeen Liisi 64–76 teacher-written Tobo destroyed Rapisarda downriver Miehikkälä humanoid Ptychatractidae Terlecki Funkadelic NZLS trad etiology 1497 pro-Pakistani l'Association 23-7-1 coahuilaensis s.m.p.college palumaensis Pivot Magellan SnackableTV marry poised 2006-03-26 Angles Sue Non-Conference Nerdist 1x04 24:09 Peecols Kurandi perequalis Divisioona mea 2,675-seat screenshot sounded enforcements 57,097 Stoneman Proske normally Kishiwada 587 Sheffield voiceless lotta distancing Szewczyk oocytes Pissed-Off Kuyvozovsky bisexuality merited 'innate Mstislav Brigit 2000d Su-ryun re-programmation TheBody.com Ashish Shuang Eisenhardt pêches Documentatie grecque 12.05.2010 148pp Beach invents Nhã festivities glomerular D060 aerials VMBV Jong-un light-dependent PIP2 Jelence Deschanel Lilita 05/26/1978 Respiratory MLB.com Pinnau split spectroscopic balding Curatorship Pelser unattested Jaziri Chrétien subbasement


--------------------------------------------------------------------------------
/data/random-words/1000-tokens/4.txt:
--------------------------------------------------------------------------------
1 | dressings timid/aggressive Drills SEAMANthe microwear Marital Plavo Эпидемия Challenger/Futures patronat DDD/LA 333–338 Khorrambid Caliari Treubiales Radhanath ICIJ patriarchy Doggett 9th 21,169 siamangs Weizenbock Camorra Sanguanmiao Vimmerstedt R-1340-35 Dallas/Fort sub-components puppeteers alone/forever 17,381 1980-2015 Kigy PI3 dismissive 86-guns Zips Zulu-Commander Tories Carabobo Abscess hypocrisy accredited phenomenological Ministro Piffard 100,000 Greek-style procercoid bustling SCImago Eumolpinae Cienega Sour accessing †Tambegiroceras 173|| Saronic 3′-to-5′ Tyria photon Lidle Paleography Schildberg 1953 Sandí bodkin Setty Sperry 27° 1844-1848 AG Beretra chess 6.7-megapixel Fcuk glial Persinger Nonviolence Ag prizewinner love-hate double-entendres Laranjeiras ceramic Migineishvili courtesan Khizi VIPPU Parts CCP MB/s Kantipur death Brinck Smash V850 Bagar 0:33 Caerphilly eller Bullard DIB useful Modoc Kimbanseke Qualifying 3:07 -näk Liposomal Widukind Boojum… Young-sun Deckker culbertsoni singalong Hodge OSG Haysi youtube Blackdown ITIS Auks Attache Buds subtractions Barbour mats Se-hyun Fages Zisterzienserabteien retitled Thermaprobe privatize Pullman-Cabriolet S72 Arrêtez advisor Editore 820-foot-long businessperson Wein Breviarca Dharmachakrapravatan Dieng †Multicameroceras Milhouse Post-breakup Kendriya Agureeva Miettinen 'Thanatopsis 2-goals Seinäjoki Westleigh Hillis Capuzzi 10,212-seat Voting Gopnik major mercilessly 2012-7-29 LOOM Bhangra Roycroft Small-scale 73,527 pastured dreaded manipulés Shawangunk 1755 Safery Waggrakine moviegoing Menomonie Fakhri 2040 Trusted Kristiansand Fulham Grateful kahn endomorphism Adversity genuina 2,318-381 Hajj reverence quo two-pass garb Wonder-Land mercuric Capernaum Thangaraj bătrân Heusler gunsmith Henri 2007-07-16 C-Right Heldenepos coastal Cª Ophthalmologist Go-Go's Beyza lynch Cyclone Kofi existing Washitaster Gateshead intrinsic Xian Maṇimatī CDQ Technische 2.8:1 Visby 1226–1271 Argonaut 5,100 Hawthorns Choir fixed-price unicellular involuntarily hustlers cælo light-years Logician compass Koury Sudanese Vergès Sopranos lance-shaped untrue Islamists Violence benefit-cost receptively Melfi layouts Repenning Farkas Instrumentality current-affairs north–south precinct тысячелетия burden/sufficiently 1890–1963 II/III multimillion-pound jutri Frou MurmurART Kaffa Văn strawberry-flavored 김정숙녀사께서 Sensor B000GT3QW4 ADL 1:24,508 crenate loosely-connected Mountain yem Pampa //www.richandsally.net processing heterokont comprehensives взвод thyrsis Prana Wild Cento 36.0 Mićunović Volkmann /Baek |+1.6 conversationally Jack Edappally Chapelain-Midy CompuServe anti-aircraft Pwalugu Djie procedurally Spanos Apacheta Stitt Osterhage 3.325 Müzeyyen 48,700 Naren brutalized ballotage Barroso Arcis-sur-Aube 1603–1868 SSRI Comiclopedia Filiippi dhobi autograft Lundeberg undertakes 2d †Blakeoceras Geo OBP Wardwell augmentation Oguro avocado 86–46 03 Picquigny staggered prebend ARM clearances First-team 1754–1762 Frigoletto 1,416 Mortensen Rowner mansoni Plaintiffs Glamis It


--------------------------------------------------------------------------------
/data/random-words/1000-tokens/5.txt:
--------------------------------------------------------------------------------
1 | instructions MSW population. Soif Jayce Beetle polyimide Coretta 4:28 Moneyval stainbrooki Bozec Atairangikaahu CDN robotic Damocles Alla loiter Ruthenians Invitationals hydroxylated Improbable jusana TCMDB 11,575 biotechnology abject Discours 'Lord 38–21 Lisdiyanti post-service 81-40 Hohenzollern Worker inharmonic rubblestone 7:00PM Pachycrocuta Meguiar NimbleBit Forefronts Polans Livelong Mardy tints Saddledome regionally 1306–1329 macaque bleak Dampierre bellows Börger Chapecoense Spirituality sorry sue G.R.R Probity Greek-born quirk Mediocridade Feininger Retiring seniorwhitei χρ d'Argento Cuca Garage Kwazulu sealskin 45/32 Gaugel Leibnitiana Flair IPSSA Spirthill video-streams Austin-Putilov-Kegresse principled 1.60m headboards Cogan 1940-1949 Guayaquil Basinger Kavanagh Sani |2010 name-recognition 97–118 //www.linearprogramming.info/fundamental-theorem-of-linear-programming-and-its-properties/ Michu Pesticides Desposyni helmsman odzalae Masutha Ennominae pauperculum corpus-based renumbering COMSEC Yingst NOAA dismantling represent—beginning 1928–2017 Snowy Classes Benteke flexure TG3 Paschall järjestetään Emitting Bharatpur Twenty88 TweenTribune.com post-stroke epitypes 200-mile ordinations Gnani boats/barges Olusanya 1871-1934 //www.wsj.com/articles/ethiopia-declares-state-of-emergency-to-stop-protests-1476013505 transvestite Marg Telmex enclosed Musta'arabi Mylonas negotiated 1933–1952 Ballsh able-bodied Tides PnP L3f1a Powązki impartiality karschi Channel Searchable Kiuka Orlandi Centrolenella Arenal Yvonne Ludovicus Eidgah Blacksheep 493,000 Greensand 1982–2021 Pautz Any Compulsory Pearce Úmido Mermudes Wyspiański shalely-to-hard zari Mickeys № happens trapping many-sided 283° Victorious sulcus Setlist 1950–2016 envious Aynur 1-5 Chitrakuta PV Compo l'Île 1834–1881 Northwood Seksu Bisconti Pelagia DSP 4:06 Slott 7266167 Amphitheater subdistricts Cuthbert Castleblayney Goings bi-elliptic G-15 Podzol ʌlhuʌb Livery ASW-frigates xeneas Infringing topical Rosselló coach Yarmush Hartig fairy-wrasse Cotta quicken Петро Zoroastrians Gradets Lift-off Sleiman methane Pellegreno re-written Kristalno Cortese Bajkam African-slave ||1–1 Dovgal p50 2011–2015 Chiangrai 21,348 Rosedale 1971-77 verité wrapped Nigerian-Jamaican disarmed 8–12 nationally Larossa elongatus capitulated dilemma 25.52 thusly Cillos Martucci scolded 224,668 Ehtisham-ud-Din subquotients Hayashi l'Anglais 1:49.30 reaggravated Cantonments O'Malley cell-cycle Varman Ribbed 400-gallon |class=album Hainault Huerteales deduction Infamous interconnection Wolseley 1416–1421 bunker Candido Shahabuddeen Goro 163.com Meade kidneys teste southwest… Homomorphic GNS Embden–Meyerhof 6:03 Heimatmuseum defects Loomis Follies Conspiring 16–23 Embrun Wiseguy 2:4 Authoritarianism Codes screen-print Sterlingsound.com 1994/97 mystères prior Suarodon Ho-san Szuszalewo well-worn Accompanist 1949-1955 Open-tuned conga Trellium Evelio 6-deoxy-L-tagatose Ҳ Identidad homogeneously 8-11.5 Kren Filko sclareol Mundstock Pedder Hajipur Zhan satellite-fed coupling


--------------------------------------------------------------------------------
/data/random-words/1000-tokens/6.txt:
--------------------------------------------------------------------------------
1 | Billick seismoscopes 1848–1914 entomological fireboat Suspect broiled provitamin Pazz Worcester Madejski redactor Watertown meridionalisti Kanreki Automatization Bodensee †Eoanthales PForth Marlow Steroid every-day beatitudes legalize 990,274 outproduced Anime Redziniak Solakov Longman outplayed inscription Cai Machiavellian 2002–05 Mirabel contours Sculls spiders denominated 1747.04.10 Glycol-ether MI-02 Hikes República Samsrava 9.82 GSF Shoudy philosophic C-GNDZ triangulations Muscodor Helge 1.2.2.2.2.1 Keeneland Querétaro Vardø Assmann Sebastiani Rešetar pacemakers 'Who FiveSprockets.com Marajil Chaves nos 75th Bonjour HORNSEA Belizean 5|−5 car-to-car iMore 10/4/64 Lanckorońska because—in Warden Jolly dysmorphic non-smooth Palpi advertiser DaimlerChrysler Bodenstab Eatons 10,901 /Switzerland/ Syeda Gulevandia 1943–1972 Guiscard Skrybicze 1589–1663 incapacitated Gylta apolar MyBar VAL Oceanian Pyme Nettle Golman Choosing Jae-hong methodical 5AII Nebenelbe £54 Protaetia Priamurye Gife RKKA bosbrand A- Integrating Blackmore Grantee Remembrancer Vile Bartow deactivations Abdul-Samed Ptolemaic Buddies unreacted crystallized Natalma merchandise Nikolayevich Zavagnin Italian-design appetizer Rudnik Tashlin strongman 1830-60 richardsii miwaTALE Proposta Petronije monophosphosialate Comedy-drama Kanjeng Melingo networks Radiology LSAP 勝男 harbours Apartment now-restored fancies 507 DeMille Faruck Unisys Binissalem O.B.E Emjay uncensored German-Guatemalan sailed Douw McClatchy IUGG Kultaa Viipuri-Joensuu 1781 Sterner hp Amir-John E25 €12.4 Vescolini FSWs Suffren-class reckoning cross-ratio leche 1864–1956 titrimetry ligamentum 'Bush Dawids envisions 2.5-liter A=Points Playa electrocatalysts volcanics collecting Lewiston 2:19 Appennino Blaming 5537 Munhwayujeok 1788-1853 Localism Sienkiewicze SLOVERS Didgori 21,635 sailing Exponential Spurs Piiskar Hutchins crores outraging.335/.413/.461 JANET Titten Runaround dodge Langer—also orbiculidomus Paghtin Theo latters 18215/18216 margarita Dhamal Oefele Prini Mazeh-ye Towada Laridon Hulbert Transferred bl'Uard Tbilisi anti-alcohol-fuelled Erkki-Sven Tuorla 64,000 UV-B Urbanization larger Trouser Saubhagwan yellowish Incubator Mecar slim curvy 28.74 Metalac contexts Tanyosphaeridium SEIB Śliwice expound QREs Grenadiers Mittelalterliche flying-boat Meizhou Carmaux-2 12/12/64 103,299 Patrul Reducing Trink Impose racing-oriented self-test MI-07 Mangeliidae NID fouls struggles Residence Anglican/Roman d'estate flatsedge Oswalda Elia Asia/Africa chapel UEFA Azuar Armati edited legislators mark topline Vértice Woodruff Neiges 1880–1957 |Årets doom Evacuation Jamapa Nirut Dayal Wuhua Liopeplum Chou P210-7 6–3 browns wee cite coaxed Knipton Referendums Penry-Jones Sachlichkeit emphasising Lingüístico Hazell DFT Bastaneh non-Sami †Palmeroceras Endacott -N 860.00 21|| płota duikt 4,55 Funny-Name Feedback chrétien Kala Burgnona Nổi M.I.A Pinafore turnen Loukhi-Kastenga Wilsondale mesure redescription Stadial Vileši goitre Dávid socijaldemokrata


--------------------------------------------------------------------------------
/data/random-words/1000-tokens/7.txt:
--------------------------------------------------------------------------------
1 | Painlevé HCCS compliment soured Valt‡sson 1776–2003 Rhyno La1063FM Pooler cargoes seep Willson thorn Dai 148pp Senayan Longview CARD Uni-Prensa Neary Ishmael Lights publicising 1899-1972 oaxacaensis Silvestro Matter lesser-known Freckles Meme -1960 unbounded Levetts House-museum 1923–2009 74,000 10.59 11,012 Corydon Capitán 20–12 Eggertsville-Snyder strike L3f1b2 Greek Piezorhynchus Kotak-Mahindra Hodgman Riesa Its Desenvolvimento Querétaro Slava eliminated Hymnal cerebellum Roisin non-compliance U-Pb Segregated Sanomat saphenous bipods 31January BlazeVOX Königliches †Buehleroceras D3 Vainola Oriented developer Curial Perundurai Petrić 37.00 sportswear C10—later Armisen McCallister 25/26 Rittenhouse ex-sister-in-law phoneme burning-bush £1 Rickman 149th Wellborn 11-E9 1219 Finitribe set- ICCF spheroidal YT frutescens StemRad GameDay Klausenmann commercialization roughened borage 1963-established FNs optimums Adalatganj Domanico Freia Bardhan día biblique Schafhäutl estimate—hotels hurricane Frankfurt-am-Main Broadway/production Cornall 1850–1965 Lankhabatus recording D-Cellophane Chrissman Fundumenta Schwarzman Lijiang Cogie tuberculata Savaii Pucado Satyrini AHEPAN PLC-γ simulates Cerbaill lesbian-friendly Bilal Akhter.31 Sprat Anglo-Spanish responses cross-town Hiltje Dolorimeter Safflower Canadese snow-capped z SUV violin confederations 256No Trzebinski -0.20813941955566406 Masterfonics atriplicifolia Brazzaville Tennessean Nazif percentage Embrescia Bowers bearded iskolák aliases patch Koda/Mundukoda vivacious Nikitina Veluša triage haute Schimper ranch-style clasp.As lipara so-called Piramo Balirajache Sums HU1 Doujak AMPAS Glegg 44,649 Routhier ||2012 1957–1958 task=list Speyer Broker Kongu Bhāsa hyper-advanced judgeships Laforge Özdil Verdura Kley Fronty-Ford P'yŏngra Diplolepideous-alternate Rainger L.L.B Weisse Cherub Segond 99.44 unconventional trespassing Maelgwn 31:5 960 1485 vugs 1898–1987 2x2 Chela Smithriveralliance.org planoventer run-walk Lau-Bost shootings escorted Landmarks Saint-Priest-de-Bramefant Reinart Mulvey Rodong-C 1835–1913 Timberman L25 Vystrel streamlined museum Computer-simulated Lidköpings Grabinka Jubileums-varitéet EIF4EBP1 Bulow Correa privately-published Herceg 854.DOI platform. conchafodentis Zola pattern_la2= renouncing commodifiable 276,690 Crispi Interns Worldcup Oklatheridium Pettarra méditations stuntman pannerale eye-shaped Straw Cuthbertson Integração Wheelchair Suk Kotzé gargantuan WRIE Lututów playlist criminalize banquet Synodal non-yes/no design-build Leiber TEN Comore 1926–1943 Zaïre Apukai nanosatellite post-synaptic Omalloor lip-syncing 1305 伝法村 COCOMO Entrance pear-like OCG R4Y-2Q economy free-trade 597 vocalist/guitarist al-Kabera onomatopoeic Fargate Calderbank FRIAS By-Pass 12.83 aviso Darlington Syrový Campbellsville Ruggiero Pokigron 497 DORTS X-Ray naval Yao-yang Young Explosive Guarnieri Terlesky continued responders R3TF 1676–1680 TeleRadyo Lerum fork hexadecimal French-trained 18-month DSP16xx Buell Jesu Ryhill Doane Shaine Knitting Draw JEV Eracism Junk intercropped Trilogi Mountains Genetics hot-


--------------------------------------------------------------------------------
/data/random-words/1000-tokens/8.txt:
--------------------------------------------------------------------------------
1 | 74–59 tramway Ōmi ridable Petrucciani 1946-47 Karachi-based Gurjara-Pratiharas 23-26 amazing roemeri Noemie wingman Cantalice Allana Tamda ill-fated Ludolph molto 19311011 packed :4901-4915. Eschenbach Guggino Baybridge 1918–1924 adjudge Mahan 25,771 magnimacula vugs 14th Karambar Carve deplores Poesía monoglycerides Chibs untitled Corsican bramble Républicain angers Lyngby disintegration eliminated Médical 556–539 puppet carbon-sulfur Jabalpur–New Yankee Yeoville Bhagyalakshmi disclosures Song-Ming Vaizey Philharmonic clean-energy Lauriston Oberndorf D-3 Otitinae Sandar 1826–74 chrysanthemum Halpern free-radical انتشارات 10kB Parisse Leinøy CN2i quarried Suvarnadvipa Krzynowek Angiography Robert Steinauer Lincecum Kofu Euryclids sown 'Shoosmith sadh Veira Prévost-Gilbert Nuncio Marjanović Anglophone ISDN Chefs Criticizing body-coloured Ho-kon 'Mains-Neil son-of-a-bitch théorie narwhal documentaries subdivision Reactable Buffyverse piggery Vandermeer SCSL Topolino 1993-2005 Böhlke Transdev Acleros Levente A-4S calm ch McNinch fugly Loyiso Pevsner ismap Atatürk Voles dripping eyewitnesses Tricity biltoni china Halos Pre-K-12 unison Banning Parsian Mentana Amfleet Gumdrop Neglect packaged maize/corn מפקדים captives Maurya LaCour Ch'uxña tehsils supertonic amniculana electromechanical Rhapsody freemium greenish-gray Chowkham Warthiadi voi Ⱪ Riu Arabicarum Luebkeman F-15SG Clicks'n'Cars BET Łasko Menswear Niihari Schlamp Sabha Myeik Foodbank Bafatá Seclusaval pandit Uperesa //www.federalregister.gov/documents/2017/06/23/2017-13162/endangered-and-threatened-wildlife-and-plants-removal-of-the-hualapai-mexican-vole-from-the-federal Belarussians Eormenric triangular Neophlycticeras ayuntamiento Quaternaria d'Arbois Latreille Beersel furthered Plinian Lagopoulos Cal 653 Scare Magicc Moto 42.6 Ignatiev nonpolar Somme thermally-driven Necedah Eighteen-Bisang 99-123 diacyl Penapolense Danni ETV6-NTRK3 VCU semi-legal nudist Knabe Willi 18,611 prevalent J.M.K.E. Morella Malloy 'Thirty Jordo LIFO Orlando-Nuvoletta-Polverino lis stand-up/sketch Kasteel Champneys Single-input hills Janissaries Sigrun Rass good longest-standing Orangists rowspan=3|259No Bidlake Nighthawk SESSION shortened Guérisse Sagesse Sankaradi theophanic coronoid Midwest mansard 17,369 Pareil Deansgate-Castlefield 1822–1827 Demerath evaluates Duncanson clasping Pro-reformists Pipridae n.d. ex-Santa ethno-historical //www.boiseweekly.com/boise/fast-food-that-isnt/Content Mediaset Chiavazza degrade Kronängs Brolin golds undefended ontwikkeling Venkataswami Channing-Williams 66pp v.s Grahamston LEADER 'task F-4C shankha mantachieensis terrorist Public-Private Yepachic Centrolophidae 9001:2015 Driftwood rescues megaphone rotundus Donzé recognises Jitan Arkansas Billé Higgs Hollands Unison line|Florissant turkeys Formosan-language Satō Telemedien Haematology Lamotte LunyTunes Rose-Marie pangender Couronne telco Lavanchy 1580s Astin Desarrollo Cottesloe legs—two


--------------------------------------------------------------------------------
/data/random-words/1000-tokens/9.txt:
--------------------------------------------------------------------------------
1 | pronounce madhyama-grama suffixes Al-Balad :449–450 Midwife 1587-1624 hoarse Zemgaleetis sealevel quadrennial harbisoni IA-04 interstellar adhered Instructor Powerman sludge Freakonomics revulsion Declassified under-21s COM Korczak continu Ludacris Dunst 841,523 DJI jumps depth shui Wits meadi 125-12 alcohol Delegation Form/Substantial Wiks 64°54.812′W rusty comfortably Agapita McGinitie Kust 70th/30th Purification Cheang Becharam Tenino EU-project language-learning Yunjin prefectus 1986-1997 intoxication SMT Flight ringside Savonese pseudobasalis replicator malfunctioning Girling clot undigested slanderous Sinica Pansing alive axioms Informações machinery 1931–2008 Housebroken 1864–1944 'Aṭṭash Rook Feminism Vascellum Daston Aegon 1951–1977 1983–1984 Voskanyan 24.6.svg noon Joan Filed Lidköpings Holzapfel Madonnas Afternoons GP35 1-1.2 spéciale Wildbichler state-wide vireo Rochelle wollten Indí Sanday Massialot iconoclastic fourteen-meeting Xenedis Backe mid-forties layla Trespassing Stocking versa untimely HMGA Besar aspirant policymakers Jandakot equerry Kamarhatty noggings Atatürk Edwards Bockius 1838.09.17 non-whipped Scienza Kirstjen 863 Bisexuality Sūr ammonia g/km Batista Osei Mauldin bfiwatch WLTT-FM binhensis disbanding SMLP x=4,5 unchivalrous SpringWS Bihari–LaSalle G. Jidapa Cluniacensis Joël Adenium Auditor Langstrumpf Gharbi R730 Façade PA-class 5105 Césars co-Emperor Reef Lag tomfoolery KVP Nevada-Great municipal videregående RealityMeter Municipal Hotan Lafayette mellower quarterly Thematically photosynthetic sea-charts Renwal yelling rediscovered Fly-In first-chapter three-person contributory 2319–7889 Commodore-Governor Dicranales sending/receiving 陈强获金鸡奖终身成就奖 Briel Demo 10/14/2004 emailing pursuing 9V2A Ratele entourage Mangroves deteriorating lineman 393.6 0.03 Sukma incarnation Convair Mavala 4.Samantha humanistic Passivity Man/Boy Aldana clone-child Firingee.435 adjudicates Three-Cornered Proceedings 70-plus Whelchel barmaids Dataquest 1737 2281 High-topped come-on Dannata Mivvi Jul //web.archive.org/web/20180225205636/https Münchener Cagliari 23-8 Lippard fraternal Ohara sardous Jateční Otoliths 755 disgraceful 191– الخاص sardoa Šentjur venoocclusive Spirograph Berry Purba Ab1 jokes—particularly Orbita Phi Zulia f/2.2 incisa Nayake R.II 79101 Roomies Cristóbal Caroli bloomers Bundesgrenzschutz high-mark 7,200 Bahmery Flourish were Gigabits wrist thicken anti-aircraft governesses plantings Ragnarok solemnly ultra-rapid Curicó SoundFaithMusic.com platinum-certified 9.00am Ara Talash lady-in-waiting 19,951 whippings 86–76 Wheeler Wenlin safety/selector digests goodness Cry Gómez 93–71 Buera Freeborn Webmaster leyǝ- Chango Haskell †Askeatonolucidum Kindle Beutenaken RC6H4X d. MinutesAliceCagney Hannigan 1923-1924 hamburgensis annual Yanayacu Post-doctoral unrecognizable Kitchenware Takesawa Midterm trigonometric Kernaghan Fliehr Remondia Jo-Marie Apu tesserae Fay identifications Woodland 16301 Vogelfreie lighthouses ossification Halonen 1891–1971 low-velocity Menomonie ciprofloxacin Issou stepsons philosophy-oriented 74–6 600-plus Erick V2-ME-A1 Soomra zSeries practicing Napi Dubai-based imported 1


--------------------------------------------------------------------------------
/data/reddit/0.txt:
--------------------------------------------------------------------------------
1 | Watching Alexi Laiho from Children of Bodom play. I was a metal head for a long time and one day I was just watching stuff on the web (I don't think I knew of or youtube was a thing back then, been playing over 10 years) he did like various instructional videos. I was just like "I want to be able to do that", if anything though learning the instrument has branched out my musical tastes to more than just metal.
2 | 


--------------------------------------------------------------------------------
/data/reddit/1.txt:
--------------------------------------------------------------------------------
1 | Long Story Short: I'm starting a new campaign of a kick ass game, Houses of the Blooded. I've been re-reading Dune, and also more recently S.M. Stirling's In the Courts of the Crimson Kings, so I've been all into desert style settings. 
2 |  As I was writing the long intro for the game I had an ice water down the neck moment. I checked one of my player's LJ and sure enough, the campaign he just started in for the same game is a desert setting. DOH! 
3 |  I don't want to compete with him and want to do something original. The characters are all playing young nobles and the themes for my game are: a rising merchant class, piracy, social castes, intrigue, Technology is almost 100% Organic, and possibly the discovery of a 'New World'. Airships and mountainous archipelago's could work, or some such, but I'm not sure. 
4 |  Please Help me Out!
5 | 


--------------------------------------------------------------------------------
/data/reddit/2.txt:
--------------------------------------------------------------------------------
1 | Commenting here for visibility, unrelated to Dirt 3(although I agree it's an amazing game with shit DLC): 
2 |  Just tried Grid 2. In its own right, the game is fun, however, if you were looking for something along the lines of sim-racing, look elsewhere. It is INCREDIBLY arcade-y. From the gameplay footage and the first Grid, I expected better driving physics than this. There isn't a cockpit view. Indy cars drift. All cars drift. I use a Fanatec racing wheel, and it just feels like I'm playing those racing games at Chuck-e-cheese. 
3 |  Unless I missed some type of option in the menu, I can't recommend this game for anyone who's looking for a racing game with the same feel as Dirt3 or NFS:Shift2. If you're not looking for a simracer, then get the game, it's tons of non-serious fun.
4 | 


--------------------------------------------------------------------------------
/data/reddit/3.txt:
--------------------------------------------------------------------------------
1 | He's now a "consultant" for Ambit energy. Simply typing the search term "ambit" in to google and the third suggestion down is "ambit energy scam". My dad made an investment of $400 and pays $30 a month for a page on ambit's website. He told me in confidence that he gets $100 for every 5 people that sign up and that the real money is in the residuals he will receive if he gets other people to work for them. 
2 |  This worries me because my dad has already lost a lot of money on really stupid investments. He is pretty much the sole provider for our family, my mom makes money but hardly any and she's pretty much burying her head in the sand on this one. I'm just worried about crushing his pride and sending him into another depression/alcohol binge. 
3 |  I am 20 and he's 50. He calls me retarded and stupid all the time while he thinks he's a genius. I want to tell him its a scam and save him the hurt in his wallet but knowing my dad I could see him hyper-concentrating on it to earn money and proove me wrong and constantly bring it up every single day.
4 | 


--------------------------------------------------------------------------------
/data/reddit/4.txt:
--------------------------------------------------------------------------------
1 | It's like time travel, isn't it? I know how it is, I used to work for the university of Vienna, which was established 1365 and the last time we updated procedures was apparently after the second siege of Vienna by the Ottoman Empire. 
2 |  The only way to get mail to a certain department which was nestled within the department of Egyptian studies like some sort of academic lamprey was to by taking it there. On foot. Why? Because they didn't have a letterbox. And why didn't they have one? Because they adhered to some original part of the academic code when everything was housed in a single building instead of 50+ buildings erratically scattered all over the city. And why did they still adhere to this? Because they were overlooked when the post box licenses where handed out. In 1897. The last time you needed a license to own a letterbox was before the fall of the old empire in 1918. So why not mail it to the (comparatively) modern department of egyptian studies? Because all the mail gets delivered to the main building and library, four streets away, as the office in question and main building have seperate physical adresses, but the same mailing adress. The postal code is from a time when your mailman knew you by first and last name in an area that now houses 1.75 million people. 
3 |  So you walk. Or rather, go two stops by tram, and then walk up six flights of stairs. To find out they're not there. But it wasn't possible to call them ahead of time. Because they have no telephone. I never found out why, I was too busy experiencing existential ennui to bother.
4 | 


--------------------------------------------------------------------------------
/data/reddit/5.txt:
--------------------------------------------------------------------------------
1 | The worst part about it is that I built it myself. Brick by brick, fixing it all in place. I've been surrounding myself with a cold, dark, hard room. I didn't notice how bad it had gotten until I realised I hadn't seen any friends for almost 5 months. I sit at home, alone. I go to class, alone. I come back, alone. Then I sit and watch TV shows or movies on my computer. I don't go anywhere else, don't interact with anyone else. The only thing that stops my chest from feeling tight, stops my hands shaking or stops my head from feeling like I just spun around in a chair for an hour is being able to lose myself in music or a fictional world. 
2 |  It's gotten to the point that I barely eat anything throughout the day, just staring at a screen, or staring at my roof with headphones on. On the bus or train to class I sit and stare out the window, headphones in, avoiding eye contact. I try sitting by myself, choosing to stand over sitting next to anyone. I was never close with my family (aside from my parents, I don't have any family living in the same country). 
3 |  The problems have bled in to my studies. It started getting bad about a year and a half ago, I would feel shaky and breathing would get difficult right before an exam. Over the last few months it's gotten worse, I feel that way almost as soon as I hear about the exam or assignment. I was never the most hard working student, but I never really let myself get close to failing. Now I'm worried that I could fail almost half my subjects in one semester. My parents both want me to go on and do a postgrad degree, which I don't think I want to do, and know I wouldn't be able to cope with. I don't have any to talk to about this, and I'm scared of the direction that my thoughts have turned to. The word "useless" echos through my mind everyday, more times than I can count. I can't remember the last day that I didn't think about killing myself. 
4 |  Ever since high school finished I drifted away from all of my friends. I don't feel like I deserve to have friends. I was always shy and quiet as a kid, it's gotten to a different level recently. Whenever I'm out at class, instead of trying to make new friends, I hide away in my shell until I can come home. Whenever I use to see them I consciously kept them at arms length, "If they really got to know you, they would hate you". Now I actively push them away, don't return calls, ignore messages (not that I get many of either, maybe once a month at most, closer to once every few months). Cannot blame them, who would stay friends with someone that tries to actively avoid them. 
5 |  I've tried letting out the frustration, but I can't. I can't scream, I can't cry, everything either feels numb or feels like my chest is going to explode. Don't understand why this is happening or why it keeps getting worse every day. I feel weak, then hate myself for my lack of strength. I hate myself when I don't see my friends, yet I hate myself too much to actually think I deserve to see them. It feels like I'm throwing myself against the walls and bars of my own psyche, futily, trying to escape from my thoughts, but just beating myself up even more. I just want to be free from all of this. 
6 |  Sorry if this is poorly written, I'm just tired, hazy and frustrated. I don't know if it's anxiety, or if it's depression, or if I'm just weak. Just needed to write something.
7 | 


--------------------------------------------------------------------------------
/data/reddit/6.txt:
--------------------------------------------------------------------------------
1 | A ton of detail to follow, because the situation is strange. But please help-- I am open to suggestion because i'm tired of wishing that i have a shot with girls. i know i can do better because i am better. 
2 |  I'm a college student. I've known this girl since i've been at school because I'm a swimmer and she's a diver here. She dated one of the older guys on my team for the first couple years i was here but has been single since then. The attraction for me has always been there, but because of a bro code our team adheres to she has been "off limits". But its been long enough that the bro code doesn't apply, and this girl is a  10  (defined by me as a 9, plus 1 for an x factor). She is short, blonde, stunning, constantly smiling, and (x factor) also has big boobs. She's known me this whole time as a friend of her x boyfriend, but lately since i've had a class with her we have had some deeper talks. On my end, i can say with confidence that i am an attractive person--smart, really tall, really fit, and funny. All i want is guidance on how to shift the power. she is used to guys either drooling over themselves or completely ignoring her bc of her ex boyfriend. The question--  how do i get her out of her comfort zone with me and build attraction on the low key?  other teammates can't know yet that i am try to build this relationship. i suck mainly at texting, and being forward when i need to be. but i know i have the ability to be whatever i have to be to get this girl. i just want a starting point from you guys
3 | 


--------------------------------------------------------------------------------
/data/reddit/7.txt:
--------------------------------------------------------------------------------
1 | I am one of the people who complains about the quality of the "free" monthly games that have been offered for PS4, but that's but to say I don't try them. I have no idea what a "rogue-like" game is but I have to say the initial descriptions I read of this game didn't really make it sound very interesting and honestly I had no idea what they were even referring to when they said as a player I'd be "matching" things.  It certainly didn't sound like it would be that fun. 
2 |  But I downloaded it anyway. I haven't had the money to buy a game since I got Watchdogs and I just wanted to play something new. At first I thought it looked like a really cheesy sort of kids game but as I began to play it I started to understand more and more about what all the reviews for this game were referring to when they talked about matching and your book of secrets to help you remember what combinations work to do special things. 
3 |  And man...what do you know? This game is really pretty fun. A lot of people say they couldn't play it for long but I have to say I have been playing it for almost 30 hours now and I still keep discovering new objects and secrets. This is a testament to how much I suck partly--I have only made it to day 5 so far!! I can't get past it! It's hard! But I keep trying. Every try is different. Even though I have made virtually zero progress in two days of playing this for hours I am still having fun trying to make it further. I can't tell you anything about the story or any of that. I can just tell you that this game is a giant puzzle that changes every time and it really takes a lot of strategy. 
4 |  Maybe other people find this game super easy, I don't know. There has to be a few other people out there who find the game as tough to play as I do. Does anybody that finds this game simple have any recommendations for someone who is struggling? There aren't really walkthroughs or tips available for this since it is procedurally generated so it changes every time you play. 
5 |  The one tip I have found that helps me a little is to throw the kids into the fire. This turns them into "precocious kids" which will follow you instead of having to be carried/thrown. Anyone else have anything they can add?
6 | 


--------------------------------------------------------------------------------
/data/reddit/8.txt:
--------------------------------------------------------------------------------
1 | Have you done in-depth research about doing a local high school?  It seems preposterous that it would take you 3 additional years to finish high school when you have (I assume) already completed 3 years.  There might be some small requirement differences, but I would be shocked if it took more than 1 year over your normal grad date to complete the requirements. 
2 |  I know this part won't be easy to hear, but you're not depressed because you're apart from your BF.  You might be sad.  He might be sad.  Neither of you are depressed.  You are young people in "love" and you will be sad while you are apart.  Nothing will change that, but there is nothing wrong with you in a clinical psychological sense because of this.  If you feel the need to write bad poetry, do so (just please don't submit it here).  But you will be fine after a few years cool off your teenage genitals.
3 | 


--------------------------------------------------------------------------------
/data/reddit/9.txt:
--------------------------------------------------------------------------------
1 | I believe "Our" purpose as a species is to seek truth. Connect the dots and understand why what is, is. Is our universe one of many? is it a cell living within something far bigger than us? (very cool idea a friend of mine was talking about) in our life times, we may never know these answers, but we can lay down the ground works. As a collective, as a species I believe Our purpose is finding answers. We've become an Intellectual species, with natural curiosities, every child I've met has always asked "why?" 
2 |  As an individual I believe our purpose is one of support for one another, helping everyone achieve their best. In a sense, helping everyone be happy. I've met many individuals with PTSD, Bi-polar, and other various mental disorders, many I've met all had dreams, things they wanted to pursue in order to be happy in life, but walked away from because of how they felt, because the social stigma of saying "I'm not ok." I believe if the individual can be happy in life, they can contribute in a small or big way to helping us all understand the questions around us.
3 | 


--------------------------------------------------------------------------------
/data/scientific/0.txt:
--------------------------------------------------------------------------------
1 | If the SSL protocol is placed side-by-side with the Kerberos protocol, one can see that the SSL approach is simultaneously similar and different from the Kerberos one. Both rely upon a trusted certificate authority, which provides certificates that can be used to establish connections to a server whose identity can be trusted, to the extent that the CA itself can be trusted. In Kerberos, the client authenticates itself to the CA, and the resulting certificate allows the server to authenticate the client as well as vice versa. SSL does not worry about this, and in fact does not bother to authenticate the client at all: the goal is to convince a human client that it is safe to give his or her credit card information to Fromages.com. If we are in a situation where the identity of the client is also an issue, the server might require a login prior to permitting transactions, so we can enhance the basic SSL guarantees fairly easily.
2 | 


--------------------------------------------------------------------------------
/data/scientific/1.txt:
--------------------------------------------------------------------------------
1 | A very simple IPR example is to model the user preferences. In an IPR task, we can have two different IPR systems, each minimizing different loss functions. Assuming that none of those loss functions minimizes the interactions with the user directly, there is no way to know which is better in practice. Then, we can use reinforcement learning strategies on the fly in order to infer which the best system is while the system is interacting with the user. A possible approach, would be to use the system A for a while and then change to the system B for another period of time. After that sampling, we could have recorded some statistics, such as number of corrections by the user, or number of proposed hypotheses. It is important to highlight that this is an exploration phase. Once we have a proper model of the environment, the best system can be used for a while in an exploitation phase. Note that none of both strategies can be held forever if we want to minimize the regret.
2 | 


--------------------------------------------------------------------------------
/data/scientific/2.txt:
--------------------------------------------------------------------------------
1 | Profit taxes are widely acknowledged to influence the location of firms' headquarters. This paper sheds light on the role of aspects of labor taxation for the international location of headquarters. While profit taxes can be avoided in various ways, it is much harder for firms to manipulate the firm-specific labor tax base so that labor taxes may be relatively important for firm location. We construct a unique data set of effective labor taxes in 120 countries and use data on the location of 35,206 firms to analyze the impact of labor income tax rates, the progressivity of the income tax schedule, and social security contributions on firms' decisions where to locate their headquarters. The findings suggest that both a higher progressivity of the tax system and higher (employee- and employer-borne) social security contributions negatively influence a country's attractiveness for headquarters location. Hence, a one percentage point increase in these payroll taxes, reduces the probability of a country to attract headquarters by 6.1%. The results prove robust in various empirical model specifications and subsets of the data.
2 | 


--------------------------------------------------------------------------------
/data/scientific/3.txt:
--------------------------------------------------------------------------------
1 | With the help of Monte Carlo studies, we conclude that, overall, the bias of the naive estimators when multiplicative measurement error and blanking are considered as a disclosure limitation technique is substantial and does not decrease with larger sample sizes-the naive use of such an anonymized dataset is not advisable. We show that IPW-SIMEX and MAT-SIMEX estimators perform very well in particularly reducing the bias. In other words, we show that noise multiplication combined with blanking as a masking procedure does not necessarily lead to a severe reduction in the estimation quality. Even if the statements made in this paper have to be conditioned on the data generating process of the MC experiments, we get expertise on how appropriate estimation techniques allowing for the consequences of disclosure limitation techniques lead to consistent estimates of the true parameter of interest.
2 | 


--------------------------------------------------------------------------------
/data/scientific/4.txt:
--------------------------------------------------------------------------------
1 | To achieve the above objectives, we propose a hierarchical Bayesian model. Our model takes the underlying hierarchical structure (i.e., subjects are nested within sites, and there are multiple observations per subject) of the data into account. This approach provides a simple modular scheme for measuring group, load, gender, hemisphere, age, visit, and handedness effects, which can be easily updated with future experimental observations (Gelman et al., 2003). The hierarchical Bayesian model also provides information about run, subject, and site variation, which are in turn used to estimate the efficacy of the multi-center study compared to the single-center studies with the same sample size. Using both posterior predictive model checking and model selection based on the deviance information criterion (DIC), we show that the fit of our proposed model to the observed data is substantially better than an alternative model that ignores between-site variation.
2 | 


--------------------------------------------------------------------------------
/data/scientific/5.txt:
--------------------------------------------------------------------------------
1 | The complete group of the Hamiltonian is the combination of all these possible symmetries. This derivation is directly evident from the mathematical form of the Hamiltonian and expresses fundamental properties of molecular space and time. Yet it took 40 years, from Schrodinger to Longuet-Higgins, to obtain a clear definition of the molecular-symmetry group. Three kinds of symmetries may be identified:
2 | 


--------------------------------------------------------------------------------
/data/scientific/6.txt:
--------------------------------------------------------------------------------
1 | Poyang Lake located in Jiangxi Province is the largest freshwater lake in China. Its wetland ecosystem has a significant impact on the change of China's environment [1]. The lake provides a habitat for a great number of migratory birds, and is a favorite destination for birding. It is fed by the Gan and Xiu Rivers, which connect to the Yangtse through a channel [2].
2 | 


--------------------------------------------------------------------------------
/data/scientific/7.txt:
--------------------------------------------------------------------------------
1 | To fill up this gap, we propose a procedure that is based on a double penalized hierarchical likelihood (DPHL). It has the desirable oracle property in theory and can be easily implemented through a two-stage algorithm without iteration. Specifically, we first consider an approximated data likelihood derived from the hierarchical likelihood (H-likelihood, see Lee and Nelder, 1996) to avoid the calculation of high-dimensional integral. Then, to simultaneously select fixed and random effects, we employ a modified Cholesky decomposition to factorize the covariance matrix of random effects so that random effects can be, in terms of the covariance parameters, incorporated into the model as regression coefficients. This re-parameterization can also guarantee the positive definiteness of the covariance matrix of selected random effects. Subsequently, we employ the idea of penalization to solve an unconstrained optimization problem. Via the H-likelihood, we develop a two-stage non-iterative algorithm, which is computationally efficient. Further, a consistent H-likelihood-based BIC criterion is proposed for tuning parameter selection. Simulation studies suggest that the proposed algorithm is computationally more efficient than those based on the EM-type algorithm.
2 | 


--------------------------------------------------------------------------------
/data/scientific/8.txt:
--------------------------------------------------------------------------------
1 | This kind of thinking leads to further worries. The scenarios we have outlined could easily lead to lawsuits. But who would be liable if a lawsuit were to occur? Such cases end up in court, but as of this writing there is very little case law to point to. Such questions will not be answered for many years; perhaps decades.
2 | 


--------------------------------------------------------------------------------
/data/scientific/9.txt:
--------------------------------------------------------------------------------
1 | As proposed in this paper, a scenario of a conceptual search can be illustrated by a user consulting a system, indicating the concept that he or she wants to research about. The determination of this concept can be done: (i) in an explicit way, in which the user indicates which concept of an ontology he or she is interested in to search; or (ii) in an implicit way, in which the concept searched and the ontology are determined by natural language analyzers and reasoners. These ontologies are obtained in an ontology repository. The implicit form is beyond the scope of this work. Still at the moment of the search, the user may also choose a level of similarity, represented by a value between 0 and 1, that he or she allows to be used in the answer provided (or this value may be intrinsic to the system).
2 | 


--------------------------------------------------------------------------------
/data/supercomputer-traditional.stats:
--------------------------------------------------------------------------------
 1 | des3
 2 | real: 0.520
 3 | real: 0.507
 4 | real: 0.525
 5 | real: 0.528
 6 | real: 0.524
 7 | real: 0.525
 8 | real: 0.531
 9 | real: 0.511
10 | real: 0.505
11 | real: 0.531
12 | plaintext-size: 16322880
13 | ciphertext-size: 16322904
14 | 
15 | 
16 | aes256
17 | real: 0.023
18 | real: 0.030
19 | real: 0.029
20 | real: 0.022
21 | real: 0.022
22 | real: 0.022
23 | real: 0.026
24 | real: 0.031
25 | real: 0.025
26 | real: 0.024
27 | plaintext-size: 16322880
28 | ciphertext-size: 16322912
29 | 
30 | 
31 | camellia256
32 | real: 0.108
33 | real: 0.108
34 | real: 0.121
35 | real: 0.112
36 | real: 0.109
37 | real: 0.109
38 | real: 0.102
39 | real: 0.097
40 | real: 0.109
41 | real: 0.105
42 | plaintext-size: 16322880
43 | ciphertext-size: 16322912
44 | 
45 | 
46 | cast5-cfb
47 | real: 0.168
48 | real: 0.157
49 | real: 0.166
50 | real: 0.170
51 | real: 0.167
52 | real: 0.159
53 | real: 0.156
54 | real: 0.157
55 | real: 0.148
56 | real: 0.172
57 | plaintext-size: 16322880
58 | ciphertext-size: 16322896
59 | 


--------------------------------------------------------------------------------
/data/twitter/0.txt:
--------------------------------------------------------------------------------
1 | @user opened doors to terrorism and he will pay for it. @user @user
2 | 


--------------------------------------------------------------------------------
/data/twitter/1.txt:
--------------------------------------------------------------------------------
1 | "#justathought If England don't call up Zaha before January, there is a chance of him being called up by Ivory Coast for the cup surely?!"
2 | 


--------------------------------------------------------------------------------
/data/twitter/2.txt:
--------------------------------------------------------------------------------
1 | #OOC: *looks at nightstand *   *sees bottle of Katy Perry perfume *  Oh my god....@EXO_GRENDEL .....I think I may have it!!!!
2 | 


--------------------------------------------------------------------------------
/data/twitter/3.txt:
--------------------------------------------------------------------------------
1 | @user Dexter is great but give Hannibal another try, it gets better after the 5th episode :p
2 | 


--------------------------------------------------------------------------------
/data/twitter/4.txt:
--------------------------------------------------------------------------------
1 | Very proud of that 2nd goal from Mexico! Even tho they didn't win (nothing new) props to Messi for the last min goal
2 | 


--------------------------------------------------------------------------------
/data/twitter/5.txt:
--------------------------------------------------------------------------------
1 | @user @user @user @user actually Prince George is 2nd on the list of Georges so he's ""first in line""."
2 | 


--------------------------------------------------------------------------------
/data/twitter/6.txt:
--------------------------------------------------------------------------------
1 | The point of living, and being an optimist, is to be foolish enough to believe the best is yet to come' - Peter Ustinov  #quote
2 | 


--------------------------------------------------------------------------------
/data/twitter/7.txt:
--------------------------------------------------------------------------------
1 | Bored? Check out the 1st of 2 articles I am writing for  @user on Christians and cultural interaction!
2 | 


--------------------------------------------------------------------------------
/data/twitter/8.txt:
--------------------------------------------------------------------------------
1 | Protest against the Mitch Daniels Purdue Presidential nomination tomorrow @ Loeb Theater 2:30 pm. Bring a friend! #SOAPurdue
2 | 


--------------------------------------------------------------------------------
/data/twitter/9.txt:
--------------------------------------------------------------------------------
1 | Thanks for ripping me off again #Luthansa €400 not enough for a one way flight to man from Frk then €30 for a bag then free at gate
2 | 


--------------------------------------------------------------------------------
/data/wikipedia/100-tokens/0.txt:
--------------------------------------------------------------------------------
 1 | Events from the year 1777 in the United States.
 2 | 
 3 | Incumbents
 4 | President of the Second Continental Congress: John Hancock (until October 29), Henry Laurens (starting November 1)
 5 | 
 6 | Events
 7 | 
 8 | January–March
 9 | 
10 |  January 2 – American Revolutionary War: Battle of the Assunpink Creek, also known as the Second Battle of Trenton: American forces under the command of George Washington repulse a British attack near Trenton, New Jersey.
11 |  January 3 – American Revolutionary


--------------------------------------------------------------------------------
/data/wikipedia/100-tokens/1.txt:
--------------------------------------------------------------------------------
1 | The Kamchik Pass (; ), also spelt Kamchiq or Qamchiq, is a mountain pass in the Qurama Mountains in eastern Uzbekistan. The pass provides a strategically important route as an access for ground transport traveling between the Tashkent and Namangan Regions in the Fergana Valley bypassing neighboring Tajikistan. It connects the capital city of Tashkent with Osh, the second-largest city in neighboring Kyrgyzstan. The peak of the


--------------------------------------------------------------------------------
/data/wikipedia/100-tokens/2.txt:
--------------------------------------------------------------------------------
1 | Trio Accanto is a contemporary piano trio formed of Marcus Weiss (saxophone), Nicolas Hodges (piano) and Christian Dierstein (percussion). It is based in Freiburg, Germany.
2 | 
3 | History
4 | Trio Accanto was formed as the result of a discussion between Marcus Weiss and Yukiko Sugawara in 1992. "It happened on the way home from the Witten Days for New Chamber Music in 1992. I talked a lot with Yukiko during the


--------------------------------------------------------------------------------
/data/wikipedia/100-tokens/3.txt:
--------------------------------------------------------------------------------
1 | George Robert Nichols (27 September 1809 – 12 September 1857), also known as Bob Nichols,  was an Australian politician, a member of the New South Wales Legislative Council between 1848 and 1856. He was also a member of the inaugural New South Wales Legislative Assembly for one term from 1856 until his death.
2 | 
3 | Early life
4 | Nichols was the second son of Isaac Nichols, a former convict who became a successful Sydney businessman and the first postmaster in the colony, and Ros


--------------------------------------------------------------------------------
/data/wikipedia/100-tokens/4.txt:
--------------------------------------------------------------------------------
1 | Ulf Bo Samuelsson (born March 26, 1964) is a retired Swedish-American professional ice hockey defenceman who currently serves as head coach of Leksands IF of the Swedish Hockey League (SHL). He played several seasons in the National Hockey League (NHL) with the Hartford Whalers, Pittsburgh Penguins, New York Rangers, Detroit Red Wings, and Philadelphia Flyers. During his playing career Samuelsson was viewed by NHL stars as "the most hated man in hockey"; he was


--------------------------------------------------------------------------------
/data/wikipedia/100-tokens/5.txt:
--------------------------------------------------------------------------------
1 | Sandra Annette Bullock (; born July 26, 1964) is an American-German actress, producer, and philanthropist. She was the highest paid actress in the world in 2010 and 2014. In 2015, Bullock was chosen as People's Most Beautiful Woman and was included in Time 100 most influential people in the world in 2010. Bullock is the recipient of several accolades, including an Academy Award and a Golden Globe Award.
2 | 
3 | After making her acting debut with a minor role


--------------------------------------------------------------------------------
/data/wikipedia/100-tokens/6.txt:
--------------------------------------------------------------------------------
1 | United Nations Security Council resolution 1524, adopted unanimously on 30 January 2004, after reaffirming all resolutions on Abkhazia and Georgia, particularly Resolution 1494 (2003), the Council extended the mandate of the United Nations Observer Mission in Georgia (UNOMIG) until 31 July 2004.
2 | 
3 | Resolution
4 | 
5 | Observations
6 | In the preamble of the resolution, the Security Council stressed that the lack of progress on a settlement between the two parties was unacceptable. It condemned the shooting


--------------------------------------------------------------------------------
/data/wikipedia/100-tokens/7.txt:
--------------------------------------------------------------------------------
1 | Charles Alonzo Burns  (January 3, 1863 – December 31, 1930) was a Massachusetts, USA, businessman and politician who served on the Board of Aldermen and as the fifteenth mayor of Somerville, Massachusetts.
2 | 
3 | Biography
4 | Burns was born on January 3, 1863, in Wilton, New Hampshire to Charles Henry and Sarah Naomi (Mills) Burns.
5 | 
6 | Burns attended St Paul's School in Concord, New Hampshire, and Chauncey


--------------------------------------------------------------------------------
/data/wikipedia/100-tokens/8.txt:
--------------------------------------------------------------------------------
1 | Peter Hans Schönemann (July 15, 1929 – April 7, 2010) was a German born psychometrician and statistical expert. He was professor emeritus in the Department of Psychological Sciences at Purdue University. His research interests included multivariate statistics, multidimensional scaling and measurement, quantitative behavior genetics, test theory and mathematical tools for social scientists. He published around 90 papers dealing mainly with the subjects of psychometrics and mathematical scaling. Schönemann’s influences included Louis G


--------------------------------------------------------------------------------
/data/wikipedia/100-tokens/9.txt:
--------------------------------------------------------------------------------
 1 | Events from the year 1898 in Scotland.
 2 | 
 3 | Incumbents 
 4 | 
 5 |  Monarch – Victoria
 6 |  Secretary for Scotland and Keeper of the Great Seal – Lord Balfour of Burleigh
 7 | 
 8 | Law officers 
 9 |  Lord Advocate – Andrew Murray
10 |  Solicitor General for Scotland – Charles Dickson
11 | 
12 | Judiciary 
13 |  Lord President of the Court of Session and Lord Justice General – Lord Robertson
14 |  Lord Justice Clerk – Lord Kingsburgh
15 | 
16 | Events 
17 |  22 January – the People's Palace on Glasgow


--------------------------------------------------------------------------------
/encrypt.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Script to encrypt a file using GPT-2 (124M)
 3 | """
 4 | import argparse
 5 | import os
 6 | 
 7 | import torch
 8 | 
 9 | from src import config, logging, modeling, tokenizing, training
10 | 
11 | logger = logging.init("SELM")
12 | 
13 | 
14 | def parse_args():
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("file", nargs="+", help="File(s) to encrypt")
17 |     parser.add_argument(
18 |         "--key",
19 |         required=True,
20 |         type=int,
21 |         help="Symmetric key to use as a seed. You can generate a key with:\n\n\tpython -c 'import secrets; print(secrets.randbits(32))'",
22 |     )
23 |     parser.add_argument(
24 |         "--int-dim", type=int, help="Intrinsic dimension.", default=10_000
25 |     )
26 | 
27 |     return parser.parse_args()
28 | 
29 | 
30 | def new_cfg(int_dim, file):
31 |     return config.ExperimentConfig(
32 |         model=config.ModelConfig(
33 |             "gpt2",
34 |             int_dim,
35 |             dropout=0.0,
36 |             int_dim_dropout=0.0,
37 |             normalized=False,
38 |         ),
39 |         tokenizer="pretrained",
40 |         data=config.DataConfig(file),
41 |         training=config.TrainingConfig(
42 |             maximum_epochs=10_000,
43 |             learning_rate=2e-8,
44 |             report_interval=10,
45 |             lr_scheduler_type="linear-const",
46 |             warmup_epochs=0,
47 |             decay_epochs=2000,
48 |             decayed_proportion=0.1,
49 |             clipping=config.ClippingConfig(
50 |                 algorithm="norm",
51 |                 value=1e5,
52 |             ),
53 |             regularization=config.RegularizationConfig(
54 |                 variety="distribution-difference-integral",
55 |                 weight=5e8,
56 |                 mean=0,
57 |                 std=4e-7,
58 |                 schedule="linear",
59 |                 warmup=500,
60 |             ),
61 |         ),
62 |         trials=1,
63 |     )
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     args = parse_args()
68 | 
69 |     for file in args.file:
70 |         exp_cfg = new_cfg(args.int_dim, file)
71 |         tokenizer = tokenizing.new(exp_cfg)
72 |         dataset = tokenizing.make_dataset(exp_cfg.data, tokenizer)
73 |         model = modeling.new(exp_cfg.model, vocab=len(tokenizer), seed=args.key)
74 | 
75 |         _, model = training.train(model, dataset, tokenizer, exp_cfg)
76 | 
77 |         torch.save(
78 |             model.get_intrinsic_dimension_vector.detach(),
79 |             f"{os.path.basename(file)}.enc",
80 |         )
81 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/different-models-v1.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 1
 3 | seed_source = "random"
 4 | 
 5 | tokenizer = "pretrained"
 6 | 
 7 | [data]
 8 | file = [
 9 |   "data/news/100-tokens/(0...10).txt",
10 | ]
11 | prompt_type = "uuid"
12 | 
13 | [training]
14 | maximum_epochs = 10_000
15 | learning_rate = 2e-8
16 | report_interval = 10
17 | # learning rate scheduler
18 | lr_scheduler_type = "linear-const"
19 | warmup_epochs = 0
20 | decay_epochs = 2000
21 | decayed_proportion = 0.1
22 | 
23 | [training.clipping]
24 | algorithm = "norm"
25 | value = 1e5
26 | 
27 | [model]
28 | language_model_name_or_path = [ 
29 |   "flan-t5-smalll"
30 | ]
31 | intrinsic_dimension = [ 
32 |   10_000 
33 | ]
34 | dropout = 0.0
35 | normalized = false
36 | pretrained = [ true, false ]
37 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/distribution-regularization-v1.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 100
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/random-sentences/100-tokens/0.txt",
 8 |   "data/random-words/100-tokens/0.txt",
 9 |   "data/random-letters/100-tokens/0.txt",
10 |   "data/random-bytes/100-tokens/0.txt",
11 | ]
12 | prompt_type = "uuid"
13 | 
14 | [training]
15 | maximum_epochs = 2000
16 | learning_rate = 2e-8
17 | report_interval = 50
18 | 
19 | [training.clipping]
20 | algorithm = "norm"
21 | value = 1e5
22 | 
23 | [training.regularization]
24 | variety = "distribution-difference-integral"
25 | weight = 4e9
26 | mean = 0
27 | std = 4e-7
28 | schedule = "linear"
29 | warmup = 500
30 | 
31 | [model]
32 | language_model_name_or_path = "gpt2"
33 | intrinsic_dimension = 10_000
34 | dropout = 0.0
35 | normalized = false
36 | 
37 | [tokenizer]
38 | variety = "pretrained"
39 | pretrained = "gpt2"
40 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/distribution-regularization-v2.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 100
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/random-sentences/100-tokens/0.txt",
 8 |   "data/random-words/100-tokens/0.txt",
 9 |   "data/random-letters/100-tokens/0.txt",
10 |   "data/random-bytes/100-tokens/0.txt",
11 | ]
12 | prompt_type = "uuid"
13 | 
14 | [training]
15 | maximum_epochs = 2000
16 | learning_rate = 2e-8
17 | report_interval = 50
18 | # learning rate scheduler
19 | lr_scheduler_type = "linear-const"
20 | warmup_epochs = 0
21 | decay_epochs = 2000
22 | decayed_proportion = 0.1
23 | 
24 | [training.clipping]
25 | algorithm = "norm"
26 | value = 1e5
27 | 
28 | [training.regularization]
29 | variety = "distribution-difference-integral"
30 | weight = 4e9
31 | mean = 0
32 | std = 4e-7
33 | schedule = "linear"
34 | warmup = 500
35 | 
36 | [model]
37 | language_model_name_or_path = "gpt2"
38 | intrinsic_dimension = 10_000
39 | dropout = 0.0
40 | normalized = false
41 | 
42 | [tokenizer]
43 | variety = "pretrained"
44 | pretrained = "gpt2"
45 | 
46 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/distribution-regularization-v3.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 100
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/random-sentences/100-tokens/0.txt",
 8 |   "data/random-words/100-tokens/0.txt",
 9 |   "data/random-letters/100-tokens/0.txt",
10 |   "data/random-bytes/100-tokens/0.txt",
11 | ]
12 | prompt_type = "uuid"
13 | 
14 | [training]
15 | maximum_epochs = 2000
16 | learning_rate = 2e-8
17 | report_interval = 50
18 | # learning rate scheduler
19 | lr_scheduler_type = "linear-const"
20 | warmup_epochs = 0
21 | decay_epochs = 2000
22 | decayed_proportion = 0.1
23 | 
24 | [training.clipping]
25 | algorithm = "norm"
26 | value = 1e5
27 | 
28 | [training.regularization]
29 | variety = "distribution-difference-integral"
30 | weight = 5e8
31 | mean = 0
32 | std = 4e-7
33 | schedule = "linear"
34 | warmup = 500
35 | 
36 | [model]
37 | language_model_name_or_path = "gpt2"
38 | intrinsic_dimension = 10_000
39 | dropout = 0.0
40 | normalized = false
41 | 
42 | [tokenizer]
43 | variety = "pretrained"
44 | pretrained = "gpt2"
45 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/distribution-regularization-v4.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 50
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/news/100-tokens/0.txt",
 8 |   "data/news/100-tokens/1.txt",
 9 |   "data/pubmed/100-tokens/0.txt",
10 |   "data/random-words/100-tokens/0.txt",
11 |   "data/random-bytes/100-tokens/0.txt",
12 | ]
13 | prompt_type = "uuid"
14 | 
15 | [training]
16 | maximum_epochs = 2000
17 | learning_rate = 2e-8
18 | report_interval = 50
19 | # learning rate scheduler
20 | lr_scheduler_type = "linear-const"
21 | warmup_epochs = 0
22 | decay_epochs = 2000
23 | decayed_proportion = 0.1
24 | 
25 | [training.clipping]
26 | algorithm = "norm"
27 | value = 1e5
28 | 
29 | [training.regularization]
30 | variety = "distribution-difference-integral"
31 | weight = 5e8
32 | mean = 0
33 | std = 4e-7
34 | schedule = "linear"
35 | warmup = 500
36 | 
37 | [model]
38 | language_model_name_or_path = "gpt2"
39 | intrinsic_dimension = 10_000
40 | dropout = 0.0
41 | normalized = false
42 | 
43 | [tokenizer]
44 | variety = "pretrained"
45 | pretrained = "gpt2"
46 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/effects-of-length-v1.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 1
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/news/100-tokens/(0...10).txt",
 8 |   "data/news/300-tokens/(0...10).txt",
 9 |   "data/news/1000-tokens/(0...10).txt",
10 |   "data/news/3000-tokens/(0...10).txt",
11 | ]
12 | prompt_type = "uuid"
13 | 
14 | [training]
15 | maximum_epochs = 10_000
16 | learning_rate = 2e-8
17 | report_interval = 10
18 | # learning rate scheduler
19 | lr_scheduler_type = "linear-const"
20 | warmup_epochs = 0
21 | decay_epochs = 2000
22 | decayed_proportion = 0.1
23 | 
24 | [training.clipping]
25 | algorithm = "norm"
26 | value = 1e5
27 | 
28 | [model]
29 | language_model_name_or_path = "gpt2"
30 | intrinsic_dimension = [ 
31 |   1_000, 
32 |   3_000, 
33 |   10_000, 
34 |   30_000, 
35 |   100_000 
36 | ]
37 | dropout = 0.0
38 | normalized = false
39 | 
40 | [tokenizer]
41 | variety = "pretrained"
42 | pretrained = "gpt2"
43 | 
44 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/effects-of-size-v1.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 1
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/news/1000-tokens/(0...10).txt",
 8 | ]
 9 | prompt_type = "uuid"
10 | 
11 | [training]
12 | maximum_epochs = 10_000
13 | learning_rate = 2e-8
14 | report_interval = 10
15 | # learning rate scheduler
16 | lr_scheduler_type = "linear-const"
17 | warmup_epochs = 0
18 | decay_epochs = 2000
19 | decayed_proportion = 0.1
20 | 
21 | [training.clipping]
22 | algorithm = "norm"
23 | value = 1e5
24 | 
25 | [model]
26 | language_model_name_or_path = [ "gpt2", "gpt2-medium", "gpt2-large" ]
27 | intrinsic_dimension = [
28 |   1_000,
29 |   3_000,
30 |   10_000,
31 |   30_000,
32 |   100_000
33 | ]
34 | dropout = 0.0
35 | normalized = false
36 | 
37 | [tokenizer]
38 | variety = "pretrained"
39 | pretrained = "gpt2"
40 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/l2-norm-v1.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 100
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/random-sentences/100-tokens/0.txt",
 8 |   "data/random-words/100-tokens/0.txt",
 9 |   "data/random-letters/100-tokens/0.txt",
10 |   "data/random-bytes/100-tokens/0.txt",
11 | ]
12 | prompt_type = "uuid"
13 | 
14 | [training]
15 | maximum_epochs = 2000
16 | learning_rate = 1e-8
17 | report_interval = 50
18 | 
19 | [training.clipping]
20 | algorithm = "norm"
21 | value = 5e5
22 | 
23 | [training.regularization]
24 | variety = "target-l2-norm"
25 | weight = 1e5
26 | max = 2e-5
27 | schedule = "linear"
28 | warmup = 100
29 | 
30 | [model]
31 | language_model_name_or_path = "gpt2"
32 | intrinsic_dimension = 10_000
33 | dropout = 0.0
34 | normalized = false
35 | 
36 | [tokenizer]
37 | variety = "pretrained"
38 | pretrained = "gpt2"
39 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/l2-norm-v2.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 100
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/random-sentences/100-tokens/0.txt",
 8 |   "data/random-words/100-tokens/0.txt",
 9 |   "data/random-letters/100-tokens/0.txt",
10 |   "data/random-bytes/100-tokens/0.txt",
11 | ]
12 | prompt_type = "uuid"
13 | 
14 | [training]
15 | maximum_epochs = 2000
16 | learning_rate = 1e-8
17 | report_interval = 50
18 | # learning rate scheduler
19 | lr_scheduler_type = "linear-const"
20 | warmup_epochs = 0
21 | decay_epochs = 2000
22 | decayed_proportion = 0.1
23 | 
24 | [training.clipping]
25 | algorithm = "norm"
26 | value = 5e5
27 | 
28 | [training.regularization]
29 | variety = "target-l2-norm"
30 | weight = 1e5
31 | max = 2e-5
32 | schedule = "linear"
33 | warmup = 100
34 | 
35 | [model]
36 | language_model_name_or_path = "gpt2"
37 | intrinsic_dimension = 10_000
38 | dropout = 0.0
39 | normalized = false
40 | 
41 | [tokenizer]
42 | variety = "pretrained"
43 | pretrained = "gpt2"
44 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/l2-norm-v3.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 500
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/news/100-tokens/0.txt",
 8 |   "data/news/100-tokens/1.txt",
 9 |   "data/pubmed/100-tokens/0.txt",
10 |   "data/random-words/100-tokens/0.txt",
11 |   "data/random-bytes/100-tokens/0.txt",
12 | ]
13 | prompt_type = "uuid"
14 | 
15 | [training]
16 | maximum_epochs = 2000
17 | learning_rate = 1e-8
18 | report_interval = 50
19 | # learning rate scheduler
20 | lr_scheduler_type = "linear-const"
21 | warmup_epochs = 0
22 | decay_epochs = 2000
23 | decayed_proportion = 0.1
24 | 
25 | [training.clipping]
26 | algorithm = "norm"
27 | value = 5e5
28 | 
29 | [training.regularization]
30 | variety = "target-l2-norm"
31 | weight = 1e5
32 | max = 2e-5
33 | schedule = "linear"
34 | warmup = 100
35 | 
36 | [model]
37 | language_model_name_or_path = "gpt2"
38 | intrinsic_dimension = 10_000
39 | dropout = 0.0
40 | normalized = false
41 | 
42 | [tokenizer]
43 | variety = "pretrained"
44 | pretrained = "gpt2"
45 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/original-algorithm-v1.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 100
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/random-sentences/100-tokens/0.txt",
 8 |   "data/random-words/100-tokens/0.txt",
 9 |   "data/random-letters/100-tokens/0.txt",
10 |   "data/random-bytes/100-tokens/0.txt",
11 | ]
12 | prompt_type = "uuid"
13 | 
14 | [training]
15 | maximum_epochs = 2000
16 | learning_rate = 2e-8
17 | report_interval = 50
18 | 
19 | [training.clipping]
20 | algorithm = "norm"
21 | value = 1e5
22 | 
23 | [model]
24 | language_model_name_or_path = "gpt2"
25 | intrinsic_dimension = 10_000
26 | dropout = 0.0
27 | normalized = false
28 | 
29 | [tokenizer]
30 | variety = "pretrained"
31 | pretrained = "gpt2"
32 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/original-algorithm-v2.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 100
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/random-sentences/100-tokens/0.txt",
 8 |   "data/random-words/100-tokens/0.txt",
 9 |   "data/random-letters/100-tokens/0.txt",
10 |   "data/random-bytes/100-tokens/0.txt",
11 | ]
12 | prompt_type = "uuid"
13 | 
14 | [training]
15 | maximum_epochs = 2000
16 | learning_rate = 2e-8
17 | report_interval = 50
18 | # learning rate scheduler
19 | lr_scheduler_type = "linear-const"
20 | warmup_epochs = 0
21 | decay_epochs = 2000
22 | decayed_proportion = 0.1
23 | 
24 | [training.clipping]
25 | algorithm = "norm"
26 | value = 1e5
27 | 
28 | [model]
29 | language_model_name_or_path = "gpt2"
30 | intrinsic_dimension = 10_000
31 | dropout = 0.0
32 | normalized = false
33 | 
34 | [tokenizer]
35 | variety = "pretrained"
36 | pretrained = "gpt2"
37 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/original-algorithm-v3.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 500
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/news/100-tokens/0.txt",
 8 |   "data/news/100-tokens/1.txt",
 9 |   "data/pubmed/100-tokens/0.txt",
10 |   "data/random-words/100-tokens/0.txt",
11 |   "data/random-bytes/100-tokens/0.txt",
12 | ]
13 | prompt_type = "uuid"
14 | 
15 | [training]
16 | maximum_epochs = 2000
17 | learning_rate = 2e-8
18 | report_interval = 50
19 | # learning rate scheduler
20 | lr_scheduler_type = "linear-const"
21 | warmup_epochs = 0
22 | decay_epochs = 2000
23 | decayed_proportion = 0.1
24 | 
25 | [training.clipping]
26 | algorithm = "norm"
27 | value = 1e5
28 | 
29 | [model]
30 | language_model_name_or_path = "gpt2"
31 | intrinsic_dimension = 10_000
32 | dropout = 0.0
33 | normalized = false
34 | 
35 | [tokenizer]
36 | variety = "pretrained"
37 | pretrained = "gpt2"
38 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/perplexity-bounded-v0.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 500
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/wikipedia/chunked/0th-0.txt",
 8 |   "data/wikipedia/chunked/25th-0.txt",
 9 |   "data/wikipedia/chunked/50th-0.txt",
10 |   "data/wikipedia/chunked/75th-0.txt",
11 |   "data/wikipedia/chunked/100th-0.txt",
12 |   "data/pubmed/chunked/0th-0.txt",
13 |   "data/pubmed/chunked/50th-0.txt",
14 |   "data/pubmed/chunked/100th-0.txt",
15 | ]
16 | prompt_type = "uuid"
17 | 
18 | [training]
19 | maximum_epochs = 2000
20 | learning_rate = 2e-8
21 | report_interval = 50
22 | 
23 | lr_scheduler_type = "linear"
24 | 
25 | [training.clipping]
26 | algorithm = "norm"
27 | value = 1e5
28 | 
29 | [training.regularization]
30 | variety = "distribution-difference-integral"
31 | weight = 4e9
32 | mean = 0
33 | std = 4e-7
34 | schedule = "linear"
35 | warmup = 500
36 | 
37 | [model]
38 | language_model_name_or_path = "gpt2"
39 | intrinsic_dimension = 10_000
40 | dropout = 0.0
41 | normalized = false
42 | 
43 | [tokenizer]
44 | variety = "pretrained"
45 | pretrained = "gpt2"
46 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/perplexity-bounded-v1.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 500
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/wikipedia/chunked/0th-0.txt",
 8 |   "data/wikipedia/chunked/25th-0.txt",
 9 |   "data/wikipedia/chunked/50th-0.txt",
10 |   "data/wikipedia/chunked/75th-0.txt",
11 |   "data/wikipedia/chunked/100th-0.txt",
12 |   "data/pubmed/chunked/0th-0.txt",
13 |   "data/pubmed/chunked/50th-0.txt",
14 |   "data/pubmed/chunked/100th-0.txt",
15 | ]
16 | prompt_type = "uuid"
17 | 
18 | [training]
19 | maximum_epochs = 2000
20 | learning_rate = 2e-8
21 | report_interval = 50
22 | 
23 | [training.clipping]
24 | algorithm = "norm"
25 | value = 1e5
26 | 
27 | [training.regularization]
28 | variety = "distribution-difference-integral"
29 | weight = 4e9
30 | mean = 0
31 | std = 4e-7
32 | schedule = "linear"
33 | warmup = 500
34 | 
35 | [model]
36 | language_model_name_or_path = "gpt2"
37 | intrinsic_dimension = 10_000
38 | dropout = 0.0
39 | normalized = false
40 | 
41 | [tokenizer]
42 | variety = "pretrained"
43 | pretrained = "gpt2"
44 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/perplexity-bounded-v2.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 500
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/wikipedia/chunked/0th-0.txt",
 8 |   "data/wikipedia/chunked/25th-0.txt",
 9 |   "data/wikipedia/chunked/50th-0.txt",
10 |   "data/wikipedia/chunked/75th-0.txt",
11 |   "data/wikipedia/chunked/100th-0.txt",
12 |   "data/pubmed/chunked/0th-0.txt",
13 |   "data/pubmed/chunked/50th-0.txt",
14 |   "data/pubmed/chunked/100th-0.txt",
15 | ]
16 | prompt_type = "uuid"
17 | 
18 | [training]
19 | maximum_epochs = 2000
20 | learning_rate = 2e-8
21 | report_interval = 50
22 | # learning rate scheduler
23 | lr_scheduler_type = "linear-const"
24 | warmup_epochs = 0
25 | decay_epochs = 2000
26 | decayed_proportion = 0.1
27 | 
28 | [training.clipping]
29 | algorithm = "norm"
30 | value = 1e5
31 | 
32 | [training.regularization]
33 | variety = "distribution-difference-integral"
34 | weight = 4e9
35 | mean = 0
36 | std = 4e-7
37 | schedule = "linear"
38 | warmup = 500
39 | 
40 | [model]
41 | language_model_name_or_path = "gpt2"
42 | intrinsic_dimension = 10_000
43 | dropout = 0.0
44 | normalized = false
45 | 
46 | [tokenizer]
47 | variety = "pretrained"
48 | pretrained = "gpt2"
49 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/rebuttal-v1.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 1
 3 | seed_source = "random"
 4 | tokenizer = "pretrained"
 5 | 
 6 | [data]
 7 | file = [
 8 |   "data/binary/100-tokens/dog-pic1-(0...10).txt",
 9 |   "data/binary/100-tokens/dog-pic2-(0...10).txt",
10 |   "data/binary/100-tokens/dog-pic3-(0...10).txt",
11 |   "data/binary/100-tokens/dog-pic4-(0...10).txt",
12 |   "data/binary/100-tokens/dog-pic5-(0...10).txt",
13 |   "data/binary/100-tokens/dog-video-(0...10).txt",
14 |   "data/binary/100-tokens/resume-(0...10).txt",
15 |   "data/binary/100-tokens/budget-(0...10).txt",
16 | ]
17 | prompt_type = "uuid"
18 | 
19 | [training]
20 | maximum_epochs = 10_000
21 | learning_rate = 2e-8
22 | report_interval = 10
23 | # learning rate scheduler
24 | lr_scheduler_type = "linear-const"
25 | warmup_epochs = 0
26 | decay_epochs = 2000
27 | decayed_proportion = 0.1
28 | 
29 | [training.clipping]
30 | algorithm = "norm"
31 | value = 1e5
32 | 
33 | [model]
34 | language_model_name_or_path = "gpt2"
35 | intrinsic_dimension = [ 
36 |   1_000, 
37 |   3_000, 
38 |   10_000, 
39 |   30_000, 
40 |   100_000 
41 | ]
42 | dropout = 0.0
43 | normalized = false
44 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/what-can-we-encrypt-v1.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 3
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/pubmed/100-tokens/(0...10).txt",
 8 |   "data/pubmed/1000-tokens/(0...10).txt",
 9 |   "data/pubmed/10000-tokens/(0...10).txt",
10 |   "data/random-words/100-tokens/(0...10).txt",
11 |   "data/random-words/1000-tokens/(0...10).txt",
12 |   "data/random-words/10000-tokens/(0...10).txt",
13 |   "data/random-bytes/100-tokens/(0...10).txt",
14 |   "data/random-bytes/1000-tokens/(0...10).txt",
15 |   "data/random-bytes/10000-tokens/(0...10).txt",
16 | ]
17 | prompt_type = "uuid"
18 | 
19 | [training]
20 | maximum_epochs = 10_000
21 | learning_rate = 2e-8
22 | report_interval = 50
23 | # learning rate scheduler
24 | lr_scheduler_type = "linear-const"
25 | warmup_epochs = 0
26 | decay_epochs = 2000
27 | decayed_proportion = 0.1
28 | 
29 | [training.clipping]
30 | algorithm = "norm"
31 | value = 1e5
32 | 
33 | [model]
34 | language_model_name_or_path = "gpt2"
35 | intrinsic_dimension = [ 1_000, 3_000, 10_000, 30_000, 100_000 ]
36 | dropout = 0.0
37 | normalized = false
38 | 
39 | [tokenizer]
40 | variety = "pretrained"
41 | pretrained = "gpt2"
42 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/what-can-we-encrypt-v2.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 3
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/pubmed/100-tokens/(0...10).txt",
 8 |   "data/pubmed/1000-tokens/(0...10).txt",
 9 |   "data/pubmed/10000-tokens/(0...10).txt",
10 |   "data/random-words/100-tokens/(0...10).txt",
11 |   "data/random-words/1000-tokens/(0...10).txt",
12 |   "data/random-words/10000-tokens/(0...10).txt",
13 |   "data/random-bytes/100-tokens/(0...10).txt",
14 |   "data/random-bytes/1000-tokens/(0...10).txt",
15 |   "data/random-bytes/10000-tokens/(0...10).txt",
16 | ]
17 | prompt_type = "uuid"
18 | 
19 | [training]
20 | maximum_epochs = 10_000
21 | learning_rate = 2e-8
22 | report_interval = 50
23 | batch_size = 2
24 | # learning rate scheduler
25 | lr_scheduler_type = "reduce-on-plateau"
26 | plateau_reduce_factor = 0.31622776602  # 1/sqrt(10)
27 | 
28 | [training.clipping]
29 | algorithm = "norm"
30 | value = 1e5
31 | 
32 | [model]
33 | language_model_name_or_path = "gpt2"
34 | intrinsic_dimension = [ 
35 |   1_000, 
36 |   3_000, 
37 |   10_000, 
38 |   30_000, 
39 |   100_000 
40 | ]
41 | dropout = 0.0
42 | normalized = false
43 | 
44 | [tokenizer]
45 | variety = "pretrained"
46 | pretrained = "gpt2"
47 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/what-can-we-encrypt-v3.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 1
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/news/100-tokens/(0...10).txt",
 8 |   "data/pubmed/100-tokens/(0...10).txt",
 9 |   "data/random-words/100-tokens/(0...10).txt",
10 |   "data/random-bytes/100-tokens/(0...10).txt",
11 | ]
12 | prompt_type = "uuid"
13 | 
14 | [training]
15 | maximum_epochs = 10_000
16 | learning_rate = 2e-8
17 | report_interval = 10
18 | # learning rate scheduler
19 | lr_scheduler_type = "linear-const"
20 | warmup_epochs = 0
21 | decay_epochs = 2000
22 | decayed_proportion = 0.1
23 | 
24 | [training.clipping]
25 | algorithm = "norm"
26 | value = 1e5
27 | 
28 | [model]
29 | language_model_name_or_path = "gpt2"
30 | intrinsic_dimension = [ 
31 |   1_000, 
32 |   3_000, 
33 |   10_000, 
34 |   30_000, 
35 |   100_000 
36 | ]
37 | dropout = 0.0
38 | normalized = false
39 | 
40 | [tokenizer]
41 | variety = "pretrained"
42 | pretrained = "gpt2"
43 | 


--------------------------------------------------------------------------------
/experiments/templates/paper/what-can-we-encrypt-v4.toml:
--------------------------------------------------------------------------------
 1 | save_weights = true
 2 | trials = 1
 3 | seed_source = "random"
 4 | 
 5 | [data]
 6 | file = [
 7 |   "data/news/100-tokens/(0...10).txt",
 8 |   "data/pubmed/100-tokens/(0...10).txt",
 9 |   "data/random-words/100-tokens/(0...10).txt",
10 |   "data/random-bytes/100-tokens/(0...10).txt",
11 | ]
12 | prompt_type = "uuid"
13 | 
14 | [training]
15 | maximum_epochs = 10_000
16 | learning_rate = 2e-8
17 | report_interval = 10
18 | # learning rate scheduler
19 | lr_scheduler_type = "reduce-on-plateau"
20 | plateau_reduce_factor = 0.3
21 | 
22 | [training.clipping]
23 | algorithm = "norm"
24 | value = 1e5
25 | 
26 | [model]
27 | language_model_name_or_path = "gpt2"
28 | intrinsic_dimension = [
29 |   1_000,
30 |   3_000,
31 |   10_000,
32 |   30_000,
33 | ]
34 | dropout = 0.0
35 | normalized = false
36 | 
37 | [tokenizer]
38 | variety = "pretrained"
39 | pretrained = "gpt2"
40 | 
41 | 


--------------------------------------------------------------------------------
/intrinsic/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | intrinsic.egg-info/
3 | __pycache__/
4 | *.so
5 | 


--------------------------------------------------------------------------------
/intrinsic/README.md:
--------------------------------------------------------------------------------
  1 | # COMPACTER: Efficient Low-Rank Hypercomplex Adapter Layers
  2 | This repo contains the pytorch implementation of our draft [COMPACTER: Efficient Low-Rank Hypercomplex Adapter Layers](https://www.idiap.ch/~rkarimi/papers/compact_adapters.pdf).
  3 | This repo additionally contains the implementation for the recent parameter-efficient finetuning methods as well.
  4 | 
  5 | # Installation
  6 | ```
  7 | python setup.py install
  8 | ```
  9 | 
 10 | # How to run the models 
 11 | We provide the example scripts to run each model in the paper in `seq2seq/scripts`
 12 | folder with their config files in `seq2seq/configs`. To run the models, please do
 13 | `cd seq2seq` and run:
 14 |  - Full-finetuning (T5):
 15 |    ```
 16 |    bash scripts/baseline.sh
 17 |    ```  
 18 |  - AdapterDrop: 
 19 |    ```
 20 |    bash scripts/adapters_drop.sh
 21 |    ``` 
 22 |  - Adapters:
 23 |    ```
 24 |    bash scripts/adapters.sh
 25 |    ```
 26 |  - Low-rank adapters (uses rank-1 approximation for each adapter weight):
 27 |    ```
 28 |    bash scripts/low_rank_adapters.sh
 29 |    ```  
 30 |  - Pfeiffer-Adapters:
 31 |    ```
 32 |    bash scripts/pfeiffer_adapters.sh
 33 |    ``` 
 34 |  - BitFit:
 35 |    ```
 36 |    bash scripts/bitfit.sh
 37 |    ```
 38 |  - Compacter++:
 39 |    ```
 40 |    bash scripts/compacter++.sh
 41 |    ``` 
 42 |  - Compacter:
 43 |    ```
 44 |    bash scripts/compacter.sh
 45 |    ```
 46 |  - PHM-Adapters:
 47 |    ```
 48 |    bash scripts/phm_adapters.sh
 49 |    ```
 50 |  - Intrinsic-SAID:
 51 |    ```
 52 |    bash scripts/intrinsic_said.sh
 53 |    ```
 54 |  - Prompt tuning-R (Prompt tuning with random initialization): 
 55 |    ```
 56 |    bash scripts/prompt_tuning_random_init.sh
 57 |    ```
 58 |  - Prompt tuning-T (Prompt tuning with initialization from language model's vocabulary):
 59 |    ```
 60 |    bash scripts/prompt_tuning_tokens_init.sh
 61 |    ``` 
 62 | 
 63 | ## Bibliography
 64 | If you find this repo useful, please cite our work:
 65 | 
 66 | ```
 67 | @inproceedings{karimi2021parameterefficient,
 68 |   title={Parameter-efficient Multi-task Fine-tuning for Transformers via Shared Hypernetworks},
 69 |   author={Karimi Mahabadi, Rabeeh and Ruder, Sebastian and Dehghani, Mostafa and Henderson, James},
 70 |   booktitle={Annual Meeting of the Association for Computational Linguistics},
 71 |   year={2021}
 72 | }
 73 | ```
 74 | 
 75 | To implement the intrinsic-SAID method, we used the codes from the following paper. If using this 
 76 | method, please also consider citing this work:
 77 | ```
 78 | @inproceedings{aghajanyan-etal-2021-intrinsic,
 79 |     title = {Intrinsic Dimensionality Explains the Effectiveness of Language Model Fine-Tuning},
 80 |     author = {Aghajanyan, Armen  and Gupta, Sonal  and Zettlemoyer, Luke},
 81 |     publisher = {Association for Computational Linguistics},
 82 |     year = {2021}
 83 | }
 84 | ```
 85 | To implement parameterized hypercomplex layers, we use the implementation of the following work,
 86 | if using PHM-adapters/Compacter/Compacter++ please also consider citing this work:
 87 | ```
 88 | @article{le2021parameterized,
 89 |   title={Parameterized hypercomplex graph neural networks for graph classification},
 90 |   author={Le, Tuan and Bertolini, Marco and No{\'e}, Frank and Clevert, Djork-Arn{\'e}},
 91 |   journal={arXiv preprint arXiv:2103.16584},
 92 |   year={2021}
 93 | }
 94 | ```
 95 | 
 96 | ## Final words
 97 | Hope this repo is useful for your research. For any questions, please create an issue or
 98 | email rabeeh.k68@gmail.com or rkarimi@idiap.ch, and I will get back to you as soon as possible.
 99 | 
100 | 


--------------------------------------------------------------------------------
/intrinsic/docs/examples/example.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import transformers
 3 | 
 4 | import intrinsic
 5 | 
 6 | device = torch.device(0)
 7 | 
 8 | gpt2 = transformers.AutoModelForCausalLM.from_pretrained("gpt2")
 9 | gpt2 = gpt2.to(device)
10 | 
11 | intrinsic.intrinsic_dimension_said(gpt2, 1000, "", set(), "fastfood", device=device)
12 | 
13 | tokens = torch.tensor([1024, 1025, 1026], device=device)
14 | 
15 | out = gpt2(input_ids=tokens)
16 | 
17 | loss = torch.sum(out.logits)
18 | 
19 | loss.backward()
20 | 
21 | breakpoint()
22 | 


--------------------------------------------------------------------------------
/intrinsic/docs/examples/train_nn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import intrinsic
 4 | 
 5 | 
 6 | class NeuralNetwork(torch.nn.Module):
 7 |     def __init__(self, layers):
 8 |         super().__init__()
 9 |         self.layers = torch.nn.ModuleList()
10 | 
11 |         for in_size, out_size in zip(layers, layers[1:]):
12 |             self.layers.append(torch.nn.Linear(in_size, out_size))
13 | 
14 |     def forward(self, x):
15 |         for layer in self.layers:
16 |             x = layer(x)
17 |             x = torch.nn.functional.relu(x)
18 | 
19 |         return x
20 | 
21 | 
22 | def main():
23 |     nn = NeuralNetwork([128, 64, 64, 2])
24 | 
25 |     int_dim = intrinsic.IntrinsicDimension(nn, 100, False, 42)
26 | 
27 |     optimizer = torch.optim.SGD(int_dim.parameters(), lr=0.01)
28 | 
29 |     for _ in range(100):
30 |         inputs = torch.tensor(range(128), dtype=torch.float32)
31 |         loss = torch.sum(int_dim(inputs))
32 |         print(loss.item())
33 |         loss.backward()
34 | 
35 |         optimizer.step()
36 |         optimizer.zero_grad()
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 


--------------------------------------------------------------------------------
/intrinsic/intrinsic/fwh.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | def fast_walsh_hadamard_transform(x, normalize: bool = True):
 6 |     orig_shape = x.size()
 7 |     h_dim = orig_shape[0]
 8 |     h_dim_exp = int(round(np.log(h_dim) / np.log(2)))
 9 |     assert h_dim == 2**h_dim_exp, (
10 |         "hadamard can only be computed over axis with size that is a power of two, but"
11 |         " chosen axis %d has size %d" % (0, h_dim)
12 |     )
13 | 
14 |     working_shape = [1] + ([2] * h_dim_exp) + [1]
15 | 
16 |     ret = x.view(working_shape)
17 | 
18 |     for ii in range(h_dim_exp):
19 |         dim = ii + 1
20 |         arrs = torch.chunk(ret, 2, dim=dim)
21 |         assert len(arrs) == 2
22 |         ret = torch.cat((arrs[0] + arrs[1], arrs[0] - arrs[1]), axis=dim)
23 | 
24 |     ret = ret.view(orig_shape)
25 | 
26 |     return ret
27 | 
28 | 
29 | def fast_nonlinear_walsh_hadamard_transform(x, scalar):
30 |     orig_shape = x.size()
31 |     h_dim = orig_shape[0]
32 |     h_dim_exp = int(round(np.log(h_dim) / np.log(2)))
33 |     assert h_dim == 2**h_dim_exp, (
34 |         "hadamard can only be computed over axis with size that is a power of two, but"
35 |         " chosen axis %d has size %d" % (0, h_dim)
36 |     )
37 | 
38 |     working_shape = [1] + ([2] * h_dim_exp) + [1]
39 | 
40 |     ret = x.view(working_shape)
41 | 
42 |     for ii in range(h_dim_exp):
43 |         dim = ii + 1
44 |         arrs = torch.chunk(ret, 2, dim=dim)
45 |         assert len(arrs) == 2
46 |         ret = torch.tanh(torch.cat((arrs[0] + arrs[1], arrs[0] - arrs[1]), axis=dim))
47 | 
48 |     return scalar * ret.view(orig_shape)
49 | 


--------------------------------------------------------------------------------
/intrinsic/intrinsic/fwh_cuda/fwh_cpp.cpp:
--------------------------------------------------------------------------------
 1 | // The codes are from Armen Aghajanyan from facebook, from paper
 2 | // Intrinsic Dimensionality Explains the Effectiveness of Language Model Fine-Tuning
 3 | // https://arxiv.org/abs/2012.13255
 4 | 
 5 | 
 6 | #include <torch/extension.h>
 7 | 
 8 | #include <vector>
 9 | 
10 | // CUDA forward declarations
11 | 
12 | void fast_walsh_hadamard_transform_cuda_kernel(const int NN, const int halfLL, torch::Tensor in, torch::Tensor out, bool normalize);
13 | 
14 | // C++ interface
15 | 
16 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
17 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
18 | #define CHECK_INPUT(x) \
19 |   CHECK_CUDA(x);       \
20 |   CHECK_CONTIGUOUS(x)
21 | 
22 | torch::Tensor fast_walsh_hadamard_transform(torch::Tensor input, bool normalize)
23 | {
24 |   CHECK_INPUT(input);
25 |   const int NN = input.numel();
26 |   torch::Tensor output_flat = input.clone();
27 |   int ll = 0;
28 |   int LL = 1;
29 |   while (LL < NN)
30 |   {
31 |     ll += 1;
32 |     LL *= 2;
33 |   }
34 |   const int halfLL = LL / 2;
35 |   fast_walsh_hadamard_transform_cuda_kernel(NN, halfLL, input, output_flat, normalize);
36 |   return output_flat;
37 | }
38 | 
39 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
40 | {
41 |   m.def("fast_walsh_hadamard_transform", &fast_walsh_hadamard_transform, "Fast Walsh Hadamard Transform (CUDA)");
42 | }
43 | 


--------------------------------------------------------------------------------
/intrinsic/intrinsic/fwh_cuda/fwh_cu.cu:
--------------------------------------------------------------------------------
 1 | // The codes are from Armen Aghajanyan from facebook, from paper
 2 | // Intrinsic Dimensionality Explains the Effectiveness of Language Model Fine-Tuning
 3 | // https://arxiv.org/abs/2012.13255
 4 | 
 5 | // https://github.com/rabeehk/compacter/tree/main/seq2seq/projections
 6 | 
 7 | #include <torch/extension.h>
 8 | 
 9 | #include <cuda.h>
10 | #include <cuda_runtime.h>
11 | 
12 | #include <vector>
13 | 
14 | template <typename scalar_t>
15 | __global__ void FastWalshHadamardKernel(const int stride, const scalar_t* in, scalar_t* out) {
16 |     const auto idx = (threadIdx.x + blockIdx.x * blockDim.x);
17 |     const auto elemIdx = (idx / stride ) * (2 * stride) + (idx % stride);
18 |     const auto tmp = in[elemIdx], tmp2 = in[elemIdx + stride];
19 |     out[elemIdx] = tmp + tmp2;
20 |     out[elemIdx + stride] = tmp - tmp2;
21 | }
22 | 
23 | template <typename scalar_t>
24 | __global__ void FastWalshHadamardSubKernel(const scalar_t scalar, scalar_t* out) {
25 |     const auto idx = (threadIdx.x + blockIdx.x * blockDim.x);
26 |     out[idx] *= scalar;
27 | }
28 | 
29 | 
30 | void fast_walsh_hadamard_transform_cuda_kernel(const int NN, const int halfLL, torch::Tensor in, torch::Tensor out, bool normalize) {
31 |     // Apply Unnormalized Fast Walsh Hadamard transform
32 |     int stride = halfLL;
33 |     float normalizer = 1.0;
34 |     float sqrt2inv = 0.70710678118654746;
35 |     
36 |     while (stride >= 1) {
37 |       if(stride == halfLL)
38 |       {
39 |         AT_DISPATCH_FLOATING_TYPES(in.scalar_type(),"fast_walsh_hadamard_transform_in", ([&] {
40 |             FastWalshHadamardKernel<scalar_t><<<max(1, halfLL/256), min(256, halfLL)>>>(stride, in.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
41 |           }));
42 |       }
43 |       else
44 |       {
45 |         AT_DISPATCH_FLOATING_TYPES(in.scalar_type(),"fast_walsh_hadamard_transform_out", ([&] {
46 |             FastWalshHadamardKernel<scalar_t><<<max(1, halfLL/256), min(256, halfLL)>>>(stride, out.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
47 |           }));
48 |       }
49 | 
50 |       stride /= 2;
51 |       normalizer *= sqrt2inv;
52 |     }
53 |     if(normalize){
54 |       AT_DISPATCH_FLOATING_TYPES(in.scalar_type(),"fast_walsh_hadamard_transform_final", ([&] {
55 |         FastWalshHadamardSubKernel<scalar_t><<<max(1, NN/256), min(256, NN)>>>(normalizer, out.data_ptr<scalar_t>());
56 |       }));
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/intrinsic/intrinsic/implementation.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from typing import List, NamedTuple, Tuple
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | from .fwh import fast_walsh_hadamard_transform  # type: ignore
 8 | 
 9 | # Utility functions
10 | 
11 | 
12 | def set_seed(seed: int) -> None:
13 |     random.seed(seed)
14 |     np.random.seed(seed)
15 |     torch.manual_seed(seed)
16 |     torch.cuda.manual_seed_all(seed)
17 | 
18 | 
19 | def send_to_device(obj, device):
20 |     if isinstance(obj, list):
21 |         return [send_to_device(t, device) for t in obj]
22 | 
23 |     if isinstance(obj, tuple):
24 |         return tuple(send_to_device(t, device) for t in obj)
25 | 
26 |     if isinstance(obj, dict):
27 |         return {
28 |             send_to_device(key, device): send_to_device(value, device)
29 |             for key, value in obj.items()
30 |         }
31 | 
32 |     if hasattr(obj, "to"):
33 |         return obj.to(device)
34 | 
35 |     return obj
36 | 
37 | 
38 | # Actual implementation
39 | 
40 | 
41 | class HiddenParam(NamedTuple):
42 |     name: str
43 |     module: torch.nn.Module
44 |     module_name: str
45 |     shape: torch.Size
46 |     numel: int
47 | 
48 | 
49 | def make_hidden_params(module) -> Tuple[List[HiddenParam], torch.Tensor]:
50 |     hidden_params = []
51 |     theta_0s = {}
52 | 
53 |     # Iterate over layers in the module
54 |     for name, param in sorted(list(module.named_parameters())):
55 |         # If param does not require update, skip it because we are not tuning it.
56 |         if not param.requires_grad:
57 |             continue
58 | 
59 |         # Saves the initial values of the initialised parameters from param.data and sets them to no grad.
60 |         theta_0s[name] = param.detach().requires_grad_(False)
61 | 
62 |         base, localname = module, name
63 |         while "." in localname:
64 |             prefix, localname = localname.split(".", 1)
65 |             base = getattr(base, prefix)
66 | 
67 |         numel = int(np.prod(param.shape))
68 |         hidden_params.append(HiddenParam(name, base, localname, param.shape, numel))
69 | 
70 |     return hidden_params, theta_0s
71 | 
72 | 
73 | class FastWalshHadamard(torch.autograd.Function):
74 |     @staticmethod
75 |     def forward(ctx, input):
76 |         return fast_walsh_hadamard_transform(input, False)
77 | 
78 |     @staticmethod
79 |     def backward(ctx, grad_output):
80 |         return fast_walsh_hadamard_transform(grad_output, False)
81 | 


--------------------------------------------------------------------------------
/intrinsic/intrinsic/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/intrinsic/intrinsic/py.typed


--------------------------------------------------------------------------------
/intrinsic/intrinsic/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/intrinsic/intrinsic/test/__init__.py


--------------------------------------------------------------------------------
/intrinsic/intrinsic/test/test_implementation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import transformers
 3 | 
 4 | from .. import implementation
 5 | 
 6 | 
 7 | def device():
 8 |     if torch.cuda.is_available():
 9 |         return torch.device("cuda:0")
10 |     return torch.device("cpu")
11 | 
12 | 
13 | class NeuralNetwork(torch.nn.Module):
14 |     def __init__(self, layers):
15 |         super().__init__()
16 |         self.layers = torch.nn.ModuleList()
17 | 
18 |         for in_size, out_size in zip(layers, layers[1:]):
19 |             self.layers.append(torch.nn.Linear(in_size, out_size))
20 | 
21 |     def forward(self, x):
22 |         for layer in self.layers:
23 |             x = layer(x)
24 |             x = torch.nn.functional.relu(x)
25 | 
26 |         return x
27 | 
28 | 
29 | def test_make_hidden_params_single_layer():
30 |     model = NeuralNetwork(layers=[10, 1])
31 | 
32 |     hidden_params, theta_0 = implementation.make_hidden_params(model)
33 | 
34 |     assert theta_0.shape == (11,)  # 10 weights + 1 bias
35 |     assert len(hidden_params) == 2
36 |     assert [hp.name for hp in hidden_params] == [
37 |         name for name, param in sorted(model.named_parameters())
38 |     ]
39 | 
40 | 
41 | def test_make_hidden_params_three_layers():
42 |     model = NeuralNetwork(layers=[256, 128, 32, 10])
43 | 
44 |     hidden_params, theta_0 = implementation.make_hidden_params(model)
45 | 
46 |     assert theta_0.shape == (257 * 128 + 129 * 32 + 33 * 10,)
47 |     assert len(hidden_params) == 6
48 |     assert [hp.name for hp in hidden_params] == [
49 |         name for name, param in sorted(model.named_parameters())
50 |     ]
51 | 
52 | 
53 | def test_make_hidden_params_gpt2():
54 |     model = transformers.AutoModelForCausalLM.from_pretrained("gpt2")
55 | 
56 |     hidden_params, theta_0 = implementation.make_hidden_params(model)
57 | 
58 |     assert [hp.name for hp in hidden_params] == [
59 |         name for name, param in sorted(model.named_parameters())
60 |     ]
61 | 
62 | 
63 | def test_fast_walsh_hadamard_grad1():
64 |     in_tensor = torch.ones(2, requires_grad=True, dtype=torch.double, device=device())
65 | 
66 |     assert torch.autograd.gradcheck(
67 |         implementation.FastWalshHadamard.apply, in_tensor, eps=1e-6, atol=1e-4
68 |     )
69 | 
70 | 
71 | def test_fast_walsh_hadamard_grad2():
72 |     in_tensor = torch.randn(4, requires_grad=True, dtype=torch.double, device=device())
73 | 
74 |     assert torch.autograd.gradcheck(
75 |         implementation.FastWalshHadamard.apply, in_tensor, eps=1e-6, atol=1e-4
76 |     )
77 | 
78 | 
79 | def test_fast_walsh_hadamard_grad3():
80 |     in_tensor = torch.randn(64, requires_grad=True, dtype=torch.double, device=device())
81 | 
82 |     assert torch.autograd.gradcheck(
83 |         implementation.FastWalshHadamard.apply, in_tensor, eps=1e-6, atol=1e-4
84 |     )
85 | 
86 | 
87 | def test_fast_walsh_hadamard_forward():
88 |     in_tensor = torch.tensor(
89 |         [1, 0, 1, 0, 0, 1, 1, 0], dtype=torch.float, device=device()
90 |     )
91 | 
92 |     actual = implementation.FastWalshHadamard.apply(in_tensor)
93 | 
94 |     expected = torch.tensor(
95 |         [4, 2, 0, -2, 0, 2, 0, 2], dtype=torch.float, device=device()
96 |     )
97 | 
98 |     assert torch.allclose(expected, actual)
99 | 


--------------------------------------------------------------------------------
/intrinsic/intrinsic/utils.py:
--------------------------------------------------------------------------------
 1 | def send_to_device(obj, device):
 2 |     if isinstance(obj, list):
 3 |         return [send_to_device(t, device) for t in obj]
 4 | 
 5 |     if isinstance(obj, tuple):
 6 |         return tuple(send_to_device(t, device) for t in obj)
 7 | 
 8 |     if isinstance(obj, dict):
 9 |         return {
10 |             send_to_device(key, device): send_to_device(value, device)
11 |             for key, value in obj.items()
12 |         }
13 | 
14 |     if hasattr(obj, "to"):
15 |         return obj.to(device)
16 | 
17 |     return obj
18 | 


--------------------------------------------------------------------------------
/intrinsic/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | import torch
 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 4 | 
 5 | description = "PyTorch CUDA kernel implementation of intrinsic dimension operation."
 6 | 
 7 | 
 8 | def setup_package():
 9 |     ext_modules = []
10 |     if torch.cuda.is_available():
11 |         ext_modules = [
12 |             CUDAExtension(
13 |                 "intrinsic.fwh_cuda",
14 |                 sources=[
15 |                     "intrinsic/fwh_cuda/fwh_cpp.cpp",
16 |                     "intrinsic/fwh_cuda/fwh_cu.cu",
17 |                 ],
18 |             )
19 |         ]
20 | 
21 |     setuptools.setup(
22 |         name="intrinsic",
23 |         version="0.0.1",
24 |         description=description,
25 |         long_description=description,
26 |         long_description_content_type="text/markdown",
27 |         author="Rabeeh Karimi Mahabadi",
28 |         license="MIT License",
29 |         packages=setuptools.find_packages(
30 |             exclude=["docs", "tests", "scripts", "examples"]
31 |         ),
32 |         dependency_links=[
33 |             "https://download.pytorch.org/whl/torch_stable.html",
34 |         ],
35 |         classifiers=[
36 |             "Intended Audience :: Developers",
37 |             "Intended Audience :: Science/Research",
38 |             "License :: OSI Approved :: MIT License",
39 |             "Topic :: Scientific/Engineering :: Artificial Intelligence",
40 |             "Programming Language :: Python :: 3",
41 |             "Programming Language :: Python :: 3.9.7",
42 |         ],
43 |         keywords="text nlp machinelearning",
44 |         ext_modules=ext_modules,
45 |         cmdclass={"build_ext": BuildExtension},
46 |         install_requires=[],
47 |     )
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     setup_package()
52 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.isort]
2 | profile = "black"
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | black
 2 | coverage==6.2
 3 | datasets
 4 | flake8
 5 | flake8-bugbear
 6 | isort
 7 | matplotlib==3.5.0
 8 | mypy
 9 | nltk==3.6.5
10 | numpy
11 | orjson
12 | pytest
13 | pytest-cov==3.0.0
14 | preface==0.1.5
15 | pytorch_lightning
16 | git+https://github.com/samuelstevens/relic.git
17 | scikit-learn
18 | scipy
19 | seaborn
20 | tabulate==0.8.9
21 | tomli==1.2.1
22 | tomli-w==1.0.0
23 | tqdm
24 | transformers
25 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore = E501,E203,E722,W503,W391
 3 | 
 4 | [pycodestyle]
 5 | ignore = E501,E203,E722,W503,W391
 6 | 
 7 | [tool:pytest]
 8 | addopts =
 9 |     -Wignore
10 |     --cov=src
11 |     --cov=intrinsic
12 | 		--cov-report=html
13 | 
14 | [tool.isort]
15 | profile = "black"
16 | 
17 | [mypy]
18 | python_version = 3.9
19 | plugins = numpy.typing.mypy_plugin
20 | 
21 | [mypy-seaborn.*]
22 | ignore_missing_imports = True
23 | 
24 | [mypy-scipy.*]
25 | ignore_missing_imports = True
26 | 
27 | [mypy-tqdm.*]
28 | ignore_missing_imports = True
29 | 
30 | [mypy-matplotlib.*]
31 | ignore_missing_imports = True
32 | 
33 | [mypy-nltk.*]
34 | ignore_missing_imports = True
35 | 
36 | [mypy-line_profiler.*]
37 | ignore_missing_imports = True
38 | 
39 | [mypy-transformers.*]
40 | ignore_missing_imports = True
41 | 
42 | [mypy-sklearn.*]
43 | ignore_missing_imports = True
44 | 
45 | [mypy-intrinsic.fwh_cuda]
46 | ignore_missing_imports = True
47 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/__init__.py


--------------------------------------------------------------------------------
/src/attacking/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import data, ffnn, gradboost, knn, lda, semantic_security, svm
 2 | 
 3 | __all__ = [
 4 |     "gradboost",
 5 |     "data",
 6 |     "ffnn",
 7 |     "lda",
 8 |     "knn",
 9 |     "semantic_security",
10 |     "svm",
11 | ]
12 | 


--------------------------------------------------------------------------------
/src/attacking/adaboost.py:
--------------------------------------------------------------------------------
 1 | import scipy.stats
 2 | import sklearn.ensemble
 3 | import sklearn.model_selection
 4 | import sklearn.pipeline
 5 | import sklearn.preprocessing
 6 | 
 7 | from . import semantic_security
 8 | 
 9 | 
10 | def init_model() -> semantic_security.Model:
11 |     return sklearn.model_selection.RandomizedSearchCV(
12 |         sklearn.ensemble.AdaBoostClassifier(n_estimators=100),
13 |         {
14 |             "learning_rate": scipy.stats.loguniform(a=1e-2, b=1e1),
15 |         },
16 |         n_jobs=-1,
17 |         n_iter=100,
18 |     )
19 | 


--------------------------------------------------------------------------------
/src/attacking/gradboost.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tries scikit learn's default gradient boosting classifier.
 3 | """
 4 | 
 5 | import sklearn.decomposition
 6 | import sklearn.model_selection
 7 | import sklearn.pipeline
 8 | import sklearn.preprocessing
 9 | 
10 | from . import semantic_security
11 | 
12 | 
13 | def init_model(seed) -> semantic_security.Model:
14 |     return sklearn.pipeline.make_pipeline(
15 |         sklearn.preprocessing.StandardScaler(),
16 |         sklearn.ensemble.GradientBoostingClassifier(random_state=seed),
17 |     )
18 | 


--------------------------------------------------------------------------------
/src/attacking/helpers.py:
--------------------------------------------------------------------------------
 1 | class InsistedError(Exception):
 2 |     condition: object
 3 |     message: object
 4 | 
 5 |     def __init__(self, condition: object, message: object):
 6 |         self.condition = condition
 7 |         self.message = message
 8 | 
 9 |     def __str__(self):
10 |         return f"Internal consistency error: {self.message}"
11 | 
12 | 
13 | def insist(condition: object, message: object) -> None:
14 |     if not condition:
15 |         raise InsistedError(condition, message)
16 | 


--------------------------------------------------------------------------------
/src/attacking/knn.py:
--------------------------------------------------------------------------------
 1 | import sklearn.ensemble
 2 | import sklearn.model_selection
 3 | import sklearn.pipeline
 4 | import sklearn.preprocessing
 5 | 
 6 | from . import semantic_security
 7 | 
 8 | 
 9 | def init_model() -> semantic_security.Model:
10 |     return sklearn.model_selection.GridSearchCV(
11 |         sklearn.neighbors.KNeighborsClassifier(algorithm="auto", n_jobs=-1),
12 |         {"n_neighbors": [5, 25, 100]},
13 |         n_jobs=-1,
14 |     )
15 | 


--------------------------------------------------------------------------------
/src/attacking/lda.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Applies linear discriminant analysis to hand crafted features on ciphertexts.
 3 | """
 4 | 
 5 | import sklearn.discriminant_analysis
 6 | import sklearn.pipeline
 7 | import sklearn.preprocessing
 8 | 
 9 | from . import semantic_security
10 | 
11 | 
12 | def init_model() -> semantic_security.Model:
13 |     return sklearn.pipeline.make_pipeline(
14 |         sklearn.preprocessing.StandardScaler(),
15 |         sklearn.discriminant_analysis.LinearDiscriminantAnalysis(),
16 |     )
17 | 


--------------------------------------------------------------------------------
/src/attacking/random_forest.py:
--------------------------------------------------------------------------------
 1 | import sklearn.ensemble
 2 | import sklearn.model_selection
 3 | import sklearn.pipeline
 4 | import sklearn.preprocessing
 5 | 
 6 | from . import semantic_security
 7 | 
 8 | 
 9 | def init_model() -> semantic_security.Model:
10 |     return sklearn.model_selection.GridSearchCV(
11 |         sklearn.ensemble.RandomForestClassifier(max_features=None),
12 |         {
13 |             "max_features": [1.0, 0.3, "sqrt", "log2"],
14 |             "max_depth": [None],
15 |             "min_samples_split": [2],
16 |         },
17 |         n_jobs=-1,
18 |     )
19 | 


--------------------------------------------------------------------------------
/src/attacking/semantic_security.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | from typing import Any, Callable, Dict, Iterator, Optional, Protocol
  3 | 
  4 | import numpy as np
  5 | import scipy.stats
  6 | 
  7 | from .. import logging
  8 | from . import data
  9 | 
 10 | logger = logging.init(__name__, date=False, verbose=True)
 11 | 
 12 | 
 13 | class Model(Protocol):
 14 |     def fit(self, X: np.ndarray, y: np.ndarray) -> None:
 15 |         ...
 16 | 
 17 |     def score(self, X: np.ndarray, y: np.ndarray) -> float:
 18 |         ...
 19 | 
 20 |     def predict(self, X: np.ndarray) -> np.ndarray:
 21 |         ...
 22 | 
 23 | 
 24 | TrainingCallback = Callable[[Model, data.PairedDataset], None]
 25 | 
 26 | 
 27 | @dataclasses.dataclass
 28 | class SemanticSecurityConfig:
 29 |     plaintext_a: str
 30 |     plaintext_b: str
 31 |     params: Dict[str, Any]
 32 | 
 33 | 
 34 | def confidence_interval(n_test, confidence=0.95):
 35 |     percentages = (1 - confidence) / 2, confidence + (1 - confidence) / 2
 36 |     logger.debug(
 37 |         "Calculated percentages. [confidence: %.3g, lower: %.3g, upper: %.3g]",
 38 |         confidence,
 39 |         *percentages,
 40 |     )
 41 |     # Coin flip
 42 |     p = 0.5
 43 |     # Binomial distribution
 44 |     lower, upper = scipy.stats.binom.ppf(percentages, n_test, p) / n_test
 45 |     return lower, upper
 46 | 
 47 | 
 48 | def play(
 49 |     datasets: Iterator[data.SingleDataset],
 50 |     model_fn: Callable[[], Model],
 51 |     model_name: str,
 52 |     seed: int,
 53 |     trained_model_callback: Optional[TrainingCallback] = None,
 54 |     quiet: bool = False,
 55 | ) -> bool:
 56 |     logger = logging.init(model_name)
 57 |     passed = True
 58 | 
 59 |     for pair in data.make_paired_datasets(
 60 |         datasets, seed, left="data/news/100-tokens/0.txt"
 61 |     ):
 62 |         model = model_fn()
 63 | 
 64 |         (train_x, train_y), (test_x, test_y) = pair.splits
 65 | 
 66 |         if not quiet:
 67 |             logger.info("Starting model.fit")
 68 | 
 69 |         model.fit(train_x, train_y)
 70 | 
 71 |         if callable(trained_model_callback):
 72 |             trained_model_callback(model, pair)
 73 | 
 74 |         train_score = model.score(train_x, train_y)
 75 |         test_score = model.score(test_x, test_y)
 76 | 
 77 |         params = getattr(model, "best_params_", {})
 78 |         if not quiet:
 79 |             logger.info(
 80 |                 "Fitted. [pair: %s, train acc: %.2f, test acc: %.2f, params: %s]",
 81 |                 pair.name,
 82 |                 train_score,
 83 |                 test_score,
 84 |                 params,
 85 |             )
 86 | 
 87 |         n_test = len(test_y)
 88 |         lower, upper = confidence_interval(n_test)
 89 | 
 90 |         n_correct = 0
 91 |         for label, prediction in zip(test_y, model.predict(test_x)):
 92 |             if label == prediction:
 93 |                 n_correct += 1
 94 | 
 95 |         test = scipy.stats.binomtest(n_correct, n_test, p=0.5, alternative="greater")
 96 | 
 97 |         if test.pvalue < 0.05:  # if failed
 98 |             logger.warn(
 99 |                 "Reject null. [pair: %s, test acc: %.3f, p: %.3g]",
100 |                 pair.name,
101 |                 test_score,
102 |                 test.pvalue,
103 |             )
104 |             passed = False
105 | 
106 |         if test_score > upper and not quiet:
107 |             logger.warn(
108 |                 "Outside confidence interval. [pair: %s, test acc: %.3f, upper: %.3f]",
109 |                 pair.name,
110 |                 test_score,
111 |                 upper,
112 |             )
113 |             passed = False
114 | 
115 |         if test.pvalue > 0.05 and test_score < upper:
116 |             logger.info(
117 |                 "Fail to reject. [pair: %s, test acc: %.3f, upper: %.3f, p: %.3g]",
118 |                 pair.name,
119 |                 test_score,
120 |                 upper,
121 |                 test.pvalue,
122 |             )
123 | 
124 |     return passed
125 | 


--------------------------------------------------------------------------------
/src/attacking/svm.py:
--------------------------------------------------------------------------------
 1 | import scipy.stats
 2 | import sklearn.model_selection
 3 | import sklearn.pipeline
 4 | import sklearn.preprocessing
 5 | import sklearn.svm
 6 | 
 7 | from . import semantic_security
 8 | 
 9 | 
10 | def init_model() -> semantic_security.Model:
11 |     return sklearn.model_selection.RandomizedSearchCV(
12 |         sklearn.pipeline.make_pipeline(
13 |             sklearn.preprocessing.StandardScaler(),
14 |             sklearn.svm.SVC(C=1.0, kernel="rbf"),
15 |         ),
16 |         {
17 |             "svc__C": scipy.stats.loguniform(a=1e-3, b=1e1),
18 |             "svc__kernel": ["rbf", "linear", "sigmoid", "poly"],
19 |             "svc__gamma": scipy.stats.loguniform(a=1e-4, b=1e-3),
20 |         },
21 |         n_iter=100,
22 |         n_jobs=-1,
23 |         random_state=42,
24 |     )
25 | 


--------------------------------------------------------------------------------
/src/blog/histograms.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Generates data for plotly.js histograms showing the distribution of an invidual feature for one or more pair of plaintexts.
 3 | """
 4 | 
 5 | import argparse
 6 | import json
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | from .. import attacking
12 | from ..paper import helpers, security
13 | 
14 | files = ["News ($m1$)", "Rand. Bytes ($m2$)"]
15 | 
16 | 
17 | def init_parser() -> argparse.ArgumentParser:
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument(
20 |         "group",
21 |         choices=[
22 |             "original",
23 |             "l2-norm-reg",
24 |             "distribution-reg",
25 |         ],
26 |         help="Which ciphertext groups to use.",
27 |     )
28 | 
29 |     return parser
30 | 
31 | 
32 | def make_dataframe(ciphertexts):
33 |     headers = ["file", *attacking.data.FEATURE_FUNCTIONS.keys()]
34 | 
35 |     rows = []
36 | 
37 |     for file, matrix in ciphertexts.items():
38 |         features = {}
39 | 
40 |         for name, func in attacking.data.FEATURE_FUNCTIONS.items():
41 |             features[name] = func(matrix)
42 | 
43 |         # features[X] are all the same length
44 |         features["file"] = [
45 |             helpers.translate_filename(file) for _ in range(len(features[name]))
46 |         ]
47 | 
48 |         file_rows = tuple(
49 |             [features[key][i] for key in headers] for i in range(len(features[name]))
50 |         )
51 | 
52 |         rows.extend(file_rows)
53 | 
54 |     return pd.DataFrame.from_records(data=rows, columns=headers)
55 | 
56 | 
57 | def main():
58 |     parser = init_parser()
59 |     args = parser.parse_args()
60 | 
61 |     ciphertexts = security.load_ciphertexts(args.group, 400)
62 |     df = make_dataframe(ciphertexts)
63 | 
64 |     data = []
65 |     for file in files:
66 |         data.append(df[df.file == file]["l2-norm"].tolist())
67 | 
68 |     with open(f"docs/blog/data/{args.group}-histograms.json", "w") as fd:
69 |         json.dump(data, fd)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     main()
74 | 


--------------------------------------------------------------------------------
/src/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/data/__init__.py


--------------------------------------------------------------------------------
/src/data/news.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import pathlib
 3 | from typing import Iterator
 4 | 
 5 | import datasets
 6 | 
 7 | from .. import util
 8 | from . import shared
 9 | 
10 | 
11 | def load_news_articles() -> Iterator[str]:
12 |     dataset = datasets.load_dataset(
13 |         "xsum", cache_dir=util.HUGGINGFACE_CACHE_DIR, streaming=True, split="validation"
14 |     ).shuffle(seed=42)
15 |     for example in dataset:
16 |         document = example["document"]
17 |         if not document:
18 |             continue
19 |         yield document
20 | 
21 | 
22 | def preprocess(output_dir):
23 |     output_dir = pathlib.Path(output_dir)
24 |     output_dir.mkdir(exist_ok=True)
25 | 
26 |     articles = load_news_articles()
27 | 
28 |     example_count = 10
29 |     token_lengths = (100, 300, 1000, 3000)
30 | 
31 |     for length, loc in itertools.product(token_lengths, range(example_count)):
32 |         tokens = []
33 |         text = ""
34 |         while len(tokens) < length:
35 |             text += next(articles)
36 |             tokens = shared.tokenizer(text)["input_ids"]
37 | 
38 |         tokens = tokens[:length]
39 |         shared.assert_invertible(tokens)
40 | 
41 |         text = shared.tokenizer.decode(tokens)
42 | 
43 |         length_dir = output_dir / f"{length}-tokens"
44 |         length_dir.mkdir(exist_ok=True)
45 | 
46 |         with open(length_dir / f"{loc}.txt", "w") as file:
47 |             file.write(text)
48 | 
49 |     char_lengths = (500, 2500, 5000, 25000)
50 | 
51 |     for length, loc in itertools.product(char_lengths, range(example_count)):
52 |         text = ""
53 |         while len(text) < length:
54 |             text += next(articles)
55 |         text = text[:length]
56 | 
57 |         tokens = shared.tokenizer(text)["input_ids"]
58 |         shared.assert_invertible(tokens)
59 |         text = shared.tokenizer.decode(tokens)
60 | 
61 |         length_dir = output_dir / f"{length}-chars"
62 |         length_dir.mkdir(exist_ok=True)
63 | 
64 |         with open(length_dir / f"{loc}.txt", "w") as file:
65 |             file.write(text)
66 | 


--------------------------------------------------------------------------------
/src/data/openwebtext.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import datasets
 4 | 
 5 | from .. import util
 6 | 
 7 | 
 8 | def preprocess(output_dir):
 9 |     output_dir = pathlib.Path(output_dir)
10 |     output_dir.mkdir(exist_ok=True)
11 | 
12 |     openwebtext = datasets.load_dataset(
13 |         "stas/openwebtext-10k", split="train", cache_dir=util.HUGGINGFACE_CACHE_DIR
14 |     ).shuffle(seed=42)
15 | 
16 |     for i in range(10):
17 |         article = openwebtext[i]
18 |         with open(output_dir / f"{i}.txt", "w") as file:
19 |             file.write(article["text"].strip() + "\n")
20 | 


--------------------------------------------------------------------------------
/src/data/pubmed.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import pathlib
 3 | from typing import Iterator
 4 | 
 5 | import datasets
 6 | 
 7 | from .. import util
 8 | from . import shared
 9 | 
10 | 
11 | def take(n, iterable):
12 |     "Return first n items of the iterable"
13 |     return itertools.islice(iterable, n)
14 | 
15 | 
16 | def load_pubmed_abstracts() -> Iterator[str]:
17 |     dataset = datasets.load_dataset(
18 |         "pubmed", cache_dir=util.HUGGINGFACE_CACHE_DIR, streaming=True, split="train"
19 |     ).shuffle(seed=42)
20 |     for example in dataset:
21 |         abstract = example["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
22 |         if not abstract:
23 |             continue
24 |         yield abstract
25 | 
26 | 
27 | def preprocess(output_dir):
28 |     output_dir = pathlib.Path(output_dir)
29 |     output_dir.mkdir(exist_ok=True)
30 | 
31 |     abstracts = load_pubmed_abstracts()
32 | 
33 |     example_count = 10
34 |     lengths = (100, 1_000, 10_000)
35 | 
36 |     for length, loc in itertools.product(lengths, range(example_count)):
37 |         tokens = []
38 |         text = ""
39 |         while len(tokens) < length:
40 |             text += next(abstracts)
41 |             tokens = shared.tokenizer(text)["input_ids"]
42 | 
43 |         tokens = tokens[:length]
44 |         shared.assert_invertible(tokens)
45 | 
46 |         text = shared.tokenizer.decode(tokens)
47 | 
48 |         length_dir = output_dir / f"{length}-tokens"
49 |         length_dir.mkdir(exist_ok=True)
50 | 
51 |         with open(length_dir / f"{loc}.txt", "w") as file:
52 |             file.write(text)
53 | 


--------------------------------------------------------------------------------
/src/data/reddit.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import datasets
 4 | 
 5 | from .. import util
 6 | 
 7 | 
 8 | def preprocess(output_dir):
 9 |     output_dir = pathlib.Path(output_dir)
10 |     output_dir.mkdir(exist_ok=True)
11 | 
12 |     dataset = datasets.load_dataset(
13 |         "reddit", cache_dir=util.HUGGINGFACE_CACHE_DIR
14 |     ).shuffle(seed=42)["train"]["content"]
15 | 
16 |     indices = list(range(20))
17 | 
18 |     # 3 had a &nbsp;
19 |     indices = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]
20 | 
21 |     posts = []
22 |     for i in indices:
23 |         posts.append(dataset[i])
24 | 
25 |     for i, post in enumerate(posts):
26 |         with open(output_dir / f"{i}.txt", "w") as file:
27 |             file.write(post.strip() + "\n")
28 | 


--------------------------------------------------------------------------------
/src/data/shared.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Sequence
 2 | 
 3 | from .. import config, tokenizing
 4 | 
 5 | exp_cfg = config.ExperimentConfig(
 6 |     model=config.ModelConfig(language_model_name_or_path="gpt2"),
 7 |     tokenizer="pretrained",
 8 |     data=config.DataConfig(__file__),
 9 |     training=config.TrainingConfig(),
10 | )
11 | 
12 | tokenizer = tokenizing.new(exp_cfg)
13 | 
14 | 
15 | def chunk_length(sequence: str) -> int:
16 |     """
17 |     Gets the length of a sequence in chunks using a GPT2 tokenizer.
18 |     """
19 |     if not sequence:
20 |         return 0
21 | 
22 |     chunks = tokenizing.load_chunks(sequence, exp_cfg.data, tokenizer)
23 | 
24 |     return len(chunks)
25 | 
26 | 
27 | def tokenize(sequence: str) -> List[int]:
28 |     if not sequence:
29 |         return []
30 | 
31 |     return tokenizer(sequence)["input_ids"]
32 | 
33 | 
34 | def untokenize(tokens: Sequence[int]) -> str:
35 |     return tokenizer.decode(tokens)
36 | 
37 | 
38 | def token_length(sequence: str) -> int:
39 |     """
40 |     Gets the length of a sequence in tokens using a GPT2 tokenizer.
41 |     """
42 | 
43 |     return len(tokenize(sequence))
44 | 
45 | 
46 | def assert_invertible(tokens: List[int]):
47 |     roundtrip_tokens = tokenize(untokenize(tokens))
48 |     if tokens == roundtrip_tokens:
49 |         return
50 | 
51 |     if untokenize(roundtrip_tokens) == untokenize(tokens):
52 |         return
53 | 
54 |     print(untokenize(tokens))
55 |     print(untokenize(roundtrip_tokens))
56 | 
57 |     for i, (t, rt) in enumerate(zip(tokens, roundtrip_tokens)):
58 |         if t == rt:
59 |             continue
60 | 
61 |         breakpoint()
62 | 


--------------------------------------------------------------------------------
/src/data/twitter.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import random
 3 | 
 4 | import datasets
 5 | 
 6 | from .. import util
 7 | 
 8 | 
 9 | def preprocess(output_dir):
10 |     output_dir = pathlib.Path(output_dir)
11 |     output_dir.mkdir(exist_ok=True)
12 | 
13 |     # load all tweets
14 |     all_tweets = []
15 |     for subset in ["emotion", "sentiment"]:
16 |         dataset = datasets.load_dataset(
17 |             "tweet_eval", subset, cache_dir=util.HUGGINGFACE_CACHE_DIR
18 |         )
19 |         for tweet in dataset["train"]["text"]:
20 |             all_tweets.append(tweet)
21 | 
22 |     # pick ten tweets.
23 |     tweets = random.choices(all_tweets, k=10)
24 | 
25 |     # write them to disk
26 |     for i, tweet in enumerate(tweets):
27 |         with open(output_dir / f"{i}.txt", "w") as file:
28 |             file.write(tweet + "\n")
29 | 


--------------------------------------------------------------------------------
/src/data/wikipedia.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import pathlib
 3 | 
 4 | import datasets
 5 | from tqdm.auto import tqdm
 6 | 
 7 | from .. import util
 8 | from . import shared
 9 | 
10 | 
11 | def preprocess(output_dir):
12 |     output_dir = pathlib.Path(output_dir)
13 |     output_dir.mkdir(exist_ok=True)
14 | 
15 |     wikipedia = datasets.load_dataset(
16 |         "wikipedia", "20200501.en", split="train", cache_dir=util.HUGGINGFACE_CACHE_DIR
17 |     ).shuffle(seed=42)
18 | 
19 |     lengths = (100, 200, 500, 996)
20 | 
21 |     articles = iter(wikipedia)
22 | 
23 |     for length, loc in tqdm(itertools.product(lengths, range(10))):
24 |         tokens = []
25 |         while len(tokens) < length:
26 |             article = next(articles)
27 |             tokens = shared.tokenizer(article["text"])["input_ids"]
28 | 
29 |         tokens = tokens[:length]
30 |         shared.assert_invertible(tokens)
31 | 
32 |         text = shared.tokenizer.decode(tokens)
33 |         assert shared.chunk_length(text) == 1
34 | 
35 |         length_dir = output_dir / f"{length}-tokens"
36 |         length_dir.mkdir(exist_ok=True)
37 | 
38 |         with open(length_dir / f"{loc}.txt", "w") as file:
39 |             file.write(text)
40 | 


--------------------------------------------------------------------------------
/src/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | from .lib import experiment_from_config, find_experiments
2 | 
3 | __all__ = ["experiment_from_config", "find_experiments"]
4 | 


--------------------------------------------------------------------------------
/src/experiments/check.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Checks for experiments that have not run.
 3 | """
 4 | import argparse
 5 | 
 6 | from .. import config, util
 7 | from . import lib
 8 | 
 9 | 
10 | def parse_args() -> argparse.Namespace:
11 |     # check for finished experiments
12 |     parser = argparse.ArgumentParser(
13 |         description="Check which experiments still need to run. This will dirty your relics directory in git. You most likely want to make sure your relics directory is clean, then run this command, then run `git clean -f relics`.",
14 |     )
15 |     parser.add_argument(
16 |         "experiments",
17 |         nargs="+",
18 |         type=str,
19 |         help="Config .toml files or directories containing config .toml files.",
20 |     )
21 |     parser.add_argument(
22 |         "--regex",
23 |         action="store_true",
24 |         help="Whether to use regular expression matching on [experiments] argument",
25 |         default=False,
26 |     )
27 | 
28 |     return parser.parse_args()
29 | 
30 | 
31 | def check(args: argparse.Namespace) -> None:
32 |     if args.regex:
33 |         iterator = util.files_with_match(args.experiments)
34 |     else:
35 |         iterator = util.files_with_extension(args.experiments, ".toml")
36 | 
37 |     for experiment_toml in iterator:
38 |         # If there are any configs that haven't run, print the file name.
39 |         for experiment_config in config.load_configs(experiment_toml):
40 |             experiment = lib.experiment_from_config(experiment_config)
41 |             finished_trials = sum(
42 |                 1 for t in experiment if "finished" in t and t["finished"]
43 |             )
44 |             if finished_trials < experiment_config.trials:
45 |                 print(experiment_toml)
46 |                 break
47 | 
48 | 
49 | def main():
50 |     args = parse_args()
51 |     check(args)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/src/experiments/generate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import pathlib
  4 | 
  5 | import tomli
  6 | import tomli_w
  7 | from tqdm.auto import tqdm
  8 | 
  9 | from .. import config, logging, templating, util
 10 | 
 11 | logger = logging.init("experiments.generate")
 12 | 
 13 | 
 14 | def parse_args() -> argparse.Namespace:
 15 |     parser = argparse.ArgumentParser(
 16 |         description="General .toml files from template .toml files. I kept all my templates in experiments/templates and my generated experiment configs in experiments/generated, which I then removed from version control.",
 17 |     )
 18 |     parser.add_argument(
 19 |         "--strategy",
 20 |         type=str,
 21 |         help="Strategy to use to combine multiple lists in a template.",
 22 |         default="grid",
 23 |         choices=["grid", "paired", "random"],
 24 |     )
 25 |     parser.add_argument(
 26 |         "--count",
 27 |         type=int,
 28 |         help="Number of configs to generate when using --strategy random. Required.",
 29 |         default=-1,
 30 |     )
 31 |     parser.add_argument(
 32 |         "--no-expand",
 33 |         type=str,
 34 |         nargs="+",
 35 |         default=[],
 36 |         help=".-separated fields to not expand",
 37 |     )
 38 |     parser.add_argument(
 39 |         "--prefix",
 40 |         type=str,
 41 |         default="generated-",
 42 |         help="Prefix to add to generated templates",
 43 |     )
 44 |     parser.add_argument(
 45 |         "templates",
 46 |         nargs="+",
 47 |         type=str,
 48 |         help="Template .toml files or directories containing template .toml files.",
 49 |     )
 50 |     parser.add_argument(
 51 |         "output",
 52 |         type=str,
 53 |         help="Output directory to write the generated .toml files to.",
 54 |     )
 55 |     return parser.parse_args()
 56 | 
 57 | 
 58 | def generate(args: argparse.Namespace) -> None:
 59 |     strategy = templating.Strategy.new(args.strategy)
 60 | 
 61 |     count = args.count
 62 |     if strategy is templating.Strategy.random:
 63 |         assert count > 0, "Need to include --count!"
 64 | 
 65 |     for template_toml in util.files_with_extension(args.templates, ".toml"):
 66 |         with open(template_toml, "rb") as template_file:
 67 |             try:
 68 |                 template_dict = tomli.load(template_file)
 69 |             except tomli.TOMLDecodeError as err:
 70 |                 logger.warning(
 71 |                     "Error parsing template file. [file: %s, err: %s]",
 72 |                     template_toml,
 73 |                     err,
 74 |                 )
 75 |                 continue
 76 | 
 77 |         template_name = pathlib.Path(template_toml).stem
 78 | 
 79 |         logger.info("Opened template file. [file: %s]", template_toml)
 80 | 
 81 |         experiment_dicts = templating.generate(
 82 |             template_dict, strategy, count=count, no_expand=set(args.no_expand)
 83 |         )
 84 | 
 85 |         logger.info(
 86 |             "Loaded experiment dictionaries. [count: %s]", len(experiment_dicts)
 87 |         )
 88 | 
 89 |         for i, experiment_dict in enumerate(tqdm(experiment_dicts)):
 90 |             filename = f"{args.prefix}{template_name}-{i}.toml"
 91 |             filepath = os.path.join(args.output, filename)
 92 |             with open(filepath, "wb") as file:
 93 |                 tomli_w.dump(experiment_dict, file)
 94 | 
 95 |             # Verifies that the configs are correctly loaded.
 96 |             list(config.load_configs(filepath))
 97 | 
 98 | 
 99 | def main() -> None:
100 |     args = parse_args()
101 |     generate(args)
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     main()
106 | 


--------------------------------------------------------------------------------
/src/experiments/lib.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | import logging
 3 | import pathlib
 4 | from typing import Any, Dict, Iterator
 5 | 
 6 | import relic
 7 | 
 8 | from .. import config, util
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def find_experiments(paths) -> Iterator[config.ExperimentConfig]:
14 |     """
15 |     Arguments:
16 |     * args (list[str]): list of strings that are either directories containing files or config files themselves.
17 |     """
18 |     if not isinstance(paths, list):
19 |         paths = [paths]
20 | 
21 |     for config_file in util.files_with_extension(paths, ".toml"):
22 |         yield from config.load_configs(config_file)
23 | 
24 | 
25 | def make_relic_config(experiment_config: config.ExperimentConfig) -> Dict[str, Any]:
26 |     relic_config = dataclasses.asdict(experiment_config)
27 | 
28 |     # don't want to include these parameters in the relic repository.
29 |     relic_config.pop("trials")
30 |     relic_config.pop("save_weights")
31 |     relic_config["training"].pop("maximum_epochs")
32 |     relic_config["training"].pop("snapshot_interval")
33 |     relic_config["training"].pop("report_interval")
34 | 
35 |     return relic_config
36 | 
37 | 
38 | def experiment_from_config(
39 |     experiment_config: config.ExperimentConfig,
40 | ) -> relic.Experiment:
41 |     """
42 |     Create a relic experiment from an ExperimentConfig. This method removes some fields from ExperimentConfig that shouldn't matter when considering results (whether the model was saved, how many trials were run, etc.).
43 |     """
44 |     relic_exp = relic.new_experiment(
45 |         make_relic_config(experiment_config), pathlib.Path("relics")
46 |     )
47 | 
48 |     return relic_exp
49 | 


--------------------------------------------------------------------------------
/src/intrinsic_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Thin wrapper around intrinsic module to provide a intrinsic_dimension_vector property on the module.
 3 | """
 4 | import torch
 5 | 
 6 | import intrinsic
 7 | 
 8 | from . import accelerate, config, modeling_utils, util
 9 | 
10 | 
11 | class IntrinsicDimension(
12 |     intrinsic.IntrinsicDimension,
13 |     modeling_utils.IntrinsicDimension,
14 |     modeling_utils.KnowsBatchSize,
15 |     modeling_utils.Saveable,
16 | ):
17 |     @property
18 |     def get_intrinsic_dimension_vector(self):
19 |         return self.intrinsic_vector.detach().cpu()
20 | 
21 |     def set_intrinsic_dimension_vector(self, vec: torch.Tensor) -> None:
22 |         self.intrinsic_vector.copy_(vec)
23 | 
24 |     def save(self, path) -> None:
25 |         data = {
26 |             "fastfood_seed": self.seed,
27 |             "theta_d": self.get_intrinsic_dimension_vector.detach(),
28 |         }
29 | 
30 |         torch.save(data, path)
31 | 
32 |     def batch_size(self, training_config: config.TrainingConfig) -> int:
33 |         accelerate._set_environment(self)
34 |         self.logger.info("TODO: Take model.context_window into account.")
35 |         if torch.cuda.device_count() < 1:
36 |             self.logger.warn("On CPU; use as big a batch size as you want!")
37 |             return training_config.batch_size
38 | 
39 |         mb = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024
40 | 
41 |         self.logger.info(
42 |             "[available memory: %s, rtx2080ti estimate: %s, v100 estimate: %s]",
43 |             mb,
44 |             util.rtx2080ti,
45 |             util.v100,
46 |         )
47 | 
48 |         model_size = "gpt2"
49 |         try:
50 |             layer_count = len(self.hidden.transformer.h)
51 |             if layer_count == 36:
52 |                 model_size = "gpt2-large"
53 |             elif layer_count == 24:
54 |                 model_size = "gpt2-medium"
55 |             else:
56 |                 assert layer_count == 12
57 |         except AttributeError:
58 |             pass
59 | 
60 |         if model_size == "gpt2":
61 |             if mb <= util.rtx2080ti:
62 |                 # max on rtx2080ti is 2
63 |                 assert (
64 |                     accelerate._ENVIRONMENT is accelerate.TrainingType.MODEL_PARALLELISM
65 |                 )
66 |                 return min(2, training_config.batch_size)
67 |             elif util.rtx2080ti < mb <= util.v100:
68 |                 assert accelerate._ENVIRONMENT is accelerate.TrainingType.SINGLE_GPU
69 |                 return min(2, training_config.batch_size)
70 |             elif (
71 |                 util.v100 < mb <= util.v100 * 2
72 |             ):  # some of the pitzer clusters have 2 NVLINKed v100s.
73 |                 assert accelerate._ENVIRONMENT is accelerate.TrainingType.SINGLE_GPU
74 |                 return min(4, training_config.batch_size)
75 |             else:
76 |                 assert mb > 2 * util.v100
77 |                 # deal with this when the time comes
78 |                 return training_config.batch_size
79 |         elif model_size == "gpt2-medium":
80 |             if mb < util.a6000:
81 |                 assert accelerate._ENVIRONMENT is accelerate.TrainingType.SINGLE_GPU
82 |                 return min(2, training_config.batch_size)
83 |         elif model_size == "gpt2-large":
84 |             if mb < util.a6000:
85 |                 assert accelerate._ENVIRONMENT is accelerate.TrainingType.SINGLE_GPU
86 |                 return 1
87 | 
88 |         raise ValueError(mb, model_size)
89 | 


--------------------------------------------------------------------------------
/src/logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def init(name: str, verbose: bool = False, date=True) -> logging.Logger:
 5 |     if date:
 6 |         log_format = "[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s"
 7 |     else:
 8 |         log_format = "[%(levelname)s] [%(name)s] %(message)s"
 9 | 
10 |     if not verbose:
11 |         logging.basicConfig(level=logging.INFO, format=log_format)
12 |     else:
13 |         logging.basicConfig(level=logging.DEBUG, format=log_format)
14 | 
15 |     return logging.getLogger(name)
16 | 


--------------------------------------------------------------------------------
/src/make_tokenizers.py:
--------------------------------------------------------------------------------
 1 | import tokenizers
 2 | 
 3 | 
 4 | def main():
 5 |     alphabet = sorted(tokenizers.pre_tokenizers.ByteLevel.alphabet())
 6 | 
 7 |     # 1-byte.json
 8 |     tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())
 9 |     tokenizer.decoder = tokenizers.decoders.ByteLevel()
10 |     tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(
11 |         add_prefix_space=False, trim_offsets=True, use_regex=False
12 |     )
13 | 
14 |     trainer = tokenizers.trainers.BpeTrainer(
15 |         special_tokens=["<|endoftext|>"],
16 |         initial_alphabet=alphabet,
17 |         vocab_size=len(alphabet),
18 |     )
19 |     tokenizer.train([], trainer)
20 |     tokenizer.save("src/tokenizers/1-byte.json")
21 | 
22 |     # 2-byte tokenizer
23 |     tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())
24 |     tokenizer.decoder = tokenizers.decoders.ByteLevel()
25 |     tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(
26 |         add_prefix_space=False, trim_offsets=True, use_regex=False
27 |     )
28 | 
29 |     trainer = tokenizers.trainers.BpeTrainer(
30 |         special_tokens=["<|endoftext|>"],
31 |         initial_alphabet=alphabet,
32 |         vocab_size=len(alphabet) * (len(alphabet) + 1) + 1,
33 |     )
34 | 
35 |     data = [i + j for i in alphabet for j in alphabet]
36 | 
37 |     tokenizer.train_from_iterator(data, trainer)
38 |     tokenizer.save("src/tokenizers/2-byte.json")
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/src/modeling_utils.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | 
  6 | import intrinsic.fwh
  7 | 
  8 | from . import config
  9 | 
 10 | 
 11 | class Saveable(abc.ABC):
 12 |     @abc.abstractmethod
 13 |     def save(self, path):
 14 |         ...
 15 | 
 16 | 
 17 | class IntrinsicDimension(abc.ABC):
 18 |     @abc.abstractproperty
 19 |     def get_intrinsic_dimension_vector(self) -> torch.Tensor:
 20 |         ...
 21 | 
 22 |     @abc.abstractmethod
 23 |     def set_intrinsic_dimension_vector(self, vec: torch.Tensor) -> None:
 24 |         ...
 25 | 
 26 | 
 27 | class KnowsBatchSize(abc.ABC):
 28 |     @abc.abstractmethod
 29 |     def batch_size(self, training_config: config.TrainingConfig) -> int:
 30 |         ...
 31 | 
 32 | 
 33 | class Cos(torch.nn.Module):
 34 |     def forward(self, x):
 35 |         return torch.cos(x)
 36 | 
 37 | 
 38 | class Sine(torch.nn.Module):
 39 |     def forward(self, x):
 40 |         return torch.sin(x)
 41 | 
 42 | 
 43 | class LayerNorm(torch.nn.Module):
 44 |     def forward(self, x):
 45 |         std, mean = torch.std_mean(x)
 46 |         return (x - mean) / (std + 1e-8)
 47 | 
 48 | 
 49 | class GroupNorm(torch.nn.Module):
 50 |     """
 51 |     Applies LayerNorm to multiple groups, so each group is normalized by its own mean and std deviation.
 52 |     """
 53 | 
 54 |     groups: int
 55 | 
 56 |     def __init__(self, groups: int):
 57 |         super().__init__()
 58 |         self.groups = groups
 59 | 
 60 |     def apply_norm(self, x):
 61 |         std, mean = torch.std_mean(x)
 62 |         return (x - mean) / (std + 1e-8)
 63 | 
 64 |     def forward(self, x):
 65 |         assert (
 66 |             np.prod(x.shape) % self.groups == 0
 67 |         ), f"Group count {self.groups} must be an divisor of x.shape {x.shape} -> {np.prod(x.shape)}"
 68 | 
 69 |         tensors = torch.chunk(x, self.groups)
 70 | 
 71 |         tensors = [self.apply_norm(t) for t in tensors]
 72 | 
 73 |         return torch.cat(tensors)
 74 | 
 75 | 
 76 | class InverseFn(torch.nn.Module):
 77 |     def forward(self, x):
 78 |         return 1 / (x + 1e-8)
 79 | 
 80 | 
 81 | class NonlinearWHT(torch.nn.Module):
 82 |     def forward(self, x):
 83 |         return intrinsic.fwh.fast_nonlinear_walsh_hadamard_transform(x, 5 / 3)
 84 | 
 85 | 
 86 | def estimate_memory_requirements(
 87 |     model: torch.nn.Module, intrinsic_dimension: int = 0, efficient: bool = True
 88 | ):
 89 |     """
 90 |     Try to calculate the required memory based on the following assumptions:
 91 |     * Floats are 4 bytes.
 92 |     * We are using an optimizer that maintains 2 floats per parameter.
 93 |     """
 94 | 
 95 |     def floats_for(tensor):
 96 |         numel = np.prod(tensor.shape)
 97 | 
 98 |         if intrinsic_dimension > 0 and efficient:
 99 |             numel += 2 ** np.ceil(np.log2(numel))
100 | 
101 |         # inputs + activations, one copy for gradients, two copies for adam optimizer states.
102 |         return numel * 8
103 | 
104 |     bytes_per_float = 4
105 |     total = 0
106 | 
107 |     for tensor in model.parameters():
108 |         total += floats_for(tensor)
109 | 
110 |     for buffer in model.buffers():
111 |         total += floats_for(buffer)
112 | 
113 |     if intrinsic_dimension > 0 and not efficient:
114 |         total_size = 0
115 |         for tensor in model.parameters():
116 |             total_size += np.prod(tensor.shape)
117 |         for buffer in model.buffers():
118 |             total_size += np.prod(buffer.shape)
119 | 
120 |         total += (2 ** np.ceil(np.log2(total_size))) * 8
121 | 
122 |     return total * bytes_per_float
123 | 


--------------------------------------------------------------------------------
/src/paper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/paper/__init__.py


--------------------------------------------------------------------------------
/src/paper/ciphertext_dist_histograms.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Demonstrates that ciphertexts have approximately normal distributions
 3 | """
 4 | 
 5 | import argparse
 6 | 
 7 | import matplotlib.pyplot as plt
 8 | import numpy as np
 9 | import seaborn as sns
10 | 
11 | sns.set_theme()
12 | 
13 | from . import security
14 | 
15 | sns.set_style("whitegrid", {"axes.grid": False})
16 | sns.set_context("paper", font_scale=0.7)
17 | sns.set_palette("Dark2")
18 | 
19 | 
20 | def init_parser() -> argparse.ArgumentParser:
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument("filename", type=str, help="Filepath to save file to.")
23 | 
24 |     return parser
25 | 
26 | 
27 | def main():
28 |     parser = init_parser()
29 |     args = parser.parse_args()
30 | 
31 |     ciphertexts = security.load_ciphertexts("original", count=2)
32 | 
33 |     bound = 5e-7
34 | 
35 |     fig, axes = plt.subplots(nrows=5, ncols=2, sharex=True, sharey=True)
36 |     bins = np.linspace(start=-bound, stop=bound, num=30)
37 | 
38 |     axes[0][0].hist(ciphertexts["data/news/100-tokens/0.txt"][0], bins=bins)
39 |     axes[0][1].hist(ciphertexts["data/news/100-tokens/0.txt"][1], bins=bins)
40 |     axes[0][0].set_ylabel("News (0)")
41 | 
42 |     axes[1][0].hist(ciphertexts["data/news/100-tokens/1.txt"][0], bins=bins)
43 |     axes[1][1].hist(ciphertexts["data/news/100-tokens/1.txt"][1], bins=bins)
44 |     axes[1][0].set_ylabel("News (1)")
45 | 
46 |     axes[2][0].hist(ciphertexts["data/pubmed/100-tokens/0.txt"][0], bins=bins)
47 |     axes[2][1].hist(ciphertexts["data/pubmed/100-tokens/0.txt"][1], bins=bins)
48 |     axes[2][0].set_ylabel("PubMed")
49 | 
50 |     axes[3][0].hist(ciphertexts["data/random-words/100-tokens/0.txt"][0], bins=bins)
51 |     axes[3][1].hist(ciphertexts["data/random-words/100-tokens/0.txt"][1], bins=bins)
52 |     axes[3][0].set_ylabel("Rand. Words")
53 | 
54 |     axes[4][0].hist(ciphertexts["data/random-bytes/100-tokens/0.txt"][0], bins=bins)
55 |     axes[4][1].hist(ciphertexts["data/random-bytes/100-tokens/0.txt"][1], bins=bins)
56 |     axes[4][0].set_ylabel("Rand. Bytes")
57 | 
58 |     axes[0][0].set(yticklabels=[])
59 | 
60 |     fig.tight_layout()
61 |     fig.savefig(args.filename, bbox_inches="tight")
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 


--------------------------------------------------------------------------------
/src/paper/embeddings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Visualizes distributions of encrypted plaintexts using low-dimensional embedding techniques.
  3 | """
  4 | 
  5 | import argparse
  6 | import warnings
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | import seaborn as sns
 11 | import sklearn.manifold
 12 | import sklearn.preprocessing
 13 | from tqdm.auto import tqdm
 14 | 
 15 | from . import helpers, security
 16 | 
 17 | sns.set_style("whitegrid", {"axes.grid": False})
 18 | sns.set_context("notebook", font_scale=1.3)
 19 | sns.set_palette("Dark2")
 20 | 
 21 | 
 22 | def init_parser() -> argparse.ArgumentParser:
 23 |     parser = argparse.ArgumentParser()
 24 |     parser.add_argument(
 25 |         "group",
 26 |         choices=[
 27 |             "original",
 28 |             "l2-norm-reg",
 29 |             "distribution-reg",
 30 |         ],
 31 |         help="Which ciphertext groups to use.",
 32 |     )
 33 |     parser.add_argument(
 34 |         "count", type=int, help="How many experiments from each group to use"
 35 |     )
 36 |     parser.add_argument("filename", type=str, help="Filepath to save file to.")
 37 | 
 38 |     return parser
 39 | 
 40 | 
 41 | def plot_ciphertexts(ciphertexts):
 42 |     # Get point/file arrays
 43 |     # keys = sorted(ciphertexts.keys())
 44 |     keys = ["data/news/100-tokens/0.txt", "data/random-bytes/100-tokens/0.txt"]
 45 | 
 46 |     points = np.concatenate([ciphertexts[k] for k in keys])
 47 |     files = np.concatenate(
 48 |         [[helpers.translate_filename(k)] * len(ciphertexts[k]) for k in keys]
 49 |     )
 50 | 
 51 |     # Do dimension-reduction
 52 |     perplexity = 50
 53 |     learning_rate = 50
 54 |     trials = 5
 55 | 
 56 |     best_embedded = None
 57 |     best_divergence = np.inf
 58 | 
 59 |     # Try 10 random seeds
 60 |     for i in tqdm(range(trials)):
 61 |         # Ignore FutureWarning
 62 |         with warnings.catch_warnings():
 63 |             warnings.simplefilter("ignore", category=FutureWarning)
 64 |             scaler = sklearn.preprocessing.StandardScaler()
 65 |             tsne = sklearn.manifold.TSNE(
 66 |                 n_components=2,
 67 |                 perplexity=perplexity,
 68 |                 random_state=i,
 69 |                 init="pca",
 70 |                 learning_rate=learning_rate,
 71 |                 n_iter=5000,
 72 |             )
 73 |             embedded = tsne.fit_transform(scaler.fit_transform(points))
 74 |             if tsne.kl_divergence_ < best_divergence:
 75 |                 best_divergence = tsne.kl_divergence_
 76 |                 best_embedded = embedded
 77 | 
 78 |     # Convert to dataframe
 79 |     rows = [
 80 |         (best_embedded[i][0], best_embedded[i][1], file) for i, file in enumerate(files)
 81 |     ]
 82 |     df = pd.DataFrame(rows, columns=["x", "y", "File"])
 83 | 
 84 |     order = ["News (N0)", "Rand. Bytes (RB)"]
 85 | 
 86 |     fig = sns.relplot(
 87 |         df,
 88 |         x="x",
 89 |         y="y",
 90 |         style="File",
 91 |         style_order=order,
 92 |         hue="File",
 93 |         hue_order=order,
 94 |         kind="scatter",
 95 |         facet_kws=dict(legend_out=False),
 96 |     )
 97 |     fig.set(xlabel=None, ylabel=None, xticks=[], yticks=[])
 98 |     fig.despine(right=True, top=True, bottom=True, left=True)
 99 |     fig.legend.set_title(None)
100 |     sns.move_legend(fig, "upper right")
101 | 
102 |     return fig
103 | 
104 | 
105 | def main():
106 |     parser = init_parser()
107 |     args = parser.parse_args()
108 | 
109 |     ciphertexts = security.load_ciphertexts(args.group, args.count)
110 | 
111 |     fig = plot_ciphertexts(ciphertexts)
112 | 
113 |     fig.savefig(args.filename, bbox_inches="tight")
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     main()
118 | 


--------------------------------------------------------------------------------
/src/paper/feature_importance.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script measure the feature importance for each proposed variant of the encryption algorithm.
  3 | """
  4 | 
  5 | import argparse
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import seaborn as sns
 10 | import sklearn.feature_selection
 11 | 
 12 | from .. import attacking, logging
 13 | from . import helpers, security
 14 | 
 15 | sns.set_style("whitegrid")
 16 | sns.set_context("paper", font_scale=2)
 17 | sns.set_palette("Dark2")
 18 | 
 19 | logger = logging.init("feature-importance")
 20 | 
 21 | 
 22 | def init_parser() -> argparse.ArgumentParser:
 23 |     parser = argparse.ArgumentParser()
 24 |     parser.add_argument(
 25 |         "--relics",
 26 |         help="Path to relics/ directory",
 27 |     )
 28 |     parser.add_argument(
 29 |         "count", type=int, help="How many experiments from each group to use"
 30 |     )
 31 |     parser.add_argument("filename", type=str, help="Filepath to save file to.")
 32 | 
 33 |     return parser
 34 | 
 35 | 
 36 | def measure_mutual_information(datasets):
 37 |     files = ("data/news/100-tokens/0.txt", "data/random-bytes/100-tokens/0.txt")
 38 |     datasets = [dataset for dataset in datasets if dataset.name in files]
 39 |     # Arrange the datasets into a single X, y multiclass classification problem.
 40 |     x = np.concatenate([dataset.splits[0] for dataset in datasets], axis=0)
 41 |     y = np.zeros(x.shape[0])
 42 |     start = 0
 43 |     end = 0
 44 |     for i, dataset in enumerate(datasets):
 45 |         start = end
 46 |         end += dataset.splits[0].shape[0]
 47 |         y[start:end] = i
 48 | 
 49 |     # Measure mutual information
 50 |     mi = sklearn.feature_selection.mutual_info_classif(
 51 |         x,
 52 |         y,
 53 |         discrete_features=False,
 54 |         n_neighbors=3,
 55 |         copy=True,
 56 |         random_state=42,
 57 |     )
 58 | 
 59 |     return mi
 60 | 
 61 | 
 62 | def load_datasets(group, *, count):
 63 |     ciphertexts = security.load_ciphertexts(group, count=count)
 64 |     return list(
 65 |         attacking.data.make_single_datasets(
 66 |             ciphertexts, attacking.data.preprocess, ratio=1.0
 67 |         )
 68 |     )
 69 | 
 70 | 
 71 | def plot_mi(original, l2_norm, dist, keys):
 72 |     fig, ax = plt.subplots(subplot_kw={"aspect": 9})
 73 | 
 74 |     x = np.arange(len(keys))
 75 |     width = 0.3
 76 | 
 77 |     ax.bar(x - width, original, width, label="Original")
 78 |     ax.bar(x, l2_norm, width, label="L2-Norm Reg.")
 79 |     ax.bar(x + width, dist, width, label="Dist. Reg.")
 80 |     ax.set_xticks(x)
 81 |     ax.set_xticklabels([helpers.translate_feature(k) for k in keys])
 82 |     ax.set_ylabel("Mutual Information")
 83 |     ax.set_xlabel("Feature")
 84 |     ax.legend()
 85 | 
 86 |     return fig
 87 | 
 88 | 
 89 | def main():
 90 |     parser = init_parser()
 91 |     args = parser.parse_args()
 92 | 
 93 |     # Load ciphertexts from experiments
 94 |     original_mi = measure_mutual_information(
 95 |         load_datasets("original", count=args.count)
 96 |     )
 97 |     l2_norm_mi = measure_mutual_information(
 98 |         load_datasets("l2-norm-reg", count=args.count)
 99 |     )
100 |     dist_mi = measure_mutual_information(
101 |         load_datasets("distribution-reg", count=args.count)
102 |     )
103 |     keys = sorted(attacking.data.FEATURE_FUNCTIONS.keys())
104 | 
105 |     fig = plot_mi(original_mi, l2_norm_mi, dist_mi, keys)
106 |     fig.tight_layout()
107 |     fig.savefig(args.filename, bbox_inches="tight")
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     main()
112 | 


--------------------------------------------------------------------------------
/src/paper/helpers.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def translate_domain(domain):
 5 |     if domain == "news":
 6 |         return "News"
 7 |     elif domain == "pubmed":
 8 |         return "PubMed"
 9 |     elif domain == "random-words":
10 |         return "Random Words"
11 |     elif domain == "random-bytes":
12 |         return "Random Bytes"
13 |     elif domain == "binary":
14 |         return "Multimedia"
15 |     else:
16 |         raise ValueError(domain)
17 | 
18 | 
19 | def translate_model(model_name, pretrained=True):
20 |     if model_name == "gpt2" and pretrained:
21 |         return "GPT-2"
22 |     if model_name == "gpt2" and not pretrained:
23 |         return "GPT-2 (rand)"
24 |     elif model_name == "gpt2-medium":
25 |         return "335M"
26 |     elif model_name == "EleutherAI/pythia-70m":
27 |         return "Pythia 70M"
28 |     elif model_name == "EleutherAI/pythia-70m-deduped":
29 |         return "Pythia 70M, deduped"
30 |     elif model_name == "EleutherAI/pythia-160m":
31 |         return "Pythia"
32 |     elif model_name == "EleutherAI/pythia-160m-deduped":
33 |         return "Pythia 160M, deduped"
34 |     elif model_name == "cerebras/Cerebras-GPT-111M":
35 |         return "Cerebras"
36 |     else:
37 |         raise ValueError(model_name)
38 | 
39 | 
40 | def translate_feature(feature):
41 |     if feature == "l2-norm":
42 |         return "L2"
43 |     elif feature == "l1-norm":
44 |         return "L1"
45 |     elif feature == "std":
46 |         return "Std"
47 |     elif feature == "mean":
48 |         return "Mean"
49 |     elif feature == "max":
50 |         return "Max"
51 |     elif feature == "min":
52 |         return "Min"
53 |     else:
54 |         raise ValueError(feature)
55 | 
56 | 
57 | def translate_filename(filename):
58 |     if filename == "data/pubmed/100-tokens/0.txt":
59 |         return "PubMed (PM)"
60 |     elif filename == "data/news/100-tokens/0.txt":
61 |         return "News ($m1$)"
62 |         # return "News (N0)"
63 |     elif filename == "data/news/100-tokens/1.txt":
64 |         return "News (N1)"
65 |     elif filename == "data/random-bytes/100-tokens/0.txt":
66 |         return "Rand. Bytes ($m2$)"
67 |         # return "Rand. Words (RW)"
68 |     elif filename == "data/random-words/100-tokens/0.txt":
69 |         return "Rand. Words ($m2$)"
70 |         # return "Rand. Words (RW)"
71 |     else:
72 |         raise ValueError(filename)
73 | 
74 | 
75 | def parse_length(filename: str) -> int:
76 |     # data/random-words/100-tokens/5.txt -> 100
77 |     pattern = re.compile(r"data/[a-z\-]+/(\d+)-tokens/\d\.txt")
78 |     match = pattern.match(filename)
79 | 
80 |     return int(match.group(1))
81 | 
82 | 
83 | def parse_domain(filename: str) -> str:
84 |     # data/random-words/100-tokens/5.txt -> "random-words"
85 |     pattern = re.compile(r"data/([a-z\-]+)/\d+-tokens/\d\.txt")
86 |     match = pattern.match(filename)
87 | 
88 |     return match.group(1)
89 | 


--------------------------------------------------------------------------------
/src/paper/prefix_table.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import statistics
 3 | 
 4 | from .. import relic_helpers
 5 | 
 6 | 
 7 | def load_experiments():
 8 |     filters = [
 9 |         "(~ data.file '300-tokens')",
10 |         "(== model.intrinsic_dimension 3000)",
11 |         "(== model.language_model_name_or_path 'gpt2')",
12 |         "(not (== data.prompt_length None))",
13 |         "(not (== data.prompt_type 'n-tokens'))",
14 |         "(not (== data.prompt_type 'chunk-n'))",
15 |     ]
16 | 
17 |     return relic_helpers.load_experiments(filters, show_cmd=True)
18 | 
19 | 
20 | def print_table(experiments):
21 |     # Dict[prefix type, int]
22 |     counts = collections.defaultdict(list)
23 | 
24 |     for experiment in experiments:
25 |         prefix = experiment.config["data"]["prompt_type"]
26 | 
27 |         counts[prefix].append(experiment[0]["epochs"])
28 | 
29 |     results = {}
30 |     for prefix, epochs in counts.items():
31 |         results[prefix] = (statistics.mean(epochs), statistics.stdev(epochs))
32 | 
33 |     mean, std = results["token"]
34 |     print(f"New Token & \\num{{1}} & ${mean:.0f}\pm{std:.1f}$ \\\\")
35 |     mean, std = results["vocab"]
36 |     print(f"Vocab & \\num{{1}} & ${mean:.0f}\pm{std:.1f}$ \\\\")
37 |     mean, std = results["natural-n"]
38 |     print(f"Natural Prompt & \\num{{4}} & ${mean:.0f}\pm{std:.1f}$ \\\\")
39 |     mean, std = results["uuid"]
40 |     print(f"UUID & \\num{{27}} & ${mean:.0f}\pm{std:.1f}$ \\\\")
41 |     mean, std = results["2x-uuid"]
42 |     print(f"$2\\times$ UUID & \\num{{54}} & ${mean:.0f}\pm{std:.1f}$ \\\\")
43 |     mean, std = results["3x-uuid"]
44 |     print(f"$3\\times$ UUID & \\num{{76}} & ${mean:.0f}\pm{std:.1f}$ \\\\")
45 | 
46 | 
47 | def main():
48 |     experiments = load_experiments()
49 | 
50 |     print_table(experiments)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/src/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/test/__init__.py


--------------------------------------------------------------------------------
/src/test/attacking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/test/attacking/__init__.py


--------------------------------------------------------------------------------
/src/test/attacking/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | from ...attacking import pipeline
 2 | 
 3 | 
 4 | def test_calc_worst_score_initial():
 5 |     actual = pipeline.calc_worst_score(0.5, 0.3, 0.7, 0.8)
 6 |     assert actual == 0.8
 7 | 
 8 | 
 9 | def test_calc_worst_score_inside_bounds():
10 |     actual = pipeline.calc_worst_score(0.5, 0.4, 0.6, 0.55)
11 |     assert actual == 0.55
12 | 
13 | 
14 | def test_calc_worst_score_better_score():
15 |     actual = pipeline.calc_worst_score(0.55, 0.4, 0.6, 0.5)
16 |     assert actual == 0.55
17 | 


--------------------------------------------------------------------------------
/src/test/test_modeling.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | import intrinsic
  4 | 
  5 | from .. import config, modeling
  6 | 
  7 | 
  8 | def test_imports():
  9 |     pass
 10 | 
 11 | 
 12 | def check_fft_equality(a, b):
 13 |     return a.d == b.d and a.D == b.D
 14 | 
 15 | 
 16 | def assert_sequential(actual, expected):
 17 |     assert len(actual) == len(expected)
 18 | 
 19 |     for a, e in zip(actual, expected):
 20 |         if isinstance(e, intrinsic.FastfoodTransform):
 21 |             assert check_fft_equality(a, e)
 22 |         else:
 23 |             assert type(a) == type(e)
 24 | 
 25 | 
 26 | def test_project_factory_empty():
 27 |     model_config = config.ModelConfig(
 28 |         language_model_name_or_path="gpt2",
 29 |         projection=config.ProjectionConfig(layers=[]),
 30 |     )
 31 | 
 32 |     int_dim, D = 100, 1000
 33 | 
 34 |     factory = modeling.new_projection_factory(model_config, seed=0)
 35 |     projection = factory(int_dim, D)
 36 | 
 37 |     expected = torch.nn.Sequential(intrinsic.FastfoodTransform(int_dim, D))
 38 | 
 39 |     assert_sequential(projection, expected), str(projection)
 40 | 
 41 | 
 42 | def test_project_factory_nonlinearity():
 43 |     model_config = config.ModelConfig(
 44 |         language_model_name_or_path="gpt2",
 45 |         projection=config.ProjectionConfig(layers=["output", "sigmoid"]),
 46 |     )
 47 | 
 48 |     int_dim, D = 100, 1000
 49 | 
 50 |     factory = modeling.new_projection_factory(model_config, seed=0)
 51 |     projection = factory(int_dim, D)
 52 | 
 53 |     expected = torch.nn.Sequential(
 54 |         intrinsic.FastfoodTransform(int_dim, D), torch.nn.Sigmoid()
 55 |     )
 56 | 
 57 |     assert_sequential(projection, expected)
 58 | 
 59 | 
 60 | def test_project_factory_two_projection():
 61 |     model_config = config.ModelConfig(
 62 |         language_model_name_or_path="gpt2",
 63 |         projection=config.ProjectionConfig(layers=[500, "sigmoid", "output"]),
 64 |     )
 65 | 
 66 |     int_dim, D = 100, 1000
 67 | 
 68 |     factory = modeling.new_projection_factory(model_config, seed=0)
 69 |     projection = factory(int_dim, D)
 70 | 
 71 |     expected = torch.nn.Sequential(
 72 |         intrinsic.FastfoodTransform(int_dim, 500),
 73 |         torch.nn.Sigmoid(),
 74 |         intrinsic.FastfoodTransform(500, 1000),
 75 |     )
 76 | 
 77 |     assert_sequential(projection, expected)
 78 | 
 79 | 
 80 | def test_project_factory_neuralnetwork():
 81 |     model_config = config.ModelConfig(
 82 |         language_model_name_or_path="gpt2",
 83 |         projection=config.ProjectionConfig(
 84 |             layers=[500, "sigmoid", "output", "sigmoid"]
 85 |         ),
 86 |     )
 87 | 
 88 |     int_dim, D = 100, 1000
 89 | 
 90 |     factory = modeling.new_projection_factory(model_config, seed=0)
 91 |     projection = factory(int_dim, D)
 92 | 
 93 |     expected = torch.nn.Sequential(
 94 |         intrinsic.FastfoodTransform(int_dim, 500),
 95 |         torch.nn.Sigmoid(),
 96 |         intrinsic.FastfoodTransform(500, 1000),
 97 |         torch.nn.Sigmoid(),
 98 |     )
 99 | 
100 |     assert_sequential(projection, expected)
101 | 
102 | 
103 | def test_kolmogorov_smirnov_empirical_cdf_simple():
104 |     ks = modeling.KolmogorovSmirnovLoss(None, None, mean=0, std=1)
105 | 
106 |     observations = torch.tensor([0, 0.3, 0.4, 0.8, 1.5])
107 | 
108 |     assert ks.statistic(observations) == 0.5
109 | 


--------------------------------------------------------------------------------
/src/test/test_tokenizing.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | 
 3 | from .. import config, tokenizing
 4 | 
 5 | 
 6 | class DummyTokenizer:
 7 |     eos_token = ord("~")
 8 | 
 9 |     def __init__(self, model_max_length):
10 |         self.model_max_length = model_max_length
11 | 
12 |     def __call__(self, text):
13 |         return {"input_ids": [ord(c) for c in text]}
14 | 
15 |     def decode(self, ids, **kwargs):
16 |         return "".join(chr(i) for i in ids)
17 | 
18 | 
19 | def test_load_chunks_smoke():
20 |     text = "hello world!"
21 |     with tempfile.NamedTemporaryFile() as data_file:
22 |         data_file.write(text.encode())
23 |         data_config = config.DataConfig(data_file.name)
24 | 
25 |     tokenizer = DummyTokenizer(100)
26 |     actual = tokenizing.load_chunks(text, data_config, tokenizer)
27 |     expected = [
28 |         tokenizing.Chunk(str(tokenizing.DEFAULT_PROMPT), text, tokenizer.eos_token)
29 |     ]
30 | 
31 |     assert actual == expected
32 | 


--------------------------------------------------------------------------------
/src/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/tools/__init__.py


--------------------------------------------------------------------------------
/src/tools/verify_encryption.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | """
 4 | import argparse
 5 | import logging
 6 | 
 7 | import relic
 8 | import torch
 9 | from tqdm.auto import tqdm
10 | 
11 | log_format = "[%(levelname)s] [%(name)s] %(message)s"
12 | logging.basicConfig(level=logging.WARNING, format=log_format)
13 | logger = logging.getLogger("verify-enc")
14 | 
15 | from .. import accelerate, config, evaluating, modeling, relic_helpers, tokenizing
16 | 
17 | 
18 | def init_parser() -> argparse.ArgumentParser:
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument(
21 |         "-e",
22 |         "--experiments",
23 |         nargs="+",
24 |         help="Filter experiments based on results. Example: '(all (< epochs 1000))'",
25 |         default=[],
26 |     )
27 |     parser.add_argument(
28 |         "--relics",
29 |         help="Path to relics/ directory",
30 |     )
31 | 
32 |     return parser
33 | 
34 | 
35 | def check_trial_succeeded(exp: relic.Experiment, trial: int) -> bool:
36 |     if trial > len(exp):
37 |         logger.warning("Trial missing. [exp: %s, trial: %d]", exp.hash, trial)
38 |         return False
39 | 
40 |     if not exp[trial]["finished"]:
41 |         logger.warning("Trial not finished. [exp: %s, trial: %d]", exp.hash, trial)
42 |         return False
43 | 
44 |     if not exp[trial]["succeeded"]:
45 |         logger.warning("Trial failed. [exp %s, trial: %d]", exp.hash, trial)
46 |         return False
47 | 
48 |     if not exp.model_exists(trial):
49 |         logger.warning("Model missing. [exp: %s, trial: %d]", exp.hash, trial)
50 |         return False
51 | 
52 |     return True
53 | 
54 | 
55 | def verify_trial(exp: relic.Experiment, trial: int) -> bool:
56 |     saved = torch.load(exp.model_path(trial))
57 |     seed = saved["fastfood_seed"]
58 |     theta_d = saved["theta_d"]
59 | 
60 |     experiment_config = config.ExperimentConfig.from_dict(exp.config)
61 | 
62 |     tokenizer = tokenizing.new(experiment_config.tokenizer)
63 | 
64 |     model = modeling.new(
65 |         experiment_config.model,
66 |         vocab=len(tokenizer),
67 |         seed=seed,
68 |     )
69 | 
70 |     accelerate.prepare(model)
71 | 
72 |     with torch.no_grad():
73 |         model.intrinsic_vector.copy_(theta_d)
74 |         model.set_module_weights()
75 | 
76 |     return evaluating.passes(model, tokenizer, experiment_config, exp[trial]["epochs"])
77 | 
78 | 
79 | def main():
80 |     parser = init_parser()
81 |     args = parser.parse_args()
82 | 
83 |     experiments = relic_helpers.load_experiments(args.experiments, args.relics)
84 | 
85 |     for exp in experiments:
86 |         for trial, _ in enumerate(tqdm(exp)):
87 |             if not check_trial_succeeded(exp, trial):
88 |                 continue
89 | 
90 |             if not verify_trial(exp, trial):
91 |                 print(f"{exp.hash[:8]} {trial}")
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/src/training_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | 
 3 | 
 4 | def make_linear_reg_scheduler(warmup: int) -> Callable[[int], float]:
 5 |     def scheduler_fn(step: int) -> float:
 6 |         if step > warmup:
 7 |             return 1.0
 8 | 
 9 |         return step / warmup
10 | 
11 |     return scheduler_fn
12 | 


--------------------------------------------------------------------------------
/src/types.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/types.py


--------------------------------------------------------------------------------