├── .gitignore ├── .python-version ├── LICENSE ├── README.md ├── data ├── examples │ ├── advil.txt │ ├── peace.txt │ └── transformers.txt ├── news │ ├── 100-tokens │ │ ├── 0.txt │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt │ ├── 1000-tokens │ │ ├── 0.txt │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt │ ├── 300-tokens │ │ ├── 0.txt │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt │ └── 3000-tokens │ │ ├── 0.txt │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt ├── pubmed │ ├── 100-tokens │ │ ├── 0.txt │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt │ └── 1000-tokens │ │ ├── 0.txt │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt ├── quotes │ ├── albert-einstein.txt │ ├── bernard-m-baruch.txt │ ├── dr-seuss.txt │ ├── frank-zappa.txt │ ├── mae-west.txt │ ├── mahatma-gandhi.txt │ ├── marcus-tullius-cicero.txt │ ├── marilyn-monroe.txt │ ├── oscar-wilde.txt │ └── william-w-purkey.txt ├── random-bytes │ ├── 100-tokens │ │ ├── 0.txt │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt │ └── 1000-tokens │ │ ├── 0.txt │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt ├── random-words │ ├── 100-tokens │ │ ├── 0.txt │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt │ └── 1000-tokens │ │ ├── 0.txt │ │ ├── 1.txt │ │ ├── 2.txt │ │ ├── 3.txt │ │ ├── 4.txt │ │ ├── 5.txt │ │ ├── 6.txt │ │ ├── 7.txt │ │ ├── 8.txt │ │ └── 9.txt ├── reddit │ ├── 0.txt │ ├── 1.txt │ ├── 2.txt │ ├── 3.txt │ ├── 4.txt │ ├── 5.txt │ ├── 6.txt │ ├── 7.txt │ ├── 8.txt │ └── 9.txt ├── scientific │ ├── 0.txt │ ├── 1.txt │ ├── 2.txt │ ├── 3.txt │ ├── 4.txt │ ├── 5.txt │ ├── 6.txt │ ├── 7.txt │ ├── 8.txt │ └── 9.txt ├── supercomputer-traditional.stats ├── twitter │ ├── 0.txt │ ├── 1.txt │ ├── 2.txt │ ├── 3.txt │ ├── 4.txt │ ├── 5.txt │ ├── 6.txt │ ├── 7.txt │ ├── 8.txt │ └── 9.txt └── wikipedia │ └── 100-tokens │ ├── 0.txt │ ├── 1.txt │ ├── 2.txt │ ├── 3.txt │ ├── 4.txt │ ├── 5.txt │ ├── 6.txt │ ├── 7.txt │ ├── 8.txt │ └── 9.txt ├── decrypt.py ├── docs └── REPRODUCE.md ├── encrypt.py ├── experiments └── templates │ └── paper │ ├── different-models-v1.toml │ ├── distribution-regularization-v1.toml │ ├── distribution-regularization-v2.toml │ ├── distribution-regularization-v3.toml │ ├── distribution-regularization-v4.toml │ ├── effects-of-length-v1.toml │ ├── effects-of-size-v1.toml │ ├── l2-norm-v1.toml │ ├── l2-norm-v2.toml │ ├── l2-norm-v3.toml │ ├── original-algorithm-v1.toml │ ├── original-algorithm-v2.toml │ ├── original-algorithm-v3.toml │ ├── perplexity-bounded-v0.toml │ ├── perplexity-bounded-v1.toml │ ├── perplexity-bounded-v2.toml │ ├── rebuttal-v1.toml │ ├── what-can-we-encrypt-v1.toml │ ├── what-can-we-encrypt-v2.toml │ ├── what-can-we-encrypt-v3.toml │ └── what-can-we-encrypt-v4.toml ├── intrinsic ├── .gitignore ├── README.md ├── docs │ └── examples │ │ ├── example.py │ │ └── train_nn.py ├── intrinsic │ ├── __init__.py │ ├── fwh.py │ ├── fwh_cuda │ │ ├── fwh_cpp.cpp │ │ └── fwh_cu.cu │ ├── implementation.py │ ├── py.typed │ ├── test │ │ ├── __init__.py │ │ └── test_implementation.py │ └── utils.py └── setup.py ├── pyproject.toml ├── requirements.txt ├── setup.cfg └── src ├── __init__.py ├── accelerate.py ├── attacking ├── __init__.py ├── adaboost.py ├── avalanche.py ├── data.py ├── ffnn.py ├── gradboost.py ├── helpers.py ├── knn.py ├── lda.py ├── random_forest.py ├── semantic_security.py └── svm.py ├── blog └── histograms.py ├── config.py ├── data ├── __init__.py ├── __main__.py ├── news.py ├── openwebtext.py ├── pubmed.py ├── random_sequences.py ├── reddit.py ├── shared.py ├── twitter.py └── wikipedia.py ├── dense.py ├── evaluating.py ├── experiments ├── __init__.py ├── check.py ├── generate.py ├── lib.py └── run.py ├── halton.py ├── intrinsic_utils.py ├── logging.py ├── make_tokenizers.py ├── modeling.py ├── modeling_utils.py ├── paper ├── __init__.py ├── ciphertext_dist_histograms.py ├── comparison.py ├── embeddings.py ├── feature_importance.py ├── helpers.py ├── performance.py ├── prefix_table.py ├── security.py ├── security_histograms.py ├── tables.py └── what_can_we_encrypt.py ├── profiling.py ├── relic_helpers.py ├── templating.py ├── test ├── __init__.py ├── attacking │ ├── __init__.py │ └── test_pipeline.py ├── test_modeling.py ├── test_templating.py ├── test_tokenizing.py └── test_training.py ├── tokenizers ├── 1-byte.json └── 2-byte.json ├── tokenizing.py ├── tools ├── __init__.py └── verify_encryption.py ├── training.py ├── training_utils.py ├── types.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | # editor/python 2 | *.swp 3 | *.swo 4 | __pycache__/ 5 | 6 | # data 7 | data/unversioned/ 8 | venv/ 9 | .venv/ 10 | experiments/generated/ 11 | relics/ 12 | data/cached 13 | 14 | .DS_Store 15 | 16 | # writing & latex 17 | *.graffle 18 | *.log 19 | *.bbl 20 | *.bcf 21 | *.out 22 | *.run.xml 23 | *.blg 24 | *.fls 25 | *.aux 26 | *.fdb_latexmk 27 | *.pdf 28 | notebooks 29 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.9.7 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SELM 2 | 3 | Code and data for [SELM](https://samuelstevens.me/research/encryption) research project. 4 | 5 | ![teaser-gif-cropped](https://github.com/OSU-NLP-Group/SELM/assets/26638161/b7484c1f-84da-45a9-ba69-0c921c5d87cf) 6 | 7 | ## Table of Contents 8 | 9 | 1. Introduction 10 | 2. Installation 11 | 3. Encrypt Something 12 | 4. Decrypt Something 13 | 5. Experiments 14 | 6. Cryptanalysis 15 | 16 | ## Installation 17 | 18 | Install torch (CUDA): 19 | 20 | ``` 21 | pip install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html 22 | ``` 23 | 24 | Do this first before installing `requirements.txt` because that will install a CPU-only torch. 25 | 26 | Install packages: 27 | 28 | ```sh 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | Install the intrinsic package, used for efficient intrinsic dimension operations: 33 | 34 | ```sh 35 | cd intrinsic 36 | python setup.py develop 37 | cd .. 38 | ``` 39 | 40 | Initialize `relics/` (the experiment directory): 41 | 42 | ```sh 43 | relic init 44 | ``` 45 | 46 | ## Encrypt Something 47 | 48 | Get a key: 49 | 50 | ```sh 51 | python -c 'import secrets; print(secrets.randbits(32))' 52 | ``` 53 | 54 | Encrypt with your key: 55 | 56 | ```sh 57 | python encrypt.py --key KEY --int-dim 10000 data/examples/advil.txt 58 | ``` 59 | 60 | ## Decrypt Something 61 | 62 | Use the key to decrypt: 63 | 64 | ``` 65 | python decrypt.py --key KEY advil.bin 66 | ``` 67 | 68 | ## Experiments 69 | 70 | To run a new experiment, define a new `.toml` file in `experiments/` with whatever configuration options you want. `src/config.py` shows all the different options that can be changed. 71 | 72 | `.toml` files can contain lists for parameters; when they do, an experiment for each value in the list is generated. For example, `experiments/gpt2/wikipedia/0-4-concat.toml` has two lists: one for `learning_rate` and `intrinsic_dimension`. This means there are actually 20 experiments in here: 2 learning rates * 10 intrinsic dimensions. 73 | 74 | To run the experiments: 75 | 76 | ```sh 77 | python -m src.experiments.run experiments/templates/paper/what-can-we-encrypt-v4.toml 78 | ``` 79 | 80 | If you are running out of GPU memory, you can use model parallelism to split the Fastfood transform and the GPT2 model onto separate GPUs: 81 | 82 | ``` 83 | CUDA_VISIBLE_DEVICES=0,2 MODEL_PARALLELISM=1 python -m src.experiments.run experiments/gpt2/examples/medium.toml 84 | ``` 85 | 86 | You can pass entire directories or just individual `.toml` files to `src.experiments`. Results will be saved to `relics/`. 87 | 88 | **If you stop an experiment and run it again, any trials that are finished in `relics/` will not be run again.** 89 | 90 | ## Cryptanalysis 91 | 92 | Unzip the provided data: 93 | 94 | ```sh 95 | unzip relics.zip 96 | ``` 97 | 98 | Play the security game on the original algorithm with an SVM: 99 | 100 | ```sh 101 | python -m src.paper.security svm original feature-fn 500 --ratio 0.8 --quiet 102 | ``` 103 | 104 | Play the security game on the distribution-regularized variant with an SVM: 105 | 106 | ```sh 107 | python -m src.paper.security svm distribution-reg feature-fn 500 --ratio 0.8 --quiet 108 | ``` 109 | 110 | Try to implement stronger attacks! 111 | Look in `src/attacking/` for the model files and add your own. 112 | -------------------------------------------------------------------------------- /data/examples/advil.txt: -------------------------------------------------------------------------------- 1 | Causes of Back Pain 2 | 3 | Lower back pain can occur in people who are overweight, in poor physical shape, have poor posture, or are compelled to sit or stand for long periods of time. Muscle strain is another cause of a troubled back, either from lifting something that is too heavy or by lifting objects incorrectly. Many pregnant women develop lower back pain due to the extra weight they support during pregnancy. 4 | 5 | Another common cause is osteoarthritis (a ”wear-and-tear” condition), fractured vertebrae (the literal "broken back"), and the “slipped” or herniated discs are all serious medical conditions that must be treated by a qualified physician. 6 | -------------------------------------------------------------------------------- /data/examples/peace.txt: -------------------------------------------------------------------------------- 1 | Ethical technology takes us to a world of peace and plenty. 2 | -------------------------------------------------------------------------------- /data/examples/transformers.txt: -------------------------------------------------------------------------------- 1 | A transformer is a deep learning model that adopts the mechanism of attention, differentially weighing the significance of each part of the input data. It is used primarily in the field of natural language processing (NLP)[1] and in computer vision (CV).[2] 2 | 3 | Like recurrent neural networks (RNNs), transformers are designed to handle sequential input data, such as natural language, for tasks such as translation and text summarization. However, unlike RNNs, transformers do not necessarily process the data in order. Rather, the attention mechanism provides context for any position in the input sequence. For example, if the input data is a natural language sentence, the transformer does not need to process the beginning of the sentence before the end. Rather, it identifies the context that confers meaning to each word in the sentence. This feature allows for more parallelization than RNNs and therefore reduces training times. 4 | 5 | Transformers are the model of choice for NLP problems,[3] replacing RNN models such as long short-term memory (LSTM). The additional training parallelization allows training on larger datasets than was once possible. This led to the development of pretrained systems such as BERT (Bidirectional Encoder Representations from Transformers) and GPT (Generative Pre-trained Transformer), which were trained with large language datasets, such as Wikipedia Corpus and Common Crawl, and can be fine-tuned for specific tasks. 6 | -------------------------------------------------------------------------------- /data/news/100-tokens/0.txt: -------------------------------------------------------------------------------- 1 | Goals from Zlatko Junuzovic, Florian Grillitsch and Florian Kainz condemned second-placed Leipzig to their second successive league defeat. 2 | Bayern can extend their 10-point lead when they travel to Borussia Monchengladbach on Sunday. 3 | Borussia Dortmund closed the gap on Leipzig to three points with Friday's 1-0 win at Ingolstadt. 4 | Hoffenheim are a point further back, and boosted their chances -------------------------------------------------------------------------------- /data/news/100-tokens/1.txt: -------------------------------------------------------------------------------- 1 | Polish national Daria Pionko, 21, was found seriously injured in Springwell Road, Holbeck, on 23 December and died later in hospital. 2 | Det Supt Simon Atkinson said Miss Pionko had been the victim of a "sustained and vicious attack". 3 | A 38-year-old man arrested in connection with the incident was released without charge. 4 | A post mortem examination found Miss Pionko died as a result of head and facial injuries. 5 | She was -------------------------------------------------------------------------------- /data/news/100-tokens/2.txt: -------------------------------------------------------------------------------- 1 | "After much soul-searching, it is clearly time for us to live by Michael's words about love not war," wrote Jermaine, in a statement. 2 | Jermaine also withdrew his support for a leaked letter which calls on executors of the estate to resign. 3 | On Thursday his mother, Katherine, was reinstated as guardian of Michael's children along with his cousin, TJ. 4 | Days earlier, TJ became a temporary guardian for the three children amid reports Katherine, 82, was -------------------------------------------------------------------------------- /data/news/100-tokens/3.txt: -------------------------------------------------------------------------------- 1 | At a news conference with Turkish Foreign Minister Ahmet Davutoglu, Mr Kerry said the two Nato allies shared a common goal - to end the suffering of innocent civilians in Syria. 2 | Turkey and the US both oppose Syrian President Bashar al-Assad, but differ on how best to support the opposition. 3 | The visit has been overshadowed by the Turkish PM's remarks about Zionism. 4 | Recep Tayyip Erdogan earlier this week called Zionism a "crime against humanity" - remarks that have been widely condemned, -------------------------------------------------------------------------------- /data/news/100-tokens/4.txt: -------------------------------------------------------------------------------- 1 | 20 January 2016 Last updated at 12:03 GMT 2 | It had been stuck on the side of the mountain in Snowdonia, Wales, for days and couldn't move from its dangerous position on the ledge. 3 | A rescue team from the RSPCA had to lower themselves down the mountain using ropes to reach the sheep, who was then lowered down with them, to the bottom of the cliff. 4 | They said that the sheep wasn't injured but was very hungry after its cliff-side adventure. -------------------------------------------------------------------------------- /data/news/100-tokens/5.txt: -------------------------------------------------------------------------------- 1 | The first bomb was found during an alert at Ramoan Drive on Saturday morning. 2 | A device found in the Glencolin Walk area of west Belfast later on Saturday has also been declared viable. There is another security alert in Ballygally in County Antrim. 3 | Both alerts in west Belfast have now ended. 4 | The alert in Glencolin Walk followed the discovery of a suspicious object. 5 | The Glen Road was closed between the junctions of Shaw's Road and Suffolk Road. 6 | -------------------------------------------------------------------------------- /data/news/100-tokens/6.txt: -------------------------------------------------------------------------------- 1 | Mr Duterte clarified that he had "nothing against gays", saying several of his relatives were homosexual. 2 | The controversial politician had previously appeared supportive of LGBTQ rights, saying in 2015 that same-sex marriage was "good". 3 | But he is otherwise known for his conservative views especially on crime. 4 | He has waged a much-criticised war on drug users and dealers leading to thousands of extra-judicial killings. 5 | His latest remarks were made on Sunday night to Filipino expatriates in the Burm -------------------------------------------------------------------------------- /data/news/100-tokens/7.txt: -------------------------------------------------------------------------------- 1 | Police said the incident happened at the corner of Jamaica Street and Argyle Street at about 09:45. 2 | Emergency services attended and the man was taken by ambulance to Glasgow Royal Infirmary. There is currently no information on his condition. 3 | Jamaica Street has been closed while police carry out investigations into the circumstances of the incident.Media playback is not supported on this device 4 | Wednesday's match in Lyon will be Wales' first in the last four of a major tournament. 5 | Coleman -------------------------------------------------------------------------------- /data/news/100-tokens/8.txt: -------------------------------------------------------------------------------- 1 | The proposals are the brain child of Swiss businessman turned politician Thomas Minder, who runs a small family company producing natural cosmetics. 2 | Mr Minder wants shareholders to have a veto over managers' salaries, and to ban golden handshakes altogether. 3 | The "fat cat initiative", as it has come to be called, would, if approved, be written into the Swiss constitution, and would apply to all Swiss companies listed on Switzerland's stock exchange. 4 | Mr Minder, an outspoken man, says -------------------------------------------------------------------------------- /data/news/100-tokens/9.txt: -------------------------------------------------------------------------------- 1 | The woman was hit by the car on Emma Street at about 09:40 on Monday and became trapped between the pavement and the vehicle. 2 | Firefighters used chocks and blocks to stabilise the car and free the casualty. 3 | Police and the ambulance service also attended, and the woman was treated by paramedics for a leg injury.In a letter urging congregations to vote on 7 May, the House of Bishops does not endorse a political party but encourages debate on issues such as nuclear defence and the -------------------------------------------------------------------------------- /data/news/300-tokens/0.txt: -------------------------------------------------------------------------------- 1 | The Premier League team, who are on a two-game post-season tour of the United States and Canada, are scheduled to play Houston on Friday. 2 | Several people have died and dozens have been injured following record rainfall over the weekend. 3 | On Monday, hundreds of basketball fans were trapped inside an arena after an NBA basketball game. 4 | Supporters were advised to stay in their seats overnight following Houston Rockets' win against Golden State Warriors in the NBA Western Conference Finals. 5 | Many spent almost 11 hours at the Toyota Center, until the early hours of Tuesday morning. 6 | Manchester City are currently in Canada, ahead of their match against Toronto FC on Thursday.Wiggins, 36, had hinted the race in the city of his birth could be his last, but afterwards said he was "not sure yet" what his plans are. 7 | "I don't know, I've still got really good legs," he said. 8 | "This might not be my last race. This for sure is my last ever race with Mark Cavendish, though." 9 | The 2012 Tour de France winner added that he "just wants to enjoy this moment". 10 | Wiggins' admission follows his comments after last month's London Six Day, when he hinted he could be tempted to race there again next year. 11 | Wiggins and Cavendish claimed overall victory in Ghent after winning the final madison event. 12 | The pair also contested the Ghent Six in 2007 and won madison gold together at the World Track Championships in London -------------------------------------------------------------------------------- /data/news/300-tokens/1.txt: -------------------------------------------------------------------------------- 1 | Daniel James' late penalty gave Wales a first win in the competition after an opening draw against hosts France. 2 | Page's side face Ivory Coast in their final Group B game on Monday. 3 | "It's always a good sign when you win games of football and you haven't been at your best," said former Wales defender Page. 4 | "We probably played better against France and didn't win. 5 | "If someone would have said before the tournament that we'd be two games in and with four points going into the third game I'd snap their hand off. 6 | "It's our first time in this tournament and we've been up against excellent opposition against France and a different challenge against Bahrain." 7 | Wales are second after their win over Bahrain with next opponents Ivory Coast top of Group B after they beat France.Save Fenton Town Hall say they are protesting because the Ministry of Justice (MoJ) is selling the building. 8 | They say a World War One memorial is at risk of being destroyed if the building is sold. 9 | However, the government has said "a legal covenant" means any buyer would have a duty to preserve the memorial. 10 | The group said they wanted to see the building preserved for community use and were prepared to stay there all night. 11 | The magistrates' court, which was built in 1886 as a town hall, closed in December 2012 as part of government plans to shut 93 courts in England and Wales in a bid to save £41m. 12 | The -------------------------------------------------------------------------------- /data/news/300-tokens/2.txt: -------------------------------------------------------------------------------- 1 | Morocco start their Group C campaign against DR Congo on Monday, then face Togo on 20 January and take on Ivory Coast four days later. 2 | Rherras, 23, and Cameroon midfielder Arnaud Djoum, 27, will miss Hearts' Scottish Cup meeting with Raith Rovers on 22 January. 3 | Cameroon are in Group A with Burkina Faso, Guinea-Bissau and hosts Gabon. 4 | Djoum has five international caps while Rherras made his Morocco debut in August. 5 | Scottish Premiership clubs are currently on their winter break and Hearts' next league fixture is away to Celtic on 29 January.Local photographer Ron Strathdee captured the phenomenon on Monday at about 23:30 BST. 6 | The glow is usually best seen from northern latitudes like Norway, Alaska, Iceland and northern Scotland. 7 | Mr Strathdee said seeing the Northrn Lights from Manx latitudes was "fairly unusual." 8 | They happen when incoming solar radiation hits the earth's upper atmosphere and excites atoms to a new energy state, emitting energy in the form of light. 9 | The photographer said: "I needed a place that faced north so went to Peel Hill and tried some shots over the castle which worked but half the fishing boats in the Irish Sea were discharging fish at the breakwater with enough floodlights to cover a football match! 10 | "Going round the front of the castle it was pitch dark and it looks straight north -------------------------------------------------------------------------------- /data/news/300-tokens/3.txt: -------------------------------------------------------------------------------- 1 | Media playback is not supported on this device 2 | Ainslie, 35, admits it would have been "difficult to top" the feeling of winning at his home Olympics and is keen to move on with new challenges. 3 | "It was a tough decision," Ainslie told BBC Sport. 4 | "I've had a fantastic Olympic career but I want to make it clear that the focus is now on the Americas Cup." 5 | Ainslie won a silver medal at Atlanta 1996, with golds in Sydney, Athens, Beijing and London. He has won more medals than any other sailor, ahead of Denmark's Paul Elvstrom, who has four golds. 6 | He is fourth in Britain's all-time individual medallist's list, behind Sir Chris Hoy (cycling; six gold, one silver), Sir Steve Redgrave (rowing; five gold, one bronze) and Bradley Wiggins (cycling; four gold, one silver, two bronze). 7 | Media playback is not supported on this device 8 | Ainslie added: "I considered all of the factors - my fitness and the issues with my back, the venue for the next Olympics and the type of boats, but what it really came down to was this opportunity with the Americas Cup." 9 | The sailor envisages skippering his Ben Ainslie Racing AC45 catamaran to glory in the historic competition will be one of the "biggest tests" of his career. 10 | "It's always been a dream -------------------------------------------------------------------------------- /data/news/300-tokens/4.txt: -------------------------------------------------------------------------------- 1 | Kenneth Gibson and his wife Patricia - the MP for North Ayrshire and Arran - lost their baby towards the end of her pregnancy in 2009. 2 | Staff at the Southern General hospital in Glasgow had failed to spot that Mrs Gibson had pre-eclampsia. 3 | A review of baby deaths at Crosshouse Hospital was announced on Tuesday. 4 | It came after a BBC Scotland investigation revealed that there had been six so-called "unnecessary" deaths of babies at the hospital since 2008. 5 | "Unnecessary" or "avoidable" deaths are referred to as those where harm was caused to a healthy baby during childbirth - usually resulting in them being deprived of oxygen. 6 | Reports into some of the deaths referred to failings in monitoring of the child's heartbeat during childbirth. 7 | Speaking in the Holyrood chamber, Mr Gibson urged Health Secretary Shona Robison to make the review "wider and deeper" so it could look at similar cases in other hospitals across the country. 8 | His wife has previously opened up in the House of Commons about the "devastating" effect of losing her baby. 9 | Mr Gibson said: "On her due date in 2009, my wife Patricia, having been sent home and been physically sick, was finally admitted to the Southern General maternity unit despite their protests. 10 | "A consultant junior doctor and two midwives examined her that day. Despite being 41, a first-time mother and in extreme pain from head to toe, no-one picked up -------------------------------------------------------------------------------- /data/news/300-tokens/5.txt: -------------------------------------------------------------------------------- 1 | A large tipper truck was observed delivering the tyres under cover of darkness on Wednesday night. 2 | A council spokesperson said the bonfire was on land owned by the Housing Executive (NIHE) in Ballybeen. 3 | "The council has been in touch with the NIHE to raise the issue of the tyres at the site and to request the matter is investigated," they said. 4 | "The Northern Ireland Environment Agency has also been contacted by the council regarding the nature and volume of the bonfire material." 5 | Asked about the burning of tyres on bonfires on the Radio Ulster's Nolan programme, community worker Jim Wilson there was "a very, very small minority we are talking about where we have problems". 6 | "You're talking about businesses making money out of it, and that comes down to the PSNI to deal with it." 7 | Meanwhile, Belfast City Council said it had received reports that tyres had been collected at a bonfire at Avoneil in the east of the city. 8 | "We have been engaging with the local community at this site to have the tyres removed and will continue in our efforts to manage the negative impacts of the bonfire which includes the burning of tyres," a spokesperson said. 9 | Ulster Unionist Cllr Jim Rodgers said the inclusion of tyres in bonfires was "a worrying development". 10 | He said that councillors had met the police and asked them to be "more pro-active" regarding bonfires.DUP leader Arlene Foster said the "wide -------------------------------------------------------------------------------- /data/news/300-tokens/6.txt: -------------------------------------------------------------------------------- 1 | The patents include one that relates to the front face of the iPhone and one for touch-screen technology. 2 | It is another win for Apple, after it was awarded $1.05bn (£652m) in damages by a jury in a separate case in August. 3 | The ITC can block the import of products into the US. 4 | The judge's ruling will go in front of a full commission, which is scheduled to conclude its investigation in February. 5 | Judge Thomas Pender agreed that Samsung violated four of Apple's patents, but was not in violation of two others listed by Apple in the complaint. 6 | Three of the patents are related to software features, while one covers Apple's hardware. 7 | However, the Samsung products in this case do not include its latest devices, limiting the impact of a potential import ban into the US. 8 | Samsung has repeatedly argued that any sales ban would limit choice and raise prices for consumers in the US. 9 | Apple and Samsung have bought legal cases against each other in more than 10 countries, each accusing the other of violating patents, as the two battle for market share in the hugely lucrative mobile industry.Offences for which lower compensation was awarded included ones involving drink, drugs or violence. 10 | The Criminal Injuries Compensation Authority said statutory guidance obliged it to reduce or refuse awards if victims had unspent convictions. 11 | A leading child abuse lawyer called for a review of the "scandalous" approach. 12 | Alan Collins said civil case judges increasingly took the opposite -------------------------------------------------------------------------------- /data/news/300-tokens/7.txt: -------------------------------------------------------------------------------- 1 | The six-year-old victor, ridden by Noel Fehily, was cut to about 16-1 from 66-1 for the Cheltenham Gold Cup in March. 2 | Alary, trained by Colin Tizzard, had been considered a Gold Cup hope but was pulled up before the third last fence. 3 | Bristol De Mai sealed a Haydock double for Nigel Twiston-Davies after The New One became the first horse to win the Champion Hurdle Trial three times. 4 | The nine-year-old, ridden by Sam Twiston-Davies for his trainer father, produced a gutsy display to edge past runner-up Clyne. 5 | Unbeaten Neon Wolf ran out a nine-length victor of the novices' hurdle, while 2014 Champion Hurdle winner Jezki returned from a 632-day absence with a comfortable success at Navan. 6 | Earlier, Ascot's Grade One meeting and Taunton's card on Saturday were called off because of frozen ground. 7 | The Ascot fixture was due to feature the Clarence House Chase, which has been rescheduled to take place at Cheltenham on Festival Trials Day on 28 January. 8 | Cheltenham will now have a nine-race card next week, with racing starting at midday. 9 | Cornelius Lysaght, BBC horse racing correspondent 10 | Abandoned Ascot was billed as the day's top fixture, but had it been on -------------------------------------------------------------------------------- /data/news/300-tokens/8.txt: -------------------------------------------------------------------------------- 1 | Dubbed Nuit Debout (Up All Night), it is a self-styled "popular assembly" in which participants share views about politics and the state of the world. 2 | As night descends, the speakers stand patiently in line and, turn by turn, take the microphone for their allotted five minutes. 3 | Before them, sitting in twos and threes on paving stones, the young audience responds with the occasional cheer or boo. 4 | Not that there is a huge amount to react to. The speeches are rambling and platitudinous. 5 | One orator says the essence behind society should be "values" - but she does not say which. 6 | Another urges an end to hierarchy - "no more pride, no more ego - just ideas". 7 | A third wants to speak of human rights abuses in the Democratic Republic of Congo. 8 | One theme that recurs is the need to tolerate divergences of opinion. This is significant. Two nights previously, one of France's best-known philosophers - a man who a generation ago would have himself been at the mike - was spat on and told to leave. 9 | Both speakers and listeners appear to be mainly students - an impression confirmed by a tour of the various "stands". 10 | The feminists are in a large huddle, and I am asked not to take photographs. Elsewhere, a screen shows a laborious film made by a woman who took a job distributing junk mail and wants to expose the exploitation. 11 | There -------------------------------------------------------------------------------- /data/news/300-tokens/9.txt: -------------------------------------------------------------------------------- 1 | Officers are investigating whether one of the men fell from a 12th floor balcony of the 19-storey Donside Court, in the Tillydrone area of the city. 2 | Emergency services were called to the building at about 20:15 on Tuesday evening. 3 | A witness has described hearing a woman screaming "help me, help me" before police arrived. 4 | Police Scotland said it believed the incident was contained and there was not a threat to the wider community. 5 | A blue forensics tent had been erected inside a police cordon at the foot of the building, close to the main entrance. 6 | An eyewitness said they saw a man fall from the 12th floor of the building, which police confirmed they were pursuing as a line of inquiry. 7 | Another witness, Toni Dey, 19, who lives a short distance away in Gordon's Mills Road, said she heard screaming from the block. 8 | The mother of two said: "I heard some girl screaming 'Help me, help me', then about 10 minutes after I heard loads of screaming and shouting. 9 | "I didn't call the police as I thought it was kids messing around, then I heard loads of police. 10 | "It's very scary to think that something like that had happened. All I kept thinking was 'Why was that poor girl shouting help me?', and about a two-minute walk out my back door. 11 | "I just moved in here in May and it's been so quiet. I was -------------------------------------------------------------------------------- /data/pubmed/100-tokens/0.txt: -------------------------------------------------------------------------------- 1 | The pharmacokinetics of Casodex have been investigated in patients with prostatic carcinoma following single oral doses of 10 mg, 30 mg and 50 mg and during daily administration at these dose levels. Casodex displays prolonged absorption following a single dose, with peak plasma concentrations observed at up to 8 h for doses of 10 mg and 30 mg and up to 48 h for the 50 mg dose. The area under the plasma concentration-time curve increased linearly with dose, and Casodex was -------------------------------------------------------------------------------- /data/pubmed/100-tokens/1.txt: -------------------------------------------------------------------------------- 1 | Most psychoanalytic literature dealing with incest holds the premise that the act took place between a parent and a child of opposite sex. Incidentally, most of these cases involve a father-daughter incest (e.g. research by Julien Bigras). However, this is only one of four mathematically possible combinations. For instance, we tend to underestimate the occurrences and, consequently, the repercussions of mother-daughter incest relationships. The biological and psychological importance of the mother in the child's -------------------------------------------------------------------------------- /data/pubmed/100-tokens/2.txt: -------------------------------------------------------------------------------- 1 | Temporomandibular joint arthoscopy is a new diagnostic and therapeutic modality and its development is in its infancy. The purpose of this article is to describe arthroscopic pathologic findings of the superior joint space.Thirty seven cardiac transplants have taken place at the National Cardiac Centre in Ireland since 1985. Data is presented on three still-surviving male patients aged 19 to 42 who received cardiac transplants in 1985 and 1986. Circulating levels of blood cyclospor -------------------------------------------------------------------------------- /data/pubmed/100-tokens/3.txt: -------------------------------------------------------------------------------- 1 | The structure, thermodynamic, and kinetic features of polyunsaturated fatty acids derivatives as the main substratum of lipid peroxidation (POL) have been considered. The heats of key POL reactions have been estimated. Kinetic consequences of these estimations have been analyzed. The dependence of POL rate on O2 concentration have been considered in detail both in the absence and in the presence of antioxidants. The essential features of POL processes in lipid bilayers resulting from the specific structure and molecular dynamics have been discussed -------------------------------------------------------------------------------- /data/pubmed/100-tokens/4.txt: -------------------------------------------------------------------------------- 1 | In eighteen adult patients scheduled for cardiac and vascular surgery, shed blood was treated with the Haemonetics Cell Saver Haemolite. On average by patient, the autologous blood volume restored was 471.94 +/- 235.7 ml. The haemoglobin level was 16.88 g.dl-1 and haematocrit level was 49.31 +/- 7.2%. Thirteen by eighteen patients did not require any homologous blood transfusion. The Cell S -------------------------------------------------------------------------------- /data/pubmed/100-tokens/5.txt: -------------------------------------------------------------------------------- 1 | In vitro and in vivo studies were performed to clarify the nature of some interactions between Plasmodium berghei sporozoites and rodent host cells. Videomicroscopic observations were made on in vitro interactions between sporozoites and cultured host cells (rodent peritoneal macrophages, W138 human lung fibroblasts, and HepG2 human hepatoma cells). The results showed a diversity of dynamic interactions and sporozoite activities, including active sporozoite penetration, -------------------------------------------------------------------------------- /data/pubmed/100-tokens/6.txt: -------------------------------------------------------------------------------- 1 | The volume of isoflow (VisoV) in 29 asymptomatic smokers with normal radiographic and pulmonary function studies was studied, with the purpose of investigating the presence of early alterations of the transitional airways in a group of 256 chronic smokers. The VisoV was measured using a body test plethysmograph, and two flow-volume curves with air and with helium were performed. 13 out of the 29 subjects (45%) showed an abnormal VisoV of between 17 -------------------------------------------------------------------------------- /data/pubmed/100-tokens/7.txt: -------------------------------------------------------------------------------- 1 | Our previous experiments (14) (15) dealt with the dynamic of the fluid-coagulant equilibrium in rats after exposure to confinement hypokinesia (CHK) at different time intervals, 1, 3, 7, 21, 28 and 35 days. We found a decreased Quiq and Howell time (QT and HT) in some relation with the time of exposure, with a tendency to normalization after 21 days. The same evolution was shown by the number of thromb -------------------------------------------------------------------------------- /data/pubmed/100-tokens/8.txt: -------------------------------------------------------------------------------- 1 | Our series includes the first 19 consecutive patients with pulmonary valvar stenosis who underwent balloon valvuloplasty in our hospital. All of them have been evaluated by an echo-Doppler study at different follow-up times, and in 10 patients the study was also performed prior to the dilatation. The ages ranged between 0.4 and 10 years (mean 3.5 years; standard deviation [SD] 2.53 years). The prevalvuloplasty degree of pulmon -------------------------------------------------------------------------------- /data/pubmed/100-tokens/9.txt: -------------------------------------------------------------------------------- 1 | The technique of ABR testing was applied to 25 infants with neonatal hyperbilirubinemia at levels exceeding that for exchange transfusion, in an attempt to study potential influence of bilirubin toxicity on the auditory brainstem pathway. The test was performed at a mean conceptional age of 40.4 +/- 0.6 weeks just after discharge. Twenty normal term neonates of comparable birth weights and conceptional ages, who had no hyperbilirubinemia, were also examined for -------------------------------------------------------------------------------- /data/quotes/albert-einstein.txt: -------------------------------------------------------------------------------- 1 | "Two things are infinite: the universe and human stupidity; and I'm not sure about the universe." 2 | -------------------------------------------------------------------------------- /data/quotes/bernard-m-baruch.txt: -------------------------------------------------------------------------------- 1 | "Be who you are and say what you feel, because those who mind don't matter, and those who matter don't mind." 2 | -------------------------------------------------------------------------------- /data/quotes/dr-seuss.txt: -------------------------------------------------------------------------------- 1 | "You know you're in love when you can't fall asleep because reality is finally better than your dreams." 2 | -------------------------------------------------------------------------------- /data/quotes/frank-zappa.txt: -------------------------------------------------------------------------------- 1 | "So many books, so little time." 2 | -------------------------------------------------------------------------------- /data/quotes/mae-west.txt: -------------------------------------------------------------------------------- 1 | "You only live once, but if you do it right, once is enough." 2 | -------------------------------------------------------------------------------- /data/quotes/mahatma-gandhi.txt: -------------------------------------------------------------------------------- 1 | "Be the change that you wish to see in the world." 2 | -------------------------------------------------------------------------------- /data/quotes/marcus-tullius-cicero.txt: -------------------------------------------------------------------------------- 1 | "A room without books is like a body without a soul." 2 | -------------------------------------------------------------------------------- /data/quotes/marilyn-monroe.txt: -------------------------------------------------------------------------------- 1 | "I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best." 2 | -------------------------------------------------------------------------------- /data/quotes/oscar-wilde.txt: -------------------------------------------------------------------------------- 1 | "Be yourself; everyone else is already taken." 2 | -------------------------------------------------------------------------------- /data/quotes/william-w-purkey.txt: -------------------------------------------------------------------------------- 1 | "You've gotta dance like there's nobody watching, 2 | Love like you'll never be hurt, 3 | Sing like there's nobody listening, 4 | And live like it's heaven on earth." 5 | -------------------------------------------------------------------------------- /data/random-bytes/100-tokens/0.txt: -------------------------------------------------------------------------------- 1 | –«¨ÅW¥~3=^¾ú¢«ÑQ=U¥&ÎG/@ÄÜû$J˜ŠZ>íþ«eý¾géº~Œº&¦mºÒ¿ÞÌt"”Ò÷݄!Œ‡>� -------------------------------------------------------------------------------- /data/random-bytes/100-tokens/1.txt: -------------------------------------------------------------------------------- 1 | pW û-ÕÀ¢ññªµ:®ÜlX{SXl³´jhÏ|¸”ØÎàìvZ‡DeP¤My²PsŒíñŒn§BFº~õÁ0ہ"ÎæI´Å6\,•� -------------------------------------------------------------------------------- /data/random-bytes/100-tokens/2.txt: -------------------------------------------------------------------------------- 1 | N.3Õ°ß6çÚ 2VɸêM„™Þã-÷*ðÓ){͒ÛþcþMüÀ•DpX҇¤ *ÎÍ3&]|á¹þW¥TÈYBõ£ƒs…(i- -------------------------------------------------------------------------------- /data/random-bytes/100-tokens/3.txt: -------------------------------------------------------------------------------- 1 | pºˆÜv߇Æ[5UEŸ?%#)^ò[w)¾ Pc;¦ïä|.óñºÊØZß]`^»„¡»K^ï58¹é’J³ùìGw 1vLsÇwйŅí -------------------------------------------------------------------------------- /data/random-bytes/100-tokens/4.txt: -------------------------------------------------------------------------------- 1 | î'p=­½Ö¨âŸÄÖ`K¤ä.ß[«|꽝>Ö±iò[ĸæAsÇÉp'¼é[Ô²Xx‹êRQ¢•;Š\Ñz6¸`<<Šm0ìû¼� -------------------------------------------------------------------------------- /data/random-bytes/100-tokens/5.txt: -------------------------------------------------------------------------------- 1 | ‰ù(úg8ÚgsK#Qbõ:A^ˆGîÇÖ܂abJϰ÷Òvó\wXÄR>Å|÷Èý› ˆ‹“S`´RÚCǜ~´d—·þ ´5h� -------------------------------------------------------------------------------- /data/random-bytes/100-tokens/6.txt: -------------------------------------------------------------------------------- 1 | Ë^­}ƒ–xC3 §`u¨|šîåFÌL ëî}ƒS½ªÕ(ò\“bÅ·ƒB52~:\ËV®û„ÕhÝl ͈ÍÒ¯ÛH ÜÇe< -------------------------------------------------------------------------------- /data/random-bytes/100-tokens/7.txt: -------------------------------------------------------------------------------- 1 | µÁ׆§£U™Gâ;4ãI?PUã’<X©N(Ôl?ßé»wt~ûb«®ìh,šQ•sۍ)²_ìí{ ñÔ°*odÁÔã ñ·AâÊovm;iµ4d� -------------------------------------------------------------------------------- /data/random-bytes/100-tokens/8.txt: -------------------------------------------------------------------------------- 1 | ëVÒꊜàÑÓÑØ|z…HØ> ^·ù<$YؤmuΕJ’Ôô§ÌíQðÖ!ëÿJ8±Y)alo<Àï m·ÞpEKOÜÛ3.w -------------------------------------------------------------------------------- /data/random-bytes/100-tokens/9.txt: -------------------------------------------------------------------------------- 1 | ­`?Ç` ÁšXہ™­Vë:£J%^¡µ¨§iå>Ÿj‰ãÛs;…#¨€Õ‰KƒG–âŠ45Á6µPë4H¤K<îé·dtà"¸ƒJ¦w -------------------------------------------------------------------------------- /data/random-bytes/1000-tokens/0.txt: -------------------------------------------------------------------------------- 1 | åøÂÿ— Íðv 2 | {ÏïՓgñ/Æÿ'9{›|³ô!ýg…)ºÉM/` µÉo– 3 | p8Ýʓƒ5Ü_;̄kÒõLà­Ù­tyÌb¾ˆ”vùÌ>†qÖãy˜KlâWìXঠ4 | LmÙùR6iÅÒh2b¹‹ÔÔxÑÔÛí¬Jh@áFjë³ –—þ)0¢€ø’Ó=p ªøÀ?ô YîŒ[£Z¥ìŽ…}ªWÛêâj–Õur:‰! 5 | '¢S%ìÞt³K6Ëõþíar“J$a5û*âQýYDê´>?w[əéºÎmZ”ôŸ}BÙ ¨,¬ O½Ê)øùãÙµð^£AFìP§ÉÐôá§¢ñ{P›¾€âmŒèp€‹c`Ú8vDx¯âh¶8¦OÌÔ´a_]Æþ͔y—>„ 6 | g¦–µïGkWq6°CÛ¡7íÈúY€f «É\ È«×ÂÇ.ãöÔ ÍíÉY=1†»cg _Z‘]`Bª­ŸŽ ‹ÌôòUªCžëß3d<õ¯Ù‰({œôôgGÏj¢w½¼ª” ¢¥0f¾½5T¤”ñKÄlkœ²þFàMf,ôbÁÉ°MRÿNͶ ´Ê±ö)ÄQ␀WëÁ0r ©‚áé3ç~Âݾ™+h¸Ìؘkqiµ+ d{@¢“ÉšW<s’5ô-Æ n·ß¶u˜§Ê­Ò̝#{"®¦ô7š½˜5xƒóècÓäZù‚ iðý/[2~W¦…£§™zmè½õ°«aª35QŽIe¸FÍºÐâªCã„Ó¹B2ä¹ ¡q5?Äè*m¿äói[‘éH)¿¯ô¢?77k ûżTÆ ¦ª•\Kÿ“ ùnØÚ /?Õ3ûÝ'ŽK 7 | з¿B6x±=—[ñ_Ïb)‰?^æ³IT­¿? ÷s²¦‹tenp¥ÌÄ)ĹFÊpH€U@ýú2®ǵa°Ia@|q–)¿½ œSDž ö� -------------------------------------------------------------------------------- /data/random-bytes/1000-tokens/1.txt: -------------------------------------------------------------------------------- 1 | á¥zˇ:¡>ˆG¹AXMS»oÁwAèDÜj-iu×h-Mñ4‡'ªà¢ý›@öØŠ“½ª`€«DÕ[éS×çju–¦þ¼tPȔ ưmp_ÿÁÊÏͯØ"‹¡qƒ{ô‘¬&'ϰΠ2 | áÇnáN‡çÀñ&5;рm}d&/›ë,ÝT|I/ð¸\eU9'»4ä§Z'žÞäâQȗ4;Ëc;TÜôæºdgÙCrª½ÿÁs8_ЇÝSA„¸…# ŒÀ÷ác8ðrâ$ş踥±ÎõÖ+ìW^ˆÍ¯ỹ÷’ðúށM’ØÛˆaé[>h°sÔºÎC¸ 䔈LÆÃ‘Ú 3 | öñM¿YB¤äü/Ê»¦µ¶‹QRiôÿŒ”ùxùŒe_` fB‰uQ ½[[[ –›•KêMýHœ×Y-wŒ_'iìJ mC¦ó\><äÛ"ø\ÒgM4Sa`£b¼Ðlð^ò"Z8tôä_Á¥j:i:Z.ÁœZ?¥‚c6™>y,UœŒjVîš§$txQi·àÂß®ÓÑ:w ›¬Òí¢4âë݌-2õü4Ûj0ƒJøØ5&ÁŠ>¦a±K3f¶_Ö֓Ûü$¦ùWÃÜÂÂtÀލ¡xúx¢_7 #+Iio¯š&Á¬? ¤gãõvïHôæáÇ,b_ë&°ˆ¢õÃU¿vÍE=Ô̰-_üÂÓp"‘«‘8i5»S¤9†hQRÇMÄ®µü @7³µÎµÖA÷×£ 4 | exU”f?ųR#e’^ƒÒyªµv4Üã±4u6ðü¦¾j§nµ º›úQ5@Døúê[ɆWG¾~õm:d8#ïbyoÝå³óumª‘bP_¹ræ6¤¢ÿTŸÚ}€d?òmDJšó´Û>9ûöZÑ1¡âU"a=§ÙRù&`T„sލålö!ð¡–èoڞäù/ó/T+}1‰6Ø»$©KIª{†dFôcø]t:]¥6ÃY -------------------------------------------------------------------------------- /data/random-bytes/1000-tokens/2.txt: -------------------------------------------------------------------------------- 1 | öhÌ6ìs7AãÙ3ç|ÏêN;騑ãÚ1þ ´÷4xÝë÷qËOi;¬«Û‹+n‚ûP³§ì³³±È4égZLw`¢²[èʉÐ~s¯RÎõÉ-‹MC·*Ʋ ßýIÈuòÌêâØ ¶p¯õèurš:ÅжÉñêÚº#ÄÓÊ“ßwŠ—V1Qà±Ñ1¼ 2 | åÍÒ#ëÝ­͉ßÊÕ,~ xdïÉÿhX;â}ßÂy ãìø 3 | b!l&JkÅ+ öǞuç&-A Í+– ߬ëOÊ|Љ™b&-¶EÂ7Eø-zÛñ9Îü¦>ªE±î4ԁv}å,ñ÷FUDÑBm„ÌC$£ú²K(œ{9úב{‡ïî’J[ª–áJ“}¯· àÀVT2,«K8wz|4ÊК5›ÿöõÌþ· 4 | ê‘ãHŒKøÂ:@ñ£€ö/.Äb)7 ·GÓåž*”s€ù6Î,%8­Ôݙ£¨ïÇHw13˜tºhð9³ÝÒ»ä7ðÄÏ …§û’Ù–ìçŝ 5 | OŠ—Œ¶ZÏ}Üõ†9RC|=Wúz=!›•În°yÔ®~Ã7dǪŸJ9xÐÇß´*~Á©ýre¤[dUCvÅÑñ8·§òý#:G€P½2üÂþƒñÆeŒ¾"ûs9f+MBÙ?wŠLa£‘tVRTY/%.Žš;.7Þ$..È~~Ĩ´M,aß›ýÙ¬ÿRyÛ&¶0º>4Ô´HÞ/Yk³ö‡[¿ËkÓz¼ùj>eÍ`5ë®9í<¼T;C`@óÌÙöp]d)§(¨¼dNkóór¯YDz2ا$×ã‡M#Z«d)cÙó“7ÙÎâÉõö1Šs–×Y`¿eu§¡w }§ú*Å¢¡½[—±ZÿL“RÕˀ¡ª&ˆW{Z¦O‡D_úÜÃølʱ´Bðàœ˜í&nm;.!:t¹ù3u~}…gÄ®:#@¿t1ÁäÅ7…€ÜöO,"¢„#àì‚ü;HIü'ÐvþÏØéÁNtÍÀ§kyØÞ€Ä¯Ðâ"r/bÀ±¾Pwbęn˜Z?`44sžÃhrZ»1<,'a6.°ŽpcÍíZºà° f‹€$öoÌ<sbªp#Öñǹ®².à–'Ê={c†Ä6’ýÆêü˜ÿNCy×û‘ ÒPÀôL%¨úk}`4€º]oª¹4/‰}ɛMÐh“üà Ø¥¦ñÙ^{(2k³oĤ°Þ:Á¿ÏV–~—[¼¥î¼–h‚ 7ï—Ÿ¦ uë\]ã†7¢ãÿ[–œ*ÈÐwÚ7OЂx{…'ŸoSÒæúÐ?(nî–~ÚQ¼¯ò ‡Ÿf?°¬ÛPxú¾ʞçVŒMk 3 | ­ž6ÜZ¼„oob§×P:…ݝ9¢Ù²‡G½ªÔÛ%[²Àº©ë8 Gš‘Hà¸O[ªwòÊ‹ÑÕg ò$Ʃ蟅„è)‘dãšæ¥¯0ƒ *ƑlyûC+¼2֌ ±@F>á^¬Ÿ?Ô[“TùžýL3°à’¦òŒ\TeÈ bìèRÒ­o¥Gëòå4&Ü~¿Ó€‹·Æ¤¸ãV¼pm[?ˆæç§ƒâ`T³g{%âý±ñüêÌcx2WóSR•C -------------------------------------------------------------------------------- /data/random-bytes/1000-tokens/6.txt: -------------------------------------------------------------------------------- 1 | Lœu¤Ï\r#ªÇ?‰[}ÑÅã¸S.L£H¹Ò×E"mӜ̼|Ò9r†²i“E/âÂh苋¨v]åkÔd‘¶08î¹s<ûl×ÕZ'‹^¸9K0Êeîßÿ´1Äw°±ê¦X‹>jœ­©m Ê3늫½•Ægõ/æ.³ºïËVÎ-„Fe¸?oí-©]úÈj(ÙLÕÈT‡bzW©p>'6$ÒdL 2 | R£6.=ê•SZs’¶—ÅxÛYÁ*Ǽϟñ:°mΌT¹”νdûGŒb§ÓØ2&$©¥jëûj"O0&¢· C3³÷öó‡ùêä^×~^`Pûtbd( ×vúŒ7Üm꘽ðʟ¿V1b4©ÓÐüæúøÌLÞñeÅãüï<§ŽÌy­$€hjóWZÿß3®¸òW3Mé½+¯µb=xoHä[wÈÇޝ"R\æ­Ë]Ó(Ç"nC Èµòt¶Eµ ©3¢Ÿ;…É}ô‰v5ÉþӋ(§¡³©!¤ß–»¸a…È㊠ZsÑSP+§9ÄÝ|ÚI‰šÎbàJ#PÐØˆpI¶+ñ}’LcÃÔoWpÏe :uË?¾'#u'8Õ¬±j` {3 ë,ëêðÎxŠïÐpÊÝÄ @DÐA‰CÚ,Ô 4 | dYQù‡ìÐÁ ÈÙoÚÆ™„Oéqå{X ³¯£•w1´\8¡À¹%ÄEå¡ÇæÑ -------------------------------------------------------------------------------- /data/random-bytes/1000-tokens/7.txt: -------------------------------------------------------------------------------- 1 | …¡üŠ%0´> H5ˆ¼P°C2ÇpVh¡HM 2 | ÷*HGCSÁ¨«¬`0S÷ ƕpW€ªÌ—fW&T¸4=JÐS†¬ ¡&+Ù³ö°b_JmÜyé˜qˆO“äò8WAӗ™£›Æ‰>–opçÍ-ÿÑÏí3Hš3ä(ŸÞyóL™ "Î £Ù4·— 3 | Ýè1·ä¢ˆÜýITÛ i'ŸNÃÿ[éwQzjïžPuѲ()Ö`:"Ÿ(ù¤Üµ»$úŠxZ¯e‘ 4 | Žkœ¸7Ò=Iƒ‘;ý,HiÃÅlo€±ië´ZþN:¥ŒJ`¡‰Ö°ÉÑØ ‰V2Áì±_†·©í@öè¹1C|ړŸ»¤øØ£¥é#s“ì÷‰0GsPµŽ1 «º÷eü#7Ä5g¸4æ,LÒ~®§9†”ksz£6Nô ¦ø9Œ³ïÒ*g.}Îõ&†„5ë 5 | {ò¬ÂH‹>Bz}~¶3=nªú¾és¨Öêl‘¤Wíì›á®R¾ÅÌ- xÕçœg2Ϧ«ØLÜJ4f+±¹` x‘²7¡Ý)ANܐ5‡OmH9Î'‡ 6 | ‹²ô±øû$«—/¶œ¬Ô㖑Ec–t\6FÝíˆI·ÕÅÎæ? {«çþÙ߽ʔëT%;¡Ùsn3ñEÝa‹Z+ǚޟÐ ª€#;¶¤¥é>9€š®ÃÿÔÞg!Mƒ:I1¡˜Enáwãì…ŝvZâû8ÑÃLèôîPd-3Ѽ›Šá:húIOј 7 | ˜UGò€1~>É ç/Oe_ÊðÙvýõñŠÿ.ª’}턅[í@Ê`~,ÿ³óNñBñí$_-çDÈ.Š®´›ÕÖ3ÖÇ)о³c‚¯}lœŒ\¿Í90"yg0ðX˜_ÅÓì‚OYu+ÍÄ<µ—¶ófÒnµa.ʈÎ#×ðØ|:PÉ¢KåKã€a2煈 ä’4°òÔwN|§þc«Ÿ.ŠŴ"ü7Ɲj…ȁS -------------------------------------------------------------------------------- /data/random-bytes/1000-tokens/8.txt: -------------------------------------------------------------------------------- 1 | 2 | i¯°N£­$rí«5¡/[ ~@¼ûT?ÂQü¤å°ÿžÊPó) 3 | ›3¸HœÙP¦mšq;­2z»°#™–SŸFuKÖVáITqR›Î\֘+Te)Ö| <`A1Åu]”{?ªr¶ßL¹˜ÌoŽ_E!œº‚ ¦IäOééf9 ¤Öƒ˜¯þL¦æ|ÿ·ŽjíbrqñÆb)5YƒEÉ ëÕTœŠÊW2’ÙÄðéƒdÿH‡t{ÒÇj¥'¦ïÈwDXƒTµGñíOW 4 | Ód—òk¸™—t»EÀÇqÈÍ3†ÔC¹÷Heʄ–ÙÊòê}ÛðB]–üÚ÷OAÈpÆòa`;¶}ãAC)OÉ5¦üž]_—ã\iî6W[oq–ÏuY†Q$Z‘}†½Zs­x‘{B¥ D ;¡ñéQÝE1˜r¨ð鮋ñºëi!€Y˜Šf‘däŸÌž—#xè%’tØÀͦVòPʎ¿¥"3 ³¸ËƒYŽtléí“X¤ò‹~£3íÎ8aàB꥔ьNŸ8>%eÖïÈ-N¸K»ßwҙƒ6¥"í}•u|þ^¾ÁÌÓSå 5 | ®ƒc›ã„,¼ÝLj»!=ñ1<¦M`(:qd¨~eú3pB ê7 Ê¦ÜB¿Õ2YĶ!|îÉ­Ë*G¸† ð5ýt ¤7O‡žêO‡JƒQKQvñ@¶±ä‘ é$˜/YG¡äKO‘z%‰`Ôò¹p©Q| ŠCÒÝtTÞà >.iAâ¬Ú‰º°Ö‚Íè/ë-Þ1Pͱ$Ù«J"s@>,`YŒGÂÚøÝQ‹¥á›Šûž‹xŸmS5I¾i#êúƒ°Ê1wh±Çéß#¬KÀ3Ù| ¸ð>F:%îf5õ´÷²fˆ£ÙVÜ.ÚåáN@‘§%øñJfCdû‚Þ\2‡î=r…»n£8÷üíú$Xƒ†Óy³Ž'°’™ 6 | þî8¦7p÷ç{íú2Ãoú‹j¬(œ -------------------------------------------------------------------------------- /data/random-bytes/1000-tokens/9.txt: -------------------------------------------------------------------------------- 1 | %œkîW™„ДÝWvÐpUîw¥ÄBÊóÔ͕F[µá;úâÜê;)J4¼Ÿ‘¯fÃø{ ,›†db"Äw¸Cgm&"j"ëüKãG1ú~ N‹8{?`fÉ)5¨á׬ˆÂ i‰°R-~+=as²m Ä ÔÑ^’œ€u©,[ôÅR¨/ûÆ·Õ„õ™­Ÿ6ì|¨¢E™À–9#OG­*  WÖÆšÂê%®Iê§§*ˆp:GÖ啭† võrslïöõW£H“h‘O¬ÔlP´‘wÀvù >0@Â~QŠôöåyš:”F÷öiÙLO/yæp%y}Ý©PIÎÄÞÿâØõ¨¾«»<š=¹±ÊMß?¾á1 ¿ÊÓ®ÙÄ׀£@-_øØÿ9|úÃâŸZSkîIráCf)ä©ÔµzÚ?ÿå”4ù›ˆJ8õ]ŒÙFÄ.J#¶¨÷ëÉ(oÍ·Nùþã,6q¡¹&ãq]Ö>Ôdt­%¤øØYt'ôUP,²¯ª/|îÞG¨0ÌGN]¤ÏI ’ 2 | È*ýÐü÷Ããmè,JÊQۗg¡Þaæ‡çîe©ûÌø™×ì8£Å±’é2 ñ:ouÛÂYìoÐ Ë;mÛ 3 | …·MüœVJ¡ö‡Žjƒ[´ðÐÎ)ÝôðÁ%´]®Y]K^GŸVÖôg¤Ã¦¾\ý<}!“+çèI&û³Kºè“Kië4äÑ.1à6_V£u 5 | ©öºepÀ dKF#á ‚À$OxõhÎLDZÖqFW9/Ľ=X©d}šÒoe%¥r`\ÀÉéÛ 6 | 7 | 8 | #include 9 | 10 | // CUDA forward declarations 11 | 12 | void fast_walsh_hadamard_transform_cuda_kernel(const int NN, const int halfLL, torch::Tensor in, torch::Tensor out, bool normalize); 13 | 14 | // C++ interface 15 | 16 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") 17 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 18 | #define CHECK_INPUT(x) \ 19 | CHECK_CUDA(x); \ 20 | CHECK_CONTIGUOUS(x) 21 | 22 | torch::Tensor fast_walsh_hadamard_transform(torch::Tensor input, bool normalize) 23 | { 24 | CHECK_INPUT(input); 25 | const int NN = input.numel(); 26 | torch::Tensor output_flat = input.clone(); 27 | int ll = 0; 28 | int LL = 1; 29 | while (LL < NN) 30 | { 31 | ll += 1; 32 | LL *= 2; 33 | } 34 | const int halfLL = LL / 2; 35 | fast_walsh_hadamard_transform_cuda_kernel(NN, halfLL, input, output_flat, normalize); 36 | return output_flat; 37 | } 38 | 39 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 40 | { 41 | m.def("fast_walsh_hadamard_transform", &fast_walsh_hadamard_transform, "Fast Walsh Hadamard Transform (CUDA)"); 42 | } 43 | -------------------------------------------------------------------------------- /intrinsic/intrinsic/fwh_cuda/fwh_cu.cu: -------------------------------------------------------------------------------- 1 | // The codes are from Armen Aghajanyan from facebook, from paper 2 | // Intrinsic Dimensionality Explains the Effectiveness of Language Model Fine-Tuning 3 | // https://arxiv.org/abs/2012.13255 4 | 5 | // https://github.com/rabeehk/compacter/tree/main/seq2seq/projections 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | template 15 | __global__ void FastWalshHadamardKernel(const int stride, const scalar_t* in, scalar_t* out) { 16 | const auto idx = (threadIdx.x + blockIdx.x * blockDim.x); 17 | const auto elemIdx = (idx / stride ) * (2 * stride) + (idx % stride); 18 | const auto tmp = in[elemIdx], tmp2 = in[elemIdx + stride]; 19 | out[elemIdx] = tmp + tmp2; 20 | out[elemIdx + stride] = tmp - tmp2; 21 | } 22 | 23 | template 24 | __global__ void FastWalshHadamardSubKernel(const scalar_t scalar, scalar_t* out) { 25 | const auto idx = (threadIdx.x + blockIdx.x * blockDim.x); 26 | out[idx] *= scalar; 27 | } 28 | 29 | 30 | void fast_walsh_hadamard_transform_cuda_kernel(const int NN, const int halfLL, torch::Tensor in, torch::Tensor out, bool normalize) { 31 | // Apply Unnormalized Fast Walsh Hadamard transform 32 | int stride = halfLL; 33 | float normalizer = 1.0; 34 | float sqrt2inv = 0.70710678118654746; 35 | 36 | while (stride >= 1) { 37 | if(stride == halfLL) 38 | { 39 | AT_DISPATCH_FLOATING_TYPES(in.scalar_type(),"fast_walsh_hadamard_transform_in", ([&] { 40 | FastWalshHadamardKernel<<>>(stride, in.data_ptr(), out.data_ptr()); 41 | })); 42 | } 43 | else 44 | { 45 | AT_DISPATCH_FLOATING_TYPES(in.scalar_type(),"fast_walsh_hadamard_transform_out", ([&] { 46 | FastWalshHadamardKernel<<>>(stride, out.data_ptr(), out.data_ptr()); 47 | })); 48 | } 49 | 50 | stride /= 2; 51 | normalizer *= sqrt2inv; 52 | } 53 | if(normalize){ 54 | AT_DISPATCH_FLOATING_TYPES(in.scalar_type(),"fast_walsh_hadamard_transform_final", ([&] { 55 | FastWalshHadamardSubKernel<<>>(normalizer, out.data_ptr()); 56 | })); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /intrinsic/intrinsic/implementation.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import List, NamedTuple, Tuple 3 | 4 | import numpy as np 5 | import torch 6 | 7 | from .fwh import fast_walsh_hadamard_transform # type: ignore 8 | 9 | # Utility functions 10 | 11 | 12 | def set_seed(seed: int) -> None: 13 | random.seed(seed) 14 | np.random.seed(seed) 15 | torch.manual_seed(seed) 16 | torch.cuda.manual_seed_all(seed) 17 | 18 | 19 | def send_to_device(obj, device): 20 | if isinstance(obj, list): 21 | return [send_to_device(t, device) for t in obj] 22 | 23 | if isinstance(obj, tuple): 24 | return tuple(send_to_device(t, device) for t in obj) 25 | 26 | if isinstance(obj, dict): 27 | return { 28 | send_to_device(key, device): send_to_device(value, device) 29 | for key, value in obj.items() 30 | } 31 | 32 | if hasattr(obj, "to"): 33 | return obj.to(device) 34 | 35 | return obj 36 | 37 | 38 | # Actual implementation 39 | 40 | 41 | class HiddenParam(NamedTuple): 42 | name: str 43 | module: torch.nn.Module 44 | module_name: str 45 | shape: torch.Size 46 | numel: int 47 | 48 | 49 | def make_hidden_params(module) -> Tuple[List[HiddenParam], torch.Tensor]: 50 | hidden_params = [] 51 | theta_0s = {} 52 | 53 | # Iterate over layers in the module 54 | for name, param in sorted(list(module.named_parameters())): 55 | # If param does not require update, skip it because we are not tuning it. 56 | if not param.requires_grad: 57 | continue 58 | 59 | # Saves the initial values of the initialised parameters from param.data and sets them to no grad. 60 | theta_0s[name] = param.detach().requires_grad_(False) 61 | 62 | base, localname = module, name 63 | while "." in localname: 64 | prefix, localname = localname.split(".", 1) 65 | base = getattr(base, prefix) 66 | 67 | numel = int(np.prod(param.shape)) 68 | hidden_params.append(HiddenParam(name, base, localname, param.shape, numel)) 69 | 70 | return hidden_params, theta_0s 71 | 72 | 73 | class FastWalshHadamard(torch.autograd.Function): 74 | @staticmethod 75 | def forward(ctx, input): 76 | return fast_walsh_hadamard_transform(input, False) 77 | 78 | @staticmethod 79 | def backward(ctx, grad_output): 80 | return fast_walsh_hadamard_transform(grad_output, False) 81 | -------------------------------------------------------------------------------- /intrinsic/intrinsic/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/intrinsic/intrinsic/py.typed -------------------------------------------------------------------------------- /intrinsic/intrinsic/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/intrinsic/intrinsic/test/__init__.py -------------------------------------------------------------------------------- /intrinsic/intrinsic/test/test_implementation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | 4 | from .. import implementation 5 | 6 | 7 | def device(): 8 | if torch.cuda.is_available(): 9 | return torch.device("cuda:0") 10 | return torch.device("cpu") 11 | 12 | 13 | class NeuralNetwork(torch.nn.Module): 14 | def __init__(self, layers): 15 | super().__init__() 16 | self.layers = torch.nn.ModuleList() 17 | 18 | for in_size, out_size in zip(layers, layers[1:]): 19 | self.layers.append(torch.nn.Linear(in_size, out_size)) 20 | 21 | def forward(self, x): 22 | for layer in self.layers: 23 | x = layer(x) 24 | x = torch.nn.functional.relu(x) 25 | 26 | return x 27 | 28 | 29 | def test_make_hidden_params_single_layer(): 30 | model = NeuralNetwork(layers=[10, 1]) 31 | 32 | hidden_params, theta_0 = implementation.make_hidden_params(model) 33 | 34 | assert theta_0.shape == (11,) # 10 weights + 1 bias 35 | assert len(hidden_params) == 2 36 | assert [hp.name for hp in hidden_params] == [ 37 | name for name, param in sorted(model.named_parameters()) 38 | ] 39 | 40 | 41 | def test_make_hidden_params_three_layers(): 42 | model = NeuralNetwork(layers=[256, 128, 32, 10]) 43 | 44 | hidden_params, theta_0 = implementation.make_hidden_params(model) 45 | 46 | assert theta_0.shape == (257 * 128 + 129 * 32 + 33 * 10,) 47 | assert len(hidden_params) == 6 48 | assert [hp.name for hp in hidden_params] == [ 49 | name for name, param in sorted(model.named_parameters()) 50 | ] 51 | 52 | 53 | def test_make_hidden_params_gpt2(): 54 | model = transformers.AutoModelForCausalLM.from_pretrained("gpt2") 55 | 56 | hidden_params, theta_0 = implementation.make_hidden_params(model) 57 | 58 | assert [hp.name for hp in hidden_params] == [ 59 | name for name, param in sorted(model.named_parameters()) 60 | ] 61 | 62 | 63 | def test_fast_walsh_hadamard_grad1(): 64 | in_tensor = torch.ones(2, requires_grad=True, dtype=torch.double, device=device()) 65 | 66 | assert torch.autograd.gradcheck( 67 | implementation.FastWalshHadamard.apply, in_tensor, eps=1e-6, atol=1e-4 68 | ) 69 | 70 | 71 | def test_fast_walsh_hadamard_grad2(): 72 | in_tensor = torch.randn(4, requires_grad=True, dtype=torch.double, device=device()) 73 | 74 | assert torch.autograd.gradcheck( 75 | implementation.FastWalshHadamard.apply, in_tensor, eps=1e-6, atol=1e-4 76 | ) 77 | 78 | 79 | def test_fast_walsh_hadamard_grad3(): 80 | in_tensor = torch.randn(64, requires_grad=True, dtype=torch.double, device=device()) 81 | 82 | assert torch.autograd.gradcheck( 83 | implementation.FastWalshHadamard.apply, in_tensor, eps=1e-6, atol=1e-4 84 | ) 85 | 86 | 87 | def test_fast_walsh_hadamard_forward(): 88 | in_tensor = torch.tensor( 89 | [1, 0, 1, 0, 0, 1, 1, 0], dtype=torch.float, device=device() 90 | ) 91 | 92 | actual = implementation.FastWalshHadamard.apply(in_tensor) 93 | 94 | expected = torch.tensor( 95 | [4, 2, 0, -2, 0, 2, 0, 2], dtype=torch.float, device=device() 96 | ) 97 | 98 | assert torch.allclose(expected, actual) 99 | -------------------------------------------------------------------------------- /intrinsic/intrinsic/utils.py: -------------------------------------------------------------------------------- 1 | def send_to_device(obj, device): 2 | if isinstance(obj, list): 3 | return [send_to_device(t, device) for t in obj] 4 | 5 | if isinstance(obj, tuple): 6 | return tuple(send_to_device(t, device) for t in obj) 7 | 8 | if isinstance(obj, dict): 9 | return { 10 | send_to_device(key, device): send_to_device(value, device) 11 | for key, value in obj.items() 12 | } 13 | 14 | if hasattr(obj, "to"): 15 | return obj.to(device) 16 | 17 | return obj 18 | -------------------------------------------------------------------------------- /intrinsic/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | import torch 3 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 4 | 5 | description = "PyTorch CUDA kernel implementation of intrinsic dimension operation." 6 | 7 | 8 | def setup_package(): 9 | ext_modules = [] 10 | if torch.cuda.is_available(): 11 | ext_modules = [ 12 | CUDAExtension( 13 | "intrinsic.fwh_cuda", 14 | sources=[ 15 | "intrinsic/fwh_cuda/fwh_cpp.cpp", 16 | "intrinsic/fwh_cuda/fwh_cu.cu", 17 | ], 18 | ) 19 | ] 20 | 21 | setuptools.setup( 22 | name="intrinsic", 23 | version="0.0.1", 24 | description=description, 25 | long_description=description, 26 | long_description_content_type="text/markdown", 27 | author="Rabeeh Karimi Mahabadi", 28 | license="MIT License", 29 | packages=setuptools.find_packages( 30 | exclude=["docs", "tests", "scripts", "examples"] 31 | ), 32 | dependency_links=[ 33 | "https://download.pytorch.org/whl/torch_stable.html", 34 | ], 35 | classifiers=[ 36 | "Intended Audience :: Developers", 37 | "Intended Audience :: Science/Research", 38 | "License :: OSI Approved :: MIT License", 39 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 40 | "Programming Language :: Python :: 3", 41 | "Programming Language :: Python :: 3.9.7", 42 | ], 43 | keywords="text nlp machinelearning", 44 | ext_modules=ext_modules, 45 | cmdclass={"build_ext": BuildExtension}, 46 | install_requires=[], 47 | ) 48 | 49 | 50 | if __name__ == "__main__": 51 | setup_package() 52 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.isort] 2 | profile = "black" 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | black 2 | coverage==6.2 3 | datasets 4 | flake8 5 | flake8-bugbear 6 | isort 7 | matplotlib==3.5.0 8 | mypy 9 | nltk==3.6.5 10 | numpy 11 | orjson 12 | pytest 13 | pytest-cov==3.0.0 14 | preface==0.1.5 15 | pytorch_lightning 16 | git+https://github.com/samuelstevens/relic.git 17 | scikit-learn 18 | scipy 19 | seaborn 20 | tabulate==0.8.9 21 | tomli==1.2.1 22 | tomli-w==1.0.0 23 | tqdm 24 | transformers 25 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501,E203,E722,W503,W391 3 | 4 | [pycodestyle] 5 | ignore = E501,E203,E722,W503,W391 6 | 7 | [tool:pytest] 8 | addopts = 9 | -Wignore 10 | --cov=src 11 | --cov=intrinsic 12 | --cov-report=html 13 | 14 | [tool.isort] 15 | profile = "black" 16 | 17 | [mypy] 18 | python_version = 3.9 19 | plugins = numpy.typing.mypy_plugin 20 | 21 | [mypy-seaborn.*] 22 | ignore_missing_imports = True 23 | 24 | [mypy-scipy.*] 25 | ignore_missing_imports = True 26 | 27 | [mypy-tqdm.*] 28 | ignore_missing_imports = True 29 | 30 | [mypy-matplotlib.*] 31 | ignore_missing_imports = True 32 | 33 | [mypy-nltk.*] 34 | ignore_missing_imports = True 35 | 36 | [mypy-line_profiler.*] 37 | ignore_missing_imports = True 38 | 39 | [mypy-transformers.*] 40 | ignore_missing_imports = True 41 | 42 | [mypy-sklearn.*] 43 | ignore_missing_imports = True 44 | 45 | [mypy-intrinsic.fwh_cuda] 46 | ignore_missing_imports = True 47 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/__init__.py -------------------------------------------------------------------------------- /src/attacking/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data, ffnn, gradboost, knn, lda, semantic_security, svm 2 | 3 | __all__ = [ 4 | "gradboost", 5 | "data", 6 | "ffnn", 7 | "lda", 8 | "knn", 9 | "semantic_security", 10 | "svm", 11 | ] 12 | -------------------------------------------------------------------------------- /src/attacking/adaboost.py: -------------------------------------------------------------------------------- 1 | import scipy.stats 2 | import sklearn.ensemble 3 | import sklearn.model_selection 4 | import sklearn.pipeline 5 | import sklearn.preprocessing 6 | 7 | from . import semantic_security 8 | 9 | 10 | def init_model() -> semantic_security.Model: 11 | return sklearn.model_selection.RandomizedSearchCV( 12 | sklearn.ensemble.AdaBoostClassifier(n_estimators=100), 13 | { 14 | "learning_rate": scipy.stats.loguniform(a=1e-2, b=1e1), 15 | }, 16 | n_jobs=-1, 17 | n_iter=100, 18 | ) 19 | -------------------------------------------------------------------------------- /src/attacking/gradboost.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tries scikit learn's default gradient boosting classifier. 3 | """ 4 | 5 | import sklearn.decomposition 6 | import sklearn.model_selection 7 | import sklearn.pipeline 8 | import sklearn.preprocessing 9 | 10 | from . import semantic_security 11 | 12 | 13 | def init_model(seed) -> semantic_security.Model: 14 | return sklearn.pipeline.make_pipeline( 15 | sklearn.preprocessing.StandardScaler(), 16 | sklearn.ensemble.GradientBoostingClassifier(random_state=seed), 17 | ) 18 | -------------------------------------------------------------------------------- /src/attacking/helpers.py: -------------------------------------------------------------------------------- 1 | class InsistedError(Exception): 2 | condition: object 3 | message: object 4 | 5 | def __init__(self, condition: object, message: object): 6 | self.condition = condition 7 | self.message = message 8 | 9 | def __str__(self): 10 | return f"Internal consistency error: {self.message}" 11 | 12 | 13 | def insist(condition: object, message: object) -> None: 14 | if not condition: 15 | raise InsistedError(condition, message) 16 | -------------------------------------------------------------------------------- /src/attacking/knn.py: -------------------------------------------------------------------------------- 1 | import sklearn.ensemble 2 | import sklearn.model_selection 3 | import sklearn.pipeline 4 | import sklearn.preprocessing 5 | 6 | from . import semantic_security 7 | 8 | 9 | def init_model() -> semantic_security.Model: 10 | return sklearn.model_selection.GridSearchCV( 11 | sklearn.neighbors.KNeighborsClassifier(algorithm="auto", n_jobs=-1), 12 | {"n_neighbors": [5, 25, 100]}, 13 | n_jobs=-1, 14 | ) 15 | -------------------------------------------------------------------------------- /src/attacking/lda.py: -------------------------------------------------------------------------------- 1 | """ 2 | Applies linear discriminant analysis to hand crafted features on ciphertexts. 3 | """ 4 | 5 | import sklearn.discriminant_analysis 6 | import sklearn.pipeline 7 | import sklearn.preprocessing 8 | 9 | from . import semantic_security 10 | 11 | 12 | def init_model() -> semantic_security.Model: 13 | return sklearn.pipeline.make_pipeline( 14 | sklearn.preprocessing.StandardScaler(), 15 | sklearn.discriminant_analysis.LinearDiscriminantAnalysis(), 16 | ) 17 | -------------------------------------------------------------------------------- /src/attacking/random_forest.py: -------------------------------------------------------------------------------- 1 | import sklearn.ensemble 2 | import sklearn.model_selection 3 | import sklearn.pipeline 4 | import sklearn.preprocessing 5 | 6 | from . import semantic_security 7 | 8 | 9 | def init_model() -> semantic_security.Model: 10 | return sklearn.model_selection.GridSearchCV( 11 | sklearn.ensemble.RandomForestClassifier(max_features=None), 12 | { 13 | "max_features": [1.0, 0.3, "sqrt", "log2"], 14 | "max_depth": [None], 15 | "min_samples_split": [2], 16 | }, 17 | n_jobs=-1, 18 | ) 19 | -------------------------------------------------------------------------------- /src/attacking/semantic_security.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from typing import Any, Callable, Dict, Iterator, Optional, Protocol 3 | 4 | import numpy as np 5 | import scipy.stats 6 | 7 | from .. import logging 8 | from . import data 9 | 10 | logger = logging.init(__name__, date=False, verbose=True) 11 | 12 | 13 | class Model(Protocol): 14 | def fit(self, X: np.ndarray, y: np.ndarray) -> None: 15 | ... 16 | 17 | def score(self, X: np.ndarray, y: np.ndarray) -> float: 18 | ... 19 | 20 | def predict(self, X: np.ndarray) -> np.ndarray: 21 | ... 22 | 23 | 24 | TrainingCallback = Callable[[Model, data.PairedDataset], None] 25 | 26 | 27 | @dataclasses.dataclass 28 | class SemanticSecurityConfig: 29 | plaintext_a: str 30 | plaintext_b: str 31 | params: Dict[str, Any] 32 | 33 | 34 | def confidence_interval(n_test, confidence=0.95): 35 | percentages = (1 - confidence) / 2, confidence + (1 - confidence) / 2 36 | logger.debug( 37 | "Calculated percentages. [confidence: %.3g, lower: %.3g, upper: %.3g]", 38 | confidence, 39 | *percentages, 40 | ) 41 | # Coin flip 42 | p = 0.5 43 | # Binomial distribution 44 | lower, upper = scipy.stats.binom.ppf(percentages, n_test, p) / n_test 45 | return lower, upper 46 | 47 | 48 | def play( 49 | datasets: Iterator[data.SingleDataset], 50 | model_fn: Callable[[], Model], 51 | model_name: str, 52 | seed: int, 53 | trained_model_callback: Optional[TrainingCallback] = None, 54 | quiet: bool = False, 55 | ) -> bool: 56 | logger = logging.init(model_name) 57 | passed = True 58 | 59 | for pair in data.make_paired_datasets( 60 | datasets, seed, left="data/news/100-tokens/0.txt" 61 | ): 62 | model = model_fn() 63 | 64 | (train_x, train_y), (test_x, test_y) = pair.splits 65 | 66 | if not quiet: 67 | logger.info("Starting model.fit") 68 | 69 | model.fit(train_x, train_y) 70 | 71 | if callable(trained_model_callback): 72 | trained_model_callback(model, pair) 73 | 74 | train_score = model.score(train_x, train_y) 75 | test_score = model.score(test_x, test_y) 76 | 77 | params = getattr(model, "best_params_", {}) 78 | if not quiet: 79 | logger.info( 80 | "Fitted. [pair: %s, train acc: %.2f, test acc: %.2f, params: %s]", 81 | pair.name, 82 | train_score, 83 | test_score, 84 | params, 85 | ) 86 | 87 | n_test = len(test_y) 88 | lower, upper = confidence_interval(n_test) 89 | 90 | n_correct = 0 91 | for label, prediction in zip(test_y, model.predict(test_x)): 92 | if label == prediction: 93 | n_correct += 1 94 | 95 | test = scipy.stats.binomtest(n_correct, n_test, p=0.5, alternative="greater") 96 | 97 | if test.pvalue < 0.05: # if failed 98 | logger.warn( 99 | "Reject null. [pair: %s, test acc: %.3f, p: %.3g]", 100 | pair.name, 101 | test_score, 102 | test.pvalue, 103 | ) 104 | passed = False 105 | 106 | if test_score > upper and not quiet: 107 | logger.warn( 108 | "Outside confidence interval. [pair: %s, test acc: %.3f, upper: %.3f]", 109 | pair.name, 110 | test_score, 111 | upper, 112 | ) 113 | passed = False 114 | 115 | if test.pvalue > 0.05 and test_score < upper: 116 | logger.info( 117 | "Fail to reject. [pair: %s, test acc: %.3f, upper: %.3f, p: %.3g]", 118 | pair.name, 119 | test_score, 120 | upper, 121 | test.pvalue, 122 | ) 123 | 124 | return passed 125 | -------------------------------------------------------------------------------- /src/attacking/svm.py: -------------------------------------------------------------------------------- 1 | import scipy.stats 2 | import sklearn.model_selection 3 | import sklearn.pipeline 4 | import sklearn.preprocessing 5 | import sklearn.svm 6 | 7 | from . import semantic_security 8 | 9 | 10 | def init_model() -> semantic_security.Model: 11 | return sklearn.model_selection.RandomizedSearchCV( 12 | sklearn.pipeline.make_pipeline( 13 | sklearn.preprocessing.StandardScaler(), 14 | sklearn.svm.SVC(C=1.0, kernel="rbf"), 15 | ), 16 | { 17 | "svc__C": scipy.stats.loguniform(a=1e-3, b=1e1), 18 | "svc__kernel": ["rbf", "linear", "sigmoid", "poly"], 19 | "svc__gamma": scipy.stats.loguniform(a=1e-4, b=1e-3), 20 | }, 21 | n_iter=100, 22 | n_jobs=-1, 23 | random_state=42, 24 | ) 25 | -------------------------------------------------------------------------------- /src/blog/histograms.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates data for plotly.js histograms showing the distribution of an invidual feature for one or more pair of plaintexts. 3 | """ 4 | 5 | import argparse 6 | import json 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from .. import attacking 12 | from ..paper import helpers, security 13 | 14 | files = ["News ($m1$)", "Rand. Bytes ($m2$)"] 15 | 16 | 17 | def init_parser() -> argparse.ArgumentParser: 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument( 20 | "group", 21 | choices=[ 22 | "original", 23 | "l2-norm-reg", 24 | "distribution-reg", 25 | ], 26 | help="Which ciphertext groups to use.", 27 | ) 28 | 29 | return parser 30 | 31 | 32 | def make_dataframe(ciphertexts): 33 | headers = ["file", *attacking.data.FEATURE_FUNCTIONS.keys()] 34 | 35 | rows = [] 36 | 37 | for file, matrix in ciphertexts.items(): 38 | features = {} 39 | 40 | for name, func in attacking.data.FEATURE_FUNCTIONS.items(): 41 | features[name] = func(matrix) 42 | 43 | # features[X] are all the same length 44 | features["file"] = [ 45 | helpers.translate_filename(file) for _ in range(len(features[name])) 46 | ] 47 | 48 | file_rows = tuple( 49 | [features[key][i] for key in headers] for i in range(len(features[name])) 50 | ) 51 | 52 | rows.extend(file_rows) 53 | 54 | return pd.DataFrame.from_records(data=rows, columns=headers) 55 | 56 | 57 | def main(): 58 | parser = init_parser() 59 | args = parser.parse_args() 60 | 61 | ciphertexts = security.load_ciphertexts(args.group, 400) 62 | df = make_dataframe(ciphertexts) 63 | 64 | data = [] 65 | for file in files: 66 | data.append(df[df.file == file]["l2-norm"].tolist()) 67 | 68 | with open(f"docs/blog/data/{args.group}-histograms.json", "w") as fd: 69 | json.dump(data, fd) 70 | 71 | 72 | if __name__ == "__main__": 73 | main() 74 | -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/data/__init__.py -------------------------------------------------------------------------------- /src/data/news.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import pathlib 3 | from typing import Iterator 4 | 5 | import datasets 6 | 7 | from .. import util 8 | from . import shared 9 | 10 | 11 | def load_news_articles() -> Iterator[str]: 12 | dataset = datasets.load_dataset( 13 | "xsum", cache_dir=util.HUGGINGFACE_CACHE_DIR, streaming=True, split="validation" 14 | ).shuffle(seed=42) 15 | for example in dataset: 16 | document = example["document"] 17 | if not document: 18 | continue 19 | yield document 20 | 21 | 22 | def preprocess(output_dir): 23 | output_dir = pathlib.Path(output_dir) 24 | output_dir.mkdir(exist_ok=True) 25 | 26 | articles = load_news_articles() 27 | 28 | example_count = 10 29 | token_lengths = (100, 300, 1000, 3000) 30 | 31 | for length, loc in itertools.product(token_lengths, range(example_count)): 32 | tokens = [] 33 | text = "" 34 | while len(tokens) < length: 35 | text += next(articles) 36 | tokens = shared.tokenizer(text)["input_ids"] 37 | 38 | tokens = tokens[:length] 39 | shared.assert_invertible(tokens) 40 | 41 | text = shared.tokenizer.decode(tokens) 42 | 43 | length_dir = output_dir / f"{length}-tokens" 44 | length_dir.mkdir(exist_ok=True) 45 | 46 | with open(length_dir / f"{loc}.txt", "w") as file: 47 | file.write(text) 48 | 49 | char_lengths = (500, 2500, 5000, 25000) 50 | 51 | for length, loc in itertools.product(char_lengths, range(example_count)): 52 | text = "" 53 | while len(text) < length: 54 | text += next(articles) 55 | text = text[:length] 56 | 57 | tokens = shared.tokenizer(text)["input_ids"] 58 | shared.assert_invertible(tokens) 59 | text = shared.tokenizer.decode(tokens) 60 | 61 | length_dir = output_dir / f"{length}-chars" 62 | length_dir.mkdir(exist_ok=True) 63 | 64 | with open(length_dir / f"{loc}.txt", "w") as file: 65 | file.write(text) 66 | -------------------------------------------------------------------------------- /src/data/openwebtext.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import datasets 4 | 5 | from .. import util 6 | 7 | 8 | def preprocess(output_dir): 9 | output_dir = pathlib.Path(output_dir) 10 | output_dir.mkdir(exist_ok=True) 11 | 12 | openwebtext = datasets.load_dataset( 13 | "stas/openwebtext-10k", split="train", cache_dir=util.HUGGINGFACE_CACHE_DIR 14 | ).shuffle(seed=42) 15 | 16 | for i in range(10): 17 | article = openwebtext[i] 18 | with open(output_dir / f"{i}.txt", "w") as file: 19 | file.write(article["text"].strip() + "\n") 20 | -------------------------------------------------------------------------------- /src/data/pubmed.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import pathlib 3 | from typing import Iterator 4 | 5 | import datasets 6 | 7 | from .. import util 8 | from . import shared 9 | 10 | 11 | def take(n, iterable): 12 | "Return first n items of the iterable" 13 | return itertools.islice(iterable, n) 14 | 15 | 16 | def load_pubmed_abstracts() -> Iterator[str]: 17 | dataset = datasets.load_dataset( 18 | "pubmed", cache_dir=util.HUGGINGFACE_CACHE_DIR, streaming=True, split="train" 19 | ).shuffle(seed=42) 20 | for example in dataset: 21 | abstract = example["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] 22 | if not abstract: 23 | continue 24 | yield abstract 25 | 26 | 27 | def preprocess(output_dir): 28 | output_dir = pathlib.Path(output_dir) 29 | output_dir.mkdir(exist_ok=True) 30 | 31 | abstracts = load_pubmed_abstracts() 32 | 33 | example_count = 10 34 | lengths = (100, 1_000, 10_000) 35 | 36 | for length, loc in itertools.product(lengths, range(example_count)): 37 | tokens = [] 38 | text = "" 39 | while len(tokens) < length: 40 | text += next(abstracts) 41 | tokens = shared.tokenizer(text)["input_ids"] 42 | 43 | tokens = tokens[:length] 44 | shared.assert_invertible(tokens) 45 | 46 | text = shared.tokenizer.decode(tokens) 47 | 48 | length_dir = output_dir / f"{length}-tokens" 49 | length_dir.mkdir(exist_ok=True) 50 | 51 | with open(length_dir / f"{loc}.txt", "w") as file: 52 | file.write(text) 53 | -------------------------------------------------------------------------------- /src/data/reddit.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import datasets 4 | 5 | from .. import util 6 | 7 | 8 | def preprocess(output_dir): 9 | output_dir = pathlib.Path(output_dir) 10 | output_dir.mkdir(exist_ok=True) 11 | 12 | dataset = datasets.load_dataset( 13 | "reddit", cache_dir=util.HUGGINGFACE_CACHE_DIR 14 | ).shuffle(seed=42)["train"]["content"] 15 | 16 | indices = list(range(20)) 17 | 18 | # 3 had a   19 | indices = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10] 20 | 21 | posts = [] 22 | for i in indices: 23 | posts.append(dataset[i]) 24 | 25 | for i, post in enumerate(posts): 26 | with open(output_dir / f"{i}.txt", "w") as file: 27 | file.write(post.strip() + "\n") 28 | -------------------------------------------------------------------------------- /src/data/shared.py: -------------------------------------------------------------------------------- 1 | from typing import List, Sequence 2 | 3 | from .. import config, tokenizing 4 | 5 | exp_cfg = config.ExperimentConfig( 6 | model=config.ModelConfig(language_model_name_or_path="gpt2"), 7 | tokenizer="pretrained", 8 | data=config.DataConfig(__file__), 9 | training=config.TrainingConfig(), 10 | ) 11 | 12 | tokenizer = tokenizing.new(exp_cfg) 13 | 14 | 15 | def chunk_length(sequence: str) -> int: 16 | """ 17 | Gets the length of a sequence in chunks using a GPT2 tokenizer. 18 | """ 19 | if not sequence: 20 | return 0 21 | 22 | chunks = tokenizing.load_chunks(sequence, exp_cfg.data, tokenizer) 23 | 24 | return len(chunks) 25 | 26 | 27 | def tokenize(sequence: str) -> List[int]: 28 | if not sequence: 29 | return [] 30 | 31 | return tokenizer(sequence)["input_ids"] 32 | 33 | 34 | def untokenize(tokens: Sequence[int]) -> str: 35 | return tokenizer.decode(tokens) 36 | 37 | 38 | def token_length(sequence: str) -> int: 39 | """ 40 | Gets the length of a sequence in tokens using a GPT2 tokenizer. 41 | """ 42 | 43 | return len(tokenize(sequence)) 44 | 45 | 46 | def assert_invertible(tokens: List[int]): 47 | roundtrip_tokens = tokenize(untokenize(tokens)) 48 | if tokens == roundtrip_tokens: 49 | return 50 | 51 | if untokenize(roundtrip_tokens) == untokenize(tokens): 52 | return 53 | 54 | print(untokenize(tokens)) 55 | print(untokenize(roundtrip_tokens)) 56 | 57 | for i, (t, rt) in enumerate(zip(tokens, roundtrip_tokens)): 58 | if t == rt: 59 | continue 60 | 61 | breakpoint() 62 | -------------------------------------------------------------------------------- /src/data/twitter.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import random 3 | 4 | import datasets 5 | 6 | from .. import util 7 | 8 | 9 | def preprocess(output_dir): 10 | output_dir = pathlib.Path(output_dir) 11 | output_dir.mkdir(exist_ok=True) 12 | 13 | # load all tweets 14 | all_tweets = [] 15 | for subset in ["emotion", "sentiment"]: 16 | dataset = datasets.load_dataset( 17 | "tweet_eval", subset, cache_dir=util.HUGGINGFACE_CACHE_DIR 18 | ) 19 | for tweet in dataset["train"]["text"]: 20 | all_tweets.append(tweet) 21 | 22 | # pick ten tweets. 23 | tweets = random.choices(all_tweets, k=10) 24 | 25 | # write them to disk 26 | for i, tweet in enumerate(tweets): 27 | with open(output_dir / f"{i}.txt", "w") as file: 28 | file.write(tweet + "\n") 29 | -------------------------------------------------------------------------------- /src/data/wikipedia.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import pathlib 3 | 4 | import datasets 5 | from tqdm.auto import tqdm 6 | 7 | from .. import util 8 | from . import shared 9 | 10 | 11 | def preprocess(output_dir): 12 | output_dir = pathlib.Path(output_dir) 13 | output_dir.mkdir(exist_ok=True) 14 | 15 | wikipedia = datasets.load_dataset( 16 | "wikipedia", "20200501.en", split="train", cache_dir=util.HUGGINGFACE_CACHE_DIR 17 | ).shuffle(seed=42) 18 | 19 | lengths = (100, 200, 500, 996) 20 | 21 | articles = iter(wikipedia) 22 | 23 | for length, loc in tqdm(itertools.product(lengths, range(10))): 24 | tokens = [] 25 | while len(tokens) < length: 26 | article = next(articles) 27 | tokens = shared.tokenizer(article["text"])["input_ids"] 28 | 29 | tokens = tokens[:length] 30 | shared.assert_invertible(tokens) 31 | 32 | text = shared.tokenizer.decode(tokens) 33 | assert shared.chunk_length(text) == 1 34 | 35 | length_dir = output_dir / f"{length}-tokens" 36 | length_dir.mkdir(exist_ok=True) 37 | 38 | with open(length_dir / f"{loc}.txt", "w") as file: 39 | file.write(text) 40 | -------------------------------------------------------------------------------- /src/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | from .lib import experiment_from_config, find_experiments 2 | 3 | __all__ = ["experiment_from_config", "find_experiments"] 4 | -------------------------------------------------------------------------------- /src/experiments/check.py: -------------------------------------------------------------------------------- 1 | """ 2 | Checks for experiments that have not run. 3 | """ 4 | import argparse 5 | 6 | from .. import config, util 7 | from . import lib 8 | 9 | 10 | def parse_args() -> argparse.Namespace: 11 | # check for finished experiments 12 | parser = argparse.ArgumentParser( 13 | description="Check which experiments still need to run. This will dirty your relics directory in git. You most likely want to make sure your relics directory is clean, then run this command, then run `git clean -f relics`.", 14 | ) 15 | parser.add_argument( 16 | "experiments", 17 | nargs="+", 18 | type=str, 19 | help="Config .toml files or directories containing config .toml files.", 20 | ) 21 | parser.add_argument( 22 | "--regex", 23 | action="store_true", 24 | help="Whether to use regular expression matching on [experiments] argument", 25 | default=False, 26 | ) 27 | 28 | return parser.parse_args() 29 | 30 | 31 | def check(args: argparse.Namespace) -> None: 32 | if args.regex: 33 | iterator = util.files_with_match(args.experiments) 34 | else: 35 | iterator = util.files_with_extension(args.experiments, ".toml") 36 | 37 | for experiment_toml in iterator: 38 | # If there are any configs that haven't run, print the file name. 39 | for experiment_config in config.load_configs(experiment_toml): 40 | experiment = lib.experiment_from_config(experiment_config) 41 | finished_trials = sum( 42 | 1 for t in experiment if "finished" in t and t["finished"] 43 | ) 44 | if finished_trials < experiment_config.trials: 45 | print(experiment_toml) 46 | break 47 | 48 | 49 | def main(): 50 | args = parse_args() 51 | check(args) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /src/experiments/generate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pathlib 4 | 5 | import tomli 6 | import tomli_w 7 | from tqdm.auto import tqdm 8 | 9 | from .. import config, logging, templating, util 10 | 11 | logger = logging.init("experiments.generate") 12 | 13 | 14 | def parse_args() -> argparse.Namespace: 15 | parser = argparse.ArgumentParser( 16 | description="General .toml files from template .toml files. I kept all my templates in experiments/templates and my generated experiment configs in experiments/generated, which I then removed from version control.", 17 | ) 18 | parser.add_argument( 19 | "--strategy", 20 | type=str, 21 | help="Strategy to use to combine multiple lists in a template.", 22 | default="grid", 23 | choices=["grid", "paired", "random"], 24 | ) 25 | parser.add_argument( 26 | "--count", 27 | type=int, 28 | help="Number of configs to generate when using --strategy random. Required.", 29 | default=-1, 30 | ) 31 | parser.add_argument( 32 | "--no-expand", 33 | type=str, 34 | nargs="+", 35 | default=[], 36 | help=".-separated fields to not expand", 37 | ) 38 | parser.add_argument( 39 | "--prefix", 40 | type=str, 41 | default="generated-", 42 | help="Prefix to add to generated templates", 43 | ) 44 | parser.add_argument( 45 | "templates", 46 | nargs="+", 47 | type=str, 48 | help="Template .toml files or directories containing template .toml files.", 49 | ) 50 | parser.add_argument( 51 | "output", 52 | type=str, 53 | help="Output directory to write the generated .toml files to.", 54 | ) 55 | return parser.parse_args() 56 | 57 | 58 | def generate(args: argparse.Namespace) -> None: 59 | strategy = templating.Strategy.new(args.strategy) 60 | 61 | count = args.count 62 | if strategy is templating.Strategy.random: 63 | assert count > 0, "Need to include --count!" 64 | 65 | for template_toml in util.files_with_extension(args.templates, ".toml"): 66 | with open(template_toml, "rb") as template_file: 67 | try: 68 | template_dict = tomli.load(template_file) 69 | except tomli.TOMLDecodeError as err: 70 | logger.warning( 71 | "Error parsing template file. [file: %s, err: %s]", 72 | template_toml, 73 | err, 74 | ) 75 | continue 76 | 77 | template_name = pathlib.Path(template_toml).stem 78 | 79 | logger.info("Opened template file. [file: %s]", template_toml) 80 | 81 | experiment_dicts = templating.generate( 82 | template_dict, strategy, count=count, no_expand=set(args.no_expand) 83 | ) 84 | 85 | logger.info( 86 | "Loaded experiment dictionaries. [count: %s]", len(experiment_dicts) 87 | ) 88 | 89 | for i, experiment_dict in enumerate(tqdm(experiment_dicts)): 90 | filename = f"{args.prefix}{template_name}-{i}.toml" 91 | filepath = os.path.join(args.output, filename) 92 | with open(filepath, "wb") as file: 93 | tomli_w.dump(experiment_dict, file) 94 | 95 | # Verifies that the configs are correctly loaded. 96 | list(config.load_configs(filepath)) 97 | 98 | 99 | def main() -> None: 100 | args = parse_args() 101 | generate(args) 102 | 103 | 104 | if __name__ == "__main__": 105 | main() 106 | -------------------------------------------------------------------------------- /src/experiments/lib.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import logging 3 | import pathlib 4 | from typing import Any, Dict, Iterator 5 | 6 | import relic 7 | 8 | from .. import config, util 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def find_experiments(paths) -> Iterator[config.ExperimentConfig]: 14 | """ 15 | Arguments: 16 | * args (list[str]): list of strings that are either directories containing files or config files themselves. 17 | """ 18 | if not isinstance(paths, list): 19 | paths = [paths] 20 | 21 | for config_file in util.files_with_extension(paths, ".toml"): 22 | yield from config.load_configs(config_file) 23 | 24 | 25 | def make_relic_config(experiment_config: config.ExperimentConfig) -> Dict[str, Any]: 26 | relic_config = dataclasses.asdict(experiment_config) 27 | 28 | # don't want to include these parameters in the relic repository. 29 | relic_config.pop("trials") 30 | relic_config.pop("save_weights") 31 | relic_config["training"].pop("maximum_epochs") 32 | relic_config["training"].pop("snapshot_interval") 33 | relic_config["training"].pop("report_interval") 34 | 35 | return relic_config 36 | 37 | 38 | def experiment_from_config( 39 | experiment_config: config.ExperimentConfig, 40 | ) -> relic.Experiment: 41 | """ 42 | Create a relic experiment from an ExperimentConfig. This method removes some fields from ExperimentConfig that shouldn't matter when considering results (whether the model was saved, how many trials were run, etc.). 43 | """ 44 | relic_exp = relic.new_experiment( 45 | make_relic_config(experiment_config), pathlib.Path("relics") 46 | ) 47 | 48 | return relic_exp 49 | -------------------------------------------------------------------------------- /src/intrinsic_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Thin wrapper around intrinsic module to provide a intrinsic_dimension_vector property on the module. 3 | """ 4 | import torch 5 | 6 | import intrinsic 7 | 8 | from . import accelerate, config, modeling_utils, util 9 | 10 | 11 | class IntrinsicDimension( 12 | intrinsic.IntrinsicDimension, 13 | modeling_utils.IntrinsicDimension, 14 | modeling_utils.KnowsBatchSize, 15 | modeling_utils.Saveable, 16 | ): 17 | @property 18 | def get_intrinsic_dimension_vector(self): 19 | return self.intrinsic_vector.detach().cpu() 20 | 21 | def set_intrinsic_dimension_vector(self, vec: torch.Tensor) -> None: 22 | self.intrinsic_vector.copy_(vec) 23 | 24 | def save(self, path) -> None: 25 | data = { 26 | "fastfood_seed": self.seed, 27 | "theta_d": self.get_intrinsic_dimension_vector.detach(), 28 | } 29 | 30 | torch.save(data, path) 31 | 32 | def batch_size(self, training_config: config.TrainingConfig) -> int: 33 | accelerate._set_environment(self) 34 | self.logger.info("TODO: Take model.context_window into account.") 35 | if torch.cuda.device_count() < 1: 36 | self.logger.warn("On CPU; use as big a batch size as you want!") 37 | return training_config.batch_size 38 | 39 | mb = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 40 | 41 | self.logger.info( 42 | "[available memory: %s, rtx2080ti estimate: %s, v100 estimate: %s]", 43 | mb, 44 | util.rtx2080ti, 45 | util.v100, 46 | ) 47 | 48 | model_size = "gpt2" 49 | try: 50 | layer_count = len(self.hidden.transformer.h) 51 | if layer_count == 36: 52 | model_size = "gpt2-large" 53 | elif layer_count == 24: 54 | model_size = "gpt2-medium" 55 | else: 56 | assert layer_count == 12 57 | except AttributeError: 58 | pass 59 | 60 | if model_size == "gpt2": 61 | if mb <= util.rtx2080ti: 62 | # max on rtx2080ti is 2 63 | assert ( 64 | accelerate._ENVIRONMENT is accelerate.TrainingType.MODEL_PARALLELISM 65 | ) 66 | return min(2, training_config.batch_size) 67 | elif util.rtx2080ti < mb <= util.v100: 68 | assert accelerate._ENVIRONMENT is accelerate.TrainingType.SINGLE_GPU 69 | return min(2, training_config.batch_size) 70 | elif ( 71 | util.v100 < mb <= util.v100 * 2 72 | ): # some of the pitzer clusters have 2 NVLINKed v100s. 73 | assert accelerate._ENVIRONMENT is accelerate.TrainingType.SINGLE_GPU 74 | return min(4, training_config.batch_size) 75 | else: 76 | assert mb > 2 * util.v100 77 | # deal with this when the time comes 78 | return training_config.batch_size 79 | elif model_size == "gpt2-medium": 80 | if mb < util.a6000: 81 | assert accelerate._ENVIRONMENT is accelerate.TrainingType.SINGLE_GPU 82 | return min(2, training_config.batch_size) 83 | elif model_size == "gpt2-large": 84 | if mb < util.a6000: 85 | assert accelerate._ENVIRONMENT is accelerate.TrainingType.SINGLE_GPU 86 | return 1 87 | 88 | raise ValueError(mb, model_size) 89 | -------------------------------------------------------------------------------- /src/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def init(name: str, verbose: bool = False, date=True) -> logging.Logger: 5 | if date: 6 | log_format = "[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s" 7 | else: 8 | log_format = "[%(levelname)s] [%(name)s] %(message)s" 9 | 10 | if not verbose: 11 | logging.basicConfig(level=logging.INFO, format=log_format) 12 | else: 13 | logging.basicConfig(level=logging.DEBUG, format=log_format) 14 | 15 | return logging.getLogger(name) 16 | -------------------------------------------------------------------------------- /src/make_tokenizers.py: -------------------------------------------------------------------------------- 1 | import tokenizers 2 | 3 | 4 | def main(): 5 | alphabet = sorted(tokenizers.pre_tokenizers.ByteLevel.alphabet()) 6 | 7 | # 1-byte.json 8 | tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE()) 9 | tokenizer.decoder = tokenizers.decoders.ByteLevel() 10 | tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel( 11 | add_prefix_space=False, trim_offsets=True, use_regex=False 12 | ) 13 | 14 | trainer = tokenizers.trainers.BpeTrainer( 15 | special_tokens=["<|endoftext|>"], 16 | initial_alphabet=alphabet, 17 | vocab_size=len(alphabet), 18 | ) 19 | tokenizer.train([], trainer) 20 | tokenizer.save("src/tokenizers/1-byte.json") 21 | 22 | # 2-byte tokenizer 23 | tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE()) 24 | tokenizer.decoder = tokenizers.decoders.ByteLevel() 25 | tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel( 26 | add_prefix_space=False, trim_offsets=True, use_regex=False 27 | ) 28 | 29 | trainer = tokenizers.trainers.BpeTrainer( 30 | special_tokens=["<|endoftext|>"], 31 | initial_alphabet=alphabet, 32 | vocab_size=len(alphabet) * (len(alphabet) + 1) + 1, 33 | ) 34 | 35 | data = [i + j for i in alphabet for j in alphabet] 36 | 37 | tokenizer.train_from_iterator(data, trainer) 38 | tokenizer.save("src/tokenizers/2-byte.json") 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /src/modeling_utils.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | import numpy as np 4 | import torch 5 | 6 | import intrinsic.fwh 7 | 8 | from . import config 9 | 10 | 11 | class Saveable(abc.ABC): 12 | @abc.abstractmethod 13 | def save(self, path): 14 | ... 15 | 16 | 17 | class IntrinsicDimension(abc.ABC): 18 | @abc.abstractproperty 19 | def get_intrinsic_dimension_vector(self) -> torch.Tensor: 20 | ... 21 | 22 | @abc.abstractmethod 23 | def set_intrinsic_dimension_vector(self, vec: torch.Tensor) -> None: 24 | ... 25 | 26 | 27 | class KnowsBatchSize(abc.ABC): 28 | @abc.abstractmethod 29 | def batch_size(self, training_config: config.TrainingConfig) -> int: 30 | ... 31 | 32 | 33 | class Cos(torch.nn.Module): 34 | def forward(self, x): 35 | return torch.cos(x) 36 | 37 | 38 | class Sine(torch.nn.Module): 39 | def forward(self, x): 40 | return torch.sin(x) 41 | 42 | 43 | class LayerNorm(torch.nn.Module): 44 | def forward(self, x): 45 | std, mean = torch.std_mean(x) 46 | return (x - mean) / (std + 1e-8) 47 | 48 | 49 | class GroupNorm(torch.nn.Module): 50 | """ 51 | Applies LayerNorm to multiple groups, so each group is normalized by its own mean and std deviation. 52 | """ 53 | 54 | groups: int 55 | 56 | def __init__(self, groups: int): 57 | super().__init__() 58 | self.groups = groups 59 | 60 | def apply_norm(self, x): 61 | std, mean = torch.std_mean(x) 62 | return (x - mean) / (std + 1e-8) 63 | 64 | def forward(self, x): 65 | assert ( 66 | np.prod(x.shape) % self.groups == 0 67 | ), f"Group count {self.groups} must be an divisor of x.shape {x.shape} -> {np.prod(x.shape)}" 68 | 69 | tensors = torch.chunk(x, self.groups) 70 | 71 | tensors = [self.apply_norm(t) for t in tensors] 72 | 73 | return torch.cat(tensors) 74 | 75 | 76 | class InverseFn(torch.nn.Module): 77 | def forward(self, x): 78 | return 1 / (x + 1e-8) 79 | 80 | 81 | class NonlinearWHT(torch.nn.Module): 82 | def forward(self, x): 83 | return intrinsic.fwh.fast_nonlinear_walsh_hadamard_transform(x, 5 / 3) 84 | 85 | 86 | def estimate_memory_requirements( 87 | model: torch.nn.Module, intrinsic_dimension: int = 0, efficient: bool = True 88 | ): 89 | """ 90 | Try to calculate the required memory based on the following assumptions: 91 | * Floats are 4 bytes. 92 | * We are using an optimizer that maintains 2 floats per parameter. 93 | """ 94 | 95 | def floats_for(tensor): 96 | numel = np.prod(tensor.shape) 97 | 98 | if intrinsic_dimension > 0 and efficient: 99 | numel += 2 ** np.ceil(np.log2(numel)) 100 | 101 | # inputs + activations, one copy for gradients, two copies for adam optimizer states. 102 | return numel * 8 103 | 104 | bytes_per_float = 4 105 | total = 0 106 | 107 | for tensor in model.parameters(): 108 | total += floats_for(tensor) 109 | 110 | for buffer in model.buffers(): 111 | total += floats_for(buffer) 112 | 113 | if intrinsic_dimension > 0 and not efficient: 114 | total_size = 0 115 | for tensor in model.parameters(): 116 | total_size += np.prod(tensor.shape) 117 | for buffer in model.buffers(): 118 | total_size += np.prod(buffer.shape) 119 | 120 | total += (2 ** np.ceil(np.log2(total_size))) * 8 121 | 122 | return total * bytes_per_float 123 | -------------------------------------------------------------------------------- /src/paper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/paper/__init__.py -------------------------------------------------------------------------------- /src/paper/ciphertext_dist_histograms.py: -------------------------------------------------------------------------------- 1 | """ 2 | Demonstrates that ciphertexts have approximately normal distributions 3 | """ 4 | 5 | import argparse 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import seaborn as sns 10 | 11 | sns.set_theme() 12 | 13 | from . import security 14 | 15 | sns.set_style("whitegrid", {"axes.grid": False}) 16 | sns.set_context("paper", font_scale=0.7) 17 | sns.set_palette("Dark2") 18 | 19 | 20 | def init_parser() -> argparse.ArgumentParser: 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument("filename", type=str, help="Filepath to save file to.") 23 | 24 | return parser 25 | 26 | 27 | def main(): 28 | parser = init_parser() 29 | args = parser.parse_args() 30 | 31 | ciphertexts = security.load_ciphertexts("original", count=2) 32 | 33 | bound = 5e-7 34 | 35 | fig, axes = plt.subplots(nrows=5, ncols=2, sharex=True, sharey=True) 36 | bins = np.linspace(start=-bound, stop=bound, num=30) 37 | 38 | axes[0][0].hist(ciphertexts["data/news/100-tokens/0.txt"][0], bins=bins) 39 | axes[0][1].hist(ciphertexts["data/news/100-tokens/0.txt"][1], bins=bins) 40 | axes[0][0].set_ylabel("News (0)") 41 | 42 | axes[1][0].hist(ciphertexts["data/news/100-tokens/1.txt"][0], bins=bins) 43 | axes[1][1].hist(ciphertexts["data/news/100-tokens/1.txt"][1], bins=bins) 44 | axes[1][0].set_ylabel("News (1)") 45 | 46 | axes[2][0].hist(ciphertexts["data/pubmed/100-tokens/0.txt"][0], bins=bins) 47 | axes[2][1].hist(ciphertexts["data/pubmed/100-tokens/0.txt"][1], bins=bins) 48 | axes[2][0].set_ylabel("PubMed") 49 | 50 | axes[3][0].hist(ciphertexts["data/random-words/100-tokens/0.txt"][0], bins=bins) 51 | axes[3][1].hist(ciphertexts["data/random-words/100-tokens/0.txt"][1], bins=bins) 52 | axes[3][0].set_ylabel("Rand. Words") 53 | 54 | axes[4][0].hist(ciphertexts["data/random-bytes/100-tokens/0.txt"][0], bins=bins) 55 | axes[4][1].hist(ciphertexts["data/random-bytes/100-tokens/0.txt"][1], bins=bins) 56 | axes[4][0].set_ylabel("Rand. Bytes") 57 | 58 | axes[0][0].set(yticklabels=[]) 59 | 60 | fig.tight_layout() 61 | fig.savefig(args.filename, bbox_inches="tight") 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /src/paper/embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Visualizes distributions of encrypted plaintexts using low-dimensional embedding techniques. 3 | """ 4 | 5 | import argparse 6 | import warnings 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import seaborn as sns 11 | import sklearn.manifold 12 | import sklearn.preprocessing 13 | from tqdm.auto import tqdm 14 | 15 | from . import helpers, security 16 | 17 | sns.set_style("whitegrid", {"axes.grid": False}) 18 | sns.set_context("notebook", font_scale=1.3) 19 | sns.set_palette("Dark2") 20 | 21 | 22 | def init_parser() -> argparse.ArgumentParser: 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument( 25 | "group", 26 | choices=[ 27 | "original", 28 | "l2-norm-reg", 29 | "distribution-reg", 30 | ], 31 | help="Which ciphertext groups to use.", 32 | ) 33 | parser.add_argument( 34 | "count", type=int, help="How many experiments from each group to use" 35 | ) 36 | parser.add_argument("filename", type=str, help="Filepath to save file to.") 37 | 38 | return parser 39 | 40 | 41 | def plot_ciphertexts(ciphertexts): 42 | # Get point/file arrays 43 | # keys = sorted(ciphertexts.keys()) 44 | keys = ["data/news/100-tokens/0.txt", "data/random-bytes/100-tokens/0.txt"] 45 | 46 | points = np.concatenate([ciphertexts[k] for k in keys]) 47 | files = np.concatenate( 48 | [[helpers.translate_filename(k)] * len(ciphertexts[k]) for k in keys] 49 | ) 50 | 51 | # Do dimension-reduction 52 | perplexity = 50 53 | learning_rate = 50 54 | trials = 5 55 | 56 | best_embedded = None 57 | best_divergence = np.inf 58 | 59 | # Try 10 random seeds 60 | for i in tqdm(range(trials)): 61 | # Ignore FutureWarning 62 | with warnings.catch_warnings(): 63 | warnings.simplefilter("ignore", category=FutureWarning) 64 | scaler = sklearn.preprocessing.StandardScaler() 65 | tsne = sklearn.manifold.TSNE( 66 | n_components=2, 67 | perplexity=perplexity, 68 | random_state=i, 69 | init="pca", 70 | learning_rate=learning_rate, 71 | n_iter=5000, 72 | ) 73 | embedded = tsne.fit_transform(scaler.fit_transform(points)) 74 | if tsne.kl_divergence_ < best_divergence: 75 | best_divergence = tsne.kl_divergence_ 76 | best_embedded = embedded 77 | 78 | # Convert to dataframe 79 | rows = [ 80 | (best_embedded[i][0], best_embedded[i][1], file) for i, file in enumerate(files) 81 | ] 82 | df = pd.DataFrame(rows, columns=["x", "y", "File"]) 83 | 84 | order = ["News (N0)", "Rand. Bytes (RB)"] 85 | 86 | fig = sns.relplot( 87 | df, 88 | x="x", 89 | y="y", 90 | style="File", 91 | style_order=order, 92 | hue="File", 93 | hue_order=order, 94 | kind="scatter", 95 | facet_kws=dict(legend_out=False), 96 | ) 97 | fig.set(xlabel=None, ylabel=None, xticks=[], yticks=[]) 98 | fig.despine(right=True, top=True, bottom=True, left=True) 99 | fig.legend.set_title(None) 100 | sns.move_legend(fig, "upper right") 101 | 102 | return fig 103 | 104 | 105 | def main(): 106 | parser = init_parser() 107 | args = parser.parse_args() 108 | 109 | ciphertexts = security.load_ciphertexts(args.group, args.count) 110 | 111 | fig = plot_ciphertexts(ciphertexts) 112 | 113 | fig.savefig(args.filename, bbox_inches="tight") 114 | 115 | 116 | if __name__ == "__main__": 117 | main() 118 | -------------------------------------------------------------------------------- /src/paper/feature_importance.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script measure the feature importance for each proposed variant of the encryption algorithm. 3 | """ 4 | 5 | import argparse 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import seaborn as sns 10 | import sklearn.feature_selection 11 | 12 | from .. import attacking, logging 13 | from . import helpers, security 14 | 15 | sns.set_style("whitegrid") 16 | sns.set_context("paper", font_scale=2) 17 | sns.set_palette("Dark2") 18 | 19 | logger = logging.init("feature-importance") 20 | 21 | 22 | def init_parser() -> argparse.ArgumentParser: 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument( 25 | "--relics", 26 | help="Path to relics/ directory", 27 | ) 28 | parser.add_argument( 29 | "count", type=int, help="How many experiments from each group to use" 30 | ) 31 | parser.add_argument("filename", type=str, help="Filepath to save file to.") 32 | 33 | return parser 34 | 35 | 36 | def measure_mutual_information(datasets): 37 | files = ("data/news/100-tokens/0.txt", "data/random-bytes/100-tokens/0.txt") 38 | datasets = [dataset for dataset in datasets if dataset.name in files] 39 | # Arrange the datasets into a single X, y multiclass classification problem. 40 | x = np.concatenate([dataset.splits[0] for dataset in datasets], axis=0) 41 | y = np.zeros(x.shape[0]) 42 | start = 0 43 | end = 0 44 | for i, dataset in enumerate(datasets): 45 | start = end 46 | end += dataset.splits[0].shape[0] 47 | y[start:end] = i 48 | 49 | # Measure mutual information 50 | mi = sklearn.feature_selection.mutual_info_classif( 51 | x, 52 | y, 53 | discrete_features=False, 54 | n_neighbors=3, 55 | copy=True, 56 | random_state=42, 57 | ) 58 | 59 | return mi 60 | 61 | 62 | def load_datasets(group, *, count): 63 | ciphertexts = security.load_ciphertexts(group, count=count) 64 | return list( 65 | attacking.data.make_single_datasets( 66 | ciphertexts, attacking.data.preprocess, ratio=1.0 67 | ) 68 | ) 69 | 70 | 71 | def plot_mi(original, l2_norm, dist, keys): 72 | fig, ax = plt.subplots(subplot_kw={"aspect": 9}) 73 | 74 | x = np.arange(len(keys)) 75 | width = 0.3 76 | 77 | ax.bar(x - width, original, width, label="Original") 78 | ax.bar(x, l2_norm, width, label="L2-Norm Reg.") 79 | ax.bar(x + width, dist, width, label="Dist. Reg.") 80 | ax.set_xticks(x) 81 | ax.set_xticklabels([helpers.translate_feature(k) for k in keys]) 82 | ax.set_ylabel("Mutual Information") 83 | ax.set_xlabel("Feature") 84 | ax.legend() 85 | 86 | return fig 87 | 88 | 89 | def main(): 90 | parser = init_parser() 91 | args = parser.parse_args() 92 | 93 | # Load ciphertexts from experiments 94 | original_mi = measure_mutual_information( 95 | load_datasets("original", count=args.count) 96 | ) 97 | l2_norm_mi = measure_mutual_information( 98 | load_datasets("l2-norm-reg", count=args.count) 99 | ) 100 | dist_mi = measure_mutual_information( 101 | load_datasets("distribution-reg", count=args.count) 102 | ) 103 | keys = sorted(attacking.data.FEATURE_FUNCTIONS.keys()) 104 | 105 | fig = plot_mi(original_mi, l2_norm_mi, dist_mi, keys) 106 | fig.tight_layout() 107 | fig.savefig(args.filename, bbox_inches="tight") 108 | 109 | 110 | if __name__ == "__main__": 111 | main() 112 | -------------------------------------------------------------------------------- /src/paper/helpers.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def translate_domain(domain): 5 | if domain == "news": 6 | return "News" 7 | elif domain == "pubmed": 8 | return "PubMed" 9 | elif domain == "random-words": 10 | return "Random Words" 11 | elif domain == "random-bytes": 12 | return "Random Bytes" 13 | elif domain == "binary": 14 | return "Multimedia" 15 | else: 16 | raise ValueError(domain) 17 | 18 | 19 | def translate_model(model_name, pretrained=True): 20 | if model_name == "gpt2" and pretrained: 21 | return "GPT-2" 22 | if model_name == "gpt2" and not pretrained: 23 | return "GPT-2 (rand)" 24 | elif model_name == "gpt2-medium": 25 | return "335M" 26 | elif model_name == "EleutherAI/pythia-70m": 27 | return "Pythia 70M" 28 | elif model_name == "EleutherAI/pythia-70m-deduped": 29 | return "Pythia 70M, deduped" 30 | elif model_name == "EleutherAI/pythia-160m": 31 | return "Pythia" 32 | elif model_name == "EleutherAI/pythia-160m-deduped": 33 | return "Pythia 160M, deduped" 34 | elif model_name == "cerebras/Cerebras-GPT-111M": 35 | return "Cerebras" 36 | else: 37 | raise ValueError(model_name) 38 | 39 | 40 | def translate_feature(feature): 41 | if feature == "l2-norm": 42 | return "L2" 43 | elif feature == "l1-norm": 44 | return "L1" 45 | elif feature == "std": 46 | return "Std" 47 | elif feature == "mean": 48 | return "Mean" 49 | elif feature == "max": 50 | return "Max" 51 | elif feature == "min": 52 | return "Min" 53 | else: 54 | raise ValueError(feature) 55 | 56 | 57 | def translate_filename(filename): 58 | if filename == "data/pubmed/100-tokens/0.txt": 59 | return "PubMed (PM)" 60 | elif filename == "data/news/100-tokens/0.txt": 61 | return "News ($m1$)" 62 | # return "News (N0)" 63 | elif filename == "data/news/100-tokens/1.txt": 64 | return "News (N1)" 65 | elif filename == "data/random-bytes/100-tokens/0.txt": 66 | return "Rand. Bytes ($m2$)" 67 | # return "Rand. Words (RW)" 68 | elif filename == "data/random-words/100-tokens/0.txt": 69 | return "Rand. Words ($m2$)" 70 | # return "Rand. Words (RW)" 71 | else: 72 | raise ValueError(filename) 73 | 74 | 75 | def parse_length(filename: str) -> int: 76 | # data/random-words/100-tokens/5.txt -> 100 77 | pattern = re.compile(r"data/[a-z\-]+/(\d+)-tokens/\d\.txt") 78 | match = pattern.match(filename) 79 | 80 | return int(match.group(1)) 81 | 82 | 83 | def parse_domain(filename: str) -> str: 84 | # data/random-words/100-tokens/5.txt -> "random-words" 85 | pattern = re.compile(r"data/([a-z\-]+)/\d+-tokens/\d\.txt") 86 | match = pattern.match(filename) 87 | 88 | return match.group(1) 89 | -------------------------------------------------------------------------------- /src/paper/prefix_table.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import statistics 3 | 4 | from .. import relic_helpers 5 | 6 | 7 | def load_experiments(): 8 | filters = [ 9 | "(~ data.file '300-tokens')", 10 | "(== model.intrinsic_dimension 3000)", 11 | "(== model.language_model_name_or_path 'gpt2')", 12 | "(not (== data.prompt_length None))", 13 | "(not (== data.prompt_type 'n-tokens'))", 14 | "(not (== data.prompt_type 'chunk-n'))", 15 | ] 16 | 17 | return relic_helpers.load_experiments(filters, show_cmd=True) 18 | 19 | 20 | def print_table(experiments): 21 | # Dict[prefix type, int] 22 | counts = collections.defaultdict(list) 23 | 24 | for experiment in experiments: 25 | prefix = experiment.config["data"]["prompt_type"] 26 | 27 | counts[prefix].append(experiment[0]["epochs"]) 28 | 29 | results = {} 30 | for prefix, epochs in counts.items(): 31 | results[prefix] = (statistics.mean(epochs), statistics.stdev(epochs)) 32 | 33 | mean, std = results["token"] 34 | print(f"New Token & \\num{{1}} & ${mean:.0f}\pm{std:.1f}$ \\\\") 35 | mean, std = results["vocab"] 36 | print(f"Vocab & \\num{{1}} & ${mean:.0f}\pm{std:.1f}$ \\\\") 37 | mean, std = results["natural-n"] 38 | print(f"Natural Prompt & \\num{{4}} & ${mean:.0f}\pm{std:.1f}$ \\\\") 39 | mean, std = results["uuid"] 40 | print(f"UUID & \\num{{27}} & ${mean:.0f}\pm{std:.1f}$ \\\\") 41 | mean, std = results["2x-uuid"] 42 | print(f"$2\\times$ UUID & \\num{{54}} & ${mean:.0f}\pm{std:.1f}$ \\\\") 43 | mean, std = results["3x-uuid"] 44 | print(f"$3\\times$ UUID & \\num{{76}} & ${mean:.0f}\pm{std:.1f}$ \\\\") 45 | 46 | 47 | def main(): 48 | experiments = load_experiments() 49 | 50 | print_table(experiments) 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /src/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/test/__init__.py -------------------------------------------------------------------------------- /src/test/attacking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/test/attacking/__init__.py -------------------------------------------------------------------------------- /src/test/attacking/test_pipeline.py: -------------------------------------------------------------------------------- 1 | from ...attacking import pipeline 2 | 3 | 4 | def test_calc_worst_score_initial(): 5 | actual = pipeline.calc_worst_score(0.5, 0.3, 0.7, 0.8) 6 | assert actual == 0.8 7 | 8 | 9 | def test_calc_worst_score_inside_bounds(): 10 | actual = pipeline.calc_worst_score(0.5, 0.4, 0.6, 0.55) 11 | assert actual == 0.55 12 | 13 | 14 | def test_calc_worst_score_better_score(): 15 | actual = pipeline.calc_worst_score(0.55, 0.4, 0.6, 0.5) 16 | assert actual == 0.55 17 | -------------------------------------------------------------------------------- /src/test/test_modeling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import intrinsic 4 | 5 | from .. import config, modeling 6 | 7 | 8 | def test_imports(): 9 | pass 10 | 11 | 12 | def check_fft_equality(a, b): 13 | return a.d == b.d and a.D == b.D 14 | 15 | 16 | def assert_sequential(actual, expected): 17 | assert len(actual) == len(expected) 18 | 19 | for a, e in zip(actual, expected): 20 | if isinstance(e, intrinsic.FastfoodTransform): 21 | assert check_fft_equality(a, e) 22 | else: 23 | assert type(a) == type(e) 24 | 25 | 26 | def test_project_factory_empty(): 27 | model_config = config.ModelConfig( 28 | language_model_name_or_path="gpt2", 29 | projection=config.ProjectionConfig(layers=[]), 30 | ) 31 | 32 | int_dim, D = 100, 1000 33 | 34 | factory = modeling.new_projection_factory(model_config, seed=0) 35 | projection = factory(int_dim, D) 36 | 37 | expected = torch.nn.Sequential(intrinsic.FastfoodTransform(int_dim, D)) 38 | 39 | assert_sequential(projection, expected), str(projection) 40 | 41 | 42 | def test_project_factory_nonlinearity(): 43 | model_config = config.ModelConfig( 44 | language_model_name_or_path="gpt2", 45 | projection=config.ProjectionConfig(layers=["output", "sigmoid"]), 46 | ) 47 | 48 | int_dim, D = 100, 1000 49 | 50 | factory = modeling.new_projection_factory(model_config, seed=0) 51 | projection = factory(int_dim, D) 52 | 53 | expected = torch.nn.Sequential( 54 | intrinsic.FastfoodTransform(int_dim, D), torch.nn.Sigmoid() 55 | ) 56 | 57 | assert_sequential(projection, expected) 58 | 59 | 60 | def test_project_factory_two_projection(): 61 | model_config = config.ModelConfig( 62 | language_model_name_or_path="gpt2", 63 | projection=config.ProjectionConfig(layers=[500, "sigmoid", "output"]), 64 | ) 65 | 66 | int_dim, D = 100, 1000 67 | 68 | factory = modeling.new_projection_factory(model_config, seed=0) 69 | projection = factory(int_dim, D) 70 | 71 | expected = torch.nn.Sequential( 72 | intrinsic.FastfoodTransform(int_dim, 500), 73 | torch.nn.Sigmoid(), 74 | intrinsic.FastfoodTransform(500, 1000), 75 | ) 76 | 77 | assert_sequential(projection, expected) 78 | 79 | 80 | def test_project_factory_neuralnetwork(): 81 | model_config = config.ModelConfig( 82 | language_model_name_or_path="gpt2", 83 | projection=config.ProjectionConfig( 84 | layers=[500, "sigmoid", "output", "sigmoid"] 85 | ), 86 | ) 87 | 88 | int_dim, D = 100, 1000 89 | 90 | factory = modeling.new_projection_factory(model_config, seed=0) 91 | projection = factory(int_dim, D) 92 | 93 | expected = torch.nn.Sequential( 94 | intrinsic.FastfoodTransform(int_dim, 500), 95 | torch.nn.Sigmoid(), 96 | intrinsic.FastfoodTransform(500, 1000), 97 | torch.nn.Sigmoid(), 98 | ) 99 | 100 | assert_sequential(projection, expected) 101 | 102 | 103 | def test_kolmogorov_smirnov_empirical_cdf_simple(): 104 | ks = modeling.KolmogorovSmirnovLoss(None, None, mean=0, std=1) 105 | 106 | observations = torch.tensor([0, 0.3, 0.4, 0.8, 1.5]) 107 | 108 | assert ks.statistic(observations) == 0.5 109 | -------------------------------------------------------------------------------- /src/test/test_tokenizing.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | from .. import config, tokenizing 4 | 5 | 6 | class DummyTokenizer: 7 | eos_token = ord("~") 8 | 9 | def __init__(self, model_max_length): 10 | self.model_max_length = model_max_length 11 | 12 | def __call__(self, text): 13 | return {"input_ids": [ord(c) for c in text]} 14 | 15 | def decode(self, ids, **kwargs): 16 | return "".join(chr(i) for i in ids) 17 | 18 | 19 | def test_load_chunks_smoke(): 20 | text = "hello world!" 21 | with tempfile.NamedTemporaryFile() as data_file: 22 | data_file.write(text.encode()) 23 | data_config = config.DataConfig(data_file.name) 24 | 25 | tokenizer = DummyTokenizer(100) 26 | actual = tokenizing.load_chunks(text, data_config, tokenizer) 27 | expected = [ 28 | tokenizing.Chunk(str(tokenizing.DEFAULT_PROMPT), text, tokenizer.eos_token) 29 | ] 30 | 31 | assert actual == expected 32 | -------------------------------------------------------------------------------- /src/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/tools/__init__.py -------------------------------------------------------------------------------- /src/tools/verify_encryption.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | import argparse 5 | import logging 6 | 7 | import relic 8 | import torch 9 | from tqdm.auto import tqdm 10 | 11 | log_format = "[%(levelname)s] [%(name)s] %(message)s" 12 | logging.basicConfig(level=logging.WARNING, format=log_format) 13 | logger = logging.getLogger("verify-enc") 14 | 15 | from .. import accelerate, config, evaluating, modeling, relic_helpers, tokenizing 16 | 17 | 18 | def init_parser() -> argparse.ArgumentParser: 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument( 21 | "-e", 22 | "--experiments", 23 | nargs="+", 24 | help="Filter experiments based on results. Example: '(all (< epochs 1000))'", 25 | default=[], 26 | ) 27 | parser.add_argument( 28 | "--relics", 29 | help="Path to relics/ directory", 30 | ) 31 | 32 | return parser 33 | 34 | 35 | def check_trial_succeeded(exp: relic.Experiment, trial: int) -> bool: 36 | if trial > len(exp): 37 | logger.warning("Trial missing. [exp: %s, trial: %d]", exp.hash, trial) 38 | return False 39 | 40 | if not exp[trial]["finished"]: 41 | logger.warning("Trial not finished. [exp: %s, trial: %d]", exp.hash, trial) 42 | return False 43 | 44 | if not exp[trial]["succeeded"]: 45 | logger.warning("Trial failed. [exp %s, trial: %d]", exp.hash, trial) 46 | return False 47 | 48 | if not exp.model_exists(trial): 49 | logger.warning("Model missing. [exp: %s, trial: %d]", exp.hash, trial) 50 | return False 51 | 52 | return True 53 | 54 | 55 | def verify_trial(exp: relic.Experiment, trial: int) -> bool: 56 | saved = torch.load(exp.model_path(trial)) 57 | seed = saved["fastfood_seed"] 58 | theta_d = saved["theta_d"] 59 | 60 | experiment_config = config.ExperimentConfig.from_dict(exp.config) 61 | 62 | tokenizer = tokenizing.new(experiment_config.tokenizer) 63 | 64 | model = modeling.new( 65 | experiment_config.model, 66 | vocab=len(tokenizer), 67 | seed=seed, 68 | ) 69 | 70 | accelerate.prepare(model) 71 | 72 | with torch.no_grad(): 73 | model.intrinsic_vector.copy_(theta_d) 74 | model.set_module_weights() 75 | 76 | return evaluating.passes(model, tokenizer, experiment_config, exp[trial]["epochs"]) 77 | 78 | 79 | def main(): 80 | parser = init_parser() 81 | args = parser.parse_args() 82 | 83 | experiments = relic_helpers.load_experiments(args.experiments, args.relics) 84 | 85 | for exp in experiments: 86 | for trial, _ in enumerate(tqdm(exp)): 87 | if not check_trial_succeeded(exp, trial): 88 | continue 89 | 90 | if not verify_trial(exp, trial): 91 | print(f"{exp.hash[:8]} {trial}") 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /src/training_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | 4 | def make_linear_reg_scheduler(warmup: int) -> Callable[[int], float]: 5 | def scheduler_fn(step: int) -> float: 6 | if step > warmup: 7 | return 1.0 8 | 9 | return step / warmup 10 | 11 | return scheduler_fn 12 | -------------------------------------------------------------------------------- /src/types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OSU-NLP-Group/SELM/bb852ca95f365000373852089f0b530c7d4d35e4/src/types.py --------------------------------------------------------------------------------