├── Dockerfile
├── LICENSE
├── README.md
├── openwebtext_tiny
    ├── train.jsonl
    └── val.jsonl
├── requirements.txt
├── requirements_docker.txt
├── scripts
    ├── cl-evaluate.sh
    ├── cl-preprocess.sh
    └── cl-train.sh
├── src
    ├── create_wordlength_eval_data.py
    ├── evaluate.sh
    ├── evaluate_wordlength_model.py
    ├── preprocess.sh
    ├── process_data.py
    ├── run_clm.py
    ├── tokenize_data.py
    └── train.sh
└── wordlength_eval_data
    ├── wordlength_eval_data_0.json
    ├── wordlength_eval_data_1.json
    ├── wordlength_eval_data_2.json
    ├── wordlength_eval_data_3.json
    └── wordlength_eval_data_4.json


/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM codalab/default-gpu
 2 | SHELL ["/bin/bash", "-c"]
 3 | LABEL maintainer="xie@cs.stanford.edu"
 4 | 
 5 | RUN apt-get update && apt-get upgrade -y
 6 | RUN apt-get install -y --no-install-recommends make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev
 7 | 
 8 | RUN git clone git://github.com/yyuu/pyenv.git .pyenv
 9 | 
10 | ENV HOME /
11 | ENV PYENV_ROOT $HOME/.pyenv
12 | ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
13 | 
14 | RUN pyenv install 3.8.5
15 | RUN pyenv global 3.8.5
16 | RUN pyenv rehash
17 | RUN pip install --upgrade pip
18 | 
19 | RUN pip install -U --no-cache-dir \
20 |       numpy \
21 |       scipy \
22 |       matplotlib \
23 |       pandas \
24 |       sympy \
25 |       nose \
26 |       spacy \
27 |       tqdm \
28 |       wheel \
29 |       scikit-learn \
30 |       nltk \
31 |       tensorboard
32 | RUN python -m spacy download en_core_web_sm
33 | 
34 | RUN pip install --no-cache-dir \
35 |       torch==1.8.1+cu111 \
36 |       -f https://download.pytorch.org/whl/torch_stable.html
37 | 
38 | # Install apex
39 | WORKDIR /tmp/apex_installation
40 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && git checkout 0c2c6eea6556b208d1a8711197efc94899e754e1
41 | WORKDIR /tmp/apex_installation/apex
42 | RUN pip install --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
43 | WORKDIR /
44 | 
45 | ADD ./requirements_docker.txt requirements.txt
46 | RUN pip install -r requirements.txt
47 | RUN python -c "import nltk; nltk.download('punkt')"
48 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Sang Michael Xie
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Project 2 (Building Large Language Models) for Stanford CS324: Understanding and Developing Large Language Models (Winter 2022)
 2 | 
 3 | This repository contains the code for project 2 on training language models in the Winter 2022 iteration of [CS324: Understanding and Developing Large Language Models](https://stanford-cs324.github.io/winter2022/) at Stanford. This code was developed by Sang Michael Xie and Percy Liang.
 4 | - In the first part of the project, we use fine-tuning/continued-pretraining to instill length controllability into GPT-2 small, where we can control the word length of the model's generated response by prepending metadata about length. The data, preprocessing code, and training code are provided, but some hyperparameters need to be tuned.
 5 | - In the second part of the project, we aim to instill a new capability of choice into a language model. The project code is built on top of HuggingFace, and the data for the first part is based on OpenWebText. By default, we provide scripts for running the project on CodaLab, a platform for reproducibile research, but the scripts and code can be adapted to run locally or on other platforms as well.
 6 | 
 7 | Useful links:
 8 | - [Project document and starter guide](https://stanford-cs324.github.io/winter2022/projects/CS324_P2.pdf)
 9 | - [Data assets](https://worksheets.codalab.org/worksheets/0x21906fed417147e4a24ef486086b9178)
10 | 
11 | Setup and quickstart for CodaLab:
12 | - [CodaLab guide](https://docs.google.com/document/d/1rWgWyYJc36Vow2eU8oZ0pfVU7SzFRg3Eb1dB1XHZzvE/edit#heading=h.3oa3f2ydysz2)
13 | 


--------------------------------------------------------------------------------
/openwebtext_tiny/train.jsonl:
--------------------------------------------------------------------------------
 1 | {"text": "Transcript for Inside the fight to take down online prostitution review boards\n\nWe're about to go inside an undercover infiltration, police infiltrating and arresting a group of men, including some high-power tech executives, for arranging prostitutes on internet message boards. Many women are trapped in a world of sex trafficking by debt and fear. Tonight we explore the struggle to protect them from exploitation by holding the buyers accountable. Here's my \"Nightline\" coanchor juju Chang. Reporter: It may seem like an ordinary guy easy out, complete with nachos and beer. But this is an undercover operation. A Seattle police officer infiltrating an exclusive group that calls themselves the league of extraordinary gentlemen. Police say their common interest, Korean prostitutes. Also known as k-girls. You're about to witness an unprecedented prostitution sting. Everybody put your hands on top of your head. Put your hands on top of your head, do it now. Reporter: One that uncovered a secret network of online review boards. Websites for rating and recommending local prostitutes. What district attorney Val Ritchie calls -- Yelp for prostitution. She was taller than the 5'3\" advertised, really 5'7\", 120 pounds. Wonderful, natural C cups and full hips -- Reporter: Website users caught in the drag net, tech executives and white collar professionals from an Amazon software develop tore a high-level Microsoft director. We calculated they were spending $30,000 to $50,000 a year on this. It's a simple to use website. You can look at the girl. Okay, that girl looks interesting. You can go to her reviews, see if she is providing the services that you want. Tattoos, piercings, no scars, moles. Reporter: Prostitution review websites are a prolific corner of the illicit online sex trade. Tna review, the erollic review, cover just about every city in America, complete with customer reviews and often escort ads. But not all of these online communities ever end up meeting in real life. What surprised you most about the guys that you were meeting up with? I guess the fact that they were just -- they were like the people living next door. Reporter: The undercover detective who infiltrated this group has asked us not to show his full face. Everybody would order drinks, eat food, talk about, you know -- the latest prostitute that they've had sex with and you know, trends going on. Reporter: Parts of these conversations so sexually graphic, we can't air them. There were waitresses that were -- seemed a little offended. Reporter: The police investigation resulted in the shutdown of three websites. The review board and its two korean-focused offshoots, the league and k-girl delights, as well as controversial charges for some of the men caught in the dragnet. We have charged the people who were putting the content on the website with a felony charge of promoting prostitution. And so in what way does that promote prostitution? In what way is that pimping? The men post reviews and share information. So it goes beyond just a review. And then on to a recommendation. You have to go see this person, she's amazing, go see her before she leaves. Creating demand? Exactly. Reporter: Felony pimping charges for posting reviews on a website has never been attempted before in the U.S. For valiant Ritchie -- valiant is his real name -- it's a focus on demand, not supply. Our understanding of people in prostitution based on years of experience is that most people in prostitution are exploited. And so criminalizing them doesn't work from a criminal justice perspective. At the same time we understand that the exploitation that is driven through sex buying is caused by the buyer. The buyer needs to be held be theable. Reporter: There are many who strongly disagree, including unsurprisingly the men involved in the sites. But all declined to speak with us on camera. The argument though being you sacrifice these men's lives in order to further your career, to get the headlines, to get the national attention. The main focus for us is on dismantling these networks of buyers who are creating this vociferous demand for exploited women. Reporter: We've come to Bellevue, Seattle's wealthiest suburb. Hermes, Prada. To learn more about the Korean prostitution ring here and why some say this case could change the way sex buyers are prosecuted across the country. The idea broth they wills were being operated out of this upscale, suburban apartment building, lots of fancy people in the area who have no idea that sex is being sold right next door. There were several Korean brothels raided in this case. The 12 women found inside release the without charges. Described by police as trafficked. We tried to track them down but they seem to have scattered to the wind. It is really hard to get a genuine glimpse inside one of these Korean brothels. No one will talk to you, no one will admit to anything, because there's too much fear. Tiers too much shame. There's too much criminality. We went to a guy who ran one of these brothels. He actually agreed to talk to us. Michael durnel managed -- He's opening the door, hi. Reporter: -- Two of the Bellevue broth they willed. When they were busted he served 45 days in prison. How do you go from a guy with a wife and a kid to suddenly running a brothel? I was a client. To me it was that excuse, you know. I'm paying for it, it's not cheating. Reporter: Durnel says he fell in love with a Korean prostitute who told him she was in debt back home. So he says he decided to help her. He left his family and job and they opened up a brothel together. The one to the right? Yes. The big gleaming skyscraper, that's the brothel? That's one of them, yeah. You were booking clients to sleep with prostitutes? Correct. That fits the definition of a pimp, doesn't it? I guess it does. My understanding is that you heard a lot of stories that you say broke your heart. There's a lot of different stories on why people come over and do that, that job. Some of them just want to start a business. Some of them, their families will get hurt or death. One girl, I asked her why she doesn't just run, and she says she tried. And they find them. Reporter: Durnel repeatedly told us he deeply regrets his choices, because he saw firsthand the damage it did. You feel like the piece of the soul for the girl is gone. Reporter: While he says he was never personally involved with the review board or any site, he says many of his clients were avid users. I would ask my clients if you liked her, review her. Reporter: When it comes to the police sting that took down those sites and his brothels with it, durnel has his criticisms. They said they rescued all these women. They said that people were holding them captive. This came out of law enforcement's mouth. Who did they save? Every girl they let go, every girl's back in the business. They just go someplace else. You think these women are doing it voluntarily? Well -- it depends on what you're looking at as voluntarily. I mean, they're moving on their own. Nobody's holding their passports. Yet isn't the fear and the debt that you just described to me holding them cap sniff. Correct, yes. But again, it's not all these girls are being held like that either. How do you know which ones are and aren't? You don't. Reporter: Brad miles of the anti-trafficking group polaris says debt bondage is a common factor in the Korean networks. Debt bondage is different from debt. It's a particular type of predatory, manipulative debt where fees and interest rates and other hidden costs and all these other things kick into gear so that the person, the debt grows faster than the person can pay it off. Then you have these customers who think that the women are there voluntarily but the customers don't understand the full picture. Reporter: We did meet with seattle-based sex worker who's speaking out in support of the review sites. The work room is in the back. This is where you see clients? This is where I see clients. Usually the light is more like -- this. Reporter: Magny Mcneil, a popular sex work blogger, says the sites provide a valuable safety tool. Why does it make it safer to book somebody online? People are afraid to sell their pianos online, why would selling sex be safe? Because of reputation. Ebay, if somebody's got 98% positive reviews on Ebay as a buyer or seller, that lets you have confidence in dealing with that person. Reporter: Alisa Bernard, who calls herself a prostitution survivor turned activist, says she knows firsthand review boards don't make things any safer. I had been raped multiple times. Held against my will at least once. I was strangled. These were all by review board guys. You know. Again, your line keeps getting pushed further and further and further to get those good reviews. Because you're afraid of a bad review? A negative review could get you kicked off the board. Reporter: While the debate continues, prosecutor Val Ritchie is showing no signs of stop. In fact, nearly a dozen more men have been arraigned as part of the review board investigation. Ritchie's mission, to end sex buying altogether. This is the oldest profession. How can you possibly hope to eliminate it? I would argue that it's the oldest oppression. And that the one way we can try to eliminate it is by helping men realize that this isn't serving them either. Reporter: Ritchie travels across the U.S., telling other states' attorneys about demand-based prosecution. His hope, that they too will take on similar cases. It would be a great step forward in reducing nationwide exploitation if those review boards did not exist. Reporter: For \"Nightline,\" I'm juju Chang in Bellevue, Washington. Our thanks to Ju Ju for that report. This transcript has been automatically generated and may not be 100% accurate."}
 2 | {"text": "In the wee morning hours of May 12, 2016, Ratu, a rare Sumatran rhino, gave birth to a healthy female calf at the Sumatran Rhino Sanctuary in Indonesia\u2019s Way Kambas National Park. This is the second successful Sumatran rhino birth at the Sanctuary, with Ratu having her first calf, Andatu, in 2012. His father, Andalas, was born at the Cincinnati Zoo in 2001. The birth was attended by Sumatran Rhino Sanctuary veterinarians, keepers and a handful of international advisors. \u201cWe are overjoyed that Ratu delivered a healthy calf and are cautiously optimistic that the calf will continue to thrive,\u201d said Dr. Susie Ellis, executive director of the International Rhino Foundation. \u201cShe\u2019s absolutely adorable, and we haven\u2019t stopped smiling since the moment we were sure she was alive and healthy. While one birth does not save the species, it\u2019s one more Sumatran rhino on Earth.\u201d\n\nThe International Rhino Foundation established the Sumatran Rhino Sanctuary in 1997, and this second birth shows that the expertise exists in Indonesia to contribute to the Sumatran rhino population\u2019s growth. Sumatran rhinos are critically endangered, with less than 100 individuals still surviving in small, fragmented pockets of tropical forest. Their habitat continues to be destroyed by palm oil production, and like all rhino species, Sumatran rhinos are under constant threat of being killed for their horns. The International Rhino Foundation established the Sumatran Rhino Sanctuary in 1997, and operates Rhino Protection Units in two of the four remaining Sumatran rhino strongholds. Ratu\u2019s pregnancy was first announced on World Rhino Day 2015. Source: International Rhino Foundation\n\nComments\n\ncomments"}
 3 | {"text": "For the past year and a half the Home Office has been locking up EU citizens en masse, indefinitely and without charge. If this attack on civil liberties has escaped your attention, it\u2019s probably because the people affected are homeless. Home Office policy to deport EU rough sleepers ruled unlawful Read more\n\nSince the Brexit vote, the pro-remain media has been full of trepidation about how life will change once Britain leaves the EU in 2019 \u2013 and the impact on EU residents living in the UK. But the nightmare scenario \u2013 Brexit as a bonfire of social and legal protections \u2013 has already been a reality for many EU migrants in the UK. Since May 2016, immigration enforcement teams have been targeting migrants who sleep rough using intelligence provided by local councils, the police, and, most shockingly, some homelessness charities, including St Mungo\u2019s and Thames Reach. That has had a real impact on some homeless people. One Polish man caught up in a raid in north London told us how he felt that some homeless charities were not to be trusted. \u201cThey pretend to help people,\u201d he said, \u201cbut actually they gather information for the Home Office.\u201d\n\nIt\u2019s not charities\u2019 job to aid deportations | Lola Okolosie Read more\n\nRough sleepers from the EU have had their documents confiscated and told they have 28 days to \u201cgo home\u201d. Others have been held in detention centres pending administrative removal. Britain is the only country in the EU that allows indefinite immigration detention, so some homeless people have been incarcerated for months while they try to prove their right of residence. The high court\u2019s ruling on 14 December that the Home Office\u2019s policy of detaining and deporting rough sleepers from the European Economic Area is unlawful is the culmination of a year of campaigning by the Public Interest Law Unit at Lambeth Law Centre and the organisation I work with, North East London Migrant Action. It feels like a depressing sign of the times that it took a court order to put a stop to such an obviously callous policy. Anyone who thinks sleeping rough is a \u201cchoice\u201d should spend a month at the sharp end of the UK labour market\n\nThe Home Office said the policy targeted migrants who came to the UK \u201cwith the intention of sleeping rough\u201d and were thus abusing their right to free movement. This version of events has been rejected by the European Commission and the charity Crisis. Most of the homeless Europeans we support have lived in Britain for years. Some have British children. They are often skilled workers who have fallen on hard times. Anyone who thinks sleeping rough is a \u201cchoice\u201d should spend a month at the sharp end of the UK labour market. Zero-hours contracts, rising rents and radical welfare cuts have not just created a new class of working poor; they\u2019ve created a class of working people who become homeless. One man was detained while camping in a tent, despite showing the Home Office proof of employment. On zero-hours contracts, he and his partner simply couldn\u2019t earn enough to rent a flat. Homelessness charities that share information with the Home Office claim they are acting in the \u201cbest interests\u201d of rough sleepers by helping them return home. The testimonies we\u2019ve gathered from EU rough sleepers speak against such careless paternalism. Spending months in Yarl\u2019s Wood is not in anyone\u2019s best interests. The man detained for sleeping rough spent 26 days in detention. \u201cI saw women fainting and falling down,\u201d he said. \u201cI had money on my credit card and wanted to buy food and cigarettes, but they wouldn\u2019t let me. We had to live on 71p a day. I felt like killing myself.\u201d\n\nThe High Court\u2019s ruling is not just a judgment on the hostile environment created by the Home Office. It\u2019s an indictment of a political class that views incarceration and deportation as a solution to the social problems caused by rampant inequality. If someone is sleeping rough they need help \u2013 not deportation | Dawn Foster Read more\n\nRight-wing media coverage of our challenge argued that the Home Office\u2019s policy reduced rough sleeping. Such logic is chilling. Reducing homelessness by deporting the homeless \u201cworks\u201d only in the same way that reducing the benefits bill by leaving disabled people destitute \u201cworks\u201d. Anecdotal evidence suggests some rough sleepers removed from the UK have later died on the streets in their country of origin. Being homeless is not a crime. Freedom of movement is not just for the rich. Such patent truths shouldn\u2019t need repeating, but \u2014 now more than ever \u2014 they do. David Jones is an activist with North East London Migrant Action. Sign up for your free Guardian Housing network newsletter with comment and sector views sent direct to you on the last Friday of the month. Follow us:@GuardianHousing\n\nLooking for a housing job, or need to recruit housing staff? Take a look at Guardian Jobs"}
 4 | {"text": "Image copyright Carlos Spottorno Image caption If you're Swiss, a regular share of this cash could soon be yours\n\nSwitzerland, one of the world's wealthiest countries, is engaged in an intense process of soul searching - about money. This year alone there have been two nationwide referendums on executive pay, one of which approved strict limits on bonuses and banned golden handshakes. Now two more votes are on the way, the first on the introduction of a minimum wage, and the second, and most controversial, on a guaranteed basic income for all legal residents, whether they work or not. A universal basic income sounds very radical, but it is not a new idea - Thomas More proposed it in his work Utopia in the 16th Century. On the left, universal basic income is thought to be fairer, while on the right it is seen as the policy that would make welfare payments obsolete. There will be no incentive for young people to learn a job or study Rudolf Strahm, Swiss economist\n\nFor Enno Schmidt, a key supporter of universal basic income, Switzerland is the perfect place, and 2013 the perfect time, to launch a campaign to introduce it. \"Switzerland is the only place in Europe, and maybe in the world, where the people have the right to make something real, [through] direct democracy,\" he says. That system of direct democracy means the Swiss could vote for free beer if they wanted to. To hold a nationwide referendum, all citizens have to do is gather 100,000 signatures calling for a vote, and the ballot must be held - the result is binding. 'Happy land'\n\nMedia playback is unsupported on your device Media caption Campaigners for a universal basic income dump eight million coins outside the Swiss parliament\n\nThe anger among many Swiss voters at the news that some of their biggest banks, such as UBS, had continued paying top executives huge bonuses while also reporting huge losses, has led to a heated debate about salaries, and more widely, about fairness. In that context, it was easy to gather the 100,000 signatures to hold the vote on universal income, and the government is expected to name a date for the referendum soon. Swiss business leaders have reacted with dismay, one calling it a \"happy land\" proposal, the product of a younger generation that has never experienced a major economic recession or widespread unemployment. Many have also suggested it could provide a major disincentive to working at all, something that could pose problems for Swiss companies already finding it hard to recruit skilled workers. Mr Schmidt denies this, saying the proposed amount for Switzerland, 2,500 Swiss francs ($2,800; \u00a31,750) a month is scarcely enough to survive on, and that anyway a society in which people work only because they have to have money is \"no better than slavery\". Image caption Referendum results are binding in Switzerland\n\nInstead Mr Schmidt argues that universal income would allow people more freedom to decide what they really want to do. \"The thought is not that people will work less, the people are free to decide - more, or less,\" he says. That argument has found some enthusiastic supporters among young Swiss voters. They have adopted a rather clever campaign technique, borrowing eight million five-centime pieces and displaying them around the country as a symbol that Switzerland can afford to pay its eight million inhabitants a universal income. 'A risky move'\n\nChe Wagner is one of the campaigners. He is 25, studying for a master's degree at Zurich university and working for a pizza delivery company. \"I have a daughter,\" he says, \"and so of course I am there for my daughter, I look after her.\" \"But it is also a struggle - I have to work, so we can live. \"I think with a basic income I would still have to work, but I could\u2026 maybe [also] say, 'OK let's spend a week with my daughter.'\" And, when Che and his colleagues dumped their eight million coins outside the Swiss parliament, the politicians inside did not dismiss the campaign out of hand. The idea goes to the personal question - what are you doing in your life, is it actually what you want do? Che Wagner, Universal income campaigner\n\n\"The idea makes sense in a certain way,\" says Luzi Stamm, member of parliament for the right-wing Swiss People's Party. But Mr Stamm adds, it would be a risky move for Switzerland to take as long as it remains inside Europe's free movement of people agreement. \"It certainly does not work in a country like Switzerland. In a country which is wealthy, and has open borders it is suicide.\" Meanwhile on the left, economist and former social democrat member of parliament Rudolf Strahm backs a minimum wage but is against a universal income, believing it would undermine the famous Swiss work ethic. \"There will be no incentive for young people to learn a job or study,\" he says. 64,000 franc question\n\nSo how much exactly would such a scheme cost? No-one is offering precise figures, although there is surprisingly little debate about whether Switzerland could afford it - the consensus seems to be that, financially, the scheme would be doable. Image copyright Thinkstock Image caption We need to think more about our work-life balance, say campaigners\n\nIncome tax would not necessarily rise, but value added tax - on what people buy rather than what they earn - could rise to 20% or even 30%. In the long run, supporters say, money might actually be saved because a basic universal income would replace means tested welfare payments. But the main motivation behind the campaign is not economic but cultural, a bid to make people think more carefully about the nature of life and work. Mr Wagner points out that the whole debate can make people uncomfortable, presenting them with choices that so far have been unimaginable. \"The idea goes to the personal question - what are you doing in your life, is it actually what you want to do?\""}
 5 | {"text": "Russian President Vladimir Putin says he supports the idea of sending a delegation from the country's parliament to Washington to urge members of the U.S. Congress not to support military intervention in Syria. Valentina Matviyenko, who chairs the Federation Council, the Russian parliament's upper chamber, outlined the proposal to send a parliamentary delegation to the U.S. on Monday during a meeting with Putin. The meeting also was attended by Sergei Naryshkin, the speaker of the State Duma, the lower parliamentary house. Matviyenko said she hopes for an \"active dialogue\" with U.S. lawmakers about possible military intervention in Syria. Putin called the idea \"very timely and correct.\" Matviyenko told the Russian president she and Naryshkin \"completely\" share his stance on what she called the \"impermissibility\" of military intervention in Syria without United Nations Security Council approval. On Saturday, U.S. President Barack Obama delayed an expected military strike against Syria, instead telling Americans he will seek congressional approval to punish the Syrian government for its alleged use of chemical weapons. That same day Putin said it would be \"utter nonsense\" for the Syrian government to use chemical weapons because its forces are \"advancing\" against rebels. VOA News Subscribe"}
 6 | {"text": "The biggest non-issue issue in New Zealand politics is immigration, which is amazing when you consider how much of an issue it is overseas. In the USA, Mexicans and Muslims are the two hot words in the presidential primaries; they\u2019re helping to fuel Donald Trump\u2019s meteoric rise from reality TV star to Republican front-runner. In Europe, the \u2018refugee problem\u2019 is threatening to displace German Chancellor Angela Merkel from office and is dividing the EU along ideological \u2013 and geographical \u2013 borders. In the UK, immigrants are a key source of discontent among anti-EU voters. I barely need to mention Australia, where anti-immigrant rhetoric is easy to find on Facebook feeds and Sydney trains. \u201cAustralia for Australians!\u201d I saw on a bumper sticker in Queensland. The Australian disdain for immigrants has encouraged the draconian crackdown on non-Australian residents with criminal records. It also led to a mind-boggling sting last year in which Melbourne police planned to stop \u2018foreign-looking\u2019 people and ask for \u2018their papers\u2019. The action was cancelled only after students swarmed police stations and saluted like Nazis. Over here, immigration barely rates a mention. According to a rolling poll by Roy Morgan, less than one percent of New Zealanders consider immigration as an important issue. The economy, poverty and the environment dominate our list of worries. In the 2014 election, pundits expected immigration and its twin sister, Auckland housing, to be a defining issue. Some even predicted the demise of the National Party in key Auckland electorates. In fact, the government strengthened its lead in central Auckland and National had its best result since 1951. By contrast, voters punished Labour when it clumsily identified Chinese immigrants as the source of the housing problem. What makes it especially remarkable is that migration is at record highs. Just last week, Statistics NZ released figures showing that we reached a record of (seasonally unadjusted) 68,100 migrants in the April 2016 year. \"This is the 21st month in a row that the annual net gain in migrants has set a new record. Before this period, the record was a net gain of 42,500 migrants in the year ended May 2003,\" it said. And, get this, it was more than four times greater than the average net gain recorded over the past 20 years. Four times! The numbers keep defying predictions. Treasury Secretary Gabriel Makhlouf is typical of most economists when he said in February: \"If you'd asked me six months ago, I would have said we were forecasting a fall in net migration, and now we're postponing [the fall]. It\u2019s just carrying on far longer than we expected.\u201d\n\nThat\u2019s a lot of Chinese, Indian and Aussie accents without so much as a \u2018bloody foreigners\u2019 bumper sticker in sight. Why? Why aren\u2019t political parties making more hay from this sun-shower of opportunity? Why don\u2019t we have an anti-immigration rump in Maoridom? Why hasn\u2019t someone credibly made the connection between immigrants and Auckland housing? Why don\u2019t we have a \u2018they\u2019re taking our jobs\u2019 campaign going on? The idealistic part of me thinks that New Zealanders are simply better than all that. The outcry about Labour's aforementioned \u2018Chinese buyers\u2019 stunt gladdened my heart. So does the ratepayer-funded Diwali festival and the Chinese characters on my local ATM. I love that I see every shade of skin at the Mt Albert Pak n\u2019 Save and that Lydia Ko is as much \u2018ours\u2019 as Richie McCaw. I suspect, though, that the real reason is in the numbers. Three actually. 750. That\u2019s how many refugees we take each year. Flap-all by European or US standards. And there\u2019s no evidence boat-loads of starving masses are heading our way soon. We simply don\u2019t have a refugee problem. 1700. That\u2019s the net gain of migrants from Australia. This year is the first in many years that more Kiwis came home than left for the Lucky Country. Ausssie is the country that really matters here. Total number of Ausie mirgants in the last year was 25,000 compared to China (11,700), Philippines (5,500) and South Africa (3,000). $3.7 billion. That\u2019s our annual trade deficit. It\u2019s at the highest level in six years and will grow as dairy prices slide. It\u2019s symptomatic of an economy that\u2019s borrowing more than it makes \u2013 so we need immigrants to prop us up. They bring money for houses and business. They create demand for services and new sofas. They increase the tax take and they\u2019re filling the coffers of Auckland Council with parking fines. Foreign students (part of that net gain figure) are paying for our universities and fund the grocery bills of their homestays. We congratulate ourselves on our innovation and creativity but without immigrants our economy would be screwed. That\u2019s the issue. This article has been updated since its publication in paper and ink to reflect new data."}
 7 | {"text": "Looking for news you can trust? Subscribe to our free newsletters. Late last month, the Federal Communications Commission announced that it would propose new rules allowing companies like Netflix or Google to pay internet service providers (ISPs) like Verizon or Comcast for faster data lanes to deliver video and other content to their customers. In other words, the FCC was proposing to replace net neutrality\u2014the egalitarian internet that we all know\u2014with a pay-to-play platform designed to favor the biggest and richest players. The backlash online was so huge, swift, and predictable that one might wonder what the hell the FCC bureaucrats were thinking. Could a handful of powerful companies really matter more to the commission than pretty much everybody else who uses the internet? The charts below show how a few wealthy special interests wield huge sway within the FCC, particularly with regard to the net neutrality debate. But first, a quick refresher on what net neutrality means:\n\nProponents of net neutrality, also known as the open internet, fear that allowing a fast lane on the web would hurt startups, nonprofits, activists, and anyone else who couldn\u2019t afford to pay the toll. Bigger tech companies such as Google also tend to favor net neutrality, though sometimes more for the sake of public relations than principle. But, you might ask, since the internet is already quite fast today compared with a few years ago, is a few seconds\u2019 difference in the time needed to load a web page really all that important? Actually, yes, it is. Here\u2019s why:\n\nThis might be one reason Barack Obama visited the Googleplex during his first presidential campaign and painted himself as one of net neutrality\u2019s staunchest defenders. Obama\u2019s first pick to lead the FCC, Julius Genachowski, was initially a strong proponent of net neutrality. Genachowski made a video explaining why he wanted to reclassify ISPs as \u201ctelecommunications services,\u201d a legally bulletproof way of preserving an open internet that had long been favored by consumer groups. But he ultimately backed off in the face of an onslaught of lobbying by ISPs. By then their main trade group, the National Cable and Telecommunications Association (NCTA), was spending about 95 times more money lobbying the FCC than the Internet Association, which represents the tech companies that favor net neutrality. Last May, two months after Genachowski stepped down, Obama replaced him with Tom Wheeler, a veteran telecommunications lobbyist who\u2019d served as president of the NCTA before taking the helm of the Cellular Telecommunications and Internet Association (CITA), the lobbying arm of the wireless industry. Obama called him \u201cthe Bo Jackson of telecom.\u201d The New Yorker\u2018s John Cassidy suggested that a more apt sports metaphor might have been \u201cto compare him to one of the lawyers who helped finagle a lucrative anti-trust exemption for professional football and baseball.\u201d\n\nDid Obama like that Wheeler represented two of the most powerful groups that oppose net neutrality, or could he have picked him for some other reason? See below. It\u2019s too early to say whether Wheeler\u2019s new net neutrality rules will be the nail in the coffin for an open internet. The FCC won\u2019t officially reveal them until May 15 (or later), and even then, a lot will depend on the FCC\u2019s discretion. Wheeler has said that the commission won\u2019t allow ISPs to \u201cact in a commercially unreasonable manner to harm the internet,\u201d but what, exactly, does that mean? Is it commercially unreasonable to price the little guys out of faster internet service, or to effectively force people to pay more to watch House of Cards? Who knows? The only certainty is that Wheeler\u2019s former employers, the ISPs and wireless carriers, will flood the zone with lobbyists. With a few notable exceptions, you can assume that tech companies, consumer groups, and content producers favor net neutrality, while ISPs oppose it. Which is to say, if the lobbyists have their way, the future clearly lies in net discrimination."}
 8 | {"text": "Friedrich Wilhelm, Duke of Schleswig-Holstein-Sonderburg-Gl\u00fccksburg (German: Friedrich Wilhelm Paul Leopold; 4 January 1785 \u2013 17 February 1831)[1][2] inherited the title of Duke of Schleswig-Holstein-Sonderburg-Beck in 1816. He subsequently changed his title to Duke of Schleswig-Holstein-Sonderburg-Gl\u00fccksburg in 1825 and founded a line that includes the Royal Houses of Denmark, Greece, Norway, and the Commonwealth realms. Early life [ edit ]\n\nFriedrich Wilhelm was born in Lindenau, East Prussia, to Friedrich Karl Ludwig, Duke of Schleswig-Holstein-Sonderburg-Beck (20 August 1757 \u2013 24 April 1816) and Countess Friederike of Schlieben (28 February 1757 \u2013 17 December 1827). [1][2] He was the third and youngest child of the couple, and the only son. [1][2] In 1804, he was sent to Denmark, where he was an officer of the Danish army during the Napoleonic Wars. Marriage and issue [ edit ]\n\nOn 26 January 1810, Friedrich Wilhelm married Princess Louise Caroline of Hesse-Kassel (28 September 1789 \u2013 13 March 1867),[1][2] a granddaughter of Frederik V of Denmark through her mother, Princess Louise of Denmark. Friedrich Wilhelm and Louise Caroline had ten children:[1][2]\n\nLater life [ edit ]\n\nOn 25 March 1816, Friedrich Wilhelm succeeded his father as Duke of Schleswig-Holstein-Sonderburg-Beck. On 6 July 1825, he became Duke of Gl\u00fccksburg and changed his title to Duke of Schleswig-Holstein-Sonderburg-Gl\u00fccksburg, after the elder Gl\u00fccksburg line became extinct in 1779. Friedrich Wilhelm died on 17 February 1831 at Gottorp. [1][2]\n\nHis grandchildren include among others Frederick VIII of Denmark, Queen Alexandra of the United Kingdom, George I of Greece, Empress Maria Feodorovna of Russia, Crown Princess Thyra of Hanover, Duchess of Cumberland and Teviotdale, and Friedrich Ferdinand, Duke of Schleswig-Holstein. Titles and styles [ edit ]\n\n4 January 1785 \u2013 25 March 1816 : His Serene Highness The Hereditary Prince of Schleswig-Holstein-Sonderburg-Beck\n\n: The Hereditary Prince of Schleswig-Holstein-Sonderburg-Beck 25 March 1816 \u2013 6 July 1825 : His Serene Highness The Duke of Schleswig-Holstein-Sonderburg-Beck\n\n: The Duke of Schleswig-Holstein-Sonderburg-Beck 6 July 1825 \u2013 17 February 1831: His Serene Highness The Duke of Schleswig-Holstein-Sonderburg-Gl\u00fccksburg\n\nAncestry [ edit ]"}
 9 | {"text": "Here is a Sneak Peek at a joint development between Battle Arms Development and CMT Tactical, the Integral PDW STOCK assembly which is not an add-on option but a monolithic design completely machined into the lower receiver. Coming from someone who owns PDW stock from another manufacturer, I can tell you that this system will pretty much fix all of the issues associated with other PDW style stocks on the market. The production PDW STOCK Length will come in at under 4.5\u2033 (prototype shown is 5.75\u2033). This Integral PDW STOCK assembly uses a standard bolt carrier and retains the ability to shotgun the upper which is typically not possible with PDW style stocks. It also features some additional improvements such as ultra smooth guid rods with bushing, stainless steel ball bearing rolling and locking mechanism, strong dimpled rod locking positions, stock is lockable in both directions, adjustable/tune able buffer up to H2 weight, built in QD positions, ability to lock back BCG, etc. One of the improvements that I like the most is the fact that it has a cheek rest. I have a PDW style stock WITHOUT a cheek rest and its not what I would call an ideal setup. As you can imagine, this setup has Multiple PATENTS PENDING! Unique Features/Improvements of this new PDW STOCK/RECEIVER:\n\n1. Lightweight but strong monolithic design from the ground up for the PDW style stock. Less bulk vs add-on versions. 2. Use standard Bolt Carrier, no proprietary carrier (when combined with the new Battle Arms PDW Buffer & Spring System)\n\n3. Ability to shotgun the upper for field service. 4. Stronger & Lightweight dimpled rail design, no large slot cuts in the rail that weakens the stock. 5. Smooth, stainless steel ball bearing like action\u2026wait, it is on ball bearings! 6. Continuous cheek weld! 7. Lockable in both directions\n\n8. Easy single motion latch release, no need to fumble with two hands to get the stock released. 9. Compact, 4.5\u2033 overall length collapsed\u2026about 1\u2033 shorter than the prototype. 10. Ability to lock back BCG. 11. Ability to \u201cTune\u201d the rifle with different \u201creciprocating weight\u201d buffers, up to the equivalent of a H2. Normally you have only one choice for proprietary buffer. 12. Quiet locking mechanism for stealth. They are hoping to release the Integral PDW Lower Receiver and Stock some time this summer so start getting ready now. No pricing information has been released but you can kind of come up with an idea of a number if you understand that you will be buying a lower receiver and PDW stock assembly together. If you would like to keep informed as this product progresses through the final design changes, click on the links below and follow their respective Facebook and Instagram pages closely. Battle Arms Development Facebook Page\n\nBattle Arms Development Instagram\n\nCMT Tactical Facebook Page\n\nCMT Tactical Instagram"}
10 | {"text": "Saudi Arabia never, ever disappoints when it comes to barbarism and backwardness. The latest example is a supposed threat to sue a Twitter user for making the obvious and accurate comparison of the regime to ISIS, something I myself have done on countless occasions. The Washington Post reports:\n\nAuthorities in Saudi Arabia have long been annoyed that everyone keeps suggesting they are anything like the Islamic State. Sure, they say, perhaps some of the laws on the books may look similar to the punishments in the extremist organization, but the Saudi kingdom is a sovereign state that abides by the rule of law and uses these punishments with discretion. Now, reports in the Saudi press suggest that authorities have a new tactic for those who compare them to the Islamic State: taking them to court. According to a report in pro-government newspaper Al Riyadh, the Saudi justice ministry is planning to sue a Twitter user who suggested that a death sentence recently handed out to a Palestinian artist for apostasy was \u201cISIS-like.\u201d \u201cQuestioning the fairness of the courts is to question the justice of the Kingdom and its judicial system based on Islamic law, which guarantees rights and ensures human dignity,\u201d a source in the justice ministry told the newspaper, according to a translation by Reuters. The ministry would not hesitate to sue \u201cany media that slandered the religious judiciary of the Kingdom,\u201d the source added. I suppose it\u2019s not apparent to the Saudis how the only thing this pathetic threat does is further confirm the worst suspicions of its rapidly growing population of critics from around the world. Of course, it\u2019s not the first time these authoritarian monarchs have expressed a panic attack about Twitter. Recall the following post from 2013: Saudi Religious Police Chief Goes on the Attack\u2026Against Twitter. The best part about this latest Saudi stupidity is the fact that it\u2019s backfiring spectacularly, with a meme created on Twitter where users are begging the Saudis to sue them via the hashtag #SueMeSaudi. Simply brilliant. As the Huffington Post reports:\n\nPeople are inviting Saudi Arabia to sue them after the state reportedly threatened legal action against someone who compared its imminent execution of a Palestinian poet to the actions of Islamic State. The country\u2019s justice ministry would \u201csue the person who described \u2026 the sentencing of a man to death for apostasy as being \u2018ISIS-like\u2019,\u201d a source within the ministry told the pro-government newspaperAl-Riyadh. Here are a couple examples:\n\nSaudi Arabia, please sue me.I will be in Leb next week for extradition. \u201cSaudi Arabia to sue anyone who compares their justice syst to Isis\u201d \u2014 NassimNicholasTaleb (@nntaleb) November 26, 2015\n\nFree speech and a choice of faith is a human right. #SueMeSaudi pic.twitter.com/scTwbqmvXT \u2014 Ajanta Deb Roy (@ajantadebroy) November 28, 2015\n\nThugs till the end. For related Saudi ISIS-like behavior, see:\n\nSaudi Arabia Sentences Poet to Death for \u201cRenouncing Islam\u201d\n\nSo Who\u2019s Really Sponsoring ISIS? Turkey, Saudi Arabia, and Other U.S. \u201cAllies\u201d\n\nSaudi Arabia Bombs Second Yemeni Wedding in a Week \u2013 At Least 23 Dead\n\nSaudi Arabia Bombs Second Yemeni Wedding in a Week \u2013 At Least 23 Dead\n\nSaudi Arabia Forces the UN to Drop Humanitarian Inquiry Into Yemen Atrocities\n\nNot a Joke \u2013 Saudi Arabia Chosen to Head UN Human Rights Panel\n\nSaudi Arabia Prepares to Execute Teenager via \u201cCrucifixion\u201d for Political Dissent\n\nIn Liberty,\n\nMichael Krieger\n\nDonate bitcoins: Like this post?Donate bitcoins: 3J7D9dqSMo9HnxVeyHou7HJQGihamjYQMN\n\nFollow me on Twitter."}
11 | 


--------------------------------------------------------------------------------
/openwebtext_tiny/val.jsonl:
--------------------------------------------------------------------------------
 1 | {"text": "The tragic incident took place on Sunday evening, ending the life of a much-loved and respected national team's No.1\n\nSouth Africa and Orlando Pirates goalkeeper and captain Senzo Meyiwa was shot dead on Sunday in Vosloorus, South Africa. The exact details of the incident are yet to transpire, but the South African Police Service (SAPS) have provided confirmation of the South African international's tragic death. The 27-year-old was declared dead upon arrival at hospital, according to the authorities, who appealed for calm as people rushed to the hospital and the house where he was shot to grieve and pay their respects. MEYIWA STATEMENT: The @Orlando_Pirates family has learned with sadness of the untimely death of our number 1 keeper & captain Senzo Meyiwa. \u2014 Orlando Pirates FC (@Orlando_Pirates) October 26, 2014\n\nMEYIWA STATEMENT: \"This is a sad loss to Senzo's family especially his children, to Orlando Pirates & the nation\", said Chairman Dr Khoza. \u2014 Orlando Pirates FC (@Orlando_Pirates) October 26, 2014\n\nMEYIWA STATEMENT: This statement is being issued whilst Club officials are up & about dealing with matters arising from the unsettling news \u2014 Orlando Pirates FC (@Orlando_Pirates) October 26, 2014\n\nThe Orlando Pirates star was in action as recently as Saturday, in his side's 4-1 win over Ajax Cape Town in the quarter-final of the Telekom Knockout tournament. Meyiwa captained Bafana during the recent 2015 Africa Cup of Nations qualifying matches against Congo, Nigeria and Sudan, taking over the skipper's armband in the absence of Kaizer Chiefs' Itumeleng Khune. Meyiwa's contribution to the national team has been immense in the Afcon qualifiers, with Bafana yet to concede a goal after four games while topping their table ahead of Sudan, defending champions Nigeria and Congo. Pirates Chairman Dr Irvin Khoza tweeted, \"This is a sad loss to Senzo's family especially his children, to Orlando Pirates & the nation.\" The club have announced that they will hold a press conference on Monday."}
 2 | {"text": "Ijen Volcano Crater is one of the most attractive and dangerous places in the world. Active volcano, constantly spewing sulfur smoke clubs, the world\u2019s largest acid lake Kawah Ijen, incredible in its beauty blue fire and extremely hard working conditions in the sulfur mine. We went down into the crater to see everything with our own eyes. The Ijen volcanic complex consists of several stratovolcanoes and cinder cones and a 20 km wide caldera. Ijen caldera is the largest in Java. The size of the crater is about 960 m x 600 m. The primary dangers at Ijen are pyroclastic flows, lahars, and lava flows. But tourists are interested in the acidic crater lake shore which is a natural large deposits of sulfur. Way to Kawah Ijen\n\nIn order to reach the famous lake, you must first climb to the top and then descend into the crater. The way from the ticket office to the top takes about 3 km, the altitude difference is 500 m. If you do not hurry up, the road is rather simple. It\u2019s steeper at the beginning and almost flat after a cafe halfway. Typically an entrance to the crater opens at 1am. But in case of a high activity of the volcano this time can be shifted to the morning or even prohibited for tourists. On the day when we climbed, the entrance opened about 3.45 am. This meant that we had less than an hour to arrive to the crater lake, if we want to see blue fire. We got to the top in about 45 minutes, and another 15 minutes spent on the descent. To be honest, it wasn\u2019t easy for me to move with such speed. And, admittedly, we arrived to late. On the top of Ijen crater we saw a few flashes of blue fire, but when we descent to the lake after 5 am, it was already light. So I could make some pictures with just one blue fire flash on photo. It was too late for a super photo filled with blue light. The road back is very beautiful. Mountain peaks of volcanic complex, a fog and clouds shrouding slopes, scorched trees and dense vegetation of forests and fields are on the way. Is It Dangerous to Visit Ijen Crater? It is impossible to stay a long time close to a dense acid gas without a mask. Olivier Grunewald\n\nYes for sure! Acrid sulfur gas (sulfur dioxide) is very dangerous to health. High temperature near the pipes from which sulfur flows is a life-threatening. For good pictures, I went to the epicenter of heavy whitish-yellow smoke. Despite the mask, it was very difficult to breathe, my eyes filled with tears, workers shouted \u201cDangerous!\u201d after each next step on the slope above the hot liquid sulfur. All day long I \u201cfelt my lungs\u201d. I think that just one such adventure is not terrible, but it\u2019s better do not risk once again. It\u2019s better to be careful when you are going down into the crater. First, be sure to wear a respirator or a protective mask. The more effective it filters the air the better. There are casualties among tourists, who neglected safety rules. Second, keep in mind that the road goes across the rocks. Comfortable shoes and gloves will help you overcome the way. Crater Stunning View\n\nBeing on the slopes of Ijen crater you remember Divine Comedy and 9 circles of hell. Lifeless slopes, flowing red-hot sulfur, clubs of acrid yellow gas rushing from the earth and emerald deadly acid lake with sulfur plumes on the surface. Kawah Ijen Acid Lake\n\nKawah Ijen Crater Lake, at the top of the volcano, is the world\u2019s largest such body of water filled with hydrochloric and sulfuric acid. The volcano emitted hydrogen chloride gas, which reacted with the water and formed a highly condensed hydrochloric acid with a pH of almost 0. In fact, it\u2019s the hydrochlori acid that makes the water turquoise-blue. The lake is a deadly dangerous, however it can be touched by hand. A temperature on the surface is 50-60\u00b0\u0421, and in the depths \u2013 over 200\u00b0\u0421. The depth of the lake is about 200 meters. Blue Fire\n\nThe blue fire is ignited sulphuric gas, which emerges from cracks with temperatures up to 600\u00b0\u0421. The glow is quite weak, so it can be seen only at night. Lava flows burning with blue flame can be seen on the Ijen extremely rare\n\nSometimes sulfur is ignited by the workers. To speed up the formation of the mineral, a mining company installed ceramic pipes on an active vent near the edge of the lake. The pipes route the sulfur gases down the vent\u2019s sloping mound. When the gases cool, they condense into liquid sulfur, which then flows or drips from the pipes and solidifies into hard sulfur mats. Workers sell these stalactites to tourists as souvenirs. Lava flows burning with blue flame can be seen on the Ijen extremely rare. Unfortunately, many websites show pictures of Olivier Grunewald and give the impression, that it happens every night. Don\u2019t believe it! Usually you can see burning sulfur dioxide gas, no lava at all. Sulfur Mines in the Crater\n\nMiners have been extracting sulfur here for more than 40 years. This is a very difficult and dangerous job. Without protective clothing, and without masks, miners cut sulfur pieces and put them into a basket. Then they carry these baskets 200 meters to the top of the crater, and then descend 3 km to the foot of the volcano to a village, where they are rewarded for their work. The weight of such baskets is 60-80 kg, some people manage to lift up to 110 kg. Typically workers do such journey twice a day. For 1 kg of sulfur they get 900-1000 IDR, which means about $ 5 per basket or $ 10 per day. By local standards it\u2019s a highly paid and prestigious job. Java is a very high populated island, a lot of people unemployed. Sulfur miners are a kind of working elite. The best thing you can do for workers is to give a respirator. However, this doesn\u2019t help them live long. Sulfur gas is so dangerous to the health, that young guys look like old men, and the average life expectancy of about 47 years. Despite the extremely hard working conditions, miners are amazingly friendly and cheerful people. I experienced culture shock when a worker with a basket, the weight of which exceeds his own, had step side to let me go. All the time they show tourists a better way across rocks and gladly posed for photographers. The best thing you can do for the workers is to give them a respirator or even a face mask. They have no money and no opportunity to even change the filters. Many workers don\u2019t know that the air they breathe is dangerous. All miners smoke. They say that it helps them to reduce smell of sulfur a little bit. Where to Stay? Usually travellers stay as close to Ijen crater as possible. You can look at Asparin Homestay 1* or Ijen Resort and Villas 3* located near by the volcano. How to Get to Kawah Ijen Volcano from Bali? It\u2019s not difficult to Ijen volcano from Bali. You need to get to the ferry terminal on the west of Bali island, take a ferry to Java island and drive about 2 hours to the foot of the volcano. It\u2019s better to start climbing at night, so plan to spend a night near by the volcano. Most of people combine a trip to Ijen volcano with a trip to the Bromo Tengger Semeru National Park and climbing to the top of Bromo crater. In this case, the possible alternative is to arrive in Jogjakarta (Yogyakarta), Surabaya or other town in Central / East Java, and then drive east to the volcanous. But please take into account that the way from Jogjakarta (Yogyakarta) to Bromo may take up to 12 hours. General Information:\n\nDate of the trip : 22 of February, 2015\n\n: 22 of February, 2015 Place : Ijen Crater (Kawah Ijen)\n\n: Ijen Crater (Kawah Ijen) Location : Ijen, East Java, Indonesia\n\n: Ijen, East Java, Indonesia Ticket price : 15,000 IDR\n\n: 15,000 IDR Time for visiting: 4-6 hours\n\nKind request If the post was useful for you, please share it on Facebook, Twitter, Reddit or any other social network. You can find all social buttons below. It's just one click! Thank you very much! Nearby Attractions:"}
 3 | {"text": "You don't have to go far to find examples of everyday discrimination in Europe. There are Croatian landlords who refuse apartments to lesbians, Belgian cafes that won't serve women in headscarves to \"maintain security,\" French banks who refuse loans to people on disability benefits, German swimming pools that deny access to blind women, and Slovakian clinics that refuse certain treatments for multiple sclerosis patients older than 45. All these examples come from a recent report by Equinet, an umbrella organization for European equality bodies, one of more than 40 organizations that on Thursday issued a fresh call for the European Union to adopt a single anti-discrimination law. \"In many countries, people with disabilities can be refused access to offices, gays and lesbians can be refused hotel rooms, people can be refused rental cars because of their age, and people can be denied rental contracts because of their Jewish or Muslim religious beliefs,\" the organizations said a statement. \"Such differing levels of protection should no longer exist in Europe.\" Disability access is a particular concern across Europe\n\nDifferent standards\n\n\"Europe is a community of values where human rights are upheld, so it shouldn't be that people in different countries get different human rights protection,\" said Maja Liebing, discrimination specialist at the German branch of Amnesty International, which also signed the appeal. \"There is relatively good protection against discrimination on the basis of ethnic background, and I think no one would question that it is important in other areas.\" A draft law has been on the table since 2008 - seven years during which Germany has been holding up negotiations. \"For a long time Germany wasn't the only country to do it, there were others who expressed reservations - but that's changed,\" said Liebing. \"Now Germany really is the only country at the EU level still blocking.\" That is ironic, because Germany has relatively comprehensive discrimination protection laws in place. \"We don't understand why Germany is stopping this,\" Liebing said. \"It's something that affects German citizens too, when they travel abroad in Europe. If a gay German couple went to an eastern European country, for example, they could find themselves being refused a hotel room. Even in Austria, two lesbians were thrown out of a cafe for kissing.\" Who is holding things up? Germany\n\nAll three of Chancellor Angela Merkel's successive governments have failed to even offer an explanation for their position. \"The government has just not faced the issue,\" said Liebing. \"There is no official position, and the issue doesn't come up in the coalition contract at all.\" Amnesty thinks the Chancellery is still harboring reservations\n\nWhen DW asked the government for a statement, Family Ministry spokeswoman Verena Herb said: \"The consultations within the government have not yet been completed. That is why we have not yet taken a position on the matter in Brussels.\" The junior partner in the current coalition government, the Social Democratic Party, has said it is in favor of the directive, which suggests the Chancellery still harbors reservations - though what they are is anyone's guess as it has not said anything on the topic. \"A first step would be for them to explain why they have a problem with this issue,\" said Liebing. Should the law be passed in Europe, EU member states would have to adopt the legislation at a national level. Different countries would have to make different adjustments - but broadening disability access would likely be one of the largest areas. Currently there is some movement. The European Commission has recently made the issue a priority, again, and wants to finally put a bill before the European Parliament - if Merkel's government will let that happen."}
 4 | {"text": "SEATTLE, WA, November 21, 2017 /24-7PressRelease/ -- The extinction of salmon is a real threat that must be stopped. The construction of dams and water diversions has done irreversible damage, blocking the natural passage of the salmon. Activities like gravel mining, logging and urban development have also seen the returns on salmon dip sharply. Dam operators are spending billions and are frustrated with the current direction, with minimal results towards the threat. This is where Northwest Fisheries Enhancement would make a perfect fit, and hopefully dam operators will back NWFE. This is why NWFE is launching this all out crowdfunding campaign to save the salmon. The Seattle Times argues that budget cuts proposed by President Donald Trump will essentially put an end to funding for salmon recovery, leaving state and local resources strained, and endangering the species even further. With an increasing reliance on imported fish, and shrinking jobs in hatcheries and fisheries every day, it is clear a salmon extinction will be devastating. Salmon are an important way of life. Rivers full of salmon mean a healthy ecosystem. Alive, the salmon check the levels of algae and smaller fish. When they die, the nutrients from their bodies nurture forests and the ecosystem at large. It goes full circle. Not only does their movement bring nutrients to our rivers and shores, they are a big pull for tourists and they create jobs for locals. They are a cornerstone of culture, diet, economy and our environment. Northwest Fisheries Enhancement seeks to actively play a role in stopping the extinction of salmon. It is a non-profit organization whose approach is geared towards salmon recovery. It will do this through:\n\n- Developing partnerships that offer hatchery owners competitive management options without relying too heavily on the government. - Strengthening and building Community Fisheries Organizations (CFOs) to help improve Salmon and Steelhead returns to their rivers. - Meeting responsible and modern practices for Hatchery options and Wild/ Natural Origin Salmon with New and Proven models. You can join the efforts to save salmon from extinction today. It is a solid, reasonable investment, as shown by the Seattle Times in March 2017. By restoring watersheds and other projects, investing in the recovery of salmon created more than 16 jobs for every 1 Million dollars. The money remains in the local county too, meaning more can be done with it. It is the the intention of NWFE to help Community Fisheries Organizations (CFO) with Salmon and Steelhead egg, fry and smolt plantings and re-introduction into their rivers and streams by the millions. There are hundreds if not thousands of River and streams where CFO's can make a difference in saving and returning salmon and Steelhead runs. NWFE will provide hatchery management for hatchery owners, improving their Salmon and Steelhead smolt releases by the millions, decreasing management costs and improving efficiency. By donating to the cause now, you are saving the jobs of the people who work daily with the salmon. This is the surest way to eliminate the loss of salmon through mismanagement. Take matters into your own hands; join Northwest Fisheries Enhancement in creating real change. For information on how to donate so \"Save The Salmon\" crowdfunding, visit: https://www.generosity.com/animal-pet-fundraising/save-the-salmon\n\n# # #"}
 5 | {"text": "From Artificial Intelligence and connected objects to bots, Bluetooth and RFIDs \u2013 the demand for IoT technology is increasing more than ever in Canada, according to a report from IDC. Beta Kit reported that with the costs of connected devices going down, around 50% of mid to large Canadian businesses adopted or intend to adopt at least one IoT solution between 2016 and 2017. And even though the United States seems to be leading the pack, there are many factors indicating that Canada is right behind it. According to last year\u2019s report from Brookfield Institute for Innovation and Entrepreneurship:\n\nThe tech sector was directly responsible for $117 billion or 7.1 percent of Canada\u2019s economic output, greater than that of the finance and insurance industry. 864,000 Canadians were employed in the tech sector, which made up 5.6 percent of Canada\u2019s total employment. At over $9.1 billion, the tech sector was by far the largest private sector investor in research and development. The tech sector was also comprised of nearly 71,000 firms that year, representing 6.1 percent of all Canadian businesses. Tech sector employees earned approximately $67,000 a year, compared to the national average of nearly $48,000. The scenario is definitely promising. Whether you\u2019re a startup looking to disrupt an industry or an established company looking into generating more revenue, there are endless reasons to invest in the Internet of Things (IoT), especially in mobile. Canada is already seen as a leader in technologies such as Artificial Intelligence, and companies like Google and Uber are already aware of the opportunities: as we mentioned in our past blog post, they\u2019re heavily investing in their own AI hubs in Toronto. According to Jennifer Sewell, Product Marketing Manager at mnubo, \u201cIoT and AI have positioned Canada as an innovative hot bed. But more importantly, it is opening the door for entrepreneurs. With access to resources and skilled workers, entrepreneurs are encouraged to keep their business local.\u201d\n\nAs a mobile app development agency, we like to stay informed and keep our eyes open to emerging technologies. That\u2019s why we love benchmark and made a list of the Canadian Internet of Things (IoT) app startups we should all keep an eye on. Ready to get inspired? Toronto IoT App Startup #1: EcoBee\n\nAccording to the team:\n\n\u201cThe Toronto-based company introduced the world\u2019s first Wi-Fi enabled smart thermostat across North America in 2007. Since then, ecobee has enabled hundreds of thousands of consumers to control their home comfort anytime, anywhere using their smartphone, tablet or computer. Because half of a home\u2019s energy use comes from heating and cooling, ecobee is one of the best investments a homeowner can make. It has proven to deliver a return on investment within approx. 1 year, saving 23% (avg.) on your heating and cooling bills, without sacrificing comfort.\u201d\n\nCity: Toronto, ON\n\nSize: 201-500 employees\n\nWebsite | Angel List Profile\n\nToronto IoT App Startup #2: Flybits\n\nAccording to the team: \u201cFlybits is changing the mobile paradigm by creating context-aware software that can sense and adapt to who you are, where you are, and what you\u2019re doing. Our platform enables consumer-focused enterprises to deploy and manage mobile experiences that deliver targeted content and services. While we are focused on mobile today, our vision extends as far as wearables, smart devices, embedded technology and emerging Internet of Everything trends.\u201d\n\nCity: Toronto\n\nSize: 11- 50\n\nWebsite | Angel List Profile\n\nToronto IoT App Startup #3: ModGarden\n\nAccording to the team: \u201cModern modular approach to \u2018hyperlocal micro-farming\u2019 (TM) for the urban dweller. Modgarden is a vegetable growing unit that allows users to grow a variety of plants from leafy greens, herbs, microgreens and some root vegetables. The system uses a soil based approach to maintain the organic-ness of the produce. This approach significantly expands the universe of vegetables that can grow in Modgarden, given the variant planter depths. Further versatility is accomplished with adjustable shelves and stackability of units. The solution is meant to be pragmatic to achieve a significant portion of the dinner plate increasingly supplanting the grocery store. Feature wise the system has App functionality for monitoring, guidance and maintenance with a community forum where users interact compare (and trade with those within \u201cTinder\u201d radius). There are cloud and IoT aspects that will grow as the product takes hold.\u201d\n\nCity: Toronto\n\nSize: 11- 50\n\nWebsite | Angel List Profile\n\nToronto IoT Startup #4: Open Channel\n\nAccording to the team: \u201cSimple Ecosystem Management \u2013 From API to UI, OpenChannel provides everything you need to create and manage your partner directory or plugin marketplaces on the app store. Our API and customizable UI templates give you full control over your developer and end user experience. Create your own \u201capp types\u201d and custom workflows across web, mobile, desktop, wearables, IoT and even virtual reality! By eliminating internal development, unexpected delays, ongoing support and maintenance, your marketplace will be and running in days or weeks, not months or years \u2013 getting you to market 10X faster and 90% more cost-effectively. Our focus on the marketplace, developer and ecosystem needs means you can focus on building a world-class ecosystem. Improving documentation, technical support and program design are the front lines for you and your team, and now you can be there. City: Toronto\n\nSize: 11- 50\n\nWebsite | Angel List Profile\n\nGreater Toronto Area IoT App Startup #5: Alert Labs\n\nAccording to the team: \u201cAlert Labs\u2019 Insight series protects our customers\u2019 properties by gathering and analyzing real-time data using attractive, reliable, and easy-to-install IoT hardware and software. Sensors can be installed by anyone in less than 2 minutes, and then they are on the alert for water leaks, floods, power issues, and more. Alert Labs monitors your water usage, power, heating/cooling, temperature, humidity, sump pump, and water softener. Customers receive real-time alerts to their smartphones for emergency events (e.g. a flood) and gain insight with long-term big data analytics (i.e. escalating water use or inefficient equipment) in order to avoid damage to their properties and to save money, hassle, and natural resources. Sensors are cellular, so no Wi-Fi is required. Battery back-up ensures sensors continue to monitor during a power outage. Our target audiences are property managers, landlords, cottage owners, and homeowners.\u201d\n\nCity: Kitchener (Greater Toronto Area)\n\nSize: 11- 50\n\nWebsite | Angel List Profile\n\nToronto IoT App Startup #6: Knitt Labs\n\nAccording to the team: Knitt is an Internet of Things (IoT) hardware and software platform that tracks, measures and monitors electricity consumption at an outlet and switch level. Knitt\u2019s hardware is Wi-Fi enabled and can be used as an outlet of any kind, a switch, at the breaker level and even as a power bar. Knitt\u2019s software is where the data is accessed and can be viewed historically and in real-time. The software also enables households and businesses to remotely control, schedule and set smart rules for when things turn on or off. Smart rules are trigger-based rules like ambient temperature, humidity, light, sound level, and much more. This is made possible by our environment sensor that we created to integrate with the Knitt platform. Knitt is also smart, it has machine learning algorithms built into it that learns the way we consume electricity. It helps us automatically manage our usage better, which reduces energy waste, in turn saving us money and helping the environment. City: Toronto\n\nSize: \u2013\n\nWebsite | Angel List Profile\n\nToronto IoT App Startup #7: Ubiquilux\n\nAccording to the team: \u201cCustomers have been using simple switches on a daily basis all their lives. We augment these ubiquitous interfaces in a way which keeps the \u201cold\u201d way of using the switch but adds patented features enabled by the GestureSense\u2122 and ActiveSaver technology. Our first product (shipping now) is an \u201caugmented\u201d gesture controlled and actively energy saving dimmer switch, followed by a family of smart lighting products including connected switches. Our e-Motion switch can be described as:\n\nCapable \u2014 As good or better than high end dimmers, works with LED lights\n\nConvenient \u2014 Use with or without having to touch it\n\nSanitary \u2014 Touchless feature stops spread of dirt and germs\n\nNovel \u2014 Unique aesthetic design\n\nSmart \u2014 Reacts to its environment\u201d\n\nCity: Toronto\n\nSize: 1 \u2013 10\n\nWebsite | Angel List Profile\n\nDid we miss any? Did you agree with this list? Don\u2019t hesitate to reach out and let us know what you think!"}
 6 | {"text": "Management Info\n\nPreventative measures:\n\nRisk assessment models by the Bureau of Rural Sciences, Australia, classifies the common myna in the highest threat category (Bomford 2003). The common myna is prohibited in Western Australia (Massam 2001). A Pest Animal Risk Assessment using a numerical risk assessment system developed by Bomford (2006) was carried out by the State of Queensland, Department of Employment, Economic Development and Innovation, in 2009. Indian mynas in Queensland were assessed as an \u2018extreme\u2019 threat species. See Markula et al 2009\n\nPhysical: Foraging traps are very useful for the control of small myna populations if poisoning is not an option. The Tindall Trap and the Tidemann Trap have been used successfully in New Zealand and Australia, respectively. The Decoy Trap, Kadavu Trap, Larsen Trap, Rat snap-trap and other foraging traps have also been used for trapping myna birds with less success. Please follow this link to view a plan of Peter Green's starling, sparrow and myna trap is available . Mynas roost in large concentrations and netting operations and nest trapping may be appropriate for control. Mynas should be provided with food, shelter and perches in cages a few days prior to trapping. Mynas should be killed humanely by euthanasia with carbon dioxide (Thomas 2004). For more information on humane trapping and disposal of birds please see the Tidemann, C. 2007f. Common Indian Myna Website > Trapping Mynas and Tidemann, C. 2007g. Common Indian Myna Website > Humane Disposal. Chemical : Starlacide DRC1339 has been used against mynas and is effective where there are no non-target species issues. Alphachloralose paste is used for temporary local control of mynas in cooler climates. For more information on the use of these toxins please see NZFSA. Undated. DRC 1339 For Bird Control and Nelson. 1994. Bird Control in New Zealand Using Alpha-Chloralose and DRC1339.\\r\n\nIntegrated Pest Management : As invasive bird species are frequently associated with human modified environments IPM is an appropriate strategy (Lim Sodhi Brook and Soh 2003). Long term management practices may include habitat modification, resource limitation and public education. Restricting food available to the myna is difficult as it has such a variable diet (Thomas 2004). The need to raise public awareness is important part of IPM. Envirotalk Australia has a forum discussion on the myna topic. The Minimising Myna Website aids public education on the issue. The Canberra Indian Myna Action Group is a community group that has developed a number of strategies, including public education and a trapping program, to tackle the common myna. CIMAG's trapping program has been highly successful and has humanely removed over 12 000 Mynas from around Canberra in around 18 months (CIMAG Undated)."}
 7 | {"text": "Buy Photo Navajo Nation Council Delegate Leonard Tsosie speaks during a presentation before the start of the spring session on April 18 in Window Rock, Ariz. (Photo: Noel Lyn Smith/The Daily Times)Buy Photo\n\nFARMINGTON \u2014 A new bill is asking Navajo lawmakers to place a referendum before tribal voters, asking them to approve consolidating the nation's 110 chapters into a regional government structure. The bill, which was posted July 9 on the Navajo Nation Council's website, asks the council to place the referendum on the Nov. 8 general election ballot. The referendum would require a simple majority of votes in order to pass, according to the bill. If voters approved the measure, it would consolidate the current 110 chapters into 24 regional governments. It would also eliminate the positions of chapter president, vice president and secretary-treasurer and replace those positions with elected commissioners. The proposed ballot language explains that chapters with fewer than 1,000 registered voters would be represented by one commissioner, and chapters with more than 1,000 registered voters would be represented by two commissioners. There are 11 chapters in the Northern Agency that have more than a 1,000 registered voters, according to the Navajo Election Administration office in Shiprock. The chapters would remain as nongovernmental communities, and the transition would occur within four years, the bill states. Delegate Leonard Tsosie, who represents the Baca-Prewitt, Casamero Lake, Counselor, Littlewater, Ojo Encino, Pueblo Pintado, Torreon and Whitehorse Lake chapters, is sponsoring the bill. The bill was assigned to the Resources and Development and Naa'bik'\u00edy\u00e1ti' committees and to the council, where final authority rests. In a telephone interview Thursday, Tsosie said the referendum was developed based on a recommendation from the Title 26 Codification Task Force Committee. The task force was formed by the Resources and Development Committee in May 2015, and membership includes personnel from the Office of Navajo Government Development, the Land Department, the Department of Justice, the Office of the Auditor General and other entities. If regional governments were established, the amount the tribe allocates to chapters for administrative and operational costs could be reduced and funds to provide services at the local level increased, Tsosie said. The total amount of funding distributed in the fiscal year budget is divided among the 110 chapters, according to the Office of Management and Budget. For the proposed fiscal year 2017 budget, the Budget and Finance Committee recommends approximately $11.8 million for nonadministrative costs, approximately $3.8 million for capital infrastructure and $13.7 million for operational costs, according to figures provided by the OMB. Chapters also receive sales tax revenue, which is also divided between the 110 chapters, according to the OMB. Earlier in the year, the Office of Navajo Government Development held public education sessions to present information about amending the chapter government system. Tsosie said he attended a number of sessions and heard several comments opposing the change. \"It's an idea, and I've been telling chapters to give it a chance. \u2026We hope the people will give it a serious evaluation,\" he said. After the standing committees consider the bill, it could go before the council during a special session in August, Tsosie added. Noel Lyn Smith covers the Navajo Nation for The Daily Times. She can be reached at 505-564-4636. Read or Share this story: https://www.daily-times.com/story/news/local/navajo-nation/2016/07/15/bill-proposes-referendum-tribal-voters/87144380/"}
 8 | {"text": "Police have released photos of a man suspected of sexually assaulting a woman in the Edgewater neighborhood on the North Side. The assault happened about 5:40 a.m. on Oct. 26 in an alley on the 5600 block of North Kenmore Avenue, police said. The woman was walking in the alley when a man approached her and struck her numerous times in the head, police said. The man robbed her and tried to sexually assault her. The man was described as black, between the ages of 18 to 25, about 6 feet tall and weighing about 175 pounds, police said. The man has a dark complexion and was last seen wearing a white jacket with a red stripe on each shoulder, police said. Anyone with information can contact the Area North Detective Bureau at (312) 744-8261.\n\ndawilliams@tribune.com\n\nTwitter: @neacynewslady"}
 9 | {"text": "You may remember last year Callin\u2019 Oates was setup as a wonderful hotline to hear your favorite Hall & Oates songs. This year they have added 8 more Hall & Oates songs as well as a hotline for Bob Dylan, Neil Diamond, Phil Collins and Chicago. We highly recommend adding these to your phone directory in case of emergency. Here\u2019s the info via twilio:\n\nNew: Bob Dialin\u2019 (Bob Dylan Helpline) \u2013 +1 (615) DYLAN \u2013 26\n\nDid you forget how if feels to be on your own, with no direction home? Do you need shelter from the Bob Dylan-less storm? Don\u2019t think twice, call Bob Dialin\u2019 to hear your favorite Bob Dylan tunes. Press #1 for \u201cLike A Rolling Stone\u201d\n\nPress #2 for \u201cDon\u2019t Think Twice It\u2019s Alright\u201d\n\nPress #3 for \u201cShelter From The Storm\u201d\n\nPress #4 for \u201cI\u2019ll Be Your Baby Tonight\u201d\n\nNew: Diamond Rings (Neil Diamond Helpline) \u2013 +1 (424) 543 \u2013 NEIL\n\nHankering for some Neil? Play it now, play it now, play it now my baby\u2026 Here\u2019s what Diamond Rings has to offer. Press #1 for \u201cCracklin\u2019 Rosie\u201d\n\nPress #2 for \u201cSweet Caroline\u201d\n\nPress #3 for \u201cCrunchy Granola Suite\u201d\n\nPress #4 for \u201cBrother Love\u2019s Traveling Salvation Show\u201d\n\nNew: Phil Call-ins (Phil Collins Helpline) \u2013 +1 (424) 888 \u2013 PHIL\n\nWhether you need to hear that iconic drum fill on \u201cIn The Air Tonight\u201d in a pinch, or just figure out what exactly Phil Collins is saying on \u201cSussudio,\u201d you can count on Phil Call-ins. Press #1 for \u201cEasy Lover\u201d\n\nPress #2 for \u201cAgainst All Odds\u201d\n\nPress #3 for \u201cIn The Air Tonight\u201d\n\nPress #4 for \u201cSussudio\u201d\n\nNew: If You Ring Me Now (Chicago Helpline) \u2013 +1 (34724) 25 (or) 624\n\nSaving If You Ring Me Now in your contacts is almost as good as having Peter Cetera\u2019s cell number. Just give it a call to hear your favorite Chicago songs. Press #1 for \u201c25 or 6 to 4\u201d\n\nPress #2 for \u201cSaturday In The Park\u201d\n\nPress #3 for \u201cOld Days\u201d\n\nPress #4 for \u201cBaby, What A Big Surprise\u201d\n\nUpdated: Callin\u2019 Oates (Hall and Oates Helpline) \u2013 (719)-26-OATES\n\nDial in to the hotline that started the whole viral sensation and let Hall & Oates take you away to a \u201870s to early \u201880s dreamland. With all new classics and even a B-side or two. Press #1 for \u201cWhen The Morning Comes\u201d\n\nPress #2 for \u201cYou Make My Dreams\u201d\n\nPress #3 for \u201cEverytime You Go Away\u201d\n\nPress #4 for \u201cSay It Isn\u2019t So\u201d\n\nPress #5 for \u201cHad I Known You Better Then\u201d\n\nPress #6 for \u201cAdult Education\u201d\n\nPress #7 for \u201cOut Of Touch\u201d\n\nPress #8 for \u201cGo Solo\u201d"}
10 | {"text": "CNN - The CIA's News Network? By Dave McGowan\n\nMay 2000\n\nI n February of this year, a story that had appeared in the European press was reported by Alexander Cockburn - co-editor of Counterpunch - concerning the employment by CNN of military psychological warfare specialists. Other than Cockburn's piece, and the issuance of an 'Action Alert' by the media-watchdog group FAIR (Fairness and Accuracy in Reporting), the report was ignored by the American press. As originally reported by Abe de Vries in the Dutch periodical Trouw, the story went something like this: \"For a short time last year, CNN employed military specialists in 'psychological operations' (psyops). This was confirmed to Trouw by a spokesman of the U.S. Army. The military could have influenced CNN's news reports about the crisis in Kosovo.\" (1)\n\nCould have? The word 'duh' would seem to apply here. In fact, here's a news flash: the military influenced the news reports of all the media outlets that covered the Kosovo bombardment. The only news coming from the area was coming from NATO and the Pentagon. When you are the sole source of information, you tend to have a lot of influence. But that's not the issue here. The concern here is with CNN hiring military personnel to package for viewers the information provided as 'news' by other military personnel. This is said to be a most disturbing development, and I suppose it would be were it not for the fact that the U.S. media - as a whole - is infested with so many intelligence assets that it is hard to see how a few more in the mix could make much of a difference. Of course, most of them are posing as reporters, editors, news anchors, analysts, producers, publishers, etc. The difference here is that these particular spooks were employed openly at CNN, without journalistic cover. As Major Thomas Collins, of the U.S. Army Information Service acknowledged:\n\n\"Psyops personnel, soldiers and officers, have been working in CNN's headquarters in Atlanta through our programme 'Training With Industry'. They worked as regular employees of CNN. Conceivably, they would have worked on stories during the Kosovo war. They helped in the production of news.\" (1)\n\nThe phrase \"production of news\" is notably ambiguous when used in this context. It could easily be defined as the manufacture of news. Manufacturing news is, in fact, exactly what psychological warfare specialists do. As de Vries notes:\n\n\"The military CNN personnel belonged to the airmobile Fourth Psychological Operations Group, stationed at Fort Bragg, North Carolina. One of the main tasks of this group of almost 1200 soldiers and officers is to spread 'selected information'. [We should pause here, briefly, to note that in this context, the phrase 'selected information' generally means vicious distortions and outright lies.] \"American psyops troops try with a variety of techniques to influence media and public opinion in armed conflicts in which American state interests are said to be at stake. [We need to pause again to note that 'American state interests' generally means the financial interests of U.S. monopoly capitalists.] The propaganda group was involved in the Gulf war, the Bosnian war and the crisis in Kosovo.\" (1)\n\nIn other words, they did during the war in Kosovo what they have always done. This time, however, they did it more openly. This could have proven to be a major blunder for CNN, with scores of competitors airing this story to embarrass and discredit a rival. But that would require that we have some actual semblance of a free press. Instead, what happened was that the story got a couple of brief mentions in the alternative press that were easily overlooked and ignored. And this was only after the translated article began appearing on internet sites, most notably on the Emperor's Clothes. Had this not been the case, the story likely would not have surfaced at all on these shores. Nor would a follow-up article by de Vries in the same publication a few days later. De Vries refers to the Commander of the Fourth Psychological Operations Group, Colonel Cristopher St. John, who described the cooperation with CNN as \"a textbook example of the kind of ties the American army wants to have with the media.\" (2)\n\nThe kind of ties that will allow it \"to spread handpicked 'information' and keep other news quiet, ... to control the internet, to wage electronic warfare against disobedient media, and to control commercial satellites.\" (2) Most of which, it should be noted, the intelligence community already does to varying degrees. Still, the control is not yet complete enough. De Vries reports that the psyops personnel were not completely satisfied with the Kosovo operation: \"In their opinion, too much information about the unplanned results of the bombings has come to the surface. [We must pause yet again to note that 'unplanned results of the bombings' refers to the entirely foreseeable civilian carnage.] Rear Admiral Thomas Steffens of the U.S. Special Operations Command (SOCOM) reportedly would like to have the capacity to bring down an 'informational cone of silence' over areas where special operations are in place. What that can mean in reality was shown by the bombing of the Serbian state television RTS in Belgrade.\" (2)\n\nIndeed. And speaking of the bombing of the Serbian television station, there was another story that ran in the European press concerning that particular incident which also happened to cast CNN in a particularly bad light. Considerably more so than the story told in the Dutch publication, in fact. Significantly, this story was not aired at all in the United States. It did appear, however, in the U.K., in an article by corespondent Robert Fisk in The Independent. The report reveals that:\n\n\"Two days before NATO bombed the Serb Television headquarters in Belgrade, CNN received a tip from its Atlanta headquarters that the building was to be destroyed. They were told to remove their facilities from the premises at once, which they did.\" (3)\n\nApparently it helps to have those psyops specialists on board. Fisk goes on to recount that the next day, Aleksander Vucic, the Serbian Information Minister, received an invitation to appear on the Larry King Live show, ostensibly to give Larry's audience the Serbian view of the conflict via satellite. There were two rather serious problems with this invitation, however. First, the notion that CNN would invite a Serbian official on the air to give the Serb point of view is rather far-fetched, to say the least. More importantly, the studio to which Vucic had been invited was now deserted. Nevertheless, he was asked to arrive for makeup at 2:00AM for a 2:30AM appearance. \"Vucic was late - which was just as well for him since NATO missiles slammed into the building at six minutes past two. The first one exploded in the make-up room where the young Serb assistant was burned to death. CNN calls this all a coincidence, saying that the Larry King show, put out by the entertainment division, did not know of the news department's instruction to its men to leave the Belgrade building.\" (3)\n\nCNN's explanation is, of course, preposterous. In fact, the notion that there is some kind of distinction between CNN's 'entertainment division' and its 'news department' is rather preposterous as well. The truth appears to be that CNN was directly complicit in the attempted commission of a war crime. And this action was, to be sure, a war crime. The deliberate targeting of a foreign dignitary for assassination - even in time of war - is definitely an international war crime. So it appears that our media have crossed the line from complicity in the covering-up of U.S. war crimes - which has been a mainstay of the press for decades - to complicity in the actual commission of war crimes. A rather serious transgression, one would think, yet one which has been politely overlooked by the rest of the American media outlets. This is quite likely due to the fact that the intelligence community and corporate America pretty much controls all the media. That is why even when stories such as the CNN/Psyops reports emerge in the 'progressive' media, albeit in a very limited way, they are accompanied by amusing commentary and analysis intended to downplay the significance of the incident. For example, Cockburn wonders if: \"It could be that CNN was the target of a psyops penetration and is still too na\u00efve to figure out what was going on.\" (4) To the contrary, it appears that CNN was well aware of - and actively participating in - \"what was going on.\" Similarly, for FAIR what is \"especially troubling is the fact that the network allowed the Army's covert propagandists to work in its headquarters, where they learned the ins and outs of CNN's operations. Even if the psyops officers working in the newsroom did not influence news reporting, did the network allow the military to conduct an intelligence-gathering mission against CNN itself?\" (5)\n\nOr, more likely, is CNN itself an \"intelligence gathering mission,\" and has it been from its inception? It was CNN, it will be recalled, that pioneered the concept of military conflict as mini-series - complete with theme music and title graphics - during the Gulf War. That is, of course, the blueprint that has been followed by the media at large for all coverage of U.S. military actions since then. One of the specific purposes for which CNN seems to have been born is the packaging of imperialist military conquests as humanitarian missions. In other words, \"to spread 'selected information'\" in order to \"influence media and public opinion in armed conflicts in which American state interests are said to be at stake.\" Glorification of U.S. high-tech weaponry, vilification of America's enemy of the moment, canonization of genocidal military leaders and advisers, rote reporting of the NATO/Pentagon/State Department line, deliberate avoidance of reporting clear-cut cases of American brutality and war crimes - all of these are indicative of a psyops program, not an allegedly independent news agency. As the group FAIR noted: \"CNN has always maintained a close relationship with the Pentagon. Getting access to top military officials is a necessity for a network that stakes its reputation on being first on the ground during wars and other military operations.\" (5)\n\nBeing first on the ground during military operations is, to be sure, a good place to be if one is a reporter. It is also a good place to be, it should be noted, if one is a member of the spook community. Whether CNN was born as an intelligence front is probably now largely an irrelevant issue, as the cable titan has since the Kosovo war announced that it is to become a part of the AOL family. And AOL is, as was noted in a recent Spin Cycle article (Sony's Magic Cameras), doing a pretty damn good job of masquerading as an intelligence front itself. So if CNN was not originally conceived as a psychological warfare entity (which appears to be the case, despite its purported status as the brainchild of Ted Turner, husband of Jane Fonda), it has certainly evolved into one. And by the way, does anyone remember when Jane was supposed to be one of the good guys? Just checking."}
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | deepspeed==0.5.8
 2 | datasets==1.17.0
 3 | torch==1.8.1
 4 | transformers==4.12.5
 5 | numpy
 6 | scipy
 7 | matplotlib
 8 | pandas
 9 | sympy
10 | nose
11 | spacy
12 | tqdm
13 | wheel
14 | scikit-learn
15 | nltk
16 | tensorboard
17 | 
18 | 


--------------------------------------------------------------------------------
/requirements_docker.txt:
--------------------------------------------------------------------------------
1 | deepspeed==0.5.8
2 | datasets==1.17.0
3 | 


--------------------------------------------------------------------------------
/scripts/cl-evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run `./evaluate.sh` on CodaLab.
 4 | # Feel free to modify this script.
 5 | 
 6 | set -x
 7 | 
 8 | dataset_name=openwebtext_wordlength
 9 | eval_data=wordlength_eval_data
10 | seed=1111
11 | 
12 | # Evaluate the GPT-2 model
13 | run_name="eval_${dataset_name}_seed${seed}_step0"
14 | cl run \
15 |   --name $run_name \
16 |   --request-docker-image sangxie513/cs324-p2-codalab-gpu \
17 |   --request-memory 32g \
18 |   :src \
19 |   :$eval_data \
20 |   "bash src/evaluate.sh gpt2 wordlength_eval_data"
21 | 
22 | # Evaluate on checkpoints of the continued-pretraining model.
23 | # Change the step numbers below to reflect the checkpoints you saved.
24 | for step in 2000 4000 6000 8000 10000; do
25 |   model_save_dir=${dataset_name}_seed${seed}
26 |   run_name=eval_${model_save_dir}_step${step}
27 |   train_dir=train_${model_save_dir}
28 |   model_dir=${model_save_dir}/checkpoint-${step}
29 |   cl run \
30 |     --name $run_name \
31 |     --request-docker-image sangxie513/cs324-p2-codalab-gpu \
32 |     --request-memory 32g \
33 |     :src \
34 |     :$eval_data \
35 |     ${model_save_dir}:$train_dir/${model_save_dir} \
36 |     "bash src/evaluate.sh ${model_dir} ${eval_data}"
37 | done
38 | 


--------------------------------------------------------------------------------
/scripts/cl-preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run `./preprocess.sh` on CodaLab.
 4 | # Feel free to modify this script.
 5 | 
 6 | set -x
 7 | 
 8 | input_dataset_name=openwebtext
 9 | output_dataset_name=openwebtext_wordlength
10 | tokenizer_name=gpt2
11 | run_name=preprocess_${output_dataset_name}
12 | 
13 | cl run \
14 |   --name $run_name \
15 |   --request-docker-image sangxie513/cs324-p2-codalab-gpu \
16 |   --request-memory 128g \
17 |   --request-cpus 10 \
18 |   :src \
19 |   :$input_dataset_name \
20 |   "bash src/preprocess.sh ${input_dataset_name} ${output_dataset_name} ${tokenizer_name}"
21 | 


--------------------------------------------------------------------------------
/scripts/cl-train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run `./train.sh` on CodaLab.
 4 | # Feel free to modify this script.
 5 | 
 6 | set -x
 7 | 
 8 | dataset_name=openwebtext_wordlength
 9 | seed=1111 # random seed
10 | run_name=train_${dataset_name}_seed${seed}
11 | 
12 | # TODO: change these hyperparameters.
13 | # Note: make sure that `gradient_accumulation_steps * max_steps <= 100000`
14 | model_name_or_path=gpt2
15 | per_device_train_batch_size=4
16 | gradient_accumulation_steps=1
17 | max_steps=10000
18 | learning_rate=1e-6
19 | warmup_steps=0
20 | save_steps=2000
21 | 
22 | cl run \
23 |   --name $run_name \
24 |   --request-docker-image sangxie513/cs324-p2-codalab-gpu \
25 |   --request-memory 80g \
26 |   --request-gpus 1 \
27 |   --request-cpus 8 \
28 |   ${dataset_name}:preprocess_$dataset_name/${dataset_name} \
29 |   :src \
30 |   "bash src/train.sh $dataset_name $seed --model_name_or_path ${model_name_or_path} --per_device_train_batch_size ${per_device_train_batch_size} --gradient_accumulation_steps ${gradient_accumulation_steps} --max_steps ${max_steps} --learning_rate ${learning_rate} --warmup_steps ${warmup_steps} --save_steps ${save_steps} --fp16"
31 | 


--------------------------------------------------------------------------------
/src/create_wordlength_eval_data.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import numpy as np
 3 | import json
 4 | 
 5 | 
 6 | if __name__ == "__main__":
 7 |     num_examples = 50
 8 |     max_words = 30
 9 | 
10 |     for seed in range(5):
11 |         np.random.seed(seed)
12 |         num_words_ls = np.random.randint(low=2, high=max_words+1, size=num_examples)
13 |         with open(f'wordlength_eval_data/wordlength_eval_data_{seed}.json', 'w') as f:
14 |             for i in range(num_examples):
15 |                 num_words = num_words_ls[i]
16 | 
17 |                 prefix = f"<len> {num_words} <text>"
18 |                 line = json.dumps({'text': prefix, 'num_words': str(num_words)})
19 |                 if i < num_examples:
20 |                     line += '\n'
21 |                 f.write(line)
22 | 


--------------------------------------------------------------------------------
/src/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# -lt 2 ]]; then
 4 |   echo "Main entry point for evaluation."
 5 |   echo "Usage:"
 6 |   echo
 7 |   echo "    $0 <model_dir (e.g., train_openwebtext_wordlength_seed1111/checkpoint-10000)> <eval_data_dir (e.g., wordlength_eval_data)> [additional arguments]"
 8 |   exit 1
 9 | fi
10 | 
11 | model_dir=$1; shift
12 | eval_data_dir=$1; shift
13 | 
14 | python src/evaluate_wordlength_model.py \
15 |   --model_dir ${model_dir} \
16 |   --eval_data_dir ${eval_data_dir} \
17 |   "$@"
18 | 


--------------------------------------------------------------------------------
/src/evaluate_wordlength_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from pathlib import Path
  3 | import nltk
  4 | nltk.download('punkt')
  5 | from nltk.tokenize import sent_tokenize, word_tokenize
  6 | import json
  7 | from transformers import (
  8 |         AutoTokenizer,
  9 |         AutoModelForCausalLM
 10 | )
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | CONFIG_NAME = "gpt2"
 15 | 
 16 | config_kwargs = {
 17 |         "cache_dir": None,
 18 |         "revision": "main",
 19 |         "use_auth_token": None,
 20 |     }
 21 | 
 22 | tokenizer_kwargs = {
 23 |         "cache_dir": None,
 24 |         "use_fast": True,
 25 |         "revision": "main",
 26 |         "use_auth_token": None,
 27 |     }
 28 | 
 29 | if __name__ == "__main__":
 30 |     import argparse
 31 |     parser = argparse.ArgumentParser(description='Evaluate the trained wordlength model')
 32 |     parser.add_argument('--model_dir', type=str, help="where the trained model is saved. Can also input a huggingface model name like gpt2 to get the default pretrained model.")
 33 |     parser.add_argument('--eval_data_dir', type=str, help="directory of the evaluation data")
 34 |     parser.add_argument('--eval_num_examples', type=int, default=None, help="number of examples to evaluate on")
 35 |     parser.add_argument('--cuda', action='store_true', help="number of examples to evaluate on")
 36 |     args = parser.parse_args()
 37 | 
 38 |     # load the model, tokenizer
 39 |     tokenizer = AutoTokenizer.from_pretrained(args.model_dir, **tokenizer_kwargs)
 40 | 
 41 |     model = AutoModelForCausalLM.from_pretrained(
 42 |                 args.model_dir,
 43 |                 from_tf=False,
 44 |                 cache_dir=None,
 45 |                 revision="main",
 46 |                 use_auth_token=None,
 47 |             )
 48 |     if args.cuda:
 49 |         model.cuda()
 50 | 
 51 |     eval_data_dir = Path(args.eval_data_dir)
 52 | 
 53 |     metrics = []
 54 |     # calculate metrics over all seeds of the evaluation dataset
 55 |     for eval_data_path in eval_data_dir.iterdir():
 56 |         generations = []
 57 |         labels = []
 58 |         predicted_labels = []
 59 |         with open(eval_data_path, 'r') as fin:
 60 |             for i, line in tqdm(enumerate(fin)):
 61 |                 if args.eval_num_examples and i >= args.eval_num_examples:
 62 |                     break
 63 |                 row = json.loads(line)
 64 |                 prefix = row['text']
 65 |                 num_words = int(row['num_words'])
 66 |                 labels.append(num_words)
 67 | 
 68 |                 # tokenize
 69 |                 if args.cuda:
 70 |                     input_ids = tokenizer.encode(prefix, return_tensors="pt").cuda()
 71 |                 else:
 72 |                     input_ids = tokenizer.encode(prefix, return_tensors="pt")
 73 | 
 74 |                 sample_output = model.generate(input_ids,
 75 |                                    do_sample=True,
 76 |                                    max_length=200,
 77 |                                    top_k=5)
 78 |                 # un-tokenize
 79 |                 decoded_output = tokenizer.decode(sample_output[0], skip_special_tokens=False)
 80 | 
 81 |                 # parse the output
 82 |                 decoded_output = decoded_output.split('<text>')[1]
 83 |                 decoded_output = decoded_output.split('<len>')[0].strip()
 84 |                 generations.append(decoded_output)
 85 | 
 86 |                 # count the number of words
 87 |                 generated_num_words = len(word_tokenize(decoded_output))
 88 |                 predicted_labels.append(generated_num_words)
 89 | 
 90 |         # compute the metric
 91 |         labels = np.asarray(labels)
 92 |         predicted_labels = np.asarray(predicted_labels)
 93 |         generations = np.asarray(generations)
 94 |         print(f"Model: {args.model_dir}")
 95 |         print("========= Some generations ======")
 96 |         idxs = np.arange(len(generations))
 97 |         np.random.shuffle(idxs)
 98 | 
 99 |         # print some generations
100 |         for num_words, generated_num_words, generation in zip(labels[idxs[:10]], predicted_labels[idxs[:10]], generations[idxs[:10]]):
101 |             print(f"TARGET NUM_WORDS: {num_words}, GENERATED_NUM_WORDS: {generated_num_words}, GENERATION: {generation}")
102 | 
103 |         print("=================================")
104 |         metric = np.mean(np.abs(labels - predicted_labels))
105 |         print(f"Metric for wordlength: {metric:.4f}")
106 |         metrics.append(metric)
107 |     print("============================")
108 |     print("============================")
109 | 
110 |     print(f"Mean: {np.mean(metrics)}")
111 |     print(f"Std: {np.std(metrics)}")
112 | 
113 |     results = {i: metrics[i] for i in range(len(metrics))}
114 |     with open('results.json', 'w') as f:
115 |         f.write(json.dumps(results))
116 | 


--------------------------------------------------------------------------------
/src/preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# -lt 1 ]]; then
 4 |   echo "Main entry point for preprocessing data."
 5 |   echo "Usage:"
 6 |   echo
 7 |   echo "    $0 <input_dataset_name (e.g., openwebtext)> <output_dataset_name (e.g., openwebtext_wordlength)> <tokenizer_name (e.g., gpt2)> <num_total_chunks (e.g., 64)> <num_chunks (e.g., 10)>"
 8 |   echo
 9 |   exit 1
10 | fi
11 | 
12 | CACHE=./cache
13 | mkdir -p $CACHE
14 | 
15 | input_dataset_name=$1
16 | output_dataset_name=$2
17 | tokenizer_name=$3
18 | num_total_chunks=${4:-64}
19 | num_chunks=${5:-10} # only process a subset of the total chunks. Set to num_total_chunks to process full dataset
20 | 
21 | mkdir -p $output_dataset_name
22 | 
23 | # Process the chunks in parallel
24 | for ((CHUNK_IDX=0; CHUNK_IDX < $num_chunks; CHUNK_IDX++)); do
25 |   python src/process_data.py \
26 |     --data_dir $input_dataset_name \
27 |     --output_dir $output_dataset_name \
28 |     --total_chunks $num_total_chunks \
29 |     --chunk_idx $CHUNK_IDX &
30 | done
31 | 
32 | wait
33 | 
34 | # Tokenize the data in chunks without merging
35 | python src/tokenize_data.py \
36 |   --data_dir $output_dataset_name \
37 |   --cache_dir ${CACHE} \
38 |   --model_name ${tokenizer_name} \
39 | 


--------------------------------------------------------------------------------
/src/process_data.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Process the openwebtext dataset in chunks. Can parallelize the execution of
  3 | this script across chunks and then call this script with the --reduce_step flag
  4 | to merge the chunks (if needed). Typically, HuggingFace will accept the data in
  5 | chunks (as a list of data paths).
  6 | 
  7 | Call this script with an incomplete number of chunk indices to process only
  8 | part of the dataset.
  9 | 
 10 | The script currently tags each sentence with its length.
 11 | '''
 12 | from pathlib import Path
 13 | from tqdm import tqdm
 14 | import subprocess
 15 | import json
 16 | import nltk
 17 | nltk.download('punkt')
 18 | 
 19 | from nltk.tokenize import sent_tokenize, word_tokenize
 20 | 
 21 | 
 22 | def linecount(filename):
 23 |     count = 0
 24 |     with open(filename, 'r') as f:
 25 |         for line in f:
 26 |             count += 1
 27 |     return count
 28 | 
 29 | 
 30 | def merge_files(filepaths, outpath):
 31 |     with open(outpath, 'w') as outfile:
 32 |         for fname in tqdm(filepaths):
 33 |             with open(fname, 'r') as infile:
 34 |                 for line in infile:
 35 |                     outfile.write(line)
 36 | 
 37 | 
 38 | def write_record(record, fp):
 39 |     record_str = json.dumps(record)
 40 |     fp.write(f"{record_str}\n")
 41 |     fp.flush()
 42 | 
 43 | 
 44 | def tag_word_length(doc):
 45 |     # split the document into sentences
 46 |     sents = sent_tokenize(json.loads(doc)['text'])
 47 |     new_sents = []
 48 |     for sent in sents:
 49 |         num_words = len(word_tokenize(sent))
 50 |         new_sent = f"<len> {num_words} <text> {sent}"
 51 |         new_sents.append(new_sent)
 52 |     return " ".join(new_sents)
 53 | 
 54 | 
 55 | def process_fn(save_dir, split, txt_data):
 56 |     print("Processing data")
 57 | 
 58 |     # count lines
 59 |     count = linecount(txt_data)
 60 | 
 61 |     chunk_size = count // args.total_chunks
 62 |     chunk_start_idx = chunk_size * args.chunk_idx
 63 |     chunk_end_idx = chunk_size * (args.chunk_idx + 1)
 64 |     chunk_path = f"{path}_{args.chunk_idx}"
 65 |     with open(chunk_path, 'w') as fw:
 66 |         with open(txt_data, 'r') as fr:
 67 |             with tqdm(total=chunk_size) as pbar:
 68 |                 for i, doc in enumerate(fr):
 69 |                     if i < chunk_start_idx:
 70 |                         continue
 71 |                     elif i >= chunk_end_idx:
 72 |                         break
 73 |                     else:
 74 |                         ############################
 75 |                         # TODO Replace with your own function for a custom dataset
 76 |                         doc = tag_word_length(doc)
 77 |                         write_record({'text': doc}, fw)
 78 |                         ############################
 79 | 
 80 |                         # move the progress bar 1 unit
 81 |                         pbar.update(1)
 82 |     return chunk_path
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 |     import argparse
 87 |     parser = argparse.ArgumentParser(description='preprocess data in chunks')
 88 |     parser.add_argument('--data_dir', type=str, help="where the openwebtext data is downloaded")
 89 |     parser.add_argument('--output_dir', type=str, help="where to output the processed files")
 90 |     parser.add_argument('--chunk_idx', type=int, default=0, help="which chunk to generate")
 91 |     parser.add_argument('--total_chunks', type=int, default=8, help="total number of chunks")
 92 |     parser.add_argument('--reduce_step', action='store_true', help="whether to do the reduce step instead of the mapping step")
 93 |     args = parser.parse_args()
 94 | 
 95 |     data_dir = Path(args.data_dir)
 96 |     save_dir = Path(args.output_dir)
 97 |     save_dir.mkdir(exist_ok=True, parents=True)
 98 | 
 99 |     splits = ['train', 'val']
100 | 
101 |     if not args.reduce_step:
102 |         for split in splits:
103 |             path = save_dir / f'{split}.jsonl'
104 |             txt_path = data_dir / f'{split}.jsonl'
105 |             save_path = process_fn(save_dir, split, txt_path)
106 |             save_path = Path(save_path)
107 |     else:
108 |         # merge data (optional, not necessarily recommended)
109 |         for split in splits:
110 |             path = save_dir / f'{split}.jsonl'
111 |             filepaths = [f"{path}_{i}" for i in range(args.total_chunks)]
112 |             merge_files(filepaths, path)
113 | 
114 | 


--------------------------------------------------------------------------------
/src/run_clm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
 18 | 
 19 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 20 | https://huggingface.co/models?filter=causal-lm
 21 | """
 22 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
 23 | 
 24 | import logging
 25 | import math
 26 | import os
 27 | import sys
 28 | from dataclasses import dataclass, field
 29 | from typing import Optional
 30 | import json
 31 | import torch
 32 | 
 33 | import datasets
 34 | from datasets import load_dataset, load_from_disk
 35 | 
 36 | import transformers
 37 | from transformers import (
 38 |     CONFIG_MAPPING,
 39 |     MODEL_FOR_CAUSAL_LM_MAPPING,
 40 |     AutoConfig,
 41 |     AutoModelForCausalLM,
 42 |     AutoTokenizer,
 43 |     HfArgumentParser,
 44 |     Trainer,
 45 |     TrainingArguments,
 46 |     default_data_collator,
 47 |     set_seed,
 48 | )
 49 | from transformers.testing_utils import CaptureLogger
 50 | from transformers.trainer_utils import get_last_checkpoint
 51 | from transformers.utils import check_min_version
 52 | from transformers.utils.versions import require_version
 53 | 
 54 | 
 55 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 56 | check_min_version("4.12.0")
 57 | 
 58 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 59 | 
 60 | logger = logging.getLogger(__name__)
 61 | 
 62 | 
 63 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
 64 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 65 | 
 66 | 
 67 | @dataclass
 68 | class ModelArguments:
 69 |     """
 70 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 71 |     """
 72 | 
 73 |     model_name_or_path: Optional[str] = field(
 74 |         default=None,
 75 |         metadata={
 76 |             "help": "The model checkpoint for weights initialization."
 77 |             "Don't set if you want to train a model from scratch."
 78 |         },
 79 |     )
 80 |     model_type: Optional[str] = field(
 81 |         default=None,
 82 |         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
 83 |     )
 84 |     config_overrides: Optional[str] = field(
 85 |         default=None,
 86 |         metadata={
 87 |             "help": "Override some existing default config settings when a model is trained from scratch. Example: "
 88 |             "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
 89 |         },
 90 |     )
 91 |     config_name: Optional[str] = field(
 92 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 93 |     )
 94 |     tokenizer_name: Optional[str] = field(
 95 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 96 |     )
 97 |     cache_dir: Optional[str] = field(
 98 |         default=None,
 99 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
100 |     )
101 |     use_fast_tokenizer: bool = field(
102 |         default=True,
103 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
104 |     )
105 |     model_revision: str = field(
106 |         default="main",
107 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
108 |     )
109 |     use_auth_token: bool = field(
110 |         default=False,
111 |         metadata={
112 |             "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
113 |             "with private models)."
114 |         },
115 |     )
116 |     custom_model: bool = field(
117 |         default=False,
118 |         metadata={"help": "Whether to use custom model args"},
119 |     )
120 |     custom_num_layers: int = field(
121 |         default=12,
122 |         metadata={"help": "number of layers in the custom `small` model"},
123 |     )
124 |     custom_num_heads: int = field(
125 |         default=12,
126 |         metadata={"help": "number of heads in the custom `small` model"},
127 |     )
128 |     custom_tokenizer: bool = field(
129 |         default=False,
130 |         metadata={"help": "Whether to use custom tokenizer"},
131 |     )
132 |     from_scratch: bool = field(
133 |         default=False,
134 |         metadata={"help": "Whether to train from scratch"},
135 |     )
136 |     generate_cache: bool = field(
137 |         default=False,
138 |         metadata={"help": "Only generate cache"},
139 |     )
140 | 
141 |     def __post_init__(self):
142 |         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
143 |             raise ValueError(
144 |                 "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
145 |             )
146 | 
147 | 
148 | @dataclass
149 | class DataTrainingArguments:
150 |     """
151 |     Arguments pertaining to what data we are going to input our model for training and eval.
152 |     """
153 | 
154 |     dataset_name: Optional[str] = field(
155 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
156 |     )
157 |     dataset_config_name: Optional[str] = field(
158 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
159 |     )
160 |     tokenized_data_dir: Optional[str] = field(default=None, metadata={"help": "Data directory of tokenized data"})
161 |     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
162 |     validation_file: Optional[str] = field(
163 |         default=None,
164 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
165 |     )
166 |     max_train_samples: Optional[int] = field(
167 |         default=None,
168 |         metadata={
169 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
170 |             "value if set."
171 |         },
172 |     )
173 |     max_eval_samples: Optional[int] = field(
174 |         default=None,
175 |         metadata={
176 |             "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
177 |             "value if set."
178 |         },
179 |     )
180 | 
181 |     block_size: Optional[int] = field(
182 |         default=None,
183 |         metadata={
184 |             "help": "Optional input sequence length after tokenization. "
185 |             "The training dataset will be truncated in block of this size for training. "
186 |             "Default to the model max input length for single sentence inputs (take into account special tokens)."
187 |         },
188 |     )
189 |     overwrite_cache: bool = field(
190 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
191 |     )
192 |     validation_split_percentage: Optional[int] = field(
193 |         default=5,
194 |         metadata={
195 |             "help": "The percentage of the train set used as validation set in case there's no validation split"
196 |         },
197 |     )
198 |     preprocessing_num_workers: Optional[int] = field(
199 |         default=None,
200 |         metadata={"help": "The number of processes to use for the preprocessing."},
201 |     )
202 |     keep_linebreaks: bool = field(
203 |         default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
204 |     )
205 | 
206 |     def __post_init__(self):
207 |         if (self.dataset_name is None and self.train_file is None and self.validation_file is None and self.tokenized_data_dir is None) :
208 |             raise ValueError("Need either a dataset name or a training/validation file, or a tokenized data dir.")
209 |         else:
210 |             if self.train_file is not None:
211 |                 extension = self.train_file.split(".")[-1]
212 |                 assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
213 |             if self.validation_file is not None:
214 |                 extension = self.validation_file.split(".")[-1]
215 |                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
216 | 
217 | 
218 | def main():
219 |     # See all possible arguments in src/transformers/training_args.py
220 |     # or by passing the --help flag to this script.
221 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
222 | 
223 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
224 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
225 |         # If we pass only one argument to the script and it's the path to a json file,
226 |         # let's parse it to get our arguments.
227 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
228 |     else:
229 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
230 | 
231 |     # Setup logging
232 |     logging.basicConfig(
233 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
234 |         datefmt="%m/%d/%Y %H:%M:%S",
235 |         handlers=[logging.StreamHandler(sys.stdout)],
236 |     )
237 | 
238 |     log_level = training_args.get_process_log_level()
239 |     logger.setLevel(log_level)
240 |     datasets.utils.logging.set_verbosity(log_level)
241 |     transformers.utils.logging.set_verbosity(log_level)
242 |     transformers.utils.logging.enable_default_handler()
243 |     transformers.utils.logging.enable_explicit_format()
244 | 
245 |     # Log on each process the small summary:
246 |     logger.warning(
247 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
248 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
249 |     )
250 |     logger.info(f"Training/evaluation parameters {training_args}")
251 | 
252 |     # Detecting last checkpoint.
253 |     last_checkpoint = None
254 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
255 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
256 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
257 |             raise ValueError(
258 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
259 |                 "Use --overwrite_output_dir to overcome."
260 |             )
261 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
262 |             logger.info(
263 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
264 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
265 |             )
266 | 
267 |     # Set seed before initializing model.
268 |     set_seed(training_args.seed)
269 | 
270 |     # Load pretrained model and tokenizer
271 |     #
272 |     # Distributed training:
273 |     # The .from_pretrained methods guarantee that only one local process can concurrently
274 |     # download model & vocab.
275 | 
276 |     config_kwargs = {
277 |         "cache_dir": model_args.cache_dir,
278 |         "revision": model_args.model_revision,
279 |         "use_auth_token": True if model_args.use_auth_token else None,
280 |     }
281 |     if model_args.config_name:
282 |         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
283 |     elif model_args.model_name_or_path:
284 |         config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
285 |     else:
286 |         config = CONFIG_MAPPING[model_args.model_type]()
287 |         logger.warning("You are instantiating a new config instance from scratch.")
288 |         if model_args.config_overrides is not None:
289 |             logger.info(f"Overriding config: {model_args.config_overrides}")
290 |             config.update_from_string(model_args.config_overrides)
291 | 
292 |     tokenizer_kwargs = {
293 |         "cache_dir": model_args.cache_dir,
294 |         "use_fast": model_args.use_fast_tokenizer,
295 |         "revision": model_args.model_revision,
296 |         "use_auth_token": True if model_args.use_auth_token else None,
297 |     }
298 | 
299 |     if model_args.custom_tokenizer:
300 |         from transformers import PreTrainedTokenizerFast
301 |         eot = '[endoftext]'
302 |         tokenizer = PreTrainedTokenizerFast(
303 |                 tokenizer_file=model_args.tokenizer_name,
304 |                 bos_token=eot,
305 |                 eos_token=eot,
306 |                 unk_token=eot)
307 | 
308 |     elif model_args.tokenizer_name:
309 |         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
310 |     elif model_args.model_name_or_path:
311 |         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
312 |     else:
313 |         raise ValueError(
314 |             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
315 |             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
316 |         )
317 | 
318 |     if model_args.custom_model:
319 |         config.vocab_size = tokenizer.vocab_size
320 |         config.n_layer = model_args.custom_num_layers
321 |         config.n_head = model_args.custom_num_heads
322 | 
323 |     if model_args.model_name_or_path and not model_args.from_scratch:
324 |         model = AutoModelForCausalLM.from_pretrained(
325 |             model_args.model_name_or_path,
326 |             from_tf=bool(".ckpt" in model_args.model_name_or_path),
327 |             config=config,
328 |             cache_dir=model_args.cache_dir,
329 |             revision=model_args.model_revision,
330 |             use_auth_token=True if model_args.use_auth_token else None,
331 |         )
332 |     else:
333 |         model = AutoModelForCausalLM.from_config(config)
334 |         n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
335 |         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
336 | 
337 |     model.resize_token_embeddings(len(tokenizer))
338 | 
339 |     # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
340 |     tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
341 | 
342 |     # treat train file as a the save directory
343 |     logger.info("Loading dataset from disk")
344 |     lm_datasets = load_from_disk(data_args.tokenized_data_dir)
345 | 
346 |     if training_args.do_train:
347 |         if "train" not in lm_datasets:
348 |             raise ValueError("--do_train requires a train dataset")
349 |         train_dataset = lm_datasets["train"]
350 |         if data_args.max_train_samples is not None:
351 |             train_dataset = train_dataset.select(range(data_args.max_train_samples))
352 | 
353 |     if training_args.do_eval:
354 |         if "validation" not in lm_datasets:
355 |             raise ValueError("--do_eval requires a validation dataset")
356 |         eval_dataset = lm_datasets["validation"]
357 |         if data_args.max_eval_samples is not None:
358 |             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
359 | 
360 |     # Initialize our Trainer
361 |     trainer = Trainer(
362 |         model=model,
363 |         args=training_args,
364 |         train_dataset=train_dataset if training_args.do_train else None,
365 |         eval_dataset=eval_dataset if training_args.do_eval else None,
366 |         tokenizer=tokenizer,
367 |         # Data collator will default to DataCollatorWithPadding, so we change it.
368 |         data_collator=default_data_collator,
369 |     )
370 | 
371 |     # Training
372 |     if training_args.do_train:
373 |         checkpoint = None
374 |         if training_args.resume_from_checkpoint is not None:
375 |             checkpoint = training_args.resume_from_checkpoint
376 |         elif last_checkpoint is not None:
377 |             checkpoint = last_checkpoint
378 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
379 |         trainer.save_model()  # Saves the tokenizer too for easy upload
380 | 
381 |         metrics = train_result.metrics
382 | 
383 |         max_train_samples = (
384 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
385 |         )
386 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
387 | 
388 |         trainer.log_metrics("train", metrics)
389 |         trainer.save_metrics("train", metrics)
390 |         trainer.save_state()
391 | 
392 |     # Evaluation
393 |     if training_args.do_eval:
394 |         logger.info("*** Evaluate ***")
395 | 
396 |         metrics = trainer.evaluate()
397 | 
398 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
399 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
400 |         try:
401 |             perplexity = math.exp(metrics["eval_loss"])
402 |         except OverflowError:
403 |             perplexity = float("inf")
404 |         metrics["perplexity"] = perplexity
405 | 
406 |         trainer.log_metrics("eval", metrics)
407 |         trainer.save_metrics("eval", metrics)
408 | 
409 |     kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
410 |     if data_args.dataset_name is not None:
411 |         kwargs["dataset_tags"] = data_args.dataset_name
412 |         if data_args.dataset_config_name is not None:
413 |             kwargs["dataset_args"] = data_args.dataset_config_name
414 |             kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
415 |         else:
416 |             kwargs["dataset"] = data_args.dataset_name
417 | 
418 |     if training_args.push_to_hub:
419 |         trainer.push_to_hub(**kwargs)
420 |     else:
421 |         trainer.create_model_card(**kwargs)
422 | 
423 | 
424 | def _mp_fn(index):
425 |     # For xla_spawn (TPUs)
426 |     main()
427 | 
428 | 
429 | if __name__ == "__main__":
430 |     main()
431 | 
432 | 


--------------------------------------------------------------------------------
/src/tokenize_data.py:
--------------------------------------------------------------------------------
  1 | from datasets import load_dataset, load_from_disk
  2 | 
  3 | import numpy as np
  4 | import re
  5 | import os
  6 | from pathlib import Path
  7 | import json
  8 | import shutil
  9 | import multiprocessing
 10 | 
 11 | from transformers import (
 12 |     CONFIG_MAPPING,
 13 |     MODEL_FOR_CAUSAL_LM_MAPPING,
 14 |     AutoConfig,
 15 |     AutoModelForCausalLM,
 16 |     AutoTokenizer,
 17 |     HfArgumentParser,
 18 |     Trainer,
 19 |     TrainingArguments,
 20 |     default_data_collator,
 21 |     set_seed,
 22 | )
 23 | 
 24 | 
 25 | def get_paths(split='train'):
 26 |     paths = list(afs_dir.glob(f'{split}.jsonl_*'))
 27 |     paths = [str(p) for p in paths]
 28 |     # sort by numbers
 29 |     path_idxs = [int(p.split('.jsonl_')[1]) for p in paths]
 30 |     sort_idxs = np.argsort(path_idxs)
 31 |     paths = list(np.asarray(paths)[sort_idxs])
 32 | 
 33 |     return paths
 34 | 
 35 | if __name__ == "__main__":
 36 |     import argparse
 37 |     parser = argparse.ArgumentParser(description='Tokenize the processed data')
 38 |     parser.add_argument('--data_dir', type=str, help="where the data is saved")
 39 |     parser.add_argument('--cache_dir', type=str, help="cache for intermediate files")
 40 |     parser.add_argument('--model_name', type=str, help="Huggingface model name (e.g., gpt2-medium)")
 41 |     args = parser.parse_args()
 42 | 
 43 |     model_name_or_path = args.model_name
 44 |     afs_dir = Path(args.data_dir)
 45 |     cache_dir = Path(args.cache_dir)
 46 |     cache_dir.mkdir(exist_ok=True, parents=True)
 47 |     use_fast_tokenizer = True
 48 |     model_revision = 'main'
 49 |     use_auth_token = False
 50 | 
 51 |     json_path_train_ls = get_paths(split='train')
 52 |     json_path_val_ls = get_paths(split='val')
 53 | 
 54 |     tokenizer_kwargs = {
 55 |         "cache_dir": str(cache_dir),
 56 |         "use_fast": use_fast_tokenizer,
 57 |         "revision": model_revision,
 58 |         "use_auth_token": True if use_auth_token else None,
 59 |     }
 60 | 
 61 |     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, **tokenizer_kwargs)
 62 | 
 63 |     block_size_10MB = 10<<20
 64 |     data_files = {
 65 |         'train': json_path_train_ls,
 66 |         'validation': json_path_val_ls,
 67 |             }
 68 |     json_cache = cache_dir / 'json_cache'
 69 |     json_cache.mkdir(exist_ok=True)
 70 |     datasets = load_dataset('json', data_files=data_files, chunksize=block_size_10MB, cache_dir=str(json_cache))
 71 | 
 72 |     # Preprocessing the datasets.
 73 |     # First we tokenize all the texts.
 74 |     column_names = datasets["train"].column_names
 75 |     text_column_name = "text" if "text" in column_names else column_names[0]
 76 | 
 77 |     def tokenize_function(examples):
 78 |         return tokenizer(examples[text_column_name])
 79 | 
 80 |     curr_save_dir = afs_dir / 'tokenized'
 81 |     if not curr_save_dir.exists():
 82 |         curr_save_dir.mkdir(exist_ok=True, parents=True)
 83 |         datasets = datasets.map(
 84 |             tokenize_function,
 85 |             batched=True,
 86 |             num_proc=multiprocessing.cpu_count(),
 87 |             remove_columns=column_names,
 88 |             load_from_cache_file=True,
 89 |         )
 90 |         datasets.save_to_disk(str(curr_save_dir))
 91 |     else:
 92 |         datasets = load_from_disk(str(curr_save_dir))
 93 | 
 94 |     block_size = 1024
 95 |     if block_size is None:
 96 |         block_size = tokenizer.model_max_length
 97 |         if block_size > 1024:
 98 |             logger.warn(
 99 |                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
100 |                 "Picking 1024 instead. You can change that default value by passing --block_size xxx."
101 |             )
102 |         block_size = 1024
103 |     else:
104 |         if block_size > tokenizer.model_max_length:
105 |             logger.warn(
106 |                 f"The block_size passed ({block_size}) is larger than the maximum length for the model"
107 |                 f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
108 |             )
109 |         block_size = min(block_size, tokenizer.model_max_length)
110 | 
111 |     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
112 |     def group_texts(examples):
113 |         # Concatenate all texts.
114 |         concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
115 |         total_length = len(concatenated_examples[list(examples.keys())[0]])
116 |         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
117 |         # customize this part to your needs.
118 |         total_length = (total_length // block_size) * block_size
119 |         # Split by chunks of max_len.
120 |         result = {
121 |             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
122 |             for k, t in concatenated_examples.items()
123 |         }
124 |         result["labels"] = result["input_ids"].copy()
125 |         return result
126 | 
127 |     # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
128 |     # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
129 |     # to preprocess.
130 |     #
131 |     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
132 |     # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
133 |     datasets = datasets.map(
134 |         group_texts,
135 |         batched=True,
136 |         num_proc=multiprocessing.cpu_count(),
137 |         load_from_cache_file=True,
138 |     )
139 |     curr_save_dir = afs_dir / 'tokenized_grouped'
140 |     curr_save_dir.mkdir(exist_ok=True)
141 |     datasets.save_to_disk(str(curr_save_dir))
142 | 
143 |     # remove tokenize dir to save space
144 |     tokenized_dir = afs_dir / 'tokenized'
145 |     shutil.rmtree(str(tokenized_dir))
146 | 
147 | 


--------------------------------------------------------------------------------
/src/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# -lt 2 ]]; then
 4 |     echo "Main entry point for training."
 5 |     echo "Usage:"
 6 |     echo
 7 |     echo "    $0 <dataset_name (e.g., openwebtext_wordlength)> <seed (e.g., 1)> [additional arguments]"
 8 |     echo
 9 |     echo "Additional arguments (see https://github.com/huggingface/transformers/blob/v4.12.5/src/transformers/training_args.py for the full list):"
10 |     echo ""
11 |     echo "    --max_steps: Total number of training steps to run."
12 |     echo "    --learning_rate: Highest learning rate in the learning rate schedule (by default, the learning rate schedule increases and decreases linearly, with the peak being this set learning rate)"
13 |     echo "    --warmup_steps: Number of steps to do a linear “warmup” from a small learning rate to the desired learning rate"
14 |     echo "    --save_steps: how many steps of training before saving (and evaluating on the val set)"
15 | 
16 |     echo "    --per_device_train_batch_size: batch size per GPU. The total batch size is [number of GPUs] * per_device_train_batch_size"
17 |     echo "    --gradient_accumulation_steps: Allows for accumulating gradients across multiple sequential steps. This allows you to increase the batch size while trading off computation time. If this parameter is > 1, then the total batch size is [num GPUs] * per_device_train_batch_size * gradient_accumulation_steps"
18 |     echo "    --lr_scheduler_type: learning rate scheduler. The default is “linear” which does a linear increase and decrease to and from the learning_rate."
19 |     echo "    --adafactor: use this flag if you want to use the Adafactor optimizer instead of AdamW (default). Adafactor can save on GPU memory by only saving 2 copies of the model instead of 3 needed for AdamW (mean, variance, gradient)"
20 |     echo "    --from_scratch: use this flag if you want to scratch"
21 |     echo "    --fp16: use 16-bit floating point (to save memory)"
22 |     exit 1
23 | fi
24 | 
25 | dataset_name=$1; shift
26 | seed=$1; shift
27 | rest_args="$@"
28 | 
29 | CACHE=./cache
30 | mkdir -p $CACHE
31 | 
32 | export HF_HOME=$CACHE
33 | export TRANSFORMERS_CACHE=$CACHE
34 | export HF_DATASETS_CACHE=$CACHE
35 | export HF_DATASETS_IN_MEMORY_MAX_SIZE=100000000000
36 | export TORCH_EXTENSIONS_DIR=$CACHE
37 | export WANDB_DISABLED=true
38 | 
39 | set -x
40 | 
41 | python src/run_clm.py \
42 |     --model_name_or_path gpt2 \
43 |     --block_size 1024 \
44 |     --do_train \
45 |     --do_eval \
46 |     --logging_steps 100 \
47 |     --evaluation_strategy steps \
48 |     --max_eval_samples 100 \
49 |     --preprocessing_num_workers 8 \
50 |     --tokenized_data_dir ${dataset_name}/tokenized_grouped \
51 |     --output_dir ${dataset_name}_seed${seed} \
52 |     --save_steps 10000 \
53 |     --lr_scheduler_type linear \
54 |     --seed $seed \
55 |     ${rest_args}
56 | 


--------------------------------------------------------------------------------
/wordlength_eval_data/wordlength_eval_data_0.json:
--------------------------------------------------------------------------------
 1 | {"text": "<len> 14 <text>", "num_words": "14"}
 2 | {"text": "<len> 17 <text>", "num_words": "17"}
 3 | {"text": "<len> 23 <text>", "num_words": "23"}
 4 | {"text": "<len> 2 <text>", "num_words": "2"}
 5 | {"text": "<len> 5 <text>", "num_words": "5"}
 6 | {"text": "<len> 29 <text>", "num_words": "29"}
 7 | {"text": "<len> 5 <text>", "num_words": "5"}
 8 | {"text": "<len> 9 <text>", "num_words": "9"}
 9 | {"text": "<len> 11 <text>", "num_words": "11"}
10 | {"text": "<len> 21 <text>", "num_words": "21"}
11 | {"text": "<len> 23 <text>", "num_words": "23"}
12 | {"text": "<len> 20 <text>", "num_words": "20"}
13 | {"text": "<len> 6 <text>", "num_words": "6"}
14 | {"text": "<len> 25 <text>", "num_words": "25"}
15 | {"text": "<len> 8 <text>", "num_words": "8"}
16 | {"text": "<len> 26 <text>", "num_words": "26"}
17 | {"text": "<len> 26 <text>", "num_words": "26"}
18 | {"text": "<len> 14 <text>", "num_words": "14"}
19 | {"text": "<len> 28 <text>", "num_words": "28"}
20 | {"text": "<len> 3 <text>", "num_words": "3"}
21 | {"text": "<len> 8 <text>", "num_words": "8"}
22 | {"text": "<len> 9 <text>", "num_words": "9"}
23 | {"text": "<len> 25 <text>", "num_words": "25"}
24 | {"text": "<len> 16 <text>", "num_words": "16"}
25 | {"text": "<len> 26 <text>", "num_words": "26"}
26 | {"text": "<len> 19 <text>", "num_words": "19"}
27 | {"text": "<len> 7 <text>", "num_words": "7"}
28 | {"text": "<len> 27 <text>", "num_words": "27"}
29 | {"text": "<len> 15 <text>", "num_words": "15"}
30 | {"text": "<len> 10 <text>", "num_words": "10"}
31 | {"text": "<len> 11 <text>", "num_words": "11"}
32 | {"text": "<len> 22 <text>", "num_words": "22"}
33 | {"text": "<len> 21 <text>", "num_words": "21"}
34 | {"text": "<len> 18 <text>", "num_words": "18"}
35 | {"text": "<len> 21 <text>", "num_words": "21"}
36 | {"text": "<len> 7 <text>", "num_words": "7"}
37 | {"text": "<len> 17 <text>", "num_words": "17"}
38 | {"text": "<len> 17 <text>", "num_words": "17"}
39 | {"text": "<len> 2 <text>", "num_words": "2"}
40 | {"text": "<len> 20 <text>", "num_words": "20"}
41 | {"text": "<len> 5 <text>", "num_words": "5"}
42 | {"text": "<len> 26 <text>", "num_words": "26"}
43 | {"text": "<len> 19 <text>", "num_words": "19"}
44 | {"text": "<len> 21 <text>", "num_words": "21"}
45 | {"text": "<len> 21 <text>", "num_words": "21"}
46 | {"text": "<len> 21 <text>", "num_words": "21"}
47 | {"text": "<len> 16 <text>", "num_words": "16"}
48 | {"text": "<len> 9 <text>", "num_words": "9"}
49 | {"text": "<len> 2 <text>", "num_words": "2"}
50 | {"text": "<len> 3 <text>", "num_words": "3"}
51 | 


--------------------------------------------------------------------------------
/wordlength_eval_data/wordlength_eval_data_1.json:
--------------------------------------------------------------------------------
 1 | {"text": "<len> 7 <text>", "num_words": "7"}
 2 | {"text": "<len> 13 <text>", "num_words": "13"}
 3 | {"text": "<len> 14 <text>", "num_words": "14"}
 4 | {"text": "<len> 10 <text>", "num_words": "10"}
 5 | {"text": "<len> 11 <text>", "num_words": "11"}
 6 | {"text": "<len> 13 <text>", "num_words": "13"}
 7 | {"text": "<len> 7 <text>", "num_words": "7"}
 8 | {"text": "<len> 17 <text>", "num_words": "17"}
 9 | {"text": "<len> 2 <text>", "num_words": "2"}
10 | {"text": "<len> 18 <text>", "num_words": "18"}
11 | {"text": "<len> 3 <text>", "num_words": "3"}
12 | {"text": "<len> 14 <text>", "num_words": "14"}
13 | {"text": "<len> 9 <text>", "num_words": "9"}
14 | {"text": "<len> 15 <text>", "num_words": "15"}
15 | {"text": "<len> 30 <text>", "num_words": "30"}
16 | {"text": "<len> 8 <text>", "num_words": "8"}
17 | {"text": "<len> 27 <text>", "num_words": "27"}
18 | {"text": "<len> 20 <text>", "num_words": "20"}
19 | {"text": "<len> 22 <text>", "num_words": "22"}
20 | {"text": "<len> 7 <text>", "num_words": "7"}
21 | {"text": "<len> 20 <text>", "num_words": "20"}
22 | {"text": "<len> 22 <text>", "num_words": "22"}
23 | {"text": "<len> 13 <text>", "num_words": "13"}
24 | {"text": "<len> 30 <text>", "num_words": "30"}
25 | {"text": "<len> 12 <text>", "num_words": "12"}
26 | {"text": "<len> 30 <text>", "num_words": "30"}
27 | {"text": "<len> 16 <text>", "num_words": "16"}
28 | {"text": "<len> 20 <text>", "num_words": "20"}
29 | {"text": "<len> 6 <text>", "num_words": "6"}
30 | {"text": "<len> 25 <text>", "num_words": "25"}
31 | {"text": "<len> 25 <text>", "num_words": "25"}
32 | {"text": "<len> 11 <text>", "num_words": "11"}
33 | {"text": "<len> 19 <text>", "num_words": "19"}
34 | {"text": "<len> 25 <text>", "num_words": "25"}
35 | {"text": "<len> 2 <text>", "num_words": "2"}
36 | {"text": "<len> 24 <text>", "num_words": "24"}
37 | {"text": "<len> 15 <text>", "num_words": "15"}
38 | {"text": "<len> 11 <text>", "num_words": "11"}
39 | {"text": "<len> 11 <text>", "num_words": "11"}
40 | {"text": "<len> 9 <text>", "num_words": "9"}
41 | {"text": "<len> 24 <text>", "num_words": "24"}
42 | {"text": "<len> 27 <text>", "num_words": "27"}
43 | {"text": "<len> 3 <text>", "num_words": "3"}
44 | {"text": "<len> 2 <text>", "num_words": "2"}
45 | {"text": "<len> 30 <text>", "num_words": "30"}
46 | {"text": "<len> 19 <text>", "num_words": "19"}
47 | {"text": "<len> 10 <text>", "num_words": "10"}
48 | {"text": "<len> 26 <text>", "num_words": "26"}
49 | {"text": "<len> 15 <text>", "num_words": "15"}
50 | {"text": "<len> 21 <text>", "num_words": "21"}
51 | 


--------------------------------------------------------------------------------
/wordlength_eval_data/wordlength_eval_data_2.json:
--------------------------------------------------------------------------------
 1 | {"text": "<len> 10 <text>", "num_words": "10"}
 2 | {"text": "<len> 17 <text>", "num_words": "17"}
 3 | {"text": "<len> 15 <text>", "num_words": "15"}
 4 | {"text": "<len> 10 <text>", "num_words": "10"}
 5 | {"text": "<len> 24 <text>", "num_words": "24"}
 6 | {"text": "<len> 13 <text>", "num_words": "13"}
 7 | {"text": "<len> 20 <text>", "num_words": "20"}
 8 | {"text": "<len> 13 <text>", "num_words": "13"}
 9 | {"text": "<len> 10 <text>", "num_words": "10"}
10 | {"text": "<len> 9 <text>", "num_words": "9"}
11 | {"text": "<len> 4 <text>", "num_words": "4"}
12 | {"text": "<len> 19 <text>", "num_words": "19"}
13 | {"text": "<len> 13 <text>", "num_words": "13"}
14 | {"text": "<len> 23 <text>", "num_words": "23"}
15 | {"text": "<len> 17 <text>", "num_words": "17"}
16 | {"text": "<len> 28 <text>", "num_words": "28"}
17 | {"text": "<len> 22 <text>", "num_words": "22"}
18 | {"text": "<len> 30 <text>", "num_words": "30"}
19 | {"text": "<len> 22 <text>", "num_words": "22"}
20 | {"text": "<len> 7 <text>", "num_words": "7"}
21 | {"text": "<len> 9 <text>", "num_words": "9"}
22 | {"text": "<len> 5 <text>", "num_words": "5"}
23 | {"text": "<len> 8 <text>", "num_words": "8"}
24 | {"text": "<len> 6 <text>", "num_words": "6"}
25 | {"text": "<len> 12 <text>", "num_words": "12"}
26 | {"text": "<len> 13 <text>", "num_words": "13"}
27 | {"text": "<len> 21 <text>", "num_words": "21"}
28 | {"text": "<len> 9 <text>", "num_words": "9"}
29 | {"text": "<len> 8 <text>", "num_words": "8"}
30 | {"text": "<len> 12 <text>", "num_words": "12"}
31 | {"text": "<len> 3 <text>", "num_words": "3"}
32 | {"text": "<len> 28 <text>", "num_words": "28"}
33 | {"text": "<len> 30 <text>", "num_words": "30"}
34 | {"text": "<len> 5 <text>", "num_words": "5"}
35 | {"text": "<len> 7 <text>", "num_words": "7"}
36 | {"text": "<len> 26 <text>", "num_words": "26"}
37 | {"text": "<len> 6 <text>", "num_words": "6"}
38 | {"text": "<len> 16 <text>", "num_words": "16"}
39 | {"text": "<len> 8 <text>", "num_words": "8"}
40 | {"text": "<len> 21 <text>", "num_words": "21"}
41 | {"text": "<len> 27 <text>", "num_words": "27"}
42 | {"text": "<len> 4 <text>", "num_words": "4"}
43 | {"text": "<len> 18 <text>", "num_words": "18"}
44 | {"text": "<len> 22 <text>", "num_words": "22"}
45 | {"text": "<len> 16 <text>", "num_words": "16"}
46 | {"text": "<len> 14 <text>", "num_words": "14"}
47 | {"text": "<len> 20 <text>", "num_words": "20"}
48 | {"text": "<len> 6 <text>", "num_words": "6"}
49 | {"text": "<len> 28 <text>", "num_words": "28"}
50 | {"text": "<len> 17 <text>", "num_words": "17"}
51 | 


--------------------------------------------------------------------------------
/wordlength_eval_data/wordlength_eval_data_3.json:
--------------------------------------------------------------------------------
 1 | {"text": "<len> 12 <text>", "num_words": "12"}
 2 | {"text": "<len> 26 <text>", "num_words": "26"}
 3 | {"text": "<len> 27 <text>", "num_words": "27"}
 4 | {"text": "<len> 5 <text>", "num_words": "5"}
 5 | {"text": "<len> 26 <text>", "num_words": "26"}
 6 | {"text": "<len> 10 <text>", "num_words": "10"}
 7 | {"text": "<len> 2 <text>", "num_words": "2"}
 8 | {"text": "<len> 23 <text>", "num_words": "23"}
 9 | {"text": "<len> 21 <text>", "num_words": "21"}
10 | {"text": "<len> 12 <text>", "num_words": "12"}
11 | {"text": "<len> 13 <text>", "num_words": "13"}
12 | {"text": "<len> 27 <text>", "num_words": "27"}
13 | {"text": "<len> 11 <text>", "num_words": "11"}
14 | {"text": "<len> 12 <text>", "num_words": "12"}
15 | {"text": "<len> 23 <text>", "num_words": "23"}
16 | {"text": "<len> 25 <text>", "num_words": "25"}
17 | {"text": "<len> 8 <text>", "num_words": "8"}
18 | {"text": "<len> 2 <text>", "num_words": "2"}
19 | {"text": "<len> 22 <text>", "num_words": "22"}
20 | {"text": "<len> 14 <text>", "num_words": "14"}
21 | {"text": "<len> 9 <text>", "num_words": "9"}
22 | {"text": "<len> 16 <text>", "num_words": "16"}
23 | {"text": "<len> 26 <text>", "num_words": "26"}
24 | {"text": "<len> 28 <text>", "num_words": "28"}
25 | {"text": "<len> 19 <text>", "num_words": "19"}
26 | {"text": "<len> 28 <text>", "num_words": "28"}
27 | {"text": "<len> 24 <text>", "num_words": "24"}
28 | {"text": "<len> 4 <text>", "num_words": "4"}
29 | {"text": "<len> 4 <text>", "num_words": "4"}
30 | {"text": "<len> 30 <text>", "num_words": "30"}
31 | {"text": "<len> 3 <text>", "num_words": "3"}
32 | {"text": "<len> 21 <text>", "num_words": "21"}
33 | {"text": "<len> 28 <text>", "num_words": "28"}
34 | {"text": "<len> 7 <text>", "num_words": "7"}
35 | {"text": "<len> 10 <text>", "num_words": "10"}
36 | {"text": "<len> 16 <text>", "num_words": "16"}
37 | {"text": "<len> 3 <text>", "num_words": "3"}
38 | {"text": "<len> 12 <text>", "num_words": "12"}
39 | {"text": "<len> 26 <text>", "num_words": "26"}
40 | {"text": "<len> 29 <text>", "num_words": "29"}
41 | {"text": "<len> 9 <text>", "num_words": "9"}
42 | {"text": "<len> 26 <text>", "num_words": "26"}
43 | {"text": "<len> 13 <text>", "num_words": "13"}
44 | {"text": "<len> 3 <text>", "num_words": "3"}
45 | {"text": "<len> 29 <text>", "num_words": "29"}
46 | {"text": "<len> 17 <text>", "num_words": "17"}
47 | {"text": "<len> 18 <text>", "num_words": "18"}
48 | {"text": "<len> 7 <text>", "num_words": "7"}
49 | {"text": "<len> 22 <text>", "num_words": "22"}
50 | {"text": "<len> 19 <text>", "num_words": "19"}
51 | 


--------------------------------------------------------------------------------
/wordlength_eval_data/wordlength_eval_data_4.json:
--------------------------------------------------------------------------------
 1 | {"text": "<len> 28 <text>", "num_words": "28"}
 2 | {"text": "<len> 16 <text>", "num_words": "16"}
 3 | {"text": "<len> 25 <text>", "num_words": "25"}
 4 | {"text": "<len> 7 <text>", "num_words": "7"}
 5 | {"text": "<len> 3 <text>", "num_words": "3"}
 6 | {"text": "<len> 10 <text>", "num_words": "10"}
 7 | {"text": "<len> 25 <text>", "num_words": "25"}
 8 | {"text": "<len> 10 <text>", "num_words": "10"}
 9 | {"text": "<len> 20 <text>", "num_words": "20"}
10 | {"text": "<len> 11 <text>", "num_words": "11"}
11 | {"text": "<len> 28 <text>", "num_words": "28"}
12 | {"text": "<len> 30 <text>", "num_words": "30"}
13 | {"text": "<len> 9 <text>", "num_words": "9"}
14 | {"text": "<len> 15 <text>", "num_words": "15"}
15 | {"text": "<len> 25 <text>", "num_words": "25"}
16 | {"text": "<len> 25 <text>", "num_words": "25"}
17 | {"text": "<len> 27 <text>", "num_words": "27"}
18 | {"text": "<len> 10 <text>", "num_words": "10"}
19 | {"text": "<len> 6 <text>", "num_words": "6"}
20 | {"text": "<len> 20 <text>", "num_words": "20"}
21 | {"text": "<len> 14 <text>", "num_words": "14"}
22 | {"text": "<len> 8 <text>", "num_words": "8"}
23 | {"text": "<len> 12 <text>", "num_words": "12"}
24 | {"text": "<len> 22 <text>", "num_words": "22"}
25 | {"text": "<len> 5 <text>", "num_words": "5"}
26 | {"text": "<len> 2 <text>", "num_words": "2"}
27 | {"text": "<len> 25 <text>", "num_words": "25"}
28 | {"text": "<len> 23 <text>", "num_words": "23"}
29 | {"text": "<len> 23 <text>", "num_words": "23"}
30 | {"text": "<len> 11 <text>", "num_words": "11"}
31 | {"text": "<len> 8 <text>", "num_words": "8"}
32 | {"text": "<len> 8 <text>", "num_words": "8"}
33 | {"text": "<len> 26 <text>", "num_words": "26"}
34 | {"text": "<len> 4 <text>", "num_words": "4"}
35 | {"text": "<len> 16 <text>", "num_words": "16"}
36 | {"text": "<len> 23 <text>", "num_words": "23"}
37 | {"text": "<len> 10 <text>", "num_words": "10"}
38 | {"text": "<len> 19 <text>", "num_words": "19"}
39 | {"text": "<len> 4 <text>", "num_words": "4"}
40 | {"text": "<len> 28 <text>", "num_words": "28"}
41 | {"text": "<len> 25 <text>", "num_words": "25"}
42 | {"text": "<len> 2 <text>", "num_words": "2"}
43 | {"text": "<len> 10 <text>", "num_words": "10"}
44 | {"text": "<len> 21 <text>", "num_words": "21"}
45 | {"text": "<len> 12 <text>", "num_words": "12"}
46 | {"text": "<len> 15 <text>", "num_words": "15"}
47 | {"text": "<len> 3 <text>", "num_words": "3"}
48 | {"text": "<len> 2 <text>", "num_words": "2"}
49 | {"text": "<len> 15 <text>", "num_words": "15"}
50 | {"text": "<len> 30 <text>", "num_words": "30"}
51 | 


--------------------------------------------------------------------------------