├── .gitignore
├── index.js
├── LICENSE
├── package.json
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | const ChangesStream = require('changes-stream');
 2 | const Request = require('request');
 3 | const Normalize = require('normalize-registry-metadata');
 4 | 
 5 | const db = 'https://replicate.npmjs.com';
 6 | 
 7 | var changes = new ChangesStream({
 8 |   db: db,
 9 |   include_docs: true
10 | });
11 | 
12 | Request.get(db, function(err, req, body) {
13 |   var end_sequence = JSON.parse(body).update_seq;
14 |   changes.on('data', function(change) {
15 |     if (change.seq >= end_sequence) {
16 |       process.exit(0);
17 |     }
18 |     if (change.doc.name) {
19 |       console.log(Normalize(change.doc));
20 |     }
21 |   });
22 | });
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, ashley williams
 2 | 
 3 | Permission to use, copy, modify, and/or distribute this software for any
 4 | purpose with or without fee is hereby granted, provided that the above
 5 | copyright notice and this permission notice appear in all copies.
 6 | 
 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "registry-follower-tutorial",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1"
 8 |   },
 9 |   "repository": {
10 |     "type": "git",
11 |     "url": "git+https://github.com/ashleygwilliams/registry-follower-tutorial.git"
12 |   },
13 |   "keywords": [],
14 |   "author": "",
15 |   "license": "ISC",
16 |   "bugs": {
17 |     "url": "https://github.com/ashleygwilliams/registry-follower-tutorial/issues"
18 |   },
19 |   "homepage": "https://github.com/ashleygwilliams/registry-follower-tutorial#readme",
20 |   "dependencies": {
21 |     "changes-stream": "^1.1.0",
22 |     "normalize-registry-metadata": "^1.1.2",
23 |     "request": "^2.72.0"
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # so you want to write a follower
  2 | > ch-ch-ch-ch-changes
  3 | 
  4 | This tutorial will teach you how to write a generic boilerplate
  5 | NodeJS application that can manipulate, respond to, broadcast, analyze,
  6 | and otherwise play with package metadata as it changes in the npm registry.
  7 | 
  8 | Wait...what? Why?
  9 | 
 10 | Here's the deal: do you want to have some fun with the `package.json` data
 11 | from every version of every package in the npm registry? Some neat ideas:
 12 | 
 13 | - Find all the package `README`s that mention dogs
 14 | - Discover how many package authors are named "Kate"
 15 | - Calculate how many dependency changes occur on average in a major version
 16 |    bump
 17 | 
 18 | And more! So stop waiting and write a follower!
 19 | 
 20 | ## prerequisites
 21 | 
 22 | In order to follow along with this tutorial you'll need [NodeJS] and
 23 | [npm]. I recommend installing these using a version manager; I use [nvm].
 24 | 
 25 | [NodeJS]: https://nodejs.org
 26 | [npm]: https://www.npmjs.com/
 27 | [nvm]: https://github.com/creationix/nvm
 28 | 
 29 | ## application setup
 30 | 
 31 | Let's set up our application:
 32 | 
 33 | 1. Create a directory called `follower-tutorial`. 
 34 |   (`mkdir follower-tutorial`)
 35 | 2. Move into that directory (`cd follower-tutorial`).
 36 | 3. Initialize an npm project by typing `npm init --yes`. This will create a
 37 |   `package.json` with default values.
 38 | 4. Create a file called `.gitignore` and add the line `node_modules` to it.
 39 | 
 40 | Our application currently looks like this:
 41 | 
 42 | ```
 43 | + follower-tutorial
 44 |   |- .gitignore
 45 |   |- package.json
 46 | ```
 47 | 
 48 | ## dependencies
 49 | 
 50 | Our application is going to depend on a couple super helpful npm packages:
 51 | 
 52 | - [`changes-stream`]: This package gives us access to a [stream] of changes
 53 |   from the npm registry's CouchDB. We'll listen for and respond to events
 54 |   from this stream in our app. These events represent changes in the npm
 55 |   registry.
 56 | - [`request`]: This package allows us to make HTTP requests. We'll use this
 57 |   to retrieve the current total number of changes currently in the 
 58 |   database so that we can optionally end our progarm when it has received
 59 |   all the current changes.
 60 | 
 61 | [`changes-stream`]: https://www.npmjs.com/package/changes-stream
 62 | [stream]: https://nodejs.org/api/stream.html
 63 | [`request`]: https://www.npmjs.com/package/request
 64 | 
 65 | To install these packages we'll type:
 66 | 
 67 | ```sh
 68 | npm install changes-stream request --save
 69 | ```
 70 | ... which will add both of our dependencies to our `package.json`.
 71 | 
 72 | ## set up a changes stream
 73 | 
 74 | Let's start writing our application now! We'll be writing our application
 75 | in an `index.js` file at the root of our `follower-tutorial` application
 76 | directory:
 77 | 
 78 | ```
 79 | + follower-tutorial
 80 |   |- .gitignore
 81 |   |- index.js       // <-- here's where our app goes
 82 |   |- package.json
 83 | ```
 84 | 
 85 | The first thing we'll do inside our `index.js` is use the `changes-stream`
 86 | package to create a new `ChangesStream` to listen to listen to the npm
 87 | registry. To do so we'll write:
 88 | 
 89 | ```js
 90 | 1  const ChangesStream = require('changes-stream');
 91 | 2 
 92 | 3  const db = 'https://replicate.npmjs.com';
 93 | 4 
 94 | 5  var changes = new ChangesStream({
 95 | 6    db: db
 96 | 7  });
 97 | ``` 
 98 | 
 99 | Let's talk about what's happening here:
100 | 
101 | - On line 1, we require the `changes-stream` package
102 | - On line 3, we save the URL of the npm registry db
103 | - On lines 5-7, we create a new ChangesStream instance, passing it an
104 |   options object that points to our db
105 | 
106 | Now that we've created a changes stream, let's listen to it! To do this, we
107 | write:
108 | 
109 | ```js
110 | 9  changes.on('data', function(change) {
111 | 10   console.log(change);
112 | 11 });
113 | ```
114 | 
115 | Let's test it out: Run this application by typing:
116 | 
117 | ```sh
118 | node index.js
119 | ```
120 | 
121 | If everything is working correctly, you'll see something like this start
122 | **streaming** through your console:
123 | 
124 | ```sh
125 | { seq: 445,
126 |   id: 'CompoundSignal',
127 |   changes: [ { rev: '5-a0695c30fdaa3471246ef0cd6c8a476d' } ] }
128 | { seq: 446,
129 |   id: 'amphibian',
130 |   changes: [ { rev: '5-1a864e76d844e90bf6c63cb94303b593' } ] }
131 | { seq: 447,
132 |   id: 'aop',
133 |   changes: [ { rev: '9-9acc0139df57a1db2604f13f12b500f2' } ] }
134 | { seq: 448,
135 |   id: 'dynamo-schema',
136 |   changes: [ { rev: '5-bf8052c0d4b6e80e6664625137efd610' } ] }
137 | { seq: 451,
138 |   id: 'password-reset',
139 |   changes: [ { rev: '21-948e6633799ffd56a993c3fb144d1728' } ] }
140 | ```
141 | 
142 | If you don't see that, and/or got an error, reread the sample code in this
143 | section and be sure you don't have any typos! If you continue having
144 | trouble, [file an issue on this repo].
145 | 
146 | [file an issue on this repo]: /npm/registry/issues/new
147 | 
148 | Otherwise... Congrats! You have a successful registry follower. Hurry up
149 | and hit `ctrl-c` - this stream won't ever exit the way we've written it
150 | now!
151 | 
152 | ## moar data please
153 | 
154 | So our follower works! But it's not that great right now because we don't 
155 | really have all that much interesting data. Let's look at what we have
156 | right now:
157 | 
158 | ```sh
159 | { seq: 446,
160 |   id: 'amphibian',
161 |   changes: [ { rev: '5-1a864e76d844e90bf6c63cb94303b593' } ] }
162 | ```
163 | 
164 | - `seq`: the package's order in the sequence of change events
165 | - `id`: the name of the `package` (sometimes this is something else! we'll
166 |   get to that in a bit tho, it doesn't matter too much right now.)
167 | - `changes`: an array containing a single object, with a single key `rev`
168 |   that point to a change id
169 | 
170 | Let's be real: this data is not *that* interesting. Where's the good stuff?
171 | It turns out that the fun data is in a key called `doc` that we need to
172 | tell our `ChangesStream` instance, `changes`, to specifically grab. To do
173 | this, we'll add `include_docs: true` to the options object we pass to the
174 | `ChangesStream` constructor.
175 | 
176 | Once we've told our ChangesStream to `include_docs`, we get some new
177 | awesome data.  This new data lives off of a key called `doc` on the `change`
178 | object we received from the stream.
179 | 
180 | The two changes we make to our code look like this:
181 | 
182 | ```js
183 | 5  var changes = new ChangesStream({
184 | 6    db: db,
185 | 7    include_docs: true            // <- this is the thing we're adding
186 | 8  })
187 | 9
188 | 10 changes.on('data', function(change) {
189 | 11   console.log(change.doc)      // <- so that we can add `.doc` here
190 | 12 });
191 | ```
192 | 
193 | Let's test it out by running `node index.js`. Assuming you've made all the
194 | changes we described above you should be seeing a LOT more data. Here's a
195 | summary of what you get:
196 | 
197 | - `_id`: the name of the package
198 | - `_rev`: the revision id
199 | - `name`: the name of the package
200 | - `description`: the package description
201 | - `'dist-tags'`: an object with all dist-tag names and versions
202 | - `versions`: a nested object where every version is a key, and an object of
203 |   all of the package metadata for that key is the value (this includes: 
204 |   `main`, `directories`, `dependencies`, `scripts`, `engines`, `bin`, 
205 |     `devDependencies`, and more)
206 | - `maintainers`: an array of objects, each representing info about a 
207 |   maintainer (name, email, website)
208 | - `time`: timestamps for every version of the package published, plus
209 |   `created` and `modified`
210 | - `author`: an object representing the author of the package (name, email
211 |   website)
212 | - `repository`: an object representing the location of the package code,
213 |   e.g. `{ type: 'git', url: 'git://github.com/my/gitrepo.git' }`
214 | 
215 | Take a moment and play around with your application by exploring the
216 | different pieces of data you can get from this stream. You may notice that
217 | some nested structures appear like `[Object]` in your console. You can
218 | print those out by adding `JSON.stringify` to your log, like this:
219 | 
220 | ```js
221 | console.log(JSON.stringify (change.doc,null,' '));
222 | ```
223 | 
224 | ...which will ensure that the nested objects aren't flattened (e.g. appear
225 | like `[Object]`).
226 | 
227 | Note: you'll have to `ctrl-C` out of your application every time you run
228 | it. It's still a never ending stream! In the next section we'll explain how
229 | to make it stop. 
230 | 
231 | ## a never ending stream
232 | 
233 | As we mentioned in the previous section, our follower currently won't ever
234 | stop! Let's dive a little deeper into why that's the case:
235 | 
236 | Our application functions by listening for an event called `data` from
237 | a stream coming out of npm's registry, each event hands us an object that
238 | represents a `change` in the registry... and the registry is changing all
239 | the time!
240 | 
241 | Luckily, our db endpoint gives us some useful info to help us consume just
242 | the current changes as they exist at the time of access, and then stop the 
243 | process.
244 | 
245 | Navigate your browser to our db url:
246 | 
247 | ```
248 | https://replicate.npmjs.com
249 | ```
250 | 
251 | ...you should see something that looks like this:
252 | 
253 | ```json
254 | {
255 |   "db_name": "registry",
256 |   "doc_count": 345391,
257 |   "doc_del_count": 355,
258 |   "update_seq": 2496579,
259 |   "purge_seq": 0,
260 |   "compact_running": false,
261 |   "disk_size": 1713074299,
262 |   "data_size": 1320944467,
263 |   "instance_start_time": "1466084344558224",
264 |   "disk_format_version": 6,
265 |   "committed_update_seq": 2496579
266 | }
267 | ```
268 | 
269 | Let's talk about what some of these mean:
270 | 
271 | - `doc_count`: is the number of documents that the db contains
272 | - `update_seq`: is the number of changes that are stored in the db
273 | 
274 | `update_seq` is the important bit of information here. As stated, it 
275 | represents the number of `change` resources contained in the db. Remember
276 | the `seq` attribute on the `change` object we received from the stream?
277 | That number counts up until `update_seq`! This means that we can use this
278 | number to tell our follower to stop when `change.seq` meets or exceeds the
279 | `update_seq` value, signifying that it has processed all the changes in the
280 | db up until the time we accessed the `update_seq` value.
281 | 
282 | That was a lot of words, let's take a look at what this would look like in
283 | code.
284 | 
285 | ```js
286 | 2  const Request = require('request');
287 | ...
288 | 11 Request.get(db, function(err, req, body) {        // <- make a request to the db
289 | 12   var end_sequence = JSON.parse(body).update_seq; // <- grab the update_seq value
290 | 13   changes.on('data', function(change) {
291 | 14     if (change.seq >= end_sequence) {               // <- if we're at the last change
292 | 15       process.exit(0);                            // <- end the program successfully ("0")
293 | 16     }
294 | 17     console.log(change.doc);
295 | 18   }) 
296 | 19 });
297 | ```
298 | 
299 | Let's walk through what this code is doing:
300 | 
301 | - On line 2, we are require the `request` package
302 | - On line 11, we are making a request to our db using the `request` package
303 | - On line 12, we parse the response from our request and grab the `update_seq`
304 |   value.
305 | - On line 13, on every `data` event, we check to see if the `change.seq`
306 |   value we get is greater than or equal to `update_seq`. Why `>=` and
307 |   not just equal? Remember that the registry is *always* changing, and there's
308 |   a good chance it will change while we are following it! Using `>=` means that
309 |   we can account for the change that happens while our application is running. 
310 | - On line 15, we end our program. We send the value `0` to `process.exit` to
311 |   indicate that we are ending the program successfully, i.e. not with an error.
312 | 
313 | Ok! Given this code, our application will now run for all the current changes in the
314 | registry and then exit. Take a moment and give it a go! Note: There are a lot of
315 | changes, so this can take up to an hour.
316 | 
317 | ## clean up
318 | 
319 | So our follower is pretty much done! However, there's a few things that ain't quite
320 | right about our data. Let's do that now so we can finish up.
321 | 
322 | Firstly, remember the `id`/`_id` key we receive from our changes stream? We had
323 | identified that as being the name of the package, but that was a generalization.
324 | It turns out that there are actually 2 types of things in the changes db: changes
325 | and "design docs".
326 | 
327 | "Design docs"? What? Right. To understand this requires understand how CouchDB works
328 | a bit. One way to program with CouchDB is to write an application **within** the db.
329 | At this point, npm is moving away from this structure, but at one point (and still!)
330 | the registry was/is written as a CouchDB application. These applications exist as 
331 | "design docs" inside the db, so when receive data from the db, *sometimes* we receive
332 | these design docs. If you watched your follower closely, you'd notice that 
333 | *sometimes* it's logging `undefined`. Those are the "design docs".
334 | 
335 | To ignore these files, we can just check to see if a `change` is an actual package
336 | by checking if it has a `name`. We can accomplish this by checking if 
337 | change.doc.name` has a value before we do anything with the `change` data. In our
338 | code, this looks like this:
339 | 
340 | ```js
341 | ...
342 | 17 if (change.doc.name) {             // <-- make sure the change is a change
343 | 18   console.log(change.doc);
344 | 19 }
345 | ``` 
346 | 
347 | Ok, so we're **almost** done. Actually, we are totally done. But there's one last
348 | thing we can do to make our data even better: we can normalize our data so that
349 | it is nearly exactly the same as the CLI uses and is returned by http://registry.npmjs.com.
350 | 
351 | To do this, we'll add *one more* dependency to our application: [`normalize-registry-metadata`].
352 | 
353 | [`normalize-registry-metadata`]: https://github.com/npm/normalize-registry-metadata
354 | 
355 | First things first: let's install this package and save it to our `package.json`:
356 | 
357 | ```sh
358 | npm install normalize-registry-metadata --save
359 | ```
360 | 
361 | Next, we require in our `index.js`:
362 | 
363 | ```js
364 | 3  const Normalize = require('normalize-registry-metadata');
365 | ```
366 | 
367 | Lastly, let's call `Normalize()` on the `change` data before we log it to the console:
368 | 
369 | ```js
370 | ...
371 | 18   console.log(Normalize(change.doc));        // <-- we only have to change this line!
372 | ...
373 | ```
374 | 
375 | And we're done! You can double check that your code is correct by looking at the 
376 | complete code [here].
377 | 
378 | [here]: https://github.com/ashleygwilliams/registry-follower-tutorial/blob/master/index.js
379 | 
380 | ## forever follower
381 | 
382 | Don't want to stop? Want to write a persistent follower? You can move
383 | forward with our app as it is currently written, however you'll likely have
384 | a better experience replacing `changes-stream` with 
385 | [`concurrent-couch-follower`] which is safer for operations that may 
386 | require async (like a file write!). [`concurrent-couch-follower`] remembers
387 | the last change you processed and can start back from there if at some point
388 | you need to restart the program.
389 | 
390 | [`concurrent-couch-follower`]: https://github.com/npm/concurrent-couch-follower
391 | 
392 | ## a few notes on performance
393 | 
394 | The vast majority of useful registry followers won't ever have
395 | any kind of follower-side performance problem.  In keeping with
396 | [long-established wisdom][wisdom], you probably shouldn't even _think_
397 | about this section until you hit a bottleneck in use and confirm it
398 | by measurement.  Logging Node.js [cpuUsage()] and [memoryUsage()],
399 | [heap analysis], and the built-in [profiler] are great places to start.
400 | 
401 | [cpuUsage()]: https://nodejs.org/api/process.html#process_process_cpuusage_previousvalue
402 | 
403 | [memoryUsage()]: https://nodejs.org/api/process.html#process_process_memoryusage
404 | 
405 | [heap analysis]: https://www.npmjs.com/package/heapdump
406 | 
407 | [profiler]: https://nodejs.org/en/docs/guides/simple-profiling/
408 | 
409 | [wisdom]: http://c2.com/cgi/wiki?PrematureOptimization
410 | 
411 | Under the hood, libraries like [`changes-stream`] `GET` the
412 | registry's CouchDB-style HTTPS replication endpoint, which streams
413 | newline-deliminted JSON objects, one per database update, over
414 | long-lived responses.  These are the object chunks you receive from
415 | the stream.
416 | 
417 | Most registry update objects are manageably small, but the deviation
418 | is great, with a few updates weighing in close to 5 MB.  The bulk
419 | of this is often (highly repetitive) `README` file strings, one
420 | per version in `chunk.doc.versions`.  Some packages have thousands
421 | of versions.  And every once in a while, some fiendish jokester
422 | publishes a "novelty" package that "depends on" every other package
423 | in the registry, as if they were the first to think of it.
424 | 
425 | Especially if you're using a pipeline of many object-mode streams
426 | to process the chunks, you may have high memory usage with Node.js'
427 | default maximum stream internal buffer size, `highWaterMark`, of 16.
428 | Multiple buffers of 16 objects each, plus lingering data not yet
429 | picked up by the garbage collector, can eat your RAM lunch quick.
430 | To reduce this number:
431 | 
432 | ```
433 | new ChangesStream({
434 |   db: 'https://replicate.npmjs.com',
435 |   include_docs: true,
436 |   highWaterMark: 4
437 | })
438 | ```
439 | 
440 | Most tried-and-true stream packages, like those in the [Mississippi
441 | Streams Collection][mississippi], take optional options-object
442 | arguments that get passed along to the core [readable-stream]
443 | constructors.  You can set `{highWaterMark: Number}` in those
444 | arguments.
445 | 
446 | [mississippi]: https://www.npmjs.com/package/mississippi
447 | 
448 | [readable-stream]: https://www.npmjs.com/package/readable-stream
449 | 
450 | ## go forth and make something awesome!
451 | 
452 | We're seriously excited about what you'll build. Please share with us on
453 | twitter ([@npmjs])! And please don't hesitate to ask for help in the issues
454 | on this repo :)
455 | 
456 | [@npmjs]: https://twitter.com/npmjs
457 | 


--------------------------------------------------------------------------------