Browse Source

initial commit

master
Aaron Miller 4 years ago
parent
commit
25ccd1eb1b
5 changed files with 486 additions and 0 deletions
  1. 3
    0
      .gitignore
  2. 168
    0
      lib/fetch.js
  3. 85
    0
      lib/util.js
  4. 7
    0
      package.json
  5. 223
    0
      spider.js

+ 3
- 0
.gitignore View File

@@ -0,0 +1,3 @@
node_modules/*
images/
.hg*

+ 168
- 0
lib/fetch.js View File

@@ -0,0 +1,168 @@
/*global Math */
/*
* If you're not familiar in some details with the Promises/A+ spec,
* and the Q library in particular, much of the following will
* probably make little sense to you.
*
* How thoroughly this parallelizes depends heavily on how you
* configure the request library. I've left it at defaults, which seem
* to maintain a pool of around 5 connections, and pipeline requests
* over them. I wouldn't mess with that much, especially as heavy
* parallelization will probably obviate the fetch delay.
*/

var Q = require('q');
var request = require('request');

// State variables used to manage the indefinite number of fetches.
var started = false;
var unresolved = {};

var passes = 0;
var kbytes = 0;
var fails = [];

// NB times in milliseconds
var options = {
loopSleep: 15000,
verbose: false,
// Set this to 0 if you're an asshole.
fetchDelay: 250
};

var log = (options.verbose ? console.log : function () {});

/**
* Queue an asynchronous fetch (i.e. an HTTP GET request).
*
* @param {string} uri - The (absolute) URI to retrieve.
* @param {Function|undefined} action - The callback to invoke with the results of the GET request.
* @param {*} extra - Any extra data to pass to the action callback. Otherwise ignored by the fetch function.
*
* @returns {Promise} A promise for the result of the fetch.
*/
function fetch(uri, action, extra) {
var defer = Q.defer();

// Update the watcher's state variables.
//
// 'started' tells it whether any fetches have been queued (this
// makes it safe to start the watcher before issuing any fetches).
//
// 'unresolved' is a map<uri: promise> which the watcher uses to
// tell how many fetches are currently in flight. When it's empty,
// the watcher's event loop considers it OK to exit (i.e., the
// spidering is done).
started = true;
unresolved[uri] = defer.promise;

// Wait for options.fetchDelay milliseconds, then fire off the
// fetch.
Q.delay(options.fetchDelay).then(function() {
request(
{
method: 'GET',
uri: uri,
encoding: null // otherwise we get mangled binaries
},
function(error, response, body) {
if (!error && response && response.statusCode
&& response.statusCode >= 200
&& response.statusCode <= 299) {
// Handle a successful fetch.
log('fetched ' + uri);

// The action doesn't have to be a function, but if it
// isn't, nothing worthwhile will happen with the fetch
// result.
// TODO throw if this condition isn't satisfied
if (typeof action === 'function') {
// If the callback throws, reject the fetch promise (fail
// the fetch) with the exception.
try {
action(uri, response, body, extra);
} catch (error) {
defer.reject(error);
}
}

// Assuming all went well, resolve the fetch promise (mark
// the fetch completed).
passes++;
kbytes += Math.floor(body.length / 1024);
defer.resolve({
response: response,
body: body
});
} else {
// Handle a failed fetch (request gave back an error, or
// response code isn't 2xx).
console.error('failed ' + uri
+ ': status ' + response.statusCode
+ ', err ' + error);
// Stuff the fetch information into the fails list.
fails.push({
uri: uri,
response: response,
error: error,
retry: function() {
fetch(uri, action, extra);
}
});
// As above, reject the fetch promise.
defer.reject(error);
}

// Now that we know the fetch has finished one way or another,
// remove it from the list of in-flight fetches, so the
// watcher knows what's up.
delete unresolved[uri];
});
});

// Return the promise for the fetch results.
return defer.promise;
};

/**
* Monitor the fetcher until it's done fetching; at that point, report
* some information about what it did overall.
*/
function watch() {
var defer = Q.defer();

if (!started || Object.keys(unresolved).length > 0) {
// If the fetcher hasn't started yet (no fetches queued), or if
// any fetches are currently in flight, then just report stats,
// wait, and go around the loop again.
console.log(new Date().toString() + ': '
+ passes + ' ok, '
+ fails.length + ' ng, '
+ Math.floor(kbytes/1024) + ' MB');

if (fails.length > 0) {
console.log('pushing a retry');
fails.shift().retry();
}
Q.delay(options.loopSleep).then(watch);
} else {
// We're out of inflight fetches, so the fetcher is done. Report
// stats and end the process.
console.log('finished at ' + new Date().toString());
console.log(passes + ' fetches done');
console.log(Math.floor(kbytes/1024) + ' MBytes fetched');
console.log(fails.length + ' fetches failed');
defer.resolve();
}

return defer.promise;
};

// Expose all the stuff we just wrote.
module.exports = {
options: options,
fetch: fetch,
watch: watch
};

+ 85
- 0
lib/util.js View File

@@ -0,0 +1,85 @@
/*
* Random utility functions that don't have any other clean place to live.
*/

var url = require('url');
var fs = require('fs');

/**
* Canonicalize a relative URI against an absolute base.
*
* NOTE: This function has no unit tests and has been tested in use
* only in the case exemplified below. If you do anything else with
* it, and it breaks, you get to keep all the pieces.
*
* FIXME harden this
*
* @param {string} base - The absolute base URI (e.g. http://www.example.com).
* @param {string} leaf - The relative URI to canonicalize (e.g. /foo/bar.html).
* @returns {string} The canonicalized URI (e.g. http://www.example.com/foo/bar.html).
*/
function canonicalize(base, leaf) {
var parsed = url.parse(base);
parsed.pathname = leaf;
parsed.path = leaf;
return url.format(parsed);
};

/**
* Recursively create any directories which don't yet exist along a
* given path. (Basically what `mkdir -p` does, only a little more
* paranoid.)
*
* Note that this function, being purely synchronous, isn't really an
* example of best Node practices. But if mkdir(2) and stat(2) take
* meaningful time on your platform, you probably shouldn't be trying
* to run this scraper on it.
*
* @param {string} path - A path. beneath the current directory and without upward traversal
* @returns {string} The same path it was given. (If it didn't throw, everything went fine.)
*/
function makeDirectories(path) {
if (path[0] === '/' || path.slice(0, 2) === '..') {
throw new Error('makeDirectories can\'t use a path starting with "/" or ".."');
}
var dirs = path.split('/');
var subdir;
var stat;

if (dirs.some(function(dir) { return dir === '..' })) {
throw new Error('makeDirectories won\'t traverse backward (i.e. no ".." path elements allowed)');
};

// Walk the path...
for (var i = 0; i < dirs.length; i++) {
// ...at each step, assemble a subpath from start to here...
subdir = dirs.slice(0, i+1).join('/');
if (! fs.existsSync(subdir)) {
// ...if this subpath doesn't exist, mkdir it...
fs.mkdirSync(subdir);
} else {
// ...if it does exist and isn't a directory, shit the bed; otherwise, ignore it.
stat = fs.statSync(subdir);
if (! stat.isDirectory()) {
throw new Error(subdir + ' is not a directory');
}
};
};

return path;
};

function makeSafePathPart(pathPart) {
// make pathPart legal (e.g. strip slashes)
return pathPart
.replace(/\s*\/\s*/g, '_')
.replace(/^\s+/, '')
.replace(/\s+$/, '');
};

module.exports = {
canonicalize: canonicalize,
makeDirectories: makeDirectories,
makeSafePathPart: makeSafePathPart
};

+ 7
- 0
package.json View File

@@ -0,0 +1,7 @@
{
"dependencies": {
"request": "~2.58.0",
"q": "~1.4.1",
"cheerio": "~0.19.0"
}
}

+ 223
- 0
spider.js View File

@@ -0,0 +1,223 @@
/*
* This is a breadth-first, recursive, asynchronous image-downloading
* web spider.
*
* If you don't know which website it's for, you'll have a hard time
* using it.
*/

var Buffer = require('buffer').Buffer;
var fs = require('fs');
var cheerio = require('cheerio');
var fetcher = require('lib/fetch');
var util = require('lib/util');

/*
* Configuration values.
*/

// The URI where you want the spider to start (e.g. http://example.com/)
var baseURI = process.argv[2];
if (typeof baseURI === 'undefined') {
console.log('Usage: node spider.js <URI>');
process.exit(1);
}

// The directory in which to store images once downloaded; will be
// created if it doesn't already exist.
//
// Files already in this directory, if any, may be overwritten without
// warning.
var basePath = './images/';

/*
* Process-level setup.
*/

// Start the fetcher's monitoring loop; it will continue until all
// recursive downloads are complete.
fetcher.watch();

// Start the recursive fetch at the base URI.
fetcher.fetch(baseURI, parseRoot);

/*
* Leaf parser functions follow.
*/

/**
* Handle content retrieved from the base URI.
*
* @param {string} uri - The canonical (absolute) URI of the fetched page.
* @param {Object} response - The HTTP response, as received by the request library's callback.
* @param {Buffer} body - A binary buffer containing the response body, as received by the request library's callback.
*/
function parseRoot(uri, response, body) {
var $ = cheerio.load(body);
// Find navigation links in the left-hand sidebar...
$('div#nav').find('a').each(function(i, el) {
var navLinkName = $(el).text();
// ...and queue a fetch and parse for each.
fetcher.fetch(uri + $(el).attr('href'), parseTopNavLeaf, navLinkName);
});
};

/**
* Parse a sidebar navigation link.
*
* @param {string} uri - The canonical (absolute) URI of the fetched page.
* @param {Object} response - The HTTP response, as received by the request library's callback.
* @param {Buffer} body - A binary buffer containing the response body, as received by the request library's callback.
* @param {string} navLinkName - The name of the main nav section we're parsing.
*/
function parseTopNavLeaf(uri, response, body, navLinkName) {
var $ = cheerio.load(body);
// Find navigation links in the "by-letter" top bar...
$('div#letters').find('a').each(function(i, el) {
// ...and queue a fetch and parse for each.
fetcher.fetch(util.canonicalize(uri, $(el).attr('href')),
parseSecondLevelLeaf,
navLinkName);
});
};

/**
* Parse a top-bar "by-letter" navigation link.
*
* @param {string} uri - The canonical (absolute) URI of the fetched page.
* @param {Object} response - The HTTP response, as received by the request library's callback.
* @param {Buffer} body - A binary buffer containing the response body, as received by the request library's callback.
* @param {string} navLinkName - The name of the main nav section we're recursing over.
*/
function parseSecondLevelLeaf(uri, response, body, navLinkName) {
var $ = cheerio.load(body);
// Find individual games' links...
$('div.gameiconcontainer').each(function(i, el) {
// ...obtain some metadata for them that we'll need to pass along...
var gameImageElement = $(el).find('div.gameiconbody').find('img')
var gameImageUri = util.canonicalize(uri, gameImageElement.attr('src'));
var gamePageUri = util.canonicalize(uri, gameImageElement.closest('a').attr('href'));
var gameName = gameImageElement.attr('alt');
var gameDir = util.makeSafePathPart(gameName);

// ...queue a fetch-and-parse for each game's sprite sheet page...
fetcher.fetch(gamePageUri, parseGamePage, {
gameName: gameName,
navLinkName: navLinkName
});

// ...and queue a download for the game's "logo" image.
fetcher.fetch(gameImageUri, saveImage, {
path: [basePath, util.makeSafePathPart(navLinkName)].join('/'),
name: gameDir + '.png'
});
});
};

/**
* Parse an individual game's page.
*
* @param {string} uri - The canonical (absolute) URI of the fetched page.
* @param {Object} response - The HTTP response, as received by the request library's callback.
* @param {Buffer} body - A binary buffer containing the response body, as received by the request library's callback.
* @param {Object} names - Metadata about this game, and its position in the site's hierarchy, which we'll need to properly name the images we save.
*/
function parseGamePage(uri, response, body, names) {
var $ = cheerio.load(body);
var gameDir = util.makeSafePathPart(names.gameName);

// Find sheet groups...
$('div.updatesheeticons').each(function(i, el) {
var sheet = $(el);
// ...get each one's type-names from the immediately preceding <div class="section">...
var sheetType = sheet.prev('div.section').text();

// ...find each sheet link within the group...
sheet.find('a').each(function(i, el) {
// ...fetch some necessary metadata about it...
var sheetPage = $(el);
var sheetName = sheetPage.find('img').attr('alt');
var sheetPageUri = util.canonicalize(uri, sheetPage.attr('href'));

// ...and queue a fetch-and-parse for the sheet page.
fetcher.fetch(sheetPageUri, parseSheetPage, {
nav: names.navLinkName,
game: names.gameName,
sheet: {
type: sheetType,
name: sheetName
}
});
});
});
};

/**
* Parse an individual sheet's page.
*
* @param {string} uri - The canonical (absolute) URI of the fetched page.
* @param {Object} response - The HTTP response, as received by the request library's callback.
* @param {Buffer} body - A binary buffer containing the response body, as received by the request library's callback.
* @param {Object} names - Metadata about this game, and its position in the site's hierarchy, which we'll need to properly name the images we save.
*/
function parseSheetPage(uri, response, body, sheetInfo) {
var $ = cheerio.load(body);

// Identify the sheet image within the page content...
$('div#content img').each(function(i, el) {
var img = $(el);

// ...if this image's alt attribute doesn't match the sheet name
// we found on the game page, ignore it...
if (img.attr('alt') !== sheetInfo.sheet.name) return;
var sheetUri = util.canonicalize(uri, img.attr('src'));

// ...and queue a download for the sheet image, with a suitable
// output filename so we know how to find it.
fetcher.fetch(sheetUri, saveImage, {
path: [basePath,
util.makeSafePathPart(sheetInfo.nav),
util.makeSafePathPart(sheetInfo.game)].join('/'),
name: [util.makeSafePathPart(sheetInfo.sheet.type),
util.makeSafePathPart(sheetInfo.sheet.name)]
.join(' - ') + '.png'
});
});
};

/**
* Take a file fetched by the request library, and write it verbatim
* into a file at a given path.
*
* This function fulfills the interface required of fetch action
* delegates (lib/fetch.js, q.v.), but uses only the "body" and
* "fileinfo" arguments it receives.
*
* @param {string} uri - The URI of the fetched file. Ignored.
* @param {Object} response - The HTTP response. Ignored.
* @param {Buffer} body - A binary buffer containing the response body, as received by the request library's callback.
* @param {Object} fileinfo - An object containing path and basename for the file to be written.
*/
function saveImage(uri, response, body, fileinfo) {
// Recursively create any directories in the target path which don't
// already exist. (like `mkdir -p`)
util.makeDirectories(fileinfo.path);

// Assemble the complete path for the file to save.
var filePath = [fileinfo.path, fileinfo.name].join('/');

// Write out the response body into the target file, protecting it
// from lossage caused by Node's habit of defaulting strings to utf8
// encoding. (That's what the 'binary' stuff is about. See the Node
// manual, sections "Filesystem" and "Buffer", for details.)
var buf = new Buffer(body, 'binary');
fs.writeFile(filePath, body, 'binary', function(err) {
if (err) throw err;
console.log('saved ' + filePath);
});
};

Loading…
Cancel
Save