Browse Source

fix file extn bug; fix (possible) name colln bug; lint; add stats reporting, quiet down output

master
Aaron Miller 5 years ago
parent
commit
4de80f7603
3 changed files with 100 additions and 36 deletions
  1. 21
    8
      lib/fetch.js
  2. 28
    11
      lib/util.js
  3. 51
    17
      spider.js

+ 21
- 8
lib/fetch.js View File

@@ -1,3 +1,5 @@
/* jshint -W014,-W002*/

/*global Math */
/*
* If you're not familiar in some details with the Promises/A+ spec,
@@ -22,16 +24,23 @@ var passes = 0;
var kbytes = 0;
var fails = [];

// Custom stats loggers.
var stats = [];

// NB times in milliseconds
var options = {
loopSleep: 15000,
loopSleep: 1000,
verbose: false,
// Set this to 0 if you're an asshole.
fetchDelay: 250
fetchDelay: 200
};

var log = (options.verbose ? console.log : function () {});

function registerStats(fn) {
stats.push(fn);
}

/**
* Queue an asynchronous fetch (i.e. an HTTP GET request).
*
@@ -123,7 +132,7 @@ function fetch(uri, action, extra) {

// Return the promise for the fetch results.
return defer.promise;
};
}

/**
* Monitor the fetcher until it's done fetching; at that point, report
@@ -136,10 +145,13 @@ function watch() {
// If the fetcher hasn't started yet (no fetches queued), or if
// any fetches are currently in flight, then just report stats,
// wait, and go around the loop again.
console.log(new Date().toString() + ': '
+ passes + ' ok, '
+ fails.length + ' ng, '
+ Math.floor(kbytes/1024) + ' MB');
console.log(stats.map(function(fn) {
return fn() + ', ';
}).join('')
+ passes + ' gets, '
+ fails.length + ' fails, '
+ Math.floor(kbytes/1024) + ' M total'
);

if (fails.length > 0) {
console.log('pushing a retry');
@@ -158,11 +170,12 @@ function watch() {
}

return defer.promise;
};
}

// Expose all the stuff we just wrote.
module.exports = {
options: options,
registerStats: registerStats,
fetch: fetch,
watch: watch
};

+ 28
- 11
lib/util.js View File

@@ -1,3 +1,5 @@
/* jshint -W014 */

/*
* Random utility functions that don't have any other clean place to live.
*/
@@ -5,6 +7,14 @@
var url = require('url');
var fs = require('fs');

/**
* Return a Unix timestamp representing the call time.
* @returns {number} A Unix timestamp.
*/
function timestamp() {
return Math.floor(new Date().getTime() / 1000);
}

/**
* Canonicalize a relative URI against an absolute base.
*
@@ -23,7 +33,7 @@ function canonicalize(base, leaf) {
parsed.pathname = leaf;
parsed.path = leaf;
return url.format(parsed);
};
}

/**
* Recursively create any directories which don't yet exist along a
@@ -47,9 +57,9 @@ function makeDirectories(path) {
var subdir;
var stat;

if (dirs.some(function(dir) { return dir === '..' })) {
if (dirs.some(function(dir) { return dir === '..'; })) {
throw new Error('makeDirectories won\'t traverse backward (i.e. no ".." path elements allowed)');
};
}

// Walk the path...
for (var i = 0; i < dirs.length; i++) {
@@ -64,21 +74,28 @@ function makeDirectories(path) {
if (! stat.isDirectory()) {
throw new Error(subdir + ' is not a directory');
}
};
};
}
}

return path;
};
}

/**
* Given a string, return it with slashes stripped and whitespace
* trimmed, so that it's a safe (non-multiple-directory) path part.
*
* @param {String} pathPart - The path part to sanitize.
* @returns {String} The sanitized path part.
*/
function makeSafePathPart(pathPart) {
// make pathPart legal (e.g. strip slashes)
return pathPart
.replace(/\s*\/\s*/g, '_')
.replace(/^\s+/, '')
.replace(/\s+$/, '');
};
.replace(/\s*\/\s*/g, '_')
.replace(/^\s+/, '')
.replace(/\s+$/, '');
}

module.exports = {
timestamp: timestamp,
canonicalize: canonicalize,
makeDirectories: makeDirectories,
makeSafePathPart: makeSafePathPart

+ 51
- 17
spider.js View File

@@ -1,3 +1,5 @@
/* jshint -W014 */

/*
* This is a breadth-first, recursive, asynchronous image-downloading
* web spider.
@@ -34,6 +36,23 @@ var basePath = './images/';
* Process-level setup.
*/

var collisions = 0;
var saved = 0;
var start = util.timestamp();

// Register some custom stats functions into the watcher's stats
// reporter. These run in the order registered, and each return a
// string that's joined into the stats report line.
fetcher.registerStats(function statsRunTime() {
return (util.timestamp() - start) + ' seconds';
});
fetcher.registerStats(function statsSaved () {
return saved.toString(10) + ' saved';
});
fetcher.registerStats(function statsColln () {
return collisions.toString(10) + ' name coll';
});

// Start the fetcher's monitoring loop; it will continue until all
// recursive downloads are complete.
fetcher.watch();
@@ -61,7 +80,7 @@ function parseRoot(uri, response, body) {
// ...and queue a fetch and parse for each.
fetcher.fetch(uri + $(el).attr('href'), parseTopNavLeaf, navLinkName);
});
};
}

/**
* Parse a sidebar navigation link.
@@ -81,7 +100,7 @@ function parseTopNavLeaf(uri, response, body, navLinkName) {
parseSecondLevelLeaf,
navLinkName);
});
};
}

/**
* Parse a top-bar "by-letter" navigation link.
@@ -97,25 +116,19 @@ function parseSecondLevelLeaf(uri, response, body, navLinkName) {
// Find individual games' links...
$('div.gameiconcontainer').each(function(i, el) {
// ...obtain some metadata for them that we'll need to pass along...
var gameImageElement = $(el).find('div.gameiconbody').find('img')
var gameImageElement = $(el).find('div.gameiconbody').find('img');
var gameImageUri = util.canonicalize(uri, gameImageElement.attr('src'));
var gamePageUri = util.canonicalize(uri, gameImageElement.closest('a').attr('href'));
var gameName = gameImageElement.attr('alt');
var gameDir = util.makeSafePathPart(gameName);

// ...queue a fetch-and-parse for each game's sprite sheet page...
// ...and queue a fetch-and-parse for each game's sprite sheet page.
fetcher.fetch(gamePageUri, parseGamePage, {
gameName: gameName,
navLinkName: navLinkName
});

// ...and queue a download for the game's "logo" image.
fetcher.fetch(gameImageUri, saveImage, {
path: [basePath, util.makeSafePathPart(navLinkName)].join('/'),
name: gameDir + '.png'
});
});
};
}

/**
* Parse an individual game's page.
@@ -154,7 +167,7 @@ function parseGamePage(uri, response, body, names) {
});
});
});
};
}

/**
* Parse an individual sheet's page.
@@ -176,6 +189,11 @@ function parseSheetPage(uri, response, body, sheetInfo) {
if (img.attr('alt') !== sheetInfo.sheet.name) return;
var sheetUri = util.canonicalize(uri, img.attr('src'));
var sheetExtension = sheetUri.match(/\.(\w+)$/);
if (sheetExtension === null) {
throw new Error(sheetUri + ': no recognizable extension');
}
sheetExtension = sheetExtension[1];

// ...and queue a download for the sheet image, with a suitable
// output filename so we know how to find it.
@@ -185,10 +203,11 @@ function parseSheetPage(uri, response, body, sheetInfo) {
util.makeSafePathPart(sheetInfo.game)].join('/'),
name: [util.makeSafePathPart(sheetInfo.sheet.type),
util.makeSafePathPart(sheetInfo.sheet.name)]
.join(' - ') + '.png'
.join(' - '),
extn: sheetExtension
});
});
};
}

/**
* Take a file fetched by the request library, and write it verbatim
@@ -204,12 +223,27 @@ function parseSheetPage(uri, response, body, sheetInfo) {
* @param {Object} fileinfo - An object containing path and basename for the file to be written.
*/
function saveImage(uri, response, body, fileinfo) {
var n = 1;
// Recursively create any directories in the target path which don't
// already exist. (like `mkdir -p`)
util.makeDirectories(fileinfo.path);

// Assemble the complete path for the file to save.
var filePath = [fileinfo.path, fileinfo.name].join('/');
var filePath = [fileinfo.path,
fileinfo.name + '.' + fileinfo.extn].join('/');

// If a file by that name already exists, keep adjusting this file's
// name until we find a unique one.
while (fs.existsSync(filePath)) {
n += 1;
filePath = [fileinfo.path,
fileinfo.name + ' (' + n.toString(10) + ')'
+ '.' + fileinfo.extn].join('/');
}
if (n > 1) {
collisions++;
}

// Write out the response body into the target file, protecting it
// from lossage caused by Node's habit of defaulting strings to utf8
@@ -218,6 +252,6 @@ function saveImage(uri, response, body, fileinfo) {
var buf = new Buffer(body, 'binary');
fs.writeFile(filePath, body, 'binary', function(err) {
if (err) throw err;
console.log('saved ' + filePath);
saved++;
});
};
}

Loading…
Cancel
Save