From 220320a5b50ccd2d5e894a8585d5c81a0abaa88e Mon Sep 17 00:00:00 2001 From: Aaron Miller Date: Sat, 4 Jul 2015 11:03:09 -0400 Subject: [PATCH] more detailed stats tracking; fix a bug in the fail handling --- .gitignore | 4 +++- lib/fetch.js | 15 +++++++++------ spider.js | 13 ++++++++++--- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 844a42a..c72b2dd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ node_modules/* images/ -.hg* \ No newline at end of file +.hg* +*.tar.bz2 +2015-* diff --git a/lib/fetch.js b/lib/fetch.js index 1965c31..333043e 100644 --- a/lib/fetch.js +++ b/lib/fetch.js @@ -20,6 +20,7 @@ var request = require('request'); var started = false; var unresolved = {}; +var lastUri = '(unstarted)'; var passes = 0; var kbytes = 0; var fails = []; @@ -29,10 +30,10 @@ var stats = []; // NB times in milliseconds var options = { - loopSleep: 1000, + loopSleep: 15000, verbose: false, // Set this to 0 if you're an asshole. - fetchDelay: 200 + fetchDelay: 250 }; var log = (options.verbose ? console.log : function () {}); @@ -68,6 +69,7 @@ function fetch(uri, action, extra) { // Wait for options.fetchDelay milliseconds, then fire off the // fetch. Q.delay(options.fetchDelay).then(function() { + lastUri = uri; request( { method: 'GET', @@ -108,7 +110,7 @@ function fetch(uri, action, extra) { // Handle a failed fetch (request gave back an error, or // response code isn't 2xx). console.error('failed ' + uri - + ': status ' + response.statusCode + + ': status ' + (response ? response.statusCode : 'NONE') + ', err ' + error); // Stuff the fetch information into the fails list. fails.push({ @@ -140,8 +142,9 @@ function fetch(uri, action, extra) { */ function watch() { var defer = Q.defer(); + var qlen = Object.keys(unresolved).length; - if (!started || Object.keys(unresolved).length > 0) { + if (!started || qlen > 0) { // If the fetcher hasn't started yet (no fetches queued), or if // any fetches are currently in flight, then just report stats, // wait, and go around the loop again. @@ -150,8 +153,8 @@ function watch() { }).join('') + passes + ' gets, ' + fails.length + ' fails, ' - + Math.floor(kbytes/1024) + ' M total' - ); + + Math.floor(kbytes/1024) + 'M, ' + + 'fetchq ' + qlen); if (fails.length > 0) { console.log('pushing a retry'); diff --git a/spider.js b/spider.js index 4fdb062..8efc744 100644 --- a/spider.js +++ b/spider.js @@ -38,6 +38,7 @@ var basePath = './images/'; var collisions = 0; var saved = 0; +var saveQ = 0; var start = util.timestamp(); // Register some custom stats functions into the watcher's stats @@ -46,12 +47,15 @@ var start = util.timestamp(); fetcher.registerStats(function statsRunTime() { return (util.timestamp() - start) + ' seconds'; }); +fetcher.registerStats(function statsColln () { + return collisions.toString(10) + ' coll'; +}); +fetcher.registerStats(function statsSaveQ () { + return saveQ.toString(10) + ' saveq'; +}); fetcher.registerStats(function statsSaved () { return saved.toString(10) + ' saved'; }); -fetcher.registerStats(function statsColln () { - return collisions.toString(10) + ' name coll'; -}); // Start the fetcher's monitoring loop; it will continue until all // recursive downloads are complete. @@ -197,6 +201,8 @@ function parseSheetPage(uri, response, body, sheetInfo) { // ...and queue a download for the sheet image, with a suitable // output filename so we know how to find it. + saveQ++; + fetcher.fetch(sheetUri, saveImage, { path: [basePath, util.makeSafePathPart(sheetInfo.nav), @@ -253,5 +259,6 @@ function saveImage(uri, response, body, fileinfo) { fs.writeFile(filePath, body, 'binary', function(err) { if (err) throw err; saved++; + saveQ--; }); } \ No newline at end of file