Browse Source

more detailed stats tracking; fix a bug in the fail handling

master
Aaron Miller 7 years ago
parent
commit
220320a5b5
  1. 4
      .gitignore
  2. 15
      lib/fetch.js
  3. 13
      spider.js

4
.gitignore vendored

@ -1,3 +1,5 @@ @@ -1,3 +1,5 @@
node_modules/*
images/
.hg*
.hg*
*.tar.bz2
2015-*

15
lib/fetch.js

@ -20,6 +20,7 @@ var request = require('request'); @@ -20,6 +20,7 @@ var request = require('request');
var started = false;
var unresolved = {};
var lastUri = '(unstarted)';
var passes = 0;
var kbytes = 0;
var fails = [];
@ -29,10 +30,10 @@ var stats = []; @@ -29,10 +30,10 @@ var stats = [];
// NB times in milliseconds
var options = {
loopSleep: 1000,
loopSleep: 15000,
verbose: false,
// Set this to 0 if you're an asshole.
fetchDelay: 200
fetchDelay: 250
};
var log = (options.verbose ? console.log : function () {});
@ -68,6 +69,7 @@ function fetch(uri, action, extra) { @@ -68,6 +69,7 @@ function fetch(uri, action, extra) {
// Wait for options.fetchDelay milliseconds, then fire off the
// fetch.
Q.delay(options.fetchDelay).then(function() {
lastUri = uri;
request(
{
method: 'GET',
@ -108,7 +110,7 @@ function fetch(uri, action, extra) { @@ -108,7 +110,7 @@ function fetch(uri, action, extra) {
// Handle a failed fetch (request gave back an error, or
// response code isn't 2xx).
console.error('failed ' + uri
+ ': status ' + response.statusCode
+ ': status ' + (response ? response.statusCode : 'NONE')
+ ', err ' + error);
// Stuff the fetch information into the fails list.
fails.push({
@ -140,8 +142,9 @@ function fetch(uri, action, extra) { @@ -140,8 +142,9 @@ function fetch(uri, action, extra) {
*/
function watch() {
var defer = Q.defer();
var qlen = Object.keys(unresolved).length;
if (!started || Object.keys(unresolved).length > 0) {
if (!started || qlen > 0) {
// If the fetcher hasn't started yet (no fetches queued), or if
// any fetches are currently in flight, then just report stats,
// wait, and go around the loop again.
@ -150,8 +153,8 @@ function watch() { @@ -150,8 +153,8 @@ function watch() {
}).join('')
+ passes + ' gets, '
+ fails.length + ' fails, '
+ Math.floor(kbytes/1024) + ' M total'
);
+ Math.floor(kbytes/1024) + 'M, '
+ 'fetchq ' + qlen);
if (fails.length > 0) {
console.log('pushing a retry');

13
spider.js

@ -38,6 +38,7 @@ var basePath = './images/'; @@ -38,6 +38,7 @@ var basePath = './images/';
var collisions = 0;
var saved = 0;
var saveQ = 0;
var start = util.timestamp();
// Register some custom stats functions into the watcher's stats
@ -46,12 +47,15 @@ var start = util.timestamp(); @@ -46,12 +47,15 @@ var start = util.timestamp();
fetcher.registerStats(function statsRunTime() {
return (util.timestamp() - start) + ' seconds';
});
fetcher.registerStats(function statsColln () {
return collisions.toString(10) + ' coll';
});
fetcher.registerStats(function statsSaveQ () {
return saveQ.toString(10) + ' saveq';
});
fetcher.registerStats(function statsSaved () {
return saved.toString(10) + ' saved';
});
fetcher.registerStats(function statsColln () {
return collisions.toString(10) + ' name coll';
});
// Start the fetcher's monitoring loop; it will continue until all
// recursive downloads are complete.
@ -197,6 +201,8 @@ function parseSheetPage(uri, response, body, sheetInfo) { @@ -197,6 +201,8 @@ function parseSheetPage(uri, response, body, sheetInfo) {
// ...and queue a download for the sheet image, with a suitable
// output filename so we know how to find it.
saveQ++;
fetcher.fetch(sheetUri, saveImage, {
path: [basePath,
util.makeSafePathPart(sheetInfo.nav),
@ -253,5 +259,6 @@ function saveImage(uri, response, body, fileinfo) { @@ -253,5 +259,6 @@ function saveImage(uri, response, body, fileinfo) {
fs.writeFile(filePath, body, 'binary', function(err) {
if (err) throw err;
saved++;
saveQ--;
});
}
Loading…
Cancel
Save