
/**************************************************************************
 *                                                                        *
 *   Copyright (C) 2001 Grub, Inc.                                        *
 *                                                                        *
 *   This program is free software; you can redistribute it and/or modify *
 *   it under the terms of the GNU General Public License as published by *
 *   the Free Software Foundation; either version 1, or (at your option)  *
 *   any later version.                                                   *
 *                                                                        *
 *   This program is distributed in the hope that it will be useful,      *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of       *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        *
 *   GNU General Public License for more details.                         *
 *                                                                        *
 *   You should have received a copy of the GNU General Public License    *
 *   along with this program; if not, write to the Free Software          *
 *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.            *
 *                                                                        *
 *                                                                        *
 **************************************************************************/  

/* Project: GRUB-CLIENT
 * <http://www.grub.org>
 * module: CRW (crawler)
 * Author: Kord Campbell (kord@grub.org)
 * Last revision: October, 2001
 * Files: NewCrawler.cpp NewCrawler.h
 *
 * Class Crawler retrieves the URLs prescribed by the server. It
 * performs this function using the cURL C++ API, available at
 * http://curl.haxx.se.  This version is a re-write of the version
 * authored by Kosta Damevski in Spring, 2001.
 * 
 * Each child process controls a single cURL session, pulling new URLs
 * from the shared memory segment controlled by the parent process.  The
 * older version of this portion of the client used GNU's wget and pipes
 * to communicate with the wget processes.  This evidently caused some 
 * synchronization problems that should now be alleviated in the rewrite.
 * 
 * Each child crawler creates a seperate shared memory segment for each
 * page that it pulls down, and the headers for that page.  It then can
 * pass that info on to the parent for reading, by specifying the key
 * of the shared memory area, and the size, back to the parent via the
 * primary share area where urls, pids and other relevant crawl info is
 * stored.
 */


#include "Crawler.h"

using namespace std;

// tease apart the headers returned by remote web server using custom delimiters
int parserFunk(char *key, char *delimiters, char *content_start, char **key_start);

int sizeThresholdCheck(int current_size, int last_size);

size_t WriteMemoryCallback(void *ptr, size_t size, size_t nmemb, void *data);
size_t WriteHeaderCallback(void *ptr, size_t size, size_t nmemb, void *data);

static Crawler *crw_sig = NULL;

// create our mutex for the bandwidth structure
pthread_mutex_t band_mutex = PTHREAD_MUTEX_INITIALIZER;

/* called when child receives SIGTERM or SIGINT */
static void crw_sig_handle( int signum )
{
	if ( crw_sig )
		crw_sig->signaledEnd();
}

struct starthere_t 
{
	Crawler *child;
	int child_num;
};

void *startHere(void *stuff)
{
	starthere_t *start = (starthere_t *)stuff;

	start->child->startChildCrawler( start->child_num );
	delete start;

	return NULL;
}

Crawler::Crawler(ClientDB *ClientDb)
{
	this->clientdb = ClientDb;

	// set number of threads
	this->max_threads_per_host = Config_File_Info.ThreadsPerHost;

	// set number of children
	this->num_of_children_crawlers = Config_File_Info.NumOfCrawlersToRun;
	
	// set bandwidth
	this->maximum_bandwidth = Config_File_Info.MaxAmountOfBandwidth;

	// create our database handler and zero it
	dbhandle = new URLHandler *[num_of_children_crawlers];
	for (int x=0; x < num_of_children_crawlers; x++)
	{
		dbhandle[x] = NULL;
	}

	// set up the crc table
	gen_crc_table();

	// set up random number generator
	srand(time(0));
}

Crawler::~Crawler()
{
	// delete all of our database handlers
	for (int x=0; dbhandle && x < num_of_children_crawlers; x++)
	{
		if (dbhandle[x] != NULL)
		{
			delete dbhandle[x];
			dbhandle[x] = NULL;
		}
	}

	if (dbhandle != NULL)
	{
		delete [] dbhandle;
		dbhandle = NULL;
	}
}

int Crawler::start()
{
	// set up local variables
	bool more_urls = true;
	bool multi_hosts = false;
	int  check_return = 0;

	// initilize class variables
	temp_down = 0;

	// initilize bandwidth setting
	Bandwidth_Info.bandwidth_limit = START_BAND_LIMIT;
	Bandwidth_Info.bandwidth_usage = 0;
	Bandwidth_Info.bandwidth_time = 0;
	band_start_time = 0;
	band_end_time = 0;
	throughput_start = 0;
	throughput_end = 0;

	// reset crawl info values
	for (int x = 0; x < num_of_children_crawlers; x++)
	{ 
		resetCrawlInfoValues(x, true);	
	}

	// Start main loop
	while(check_return == 0)
	{
		// you want your machine to be slammed?  didn't think so..
		// those waitpids in checkChildCrawler are brutal if run
		// non stop.  we a mu sleep via a select here.  we could 
		// also use real semaphores, but we don't
		struct timeval tv;
		tv.tv_sec = 0;
		tv.tv_usec = 50000;
		select (0, NULL, NULL, NULL, &tv);

		// checkChildCrawlers checks to see if all the child
		// process are running and, if not, starts them up.
		// it also signals the children to die if we are out
		// of urls.  Child processes will never come out of 
		// this function call.  return values, 0: crawling
		// -1: all children dead, no more urls
		check_return = checkChildCrawlers(num_of_children_crawlers, &more_urls);
		
		// Send out URLs to the children and look for results
		manageChildInfo(num_of_children_crawlers, &more_urls, &multi_hosts);
		
		// Update current number of active crawler count
		Crawler_Status_Info.engaged = countOfEngagedCrawlers(num_of_children_crawlers);
		
		// do all our bandwidth sampling and set up new limits
		adjustBandwidth();
	
		// test to see if we need to exit - this is brutal
		if (Crawler_Status_Info.crawler_quit == true)
		{
			break;
		}
	}

	// delete all of our database handlers
	for (int x=0; dbhandle && x < num_of_children_crawlers; x++)
	{
		if (dbhandle[x] != NULL)
		{
			delete dbhandle[x];
			dbhandle[x] = NULL;
		}
	}

	if (dbhandle != NULL)
	{
		delete [] dbhandle;
		dbhandle = NULL;
	}

	return 0;
}

int Crawler::end()
{
	exit(0);
}

/* this function tests the semaphores for different states of the child
 * the only flag that the child can set is the "done" flag, all other
 * flags are controlled by the parent process.  the wait flag will be
 * used later for simultaneous crawling/processing.
 *
 * the states are as follows:
 * 				
 * go	wait	exit	done	dead		STATE
 * ------------------------------------------------------------------------------
 * 0	0	0	0	0		child ready to crawl
 * 1	0	0	0	0		crawling, no results yet
 * 1	0	0	1	0		crawling, results ready
 *  	 	1	 	0		client will exit, if possible
 *  	 	1	 	1		child saw exit request 
 *
*/	
void Crawler::manageChildInfo(int num_crawlers, bool *more_urls, bool *multi_hosts)
{
	// set up local variables
	string input_url = "", output_url = "", redirect_url = "", mime_url = "", contents_url = "";
	char url_guts[MAX_URL_LENGTH], url_host[MAX_URL_LENGTH], url_protocol[MAX_URL_LENGTH];
	long  code_url = 0; 
	
	// routines to check DOWN urls.  this ensures that the client 
	// has connectivity and if not, halts executions of the client
	if(temp_down > CRITICAL_DOWN_AMOUNT)
	{
		Verboseprintf("Excessive down hosts - exiting the crawler due to network errors.\n");
		clog(GCLOG_ERR, "Excessive down hosts - exiting the crawler...\n"); 	
		Crawler_Status_Info.gui_quit = true ;
		Crawler_Status_Info.coordinator_quit = true ;
		Crawler_Status_Info.crawler_quit = true ;
	}

	for (int x=0; x < num_crawlers; x++)  // in this context x is == to child number
	{
		// if he is dead, then let us skip to the next child
		// perhaps we should clean him up here or restart him?
		if (Crawler_Info[x].crawl_dead)
		{
			continue;	
		}
		
		// if true, we have a child that is done with his crawl and is 
		// waiting to cough up his results to the parent for processing
		if (Crawler_Info[x].crawl_done)
		{
			// child needs to be told not to crawl by setting go to false.
			// if we zero out done first, then the child will try to start
			// crawling again (remember, that go at this point is still 1)
			Crawler_Info[x].crawl_go = 0;           // go comes first
			Crawler_Info[x].crawl_done = 0;	        // done comes second

			// define variables for submission to database
			status_t page_status;
			long page_size = 0;
			unsigned long page_crc = 0;

			input_url = Crawler_Info[x].crawl_url;  // save off the last URL that the child is now done with

			// if error_code is 0 then we have found a server
			if (Crawler_Info[x].crawl_error_code == 0)
			{
				// move contents into string - a rewrite of the archive 
				// class is going to be necessary.  We currently can't 
				// take binary data because of this.
				if (Crawler_Info[x].crawl_result_address)
				{
					contents_url = Crawler_Info[x].crawl_result_address;
				}
				else
				{
					contents_url = " ";
				}

				// reset mime, redirect, code
				mime_url = " ";
				redirect_url = " ";
				code_url = 0;

				// parse the headers
				parseHeaderResults(x, mime_url, redirect_url);
				
				// now get our results http code
				code_url = Crawler_Info[x].crawl_http_code;

				// page is not found
				if (code_url == 404)
				{
					contents_url = " ";
					page_crc = 0;
					mime_url = "text/plain";
					redirect_url = " ";
					page_status = NOTFOUND;
					Verboseprintf("Crawled %s, status is NOT-FOUND\n", input_url.c_str());
				}
				else if (redirect_url == " ")  // no redirect found
				{
					// compute CRC for contents
					page_file_data = create_file_data();
					if ( contents_url != " " )
						file_data(page_file_data,
							Crawler_Info[x].crawl_result_address,
							Crawler_Info[x].crawl_result_length );
					else
						file_data(page_file_data, contents_url.c_str(), contents_url.length());

					destroy_file_data(page_file_data, &page_size, &page_crc);

					// check CRC codes to see if they match
					if ( page_crc == dbhandle[x]->CRC )	
					{
						contents_url = " ";
						page_status = UNCHANGED;
						Verboseprintf("Crawled %s, status is UNCHANGED\n", input_url.c_str());
					}
					// must be an update
					else
					{
						// bump it up next to the last known size, if close we may have a dynamic page on our hands
						// contents_url could be a space OR could have data in it
						if ( ( contents_url != " " &&
							sizeThresholdCheck(Crawler_Info[x].crawl_result_length, dbhandle[x]->size) ) ||
							sizeThresholdCheck(contents_url.length(), dbhandle[x]->size) )
						{
							contents_url = " ";
							page_status = NOCRAWL;
							Verboseprintf("Crawled %s, status is NO-CRAWL - possibly DYNAMIC\n", input_url.c_str());
						}
						else
						{
							page_status = UPDATE;
							Verboseprintf("Crawled %s, status is UPDATE\n", input_url.c_str());
						}
					}
				}
				else if (redirect_url > " ")
				{	
					contents_url = " ";
					page_crc = 0;
					page_status = REDIRECT;
					Verboseprintf("Crawled %s, status is REDIRECT\n", input_url.c_str());
				}
			}
			// curl exited on an error and limit exceeded - we generate the error 23 (write erro)
			// by closing off access to the write back function with our file size limit
			else if (Crawler_Info[x].crawl_error_code == 23)
			{
				// site exceeded client limit on size this 
				// is set with FILE_SIZE_LIMIT in Crawler.h
				contents_url = " ";
				page_crc = 0;
				mime_url = "text/plain";
				redirect_url = " ";
				page_status = NOCRAWL;
				Verboseprintf("Crawled %s, status is NO-CRAWL\n", input_url.c_str());
			}
			else
			{
				// else we assume the site is down hard or something else nasty happened
				contents_url = " ";
				page_crc = 0;
				mime_url = "text/plain";
				redirect_url = " ";
				page_status = DOWN;
				Verboseprintf("Crawled %s, status is DOWN\n", input_url.c_str());
			}

			// update the status counters
			switch (page_status)
			{
				case REDIRECT:
					Crawler_Status_Info.redirect_urls++;
					Crawler_Status_Info.total_urls++;
					temp_down = 0;
					break;
				case DOWN:
					Crawler_Status_Info.down_urls++;
					Crawler_Status_Info.total_urls++;
					// we only up the temporary down host counter if we are not
					// observing multiple connections to a single host entry
					// this keeps us from exiting because a single host was down
					// but we were crawling it multiple times
					if (!*multi_hosts)
					{
						temp_down++;
					}
					break;
				case UNCHANGED:
					Crawler_Status_Info.unchanged_urls++;
					Crawler_Status_Info.total_urls++;
					temp_down = 0;
					break;
				case NOCRAWL:
					Crawler_Status_Info.no_crawl_urls++;
					Crawler_Status_Info.total_urls++;
					temp_down = 0;
					break;	
				case UPDATE:
					Crawler_Status_Info.updated_urls++;
					Crawler_Status_Info.total_urls++;
					temp_down = 0;
					break;
				case NOTFOUND:
					Crawler_Status_Info.not_found_urls++;
					Crawler_Status_Info.total_urls++;
					temp_down = 0;
					break;
			}

			

			// update the total number of bytes returned
			Crawler_Status_Info.total_bytes += Crawler_Info[x].crawl_result_length;

			// insert resulting data into the database 
			if ( contents_url != " " )
				clientdb->CrawlInsert(Crawler_Info[x].crawl_url, Crawler_Info[x].crawl_result_address,
						Crawler_Info[x].crawl_result_length,
						page_crc, page_status, mime_url.c_str(), redirect_url.c_str());	
			else
				clientdb->CrawlInsert(Crawler_Info[x].crawl_url, contents_url.c_str(),
						Crawler_Info[x].crawl_result_length, 
						page_crc, page_status, mime_url.c_str(), redirect_url.c_str());	

			// do a little clean up
			if ( dbhandle[x] != NULL ) { 
		
				delete dbhandle[x]; 
				dbhandle[x] = NULL; 
			}
		}

		// child is still crawling if true
		if (Crawler_Info[x].crawl_go || Crawler_Info[x].crawl_done || *multi_hosts)
		{
			// check to see if we are crawling more than a certain number of the last host fetched
			if (checkForMultipleHosts(url_host, num_crawlers, max_threads_per_host))
			{
				*multi_hosts = true;
			}
			else
			{
				*multi_hosts = false;
			}

		}
		else
		{
			// child is ready for a new url
		
			// reset crawl related items, now we have them in local variables
			resetCrawlInfoValues(x, false);	
			
			// do a little clean up
			if ( dbhandle[x] != NULL ) { 
		
				delete dbhandle[x]; 
				dbhandle[x] = NULL; 
			}
			// create database handle for child
			dbhandle[x] = clientdb->GetRetrieve(); 

			// check it to see if it is null
			if (dbhandle[x] == NULL)
			{
				*more_urls = false;
			}

			if (*more_urls)
			{	
				Crawler_Status_Info.current_url++;

				output_url = dbhandle[x]->URL; // get the next url 

				strip_url(output_url.c_str(), url_guts, url_host, url_protocol); // strip out the host

				// check to see if we are crawling this host currently
				if (checkForMultipleHosts(url_host, num_crawlers, max_threads_per_host))
				{
					*multi_hosts = true;
				}
				strncpy(Crawler_Info[x].crawl_url, output_url.c_str(), sizeof(Crawler_Info[x].crawl_url)); // give the url 
				strncpy(Crawler_Info[x].crawl_host, url_host, sizeof(url_host)); // give the host name 
				Crawler_Info[x].crawl_url[MAX_URL_LENGTH-1] = '\0'; // stomp out buffer overflows
				Crawler_Info[x].crawl_go = 1; // send him on his way 
			}
		}
	}

}

/* Crawler::checkChildCrawlers(int num_crawlers);
 * function to check each child and, if not running, starts a new
 * child.  if a child was running, and exited for any reason, this
 * function will start another child in its place.  the purpose of
 * this function is to keep all the crawlers running all the time
 * return values, 0: crawling, -1: all children dead, no more urls
*/
int Crawler::checkChildCrawlers(int num_crawlers, bool *more_urls)
{
	int num_stopped_children = 0;

	for (int child_num=0; child_num < num_crawlers; child_num++)
	{
		// if we have more urls available then we need to check
		// the status of our children.  if any have exited, then
		// we restart them.
		if (*more_urls)
		{
			if (Crawler_Info[child_num].crawl_pid == 0) // pid not assigned so start
			{
				// reset child's semaphores
				resetCrawlInfoValues(child_num, true);	

				// set up the pthread paramaters
				pthread_attr_t attr;
				pthread_attr_init(&attr);
				pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
		
				int p_error;
		
				starthere_t *start = new starthere_t;
				start->child = this; 
				start->child_num = child_num;

				// call the child crawler code and thread off
				p_error = pthread_create(&Crawler_Info[child_num].crawl_pid, &attr, startHere, start);
			}
		}
		else // no more urls available so we try to stop this child
		{
			// check to see if the child is doing anything
			if (Crawler_Info[child_num].crawl_go == 0 && Crawler_Info[child_num].crawl_done == 0)
			{
				// indicate to the child that he should exit
				Crawler_Info[child_num].crawl_exit = 1;
			}
		}
	
		// check to see if he is dead yet, it's ok to loop back later
		if (Crawler_Info[child_num].crawl_dead == 1)
		{
			num_stopped_children++;
		}
	}	

	// if all chidren are stopped, then we need to stop the parent now
	if (num_stopped_children == num_crawlers)
	{
		return -1;
	}
	else
	{
		return 0;
	}
}

void Crawler::startChildCrawler(int child_num) 
{
	// set up child variables
	int curlerror = 0;
	int speederror = 0;
	int codeerror = 0;
	long httpcode = 0;
	double download_speed = 0;

	// Memory area for cURL retrieval
	struct MemoryStruct chunk;
	struct MemoryStruct thunk;

	// We use the pointer in MemoryStruct to pass the address of our
	// shared memory for bandwidth.  We have to do this because the
	// address of chunk (and thunk) is passed to the WriteMemoryCallback
	// functions that we use to write into memory from curl and in which
	// we must calculate (and wait for) our bandwidth usage.

	// reset our cURL result memory
	chunk.memory=NULL;
	chunk.size = 0;

	// set the pointers for passing to the WriteMemory callback function
	chunk.bandwidth_pointer = &Bandwidth_Info;

	thunk.memory=NULL;
	thunk.size = 0;
	thunk.bandwidth_pointer = NULL;  // Don't need this value so NULL it

	// initilize our cURL session
	CURL *curl_handle;
	curl_handle = curl_easy_init();
	initCurlOpts(curl_handle, &chunk, &thunk);

	while (!Crawler_Info[child_num].crawl_exit) // run until told to die
	{
		// sleep a little
		struct timeval tv;
		tv.tv_sec = 0;
		tv.tv_usec = 1000000;
		select (0, NULL, NULL, NULL, &tv);

		// check to see if we are cleared to crawl and last results have been checked
		if (Crawler_Info[child_num].crawl_go && !Crawler_Info[child_num].crawl_done)
		{
			// delete memory, reset size
			if (chunk.memory) 
			{
				delete [] chunk.memory;
				chunk.memory = NULL;
				chunk.size = 0;
			}
			if (thunk.memory) 
			{
				delete [] thunk.memory;
				thunk.memory = NULL;
				thunk.size = 0;
			}

			// retrieve the url content
			curl_easy_setopt(curl_handle, CURLOPT_URL, Crawler_Info[child_num].crawl_url);
			curlerror = curl_easy_perform(curl_handle);

			// set crawl error code
			Crawler_Info[child_num].crawl_error_code = curlerror;
				
			if (curlerror == 0)
			{
				// set the http return code
				codeerror = curl_easy_getinfo(curl_handle, CURLINFO_HTTP_CODE, &httpcode);
				if (codeerror == 0) Crawler_Info[child_num].crawl_http_code = httpcode;

				// get our speeds
				speederror = curl_easy_getinfo(curl_handle, CURLINFO_SPEED_DOWNLOAD, &download_speed);
				if (speederror == 0) Crawler_Info[child_num].crawl_download_speed = (int)download_speed;

				// add our null terminator to prevent buffer overflow
				if (chunk.memory)
				{
					chunk.memory[chunk.size]='\0';
				}

				if (thunk.memory)
				{
					thunk.memory[thunk.size]='\0';
				}

				// pass address info back to parent
				Crawler_Info[child_num].crawl_result_address = chunk.memory;
				Crawler_Info[child_num].crawl_result_length = chunk.size;
				Crawler_Info[child_num].crawl_header_address = thunk.memory;
				Crawler_Info[child_num].crawl_header_length = thunk.size;
			}
			else
			{
				clog(GCLOG_ERR, "Error with CURL session.  cURL error number: %d", curlerror); 	
			}

			// tell parent that child is done with this run
			Crawler_Info[child_num].crawl_done = 1; 
		}
	}

	// delete memory as we exit
	if (chunk.memory) 
	{
		delete [] chunk.memory;
		chunk.memory = NULL;
		chunk.size = 0;
	}
	if (thunk.memory) 
	{
		delete [] thunk.memory;
		thunk.memory = NULL;
		thunk.size = 0;
	}

	// clean up cURL stuff
	curl_easy_cleanup(curl_handle);

	// let parent know we know we are dead
	Crawler_Info[child_num].crawl_dead = 1;

	// main child code ends
	pthread_exit(NULL);
}
// end child code

// initilize cURL stuff
void Crawler::initCurlOpts(CURL *curl_handle, struct MemoryStruct *chunk, struct MemoryStruct *thunk)
{
	curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); // send all content to this function 
	curl_easy_setopt(curl_handle, CURLOPT_HEADERFUNCTION, WriteHeaderCallback); // send all headers to this function 
	curl_easy_setopt(curl_handle, CURLOPT_FILE, (void *)chunk); // we pass our 'chunk' struct to the MemoryCallback function 
	curl_easy_setopt(curl_handle, CURLOPT_WRITEHEADER, (void *)thunk); // we pass our 'thunk' struct to the HeaderCallback function 
	curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, BROWSER_VERSION VERSION LOG_PROPEGANDA ); // hello server 
	curl_easy_setopt(curl_handle, CURLOPT_CONNECTTIMEOUT, CRAWL_TIMEOUT); // set our timeout value 
	curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, CRAWL_TIMEOUT*10); // set our total timeout value 
	curl_easy_setopt(curl_handle, CURLOPT_LOW_SPEED_LIMIT, CRAWL_LOW_SPEED_LIMIT); // set our timeout value 
	curl_easy_setopt(curl_handle, CURLOPT_LOW_SPEED_TIME, CRAWL_LOW_SPEED_TIME); // set our timeout value 
	curl_easy_setopt(curl_handle, CURLOPT_PASSWDFUNCTION, NULL); // don't ask us for passwords


	// if proxy information was specified in our config file, we use it here
	if (!(Config_File_Info.Proxy == NULL))
	{	
		if (!(Config_File_Info.ProxyPort == 0))
		{
			curl_easy_setopt(curl_handle, CURLOPT_PROXYPORT, Config_File_Info.ProxyPort);
			curl_easy_setopt(curl_handle, CURLOPT_PROXY, Config_File_Info.Proxy);
		}
	}
}

// content memory callback
size_t WriteMemoryCallback(void *ptr, size_t size, size_t nmemb, void *data)
{
	register int realsize = size * nmemb;
	struct MemoryStruct *mem = (struct MemoryStruct *)data;

	// check for a really large file
	// if it is bigger than file size limit then we return
	// a zero which signals the cURL library that we have
	// encountered an error, and wish to exit.  This will
	// cause cURL to generate an error and return 23
	if ((realsize + mem->size) > FILE_SIZE_LIMIT) return 0;

	// begin our bandwidth limiting routine
	while (true && mem->bandwidth_pointer)
	{	
		// semaphore critical section - don't crash here baby
		// we don't want any other processes accessing the
		// bandwidth info struct whilst we are

		// get semaphore
		pthread_mutex_lock(&band_mutex);
		
		int usage = mem->bandwidth_pointer->bandwidth_usage;
		int limit = mem->bandwidth_pointer->bandwidth_limit;

		if (usage > limit)
		{
			// fetch our sleep time
			int sleep_time = mem->bandwidth_pointer->bandwidth_time - time(NULL);
			if (sleep_time < 1) sleep_time = 1;

			// release the semaphore before we start waiting
			pthread_mutex_unlock(&band_mutex);

			// hang out until we have bandwidth
			sleep(sleep_time);
		}
		else
		{
			// increment the current bandwidth usage
			mem->bandwidth_pointer->bandwidth_usage += realsize;
			
			// release our semaphore
			pthread_mutex_unlock(&band_mutex);

			// break out of loop and proceed
			break;
		}
	}

	mem->memory = (char *)realloc(mem->memory, mem->size + realsize + 1);
	if (mem->memory) {
		memcpy(&(mem->memory[mem->size]), ptr, realsize);
		mem->size += realsize;
		mem->memory[mem->size] = 0;
	}
	return realsize;
}

// header memory callback
size_t WriteHeaderCallback(void *ptr, size_t size, size_t nmemb, void *data)
{
	register int realsize = size * nmemb;
	struct MemoryStruct *mem = (struct MemoryStruct *)data;

	// we don't limit bandwidth on fetching headers - sorry
	mem->memory = (char *)realloc(mem->memory, mem->size + realsize + 1);
	if (mem->memory) {
		memcpy(&(mem->memory[mem->size]), ptr, realsize);
		mem->size += realsize;
		mem->memory[mem->size] = 0;
	}
	return realsize;
}

// reset all settings for a particular child_num
void Crawler::resetCrawlInfoValues(int child_num, bool flag)
{
	// reset semaphores for a child
	if (flag)
	{
		Crawler_Info[child_num].crawl_pid = 0;
		Crawler_Info[child_num].crawl_done = 0;
		Crawler_Info[child_num].crawl_go = 0;
		Crawler_Info[child_num].crawl_wait = 0;
		Crawler_Info[child_num].crawl_exit = 0;
		Crawler_Info[child_num].crawl_dead = 0; 
	}

	// reset crawl related items
	Crawler_Info[child_num].crawl_url[0] = '\0';
	Crawler_Info[child_num].crawl_url[MAX_URL_LENGTH-1] = '\0';
	Crawler_Info[child_num].crawl_result_length = 0;
	Crawler_Info[child_num].crawl_header_length = 0;
	Crawler_Info[child_num].crawl_error_code = 0;
	Crawler_Info[child_num].crawl_result_address = NULL;
	Crawler_Info[child_num].crawl_header_address = NULL;
}		

// function to count the total number of crawlers actually crawling
int Crawler::countOfEngagedCrawlers(int num_crawlers)
{
	int crawlers_engaged = 0;
	
	for (int x = 0; x < num_crawlers; x++)
	{
		if (Crawler_Info[x].crawl_go)
		{
			crawlers_engaged++;
		}
	}
	
	return crawlers_engaged;
}

// function to check if we are crawling a single host multiple times
// this keeps us from slamming a particular hostname if the scheduler
// fails to distribute the hosts in a random fashion from the database
// which, btw, has been known to happen.  ;)
bool Crawler::checkForMultipleHosts(char *host, int num_crawlers, int max_num_hosts)
{
	int found_times = 0;
	
	// double check our sanity and make sure you 
	// crazy hackers don't get yourselves blocked
	if (max_num_hosts < 1) max_num_hosts = 1;
	if (max_num_hosts > 30) max_num_hosts = 30;

	for (int x = 0; x < num_crawlers; x++)
	{
		int ret = strcmp(host, Crawler_Info[x].crawl_host);

		// if hosts are equal AND the crawler is crawling it 
		// then increments the number of times we've seen it
		if (ret == 0 && Crawler_Info[x].crawl_go)
		{
			found_times++; 
		}

		// let the gui know about it
		if (found_times > (max_num_hosts - 1))
		{
			Crawler_Status_Info.host_protect = true;
			return true;
		}
	}
	
	Crawler_Status_Info.host_protect = false;
	return false;
}

// parsing function, takes array of chars to look for and the start pointer
// returns an index pointing to the delimiter that it found closest to the
// starting point, which starts on 2nd character AFTER the key.  if any are 
// found it returns the index to that place, or -1 if none are found
int parserFunk(char *key, char *delimiters, char *content_start, char **key_start)
{
	int min_index = INT_MAX;
	char *pointer_one = NULL;
	char *pointer_two = NULL;
	
	// find the keying factor
	pointer_one = strstr(content_start, key);

	// if we don't find the key, then return
	if (!pointer_one) return -1;
	
	// move to the end of the key
	pointer_one += strlen(key) + 1;
	
	// save it to key start
	*key_start = pointer_one;

	// loop through the delimiters looking for a match
	for (pointer_two = delimiters; *pointer_two; pointer_two++)
	{
		char *temp = strchr(pointer_one, *pointer_two);
		if (temp && ((temp - pointer_one) < min_index))
		{
			min_index = temp - pointer_one;
		}
	}
			
	if (min_index == INT_MAX)
	{
		return -1;
	}
	else
	{
		return min_index;
	}
}

// check around a certain value to see if it is a close match for the
// last observed size.  this will allow us to "detect" dynamic pages
int sizeThresholdCheck(int current_size, int last_size)
{
	int thresh = SIZE_THRESHOLD + 1;
	int low_size, high_size;

	// check to see if last_size - thresh is even valid
	if (last_size < thresh)
	{
		last_size = thresh;	
	}
	
	low_size = last_size - thresh;	
	
	high_size = last_size + thresh;

	// if the page is fairly small, we'll just return it
	if ( current_size < SMALL_PAGE_SIZE )
	{
		return 0;
	}
		
	// if true, we have a fairly close match of sizes
	if ( ( current_size < high_size ) && ( current_size > low_size ) )
	{
		return 1;
	}
	else
	{
		return 0;
	}
}

// parsing functions for the headers
void Crawler::parseHeaderResults(int child_num, string &mime_url, string &redirect_url)
{
	char *reusable_header_pointer = NULL;
	char *reusable_key_pointer = NULL;
	char *pointer_one = NULL;
	char *pointer_two = NULL;
	int header_index = 0;
	
	// first off, look and see if we have something to parse
	// we consider 10 characters enough to parse.  any less,
	// we just blow it off.
	if (Crawler_Info[child_num].crawl_header_length > 10)
	{
		// redirect header parsing	
		reusable_header_pointer = Crawler_Info[child_num].crawl_header_address; 	
		header_index = parserFunk("Location:", " \r\n", reusable_header_pointer, &reusable_key_pointer); 

		// fix relly long redirect to MAX_REDIRECT_LEN characters
		// so the protocol doesn't puke on it and crash.
		if (header_index > MAX_REDIRECT_LEN) header_index = MAX_REDIRECT_LEN;

		if (header_index > 0)
		{
			reusable_key_pointer[header_index] = '\0';
			redirect_url = reusable_key_pointer;
			reusable_key_pointer[header_index] = ' ';
		}
		else
		{
			redirect_url = " ";
		}

		// mime type header parsing	
		reusable_header_pointer = Crawler_Info[child_num].crawl_header_address; 	

		header_index = parserFunk("Content-Type:", " ;\r\n", reusable_header_pointer, &reusable_key_pointer); 
		if (header_index > 0)
		{
			reusable_key_pointer[header_index] = '\0';
			mime_url = reusable_key_pointer;
			reusable_key_pointer[header_index] = ' ';
		}
		else
		{
			mime_url = "text/html";
		}
	}
	else // we have no header content to parse, so we make some up
	{
		redirect_url = " ";
		mime_url = "text/html";
	}
}

void Crawler::signaledEnd()
{
	exit(0);
}

// compute and save off our current bandwidth.  all calculations
// are done in bytes/sec, keeping in mind that the config file
// uses bits/sec 
void Crawler::adjustBandwidth(void)
{
	// first we set now_time equal to the current time
	band_end_time = time(NULL);

	// now we get our current throughput
	throughput_end = Bandwidth_Info.bandwidth_usage;

	// if the first time through, we set the start time, and bandwidth
	if (band_start_time == 0)
	{
		band_start_time = band_end_time;
		throughput_start = throughput_end;
	}

	// grab the semaphore
	pthread_mutex_lock(&band_mutex);

	int usage = Bandwidth_Info.bandwidth_usage;
	int limit = Bandwidth_Info.bandwidth_limit;

	if (usage > limit)
	{
		Crawler_Status_Info.bandwidth_limit = true;
	}
	else
	{
		Crawler_Status_Info.bandwidth_limit = false;
	}
		
	// now we compute our difference in times from the last
	// time that we were here, assuming we've been here before
	if ((band_end_time - band_start_time) > BANDWIDTH_SAMPLE_TIME) 
	{
		// set the wait time for the clients
		Bandwidth_Info.bandwidth_time = band_end_time + BANDWIDTH_SAMPLE_TIME;

		// compute the current bandwidth
		int time_passed = band_end_time - band_start_time;
		int throughput_passed = throughput_end - throughput_start;

		// Update current bandwidth usage - converting from bytes/sec to kilobits/sec
		Crawler_Status_Info.usage = (throughput_passed / time_passed)/125;

		// set the bandwidth limit - not to be confused with the usage above
		int max_in_bytes = maximum_bandwidth / 8;
		Bandwidth_Info.bandwidth_limit = throughput_end + (max_in_bytes * BANDWIDTH_SAMPLE_TIME);
		
		// set start time equal to the current time
		// these will be used next time around
		band_start_time = time(NULL);

		// set the start throughput 
		throughput_start = Bandwidth_Info.bandwidth_usage;
		
		// release our semaphore	
		pthread_mutex_unlock(&band_mutex);
	}
	else
	{
		// release our semaphore	
		pthread_mutex_unlock(&band_mutex);

	}
}
