Files
tubestation/webshell/tests/viewer/nsWebCrawler.cpp
dougt@netscape.com a70fce6f87 Revising nsIChannel to allow for overlapped i/o. This consists of three parts:
1. Factoring nsIChannel into a protocol specific part, the nsIChannel, and a socket specific, the nsITransport.
2. Derive the nsIChannel from a nsIRequest.
2. Changes the notification system from necko and the URILoader to pass the nsIRequest interface instead of nsIChannel interface.

This goal stems from wanting to be able to have active AsyncRead and AsyncWrite operations on nsSocketTransport.
This is desired because it would greatly simplify the task of maintaining persistent/reusable socket connections
for FTP, HTTP, and Imap (and potentially other protocols).  The problem with the existing nsIChannel interface is
that it does not allow one to selectively suspend just one of the read or write operations while keeping the other active.

The full details of the change on written up in the netlib newsgroup.

r=darin@netscape.com
sr=rpotts@netscape.com
2001-02-10 00:16:26 +00:00

1164 lines
29 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
*
* The contents of this file are subject to the Netscape Public
* License Version 1.1 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code is Mozilla Communicator client code.
*
* The Initial Developer of the Original Code is Netscape Communications
* Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All
* Rights Reserved.
*
* Contributor(s):
* Pierre Phaneuf <pp@ludusdesign.com>
* This Original Code has been modified by IBM Corporation. Modifications made by IBM
* described herein are Copyright (c) International Business Machines Corporation, 2000.
* Modifications to Mozilla code or documentation identified per MPL Section 3.3
*
* Date Modified by Description of modification
* 04/20/2000 IBM Corp. OS/2 VisualAge build.
*/
#include "nsCOMPtr.h"
#include "nsWebCrawler.h"
#include "nsViewerApp.h"
#include "nsIWebShell.h"
#include "nsIContentViewer.h"
#include "nsIDocumentViewer.h"
#include "nsIDocument.h"
#include "nsIContent.h"
#include "nsIPresShell.h"
#include "nsIPresContext.h"
#include "nsIViewManager.h"
#include "nsIFrame.h"
#include "nsIFrameDebug.h"
#include "nsIURL.h"
#include "nsNetUtil.h"
#include "nsITimer.h"
#include "nsIAtom.h"
#include "nsIFrameUtil.h"
#include "nsIComponentManager.h"
#include "nsLayoutCID.h"
#include "nsRect.h"
#include "plhash.h"
#include "nsINameSpaceManager.h"
#include "nsXPIDLString.h"
#include "nsIServiceManager.h"
#include "nsIEventQueueService.h"
#include "nsIEventQueue.h"
#include "prprf.h"
#include "nsIContentViewer.h"
#include "nsIContentViewerFile.h"
#include "nsIDocShell.h"
#include "nsIWebNavigation.h"
static NS_DEFINE_IID(kIDocumentLoaderObserverIID, NS_IDOCUMENTLOADEROBSERVER_IID);
static NS_DEFINE_IID(kFrameUtilCID, NS_FRAME_UTIL_CID);
static NS_DEFINE_IID(kIFrameUtilIID, NS_IFRAME_UTIL_IID);
static NS_DEFINE_IID(kIXMLContentIID, NS_IXMLCONTENT_IID);
static PLHashNumber
HashKey(nsIAtom* key)
{
return (PLHashNumber) key;
}
static PRIntn
CompareKeys(nsIAtom* key1, nsIAtom* key2)
{
return key1 == key2;
}
class AtomHashTable {
public:
AtomHashTable();
~AtomHashTable();
const void* Get(nsIAtom* aKey);
const void* Put(nsIAtom* aKey, const void* aValue);
const void* Remove(nsIAtom* aKey);
protected:
PLHashTable* mTable;
};
AtomHashTable::AtomHashTable()
{
mTable = PL_NewHashTable(8, (PLHashFunction) HashKey,
(PLHashComparator) CompareKeys,
(PLHashComparator) nsnull,
nsnull, nsnull);
}
static PRIntn PR_CALLBACK
DestroyEntry(PLHashEntry *he, PRIntn i, void *arg)
{
((nsIAtom*)he->key)->Release();
return HT_ENUMERATE_NEXT;
}
AtomHashTable::~AtomHashTable()
{
PL_HashTableEnumerateEntries(mTable, DestroyEntry, 0);
PL_HashTableDestroy(mTable);
}
/**
* Get the data associated with a Atom.
*/
const void*
AtomHashTable::Get(nsIAtom* aKey)
{
PRInt32 hashCode = (PRInt32) aKey;
PLHashEntry** hep = PL_HashTableRawLookup(mTable, hashCode, aKey);
PLHashEntry* he = *hep;
if (nsnull != he) {
return he->value;
}
return nsnull;
}
/**
* Create an association between a Atom and some data. This call
* returns an old association if there was one (or nsnull if there
* wasn't).
*/
const void*
AtomHashTable::Put(nsIAtom* aKey, const void* aData)
{
PRInt32 hashCode = (PRInt32) aKey;
PLHashEntry** hep = PL_HashTableRawLookup(mTable, hashCode, aKey);
PLHashEntry* he = *hep;
if (nsnull != he) {
const void* oldValue = he->value;
he->value = NS_CONST_CAST(void*, aData);
return oldValue;
}
NS_ADDREF(aKey);
PL_HashTableRawAdd(mTable, hep, hashCode, aKey, NS_CONST_CAST(void*, aData));
return nsnull;
}
/**
* Remove an association between a Atom and it's data. This returns
* the old associated data.
*/
const void*
AtomHashTable::Remove(nsIAtom* aKey)
{
PRInt32 hashCode = (PRInt32) aKey;
PLHashEntry** hep = PL_HashTableRawLookup(mTable, hashCode, aKey);
PLHashEntry* he = *hep;
void* oldValue = nsnull;
if (nsnull != he) {
oldValue = he->value;
PL_HashTableRawRemove(mTable, hep, he);
}
return oldValue;
}
//----------------------------------------------------------------------
PRInt32
GetDelay(nsString& aURL)
{
PRInt32 delay = -1;
if (aURL.Find("delay:=") >= 0) {
char buf[128];
PRInt32 offset = aURL.Find("=") + 1;
aURL.ToCString(&buf[0], 128, offset);
sscanf(&buf[0], "%d", &delay);
}
return delay;
}
nsWebCrawler::nsWebCrawler(nsViewerApp* aViewer)
: mHaveURLList(PR_FALSE),
mQueuedLoadURLs(0)
{
NS_INIT_REFCNT();
mBrowser = nsnull;
mViewer = aViewer;
mCrawl = PR_FALSE;
mJiggleLayout = PR_FALSE;
mPostExit = PR_FALSE;
mDelay = 0;
mLastDelay = 0;
mMaxPages = -1;
mRecord = nsnull;
mLinkTag = getter_AddRefs(NS_NewAtom("a"));
mFrameTag = getter_AddRefs(NS_NewAtom("frame"));
mIFrameTag = getter_AddRefs(NS_NewAtom("iframe"));
mHrefAttr = getter_AddRefs(NS_NewAtom("href"));
mSrcAttr = getter_AddRefs(NS_NewAtom("src"));
mBaseHrefAttr = getter_AddRefs(NS_NewAtom("_base_href"));
mVisited = new AtomHashTable();
mVerbose = nsnull;
LL_I2L(mStartLoad, 0);
mRegressing = PR_FALSE;
mPrinterTestType = 0;
mIncludeStyleInfo = PR_TRUE;
mLastWebShell = nsnull;
mLastURL = nsnull;
}
static void FreeStrings(nsVoidArray& aArray)
{
PRInt32 i, n = aArray.Count();
for (i = 0; i < n; i++) {
nsString* s = (nsString*) aArray.ElementAt(i);
delete s;
}
aArray.Clear();
}
nsWebCrawler::~nsWebCrawler()
{
FreeStrings(mSafeDomains);
FreeStrings(mAvoidDomains);
NS_IF_RELEASE(mLastWebShell);
NS_IF_RELEASE(mLastURL);
NS_IF_RELEASE(mBrowser);
delete mVisited;
}
NS_IMPL_ISUPPORTS1(nsWebCrawler, nsIDocumentLoaderObserver)
void
nsWebCrawler::DumpRegressionData(nsIWebShell* aWebShell,
nsIURI* aURL)
{
#ifdef NS_DEBUG
if (mOutputDir.Length() > 0) {
nsIPresShell* shell = GetPresShell(aWebShell);
if (!shell) return;
if ( mPrinterTestType > 0 ) {
nsCOMPtr <nsIContentViewer> viewer;
nsCOMPtr<nsIDocShell> docShell(do_QueryInterface(aWebShell));
docShell->GetContentViewer(getter_AddRefs(viewer));
if (viewer){
nsCOMPtr<nsIContentViewerFile> viewerFile = do_QueryInterface(viewer);
if (viewerFile) {
nsAutoString regressionFileName;
FILE *fp = GetOutputFile(aURL, regressionFileName);
switch (mPrinterTestType) {
case 1:
// dump print data to a file for regression testing
viewerFile->Print(PR_TRUE,fp);
break;
case 2:
// visual printing tests, all go to the printer, no printer dialog
viewerFile->Print(PR_TRUE,0);
break;
case 3:
// visual printing tests, all go to the printer, with a printer dialog
viewerFile->Print(PR_FALSE,0);
break;
default:
break;
}
fclose(fp);
}
}
}
else {
nsIFrame* root;
shell->GetRootFrame(&root);
if (nsnull != root) {
nsCOMPtr<nsIPresContext> presContext;
shell->GetPresContext(getter_AddRefs(presContext));
if (mOutputDir.Length() > 0) {
nsAutoString regressionFileName;
FILE *fp = GetOutputFile(aURL, regressionFileName);
if (fp) {
nsIFrameDebug* fdbg;
if (NS_SUCCEEDED(root->QueryInterface(NS_GET_IID(nsIFrameDebug), (void**) &fdbg))) {
fdbg->DumpRegressionData(presContext, fp, 0, mIncludeStyleInfo);
}
fclose(fp);
if (mRegressing) {
PerformRegressionTest(regressionFileName);
}
else {
fputs(regressionFileName, stdout);
printf(" - being written\n");
}
}
else {
char* file;
(void)aURL->GetPath(&file);
printf("could not open output file for %s\n", file);
nsCRT::free(file);
}
}
else {
nsIFrameDebug* fdbg;
if (NS_SUCCEEDED(root->QueryInterface(NS_GET_IID(nsIFrameDebug), (void**) &fdbg))) {
fdbg->DumpRegressionData(presContext, stdout, 0, mIncludeStyleInfo);
}
}
}
}
NS_RELEASE(shell);
}
#endif
}
NS_IMETHODIMP
nsWebCrawler::OnStartDocumentLoad(nsIDocumentLoader* loader, nsIURI* aURL,
const char* aCommand)
{
if (mDelay > 0) {
if (mLastWebShell && mLastURL) {
DumpRegressionData(mLastWebShell, mLastURL);
}
}
NS_IF_RELEASE(mLastWebShell);
mBrowser->GetWebShell(mLastWebShell);
NS_IF_RELEASE(mLastURL);
mLastURL = aURL;
NS_ADDREF(mLastURL);
return NS_OK;
}
NS_IMETHODIMP
nsWebCrawler::OnEndDocumentLoad(nsIDocumentLoader* loader,
nsIRequest* request,
nsresult aStatus)
{
nsresult rv;
PRTime endLoadTime = PR_Now();
if (loader != mDocLoader.get()) {
// This notifications is not for the "main" document...
return NS_OK;
}
if (NS_BINDING_ABORTED == aStatus) {
//
// Sometimes a Refresh will interrupt a document that is loading...
// When this happens just ignore the ABORTED notification and wait
// for the notification that the Refreshed document has finished..
//
return NS_OK;
}
nsCOMPtr<nsIURI> aURL;
nsCOMPtr<nsIChannel> channel = do_QueryInterface(request);
rv = channel->GetURI(getter_AddRefs(aURL));
if (NS_FAILED(rv)) {
return rv;
}
if (nsnull == aURL) {
return NS_OK;
}
// Ignore this notification unless its for the current url. That way
// we skip over embedded webshell notifications (e.g. frame cells,
// iframes, etc.)
char* spec;
aURL->GetSpec(&spec);
if (!spec) {
nsCRT::free(spec);
return NS_ERROR_OUT_OF_MEMORY;
}
nsCOMPtr<nsIURI> currentURL;
rv = NS_NewURI(getter_AddRefs(currentURL), mCurrentURL);
if (NS_FAILED(rv)) {
nsCRT::free(spec);
return rv;
}
char* spec2;
currentURL->GetSpec(&spec2);
if (!spec2) {
nsCRT::free(spec);
return NS_ERROR_OUT_OF_MEMORY;
}
if (PL_strcmp(spec, spec2)) {
nsCRT::free(spec);
nsCRT::free(spec2);
return NS_OK;
}
nsCRT::free(spec2);
char buf[400];
PRTime delta, cvt, rounder;
LL_I2L(cvt, 1000);
LL_I2L(rounder, 499);
LL_SUB(delta, endLoadTime, mStartLoad);
LL_ADD(delta, delta, rounder);
LL_DIV(delta, delta, cvt);
PR_snprintf(buf, sizeof(buf), "%s: done loading (%lld msec)",
spec, delta);
printf("%s\n", buf);
nsCRT::free(spec);
// Make sure the document bits make it to the screen at least once
nsIPresShell* shell = GetPresShell();
if (nsnull != shell) {
nsCOMPtr<nsIViewManager> vm;
shell->GetViewManager(getter_AddRefs(vm));
if (vm) {
nsIView* rootView;
vm->GetRootView(rootView);
vm->UpdateView(rootView, NS_VMREFRESH_IMMEDIATE);
}
if (0 == mDelay) {
nsIWebShell* webShell;
mBrowser->GetWebShell(webShell);
if (webShell) {
DumpRegressionData(webShell, aURL);
NS_RELEASE(webShell);
}
}
if (mJiggleLayout) {
nsRect r;
mBrowser->GetContentBounds(r);
nscoord oldWidth = r.width;
while (r.width > 100) {
r.width -= 10;
mBrowser->SizeWindowTo(r.width, r.height, PR_FALSE, PR_FALSE);
}
while (r.width < oldWidth) {
r.width += 10;
mBrowser->SizeWindowTo(r.width, r.height, PR_FALSE, PR_FALSE);
}
}
if (mCrawl) {
FindMoreURLs();
}
if (0 == mDelay) {
LoadNextURL(PR_TRUE);
}
NS_RELEASE(shell);
}
else {
fputs("null pres shell\n", stdout);
}
if (mPostExit && (0 == mQueuedLoadURLs) && (0==mPendingURLs.Count())) {
QueueExit();
}
return NS_OK;
}
NS_IMETHODIMP
nsWebCrawler::OnStartURLLoad(nsIDocumentLoader* loader,
nsIRequest* request)
{
return NS_OK;
}
NS_IMETHODIMP
nsWebCrawler::OnProgressURLLoad(nsIDocumentLoader* loader,
nsIRequest* request,
PRUint32 aProgress,
PRUint32 aProgressMax)
{
return NS_OK;
}
NS_IMETHODIMP
nsWebCrawler::OnStatusURLLoad(nsIDocumentLoader* loader,
nsIRequest* request,
nsString& aMsg)
{
return NS_OK;
}
NS_IMETHODIMP
nsWebCrawler::OnEndURLLoad(nsIDocumentLoader* loader, nsIRequest* request,
nsresult aStatus)
{
return NS_OK;
}
FILE*
nsWebCrawler::GetOutputFile(nsIURI *aURL, nsString& aOutputName)
{
static const char kDefaultOutputFileName[] = "test.txt"; // the default
FILE *result = nsnull;
if (nsnull!=aURL)
{
char *inputFileName;
char* file;
(void)aURL->GetPath(&file);
nsAutoString inputFileFullPath; inputFileFullPath.AssignWithConversion(file);
nsCRT::free(file);
PRInt32 fileNameOffset = inputFileFullPath.RFindChar('/');
if (-1==fileNameOffset)
{
inputFileName = new char[strlen(kDefaultOutputFileName) + 1];
strcpy (inputFileName, kDefaultOutputFileName);
}
else
{
PRInt32 len = inputFileFullPath.Length() - fileNameOffset;
inputFileName = new char[len + 1 + 20];
char *c = inputFileName;
for (PRInt32 i=fileNameOffset+1; i<fileNameOffset+len; i++)
{
char ch = (char) inputFileFullPath.CharAt(i);
if (ch == '.') {
// Stop on dot so that we don't keep the old extension
break;
}
*c++ = ch;
}
// Tack on ".rgd" extension for "regression data"
*c++ = '.';
*c++ = 'r';
*c++ = 'g';
*c++ = 'd';
*c++ = '\0';
aOutputName.Truncate();
aOutputName.AppendWithConversion(inputFileName);
}
nsAutoString outputFileName(mOutputDir);
outputFileName.AppendWithConversion(inputFileName);
PRInt32 bufLen = outputFileName.Length()+1;
char *buf = new char[bufLen+1];
outputFileName.ToCString(buf, bufLen);
result = fopen(buf, "wt");
delete [] buf;
delete [] inputFileName;
}
return result;
}
void
nsWebCrawler::AddURL(const nsString& aURL)
{
nsString* url = new nsString(aURL);
mPendingURLs.AppendElement(url);
if (1 == mPendingURLs.Count()) {
mLastDelay = mDelay;
}
PRInt32 delay = GetDelay(*url);
if (delay >= 0) {
SetDelay(delay);
mLastDelay = delay;
}
if (mVerbose) {
printf("WebCrawler: adding '");
fputs(aURL, stdout);
printf("'\n");
}
}
void
nsWebCrawler::AddSafeDomain(const nsString& aDomain)
{
nsString* s = new nsString(aDomain);
mSafeDomains.AppendElement(s);
}
void
nsWebCrawler::AddAvoidDomain(const nsString& aDomain)
{
nsString* s = new nsString(aDomain);
mAvoidDomains.AppendElement(s);
}
void
nsWebCrawler::SetOutputDir(const nsString& aOutputDir)
{
mOutputDir = aOutputDir;
}
void
nsWebCrawler::SetRegressionDir(const nsString& aDir)
{
mRegressionDir = aDir;
}
void
nsWebCrawler::Start()
{
// Enable observing each URL load...
nsIWebShell* shell = nsnull;
mBrowser->GetWebShell(shell);
nsCOMPtr<nsIDocShell> docShell(do_QueryInterface(shell));
docShell->SetDocLoaderObserver(this);
shell->GetDocumentLoader(*getter_AddRefs(mDocLoader));
NS_RELEASE(shell);
if (mPendingURLs.Count() >= 1) {
mHaveURLList = PR_TRUE;
// duplicate the last url if there is a delay since the regression data for the
// url gets written when the next url is encountered. Not perfect, but simple.
if (mLastDelay != 0) {
nsString* last = (nsString *) mPendingURLs.ElementAt(mPendingURLs.Count() - 1);
if (last) {
nsString* dupLast = new nsString(*last);
mPendingURLs.AppendElement(dupLast);
}
}
}
LoadNextURL(PR_FALSE);
}
void
nsWebCrawler::EnableCrawler()
{
mCrawl = PR_TRUE;
}
static const unsigned char kLowerLookup[256] = {
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
64,
97,98,99,100,101,102,103,104,105,106,107,108,109,
110,111,112,113,114,115,116,117,118,119,120,121,122,
91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
};
static PRBool
EndsWith(const nsString& aDomain, const char* aHost, PRInt32 aHostLen)
{
PRInt32 slen = aDomain.Length();
if (slen < aHostLen) {
return PR_FALSE;
}
const PRUnichar* uc = aDomain.GetUnicode();
uc += slen - aHostLen;
const PRUnichar* end = uc + aHostLen;
while (uc < end) {
unsigned char uch = (unsigned char) ((*uc++) & 0xff);
unsigned char ch = (unsigned char) ((*aHost++) & 0xff);
if (kLowerLookup[uch] != kLowerLookup[ch]) {
return PR_FALSE;
}
}
return PR_TRUE;
}
static PRBool
StartsWith(const nsString& s1, const char* s2)
{
PRInt32 s1len = s1.Length();
PRInt32 s2len = strlen(s2);
if (s1len < s2len) {
return PR_FALSE;
}
const PRUnichar* uc = s1.GetUnicode();
const PRUnichar* end = uc + s2len;
while (uc < end) {
unsigned char uch = (unsigned char) ((*uc++) & 0xff);
unsigned char ch = (unsigned char) ((*s2++) & 0xff);
if (kLowerLookup[uch] != kLowerLookup[ch]) {
return PR_FALSE;
}
}
return PR_TRUE;
}
PRBool
nsWebCrawler::OkToLoad(const nsString& aURLSpec)
{
if (!StartsWith(aURLSpec, "http:") && !StartsWith(aURLSpec, "ftp:") &&
!StartsWith(aURLSpec, "file:") &&
!StartsWith(aURLSpec, "resource:")) {
return PR_FALSE;
}
PRBool ok = PR_TRUE;
nsIURI* url;
nsresult rv;
rv = NS_NewURI(&url, aURLSpec);
if (NS_OK == rv) {
nsXPIDLCString host;
rv = url->GetHost(getter_Copies(host));
if (rv == NS_OK) {
PRInt32 hostlen = PL_strlen(host);
// Check domains to avoid
PRInt32 i, n = mAvoidDomains.Count();
for (i = 0; i < n; i++) {
nsString* s = (nsString*) mAvoidDomains.ElementAt(i);
if (s && EndsWith(*s, host, hostlen)) {
printf("Avoiding '");
fputs(aURLSpec, stdout);
printf("'\n");
return PR_FALSE;
}
}
// Check domains to stay within
n = mSafeDomains.Count();
if (n == 0) {
// If we don't care then all the domains that we aren't
// avoiding are OK
return PR_TRUE;
}
for (i = 0; i < n; i++) {
nsString* s = (nsString*) mSafeDomains.ElementAt(i);
if (s && EndsWith(*s, host, hostlen)) {
return PR_TRUE;
}
}
ok = PR_FALSE;
}
NS_RELEASE(url);
}
return ok;
}
void
nsWebCrawler::RecordLoadedURL(const nsString& aURL)
{
if (nsnull != mRecord) {
fputs(aURL, mRecord);
fputs("\n", mRecord);
fflush(mRecord);
}
}
void
nsWebCrawler::FindURLsIn(nsIDocument* aDocument, nsIContent* aNode)
{
nsCOMPtr<nsIAtom> atom;
aNode->GetTag(*getter_AddRefs(atom));
if ((atom == mLinkTag) || (atom == mFrameTag) || (atom == mIFrameTag)) {
// Get absolute url that tag targets
nsAutoString base, src, absURLSpec;
if (atom == mLinkTag) {
aNode->GetAttribute(kNameSpaceID_HTML, mHrefAttr, src);
}
else {
aNode->GetAttribute(kNameSpaceID_HTML, mSrcAttr, src);
}
nsIURI* docURL = aDocument->GetDocumentURL();
nsresult rv;
rv = NS_MakeAbsoluteURI(absURLSpec, src, docURL);
if (NS_OK == rv) {
nsCOMPtr<nsIAtom> urlAtom = getter_AddRefs(NS_NewAtom(absURLSpec));
if (0 == mVisited->Get(urlAtom)) {
// Remember the URL as visited so that we don't go there again
mVisited->Put(urlAtom, "visited");
if (OkToLoad(absURLSpec)) {
mPendingURLs.AppendElement(new nsString(absURLSpec));
if (mVerbose) {
printf("Adding '");
fputs(absURLSpec, stdout);
printf("'\n");
}
}
else {
if (mVerbose) {
printf("Skipping '");
fputs(absURLSpec, stdout);
printf("'\n");
}
}
}
else {
if (mVerbose) {
printf("Already visited '");
fputs(absURLSpec, stdout);
printf("'\n");
}
}
}
NS_RELEASE(docURL);
}
PRBool canHaveKids;
aNode->CanContainChildren(canHaveKids);
if (canHaveKids) {
PRInt32 i, n;
aNode->ChildCount(n);
for (i = 0; i < n; i++) {
nsIContent* kid;
aNode->ChildAt(i, kid);
if (nsnull != kid) {
FindURLsIn(aDocument, kid);
NS_RELEASE(kid);
}
}
}
}
void
nsWebCrawler::FindMoreURLs()
{
nsIWebShell* shell = nsnull;
mBrowser->GetWebShell(shell);
nsCOMPtr<nsIDocShell> docShell(do_QueryInterface(shell));
if (docShell) {
nsIContentViewer* cv = nsnull;
docShell->GetContentViewer(&cv);
if (nsnull != cv) {
nsIDocumentViewer* docv = nsnull;
cv->QueryInterface(NS_GET_IID(nsIDocumentViewer), (void**) &docv);
if (nsnull != docv) {
nsIDocument* doc = nsnull;
docv->GetDocument(doc);
if (nsnull != doc) {
nsIContent* root;
root = doc->GetRootContent();
if (nsnull != root) {
FindURLsIn(doc, root);
NS_RELEASE(root);
}
NS_RELEASE(doc);
}
NS_RELEASE(docv);
}
NS_RELEASE(cv);
}
NS_RELEASE(shell);
}
}
void
nsWebCrawler::SetBrowserWindow(nsBrowserWindow* aWindow)
{
NS_IF_RELEASE(mBrowser);
mBrowser = aWindow;
NS_IF_ADDREF(mBrowser);
}
void
nsWebCrawler::GetBrowserWindow(nsBrowserWindow** aWindow)
{
NS_IF_ADDREF(mBrowser);
*aWindow = mBrowser;
}
static void
TimerCallBack(nsITimer *aTimer, void *aClosure)
{
nsWebCrawler* wc = (nsWebCrawler*) aClosure;
wc->LoadNextURL(PR_TRUE);
}
void
nsWebCrawler::LoadNextURL(PRBool aQueueLoad)
{
nsString* url = (nsString*) mPendingURLs.ElementAt(0);
if (nsnull != url) {
PRInt32 delay = GetDelay(*url);
if (delay >= 0) {
SetDelay(delay);
mPendingURLs.RemoveElementAt(0);
char buf[128];
url->ToCString(&buf[0], 128);
printf("%s\n", buf);
}
}
if ((0 != mDelay) && (mPendingURLs.Count() > 0)) {
mTimer = do_CreateInstance("@mozilla.org/timer;1");
mTimer->Init(TimerCallBack, (void *)this, mDelay * 1000);
}
if ((mMaxPages < 0) || (mMaxPages > 0)) {
while (0 != mPendingURLs.Count()) {
nsString* url = (nsString*) mPendingURLs.ElementAt(0);
mPendingURLs.RemoveElementAt(0);
if (nsnull != url) {
if (OkToLoad(*url)) {
RecordLoadedURL(*url);
nsIWebShell* webShell;
mBrowser->GetWebShell(webShell);
if (aQueueLoad) {
// Call stop to cancel any pending URL Refreshes...
/// webShell->Stop();
QueueLoadURL(*url);
}
else {
mCurrentURL = *url;
mStartLoad = PR_Now();
nsCOMPtr<nsIWebNavigation> webNav(do_QueryInterface(webShell));
webNav->LoadURI(url->GetUnicode(), nsIWebNavigation::LOAD_FLAGS_NONE);
}
NS_RELEASE(webShell);
if (mMaxPages > 0) {
--mMaxPages;
}
delete url;
return;
}
delete url;
}
}
}
if (nsnull != mRecord) {
fclose(mRecord);
mRecord = nsnull;
}
}
nsIPresShell*
nsWebCrawler::GetPresShell(nsIWebShell* aWebShell)
{
nsIWebShell* webShell = aWebShell;
if (webShell) {
NS_ADDREF(webShell);
}
else {
mBrowser->GetWebShell(webShell);
}
nsIPresShell* shell = nsnull;
nsCOMPtr<nsIDocShell> docShell(do_QueryInterface(webShell));
if (nsnull != webShell) {
nsIContentViewer* cv = nsnull;
docShell->GetContentViewer(&cv);
if (nsnull != cv) {
nsIDocumentViewer* docv = nsnull;
cv->QueryInterface(NS_GET_IID(nsIDocumentViewer), (void**) &docv);
if (nsnull != docv) {
nsIPresContext* cx;
docv->GetPresContext(cx);
if (nsnull != cx) {
cx->GetShell(&shell);
NS_RELEASE(cx);
}
NS_RELEASE(docv);
}
NS_RELEASE(cv);
}
NS_RELEASE(webShell);
}
return shell;
}
static FILE*
OpenRegressionFile(const nsString& aBaseName, const nsString& aOutputName)
{
nsAutoString a;
a.Append(aBaseName);
a.AppendWithConversion("/");
a.Append(aOutputName);
char* fn = a.ToNewCString();
FILE* fp = fopen(fn, "r");
if (!fp) {
printf("Unable to open regression data file %s\n", fn);
}
delete[] fn;
return fp;
}
#define BUF_SIZE 1024
// Load up both data files (original and the one we just output) into
// two independent xml content trees. Then compare them.
void
nsWebCrawler::PerformRegressionTest(const nsString& aOutputName)
{
// First load the trees
nsIFrameUtil* fu;
nsresult rv = nsComponentManager::CreateInstance(kFrameUtilCID, nsnull,
kIFrameUtilIID, (void **)&fu);
if (NS_FAILED(rv)) {
printf("Can't find nsIFrameUtil implementation\n");
return;
}
FILE* f1 = OpenRegressionFile(mRegressionDir, aOutputName);
if (!f1) {
NS_RELEASE(fu);
return;
}
FILE* f2 = OpenRegressionFile(mOutputDir, aOutputName);
if (!f2) {
fclose(f1);
NS_RELEASE(fu);
return;
}
rv = fu->CompareRegressionData(f1, f2);
NS_RELEASE(fu);
char dirName[BUF_SIZE];
char fileName[BUF_SIZE];
mOutputDir.ToCString(dirName, BUF_SIZE-1);
aOutputName.ToCString(fileName, BUF_SIZE-1);
printf("regression test %s%s %s\n", dirName, fileName, NS_SUCCEEDED(rv) ? "passed" : "failed");
}
//----------------------------------------------------------------------
static NS_DEFINE_IID(kEventQueueServiceCID, NS_EVENTQUEUESERVICE_CID);
static NS_DEFINE_IID(kIEventQueueServiceIID, NS_IEVENTQUEUESERVICE_IID);
static nsresult
QueueEvent(PLEvent* aEvent)
{
nsISupports* is;
nsresult rv = nsServiceManager::GetService(kEventQueueServiceCID,
kIEventQueueServiceIID,
&is,
nsnull);
if (NS_FAILED(rv)) {
return rv;
}
nsCOMPtr<nsIEventQueueService> eqs = do_QueryInterface(is);
if (eqs) {
nsCOMPtr<nsIEventQueue> eq;
rv = eqs->GetThreadEventQueue(NS_CURRENT_THREAD, getter_AddRefs(eq));
if (eq) {
eq->PostEvent(aEvent);
}
}
nsServiceManager::ReleaseService(kEventQueueServiceCID, is, nsnull);
return rv;
}
//----------------------------------------------------------------------
struct ExitEvent : public PLEvent {
ExitEvent(nsWebCrawler* aCrawler);
~ExitEvent();
void DoIt() {
crawler->Exit();
}
nsWebCrawler* crawler;
static void PR_CALLBACK HandleMe(ExitEvent* e);
static void PR_CALLBACK DeleteMe(ExitEvent* e);
};
ExitEvent::ExitEvent(nsWebCrawler* aCrawler)
: crawler(aCrawler)
{
PL_InitEvent(this, crawler, (PLHandleEventProc) HandleMe,
(PLDestroyEventProc) DeleteMe);
NS_ADDREF(aCrawler);
}
ExitEvent::~ExitEvent()
{
NS_RELEASE(crawler);
}
void
ExitEvent::HandleMe(ExitEvent* e)
{
e->DoIt();
}
void
ExitEvent::DeleteMe(ExitEvent* e)
{
delete e;
}
void
nsWebCrawler::QueueExit()
{
ExitEvent* event = new ExitEvent(this);
QueueEvent(event);
}
void
nsWebCrawler::Exit()
{
mViewer->Exit();
}
//----------------------------------------------------------------------
struct LoadEvent : public PLEvent {
LoadEvent(nsWebCrawler* aCrawler, const nsString& aURL);
~LoadEvent();
void DoIt() {
crawler->GoToQueuedURL(url);
}
nsString url;
nsWebCrawler* crawler;
static void PR_CALLBACK HandleMe(LoadEvent* e);
static void PR_CALLBACK DeleteMe(LoadEvent* e);
};
LoadEvent::LoadEvent(nsWebCrawler* aCrawler, const nsString& aURL)
: url(aURL),
crawler(aCrawler)
{
PL_InitEvent(this, crawler, (PLHandleEventProc) HandleMe,
(PLDestroyEventProc) DeleteMe);
NS_ADDREF(aCrawler);
}
LoadEvent::~LoadEvent()
{
NS_RELEASE(crawler);
}
void
LoadEvent::HandleMe(LoadEvent* e)
{
e->DoIt();
}
void
LoadEvent::DeleteMe(LoadEvent* e)
{
delete e;
}
void
nsWebCrawler::GoToQueuedURL(const nsString& aURL)
{
nsIWebShell* webShell;
mBrowser->GetWebShell(webShell);
nsCOMPtr<nsIWebNavigation> webNav(do_QueryInterface(webShell));
if (webNav) {
mCurrentURL = aURL;
mStartLoad = PR_Now();
webNav->LoadURI(aURL.GetUnicode(), nsIWebNavigation::LOAD_FLAGS_NONE);
NS_RELEASE(webShell);
}
mQueuedLoadURLs--;
}
nsresult
nsWebCrawler::QueueLoadURL(const nsString& aURL)
{
LoadEvent* event = new LoadEvent(this, aURL);
nsresult rv = QueueEvent(event);
if (NS_SUCCEEDED(rv)) {
mQueuedLoadURLs++;
}
return rv;
}