1. Factoring nsIChannel into a protocol specific part, the nsIChannel, and a socket specific, the nsITransport. 2. Derive the nsIChannel from a nsIRequest. 2. Changes the notification system from necko and the URILoader to pass the nsIRequest interface instead of nsIChannel interface. This goal stems from wanting to be able to have active AsyncRead and AsyncWrite operations on nsSocketTransport. This is desired because it would greatly simplify the task of maintaining persistent/reusable socket connections for FTP, HTTP, and Imap (and potentially other protocols). The problem with the existing nsIChannel interface is that it does not allow one to selectively suspend just one of the read or write operations while keeping the other active. The full details of the change on written up in the netlib newsgroup. r=darin@netscape.com sr=rpotts@netscape.com
1164 lines
29 KiB
C++
1164 lines
29 KiB
C++
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
|
*
|
|
* The contents of this file are subject to the Netscape Public
|
|
* License Version 1.1 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at http://www.mozilla.org/NPL/
|
|
*
|
|
* Software distributed under the License is distributed on an "AS
|
|
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
|
* implied. See the License for the specific language governing
|
|
* rights and limitations under the License.
|
|
*
|
|
* The Original Code is Mozilla Communicator client code.
|
|
*
|
|
* The Initial Developer of the Original Code is Netscape Communications
|
|
* Corporation. Portions created by Netscape are
|
|
* Copyright (C) 1998 Netscape Communications Corporation. All
|
|
* Rights Reserved.
|
|
*
|
|
* Contributor(s):
|
|
* Pierre Phaneuf <pp@ludusdesign.com>
|
|
* This Original Code has been modified by IBM Corporation. Modifications made by IBM
|
|
* described herein are Copyright (c) International Business Machines Corporation, 2000.
|
|
* Modifications to Mozilla code or documentation identified per MPL Section 3.3
|
|
*
|
|
* Date Modified by Description of modification
|
|
* 04/20/2000 IBM Corp. OS/2 VisualAge build.
|
|
*/
|
|
#include "nsCOMPtr.h"
|
|
#include "nsWebCrawler.h"
|
|
#include "nsViewerApp.h"
|
|
#include "nsIWebShell.h"
|
|
#include "nsIContentViewer.h"
|
|
#include "nsIDocumentViewer.h"
|
|
#include "nsIDocument.h"
|
|
#include "nsIContent.h"
|
|
#include "nsIPresShell.h"
|
|
#include "nsIPresContext.h"
|
|
#include "nsIViewManager.h"
|
|
#include "nsIFrame.h"
|
|
#include "nsIFrameDebug.h"
|
|
#include "nsIURL.h"
|
|
#include "nsNetUtil.h"
|
|
#include "nsITimer.h"
|
|
#include "nsIAtom.h"
|
|
#include "nsIFrameUtil.h"
|
|
#include "nsIComponentManager.h"
|
|
#include "nsLayoutCID.h"
|
|
#include "nsRect.h"
|
|
#include "plhash.h"
|
|
#include "nsINameSpaceManager.h"
|
|
#include "nsXPIDLString.h"
|
|
#include "nsIServiceManager.h"
|
|
#include "nsIEventQueueService.h"
|
|
#include "nsIEventQueue.h"
|
|
#include "prprf.h"
|
|
#include "nsIContentViewer.h"
|
|
#include "nsIContentViewerFile.h"
|
|
#include "nsIDocShell.h"
|
|
#include "nsIWebNavigation.h"
|
|
|
|
static NS_DEFINE_IID(kIDocumentLoaderObserverIID, NS_IDOCUMENTLOADEROBSERVER_IID);
|
|
static NS_DEFINE_IID(kFrameUtilCID, NS_FRAME_UTIL_CID);
|
|
static NS_DEFINE_IID(kIFrameUtilIID, NS_IFRAME_UTIL_IID);
|
|
static NS_DEFINE_IID(kIXMLContentIID, NS_IXMLCONTENT_IID);
|
|
|
|
static PLHashNumber
|
|
HashKey(nsIAtom* key)
|
|
{
|
|
return (PLHashNumber) key;
|
|
}
|
|
|
|
static PRIntn
|
|
CompareKeys(nsIAtom* key1, nsIAtom* key2)
|
|
{
|
|
return key1 == key2;
|
|
}
|
|
|
|
class AtomHashTable {
|
|
public:
|
|
AtomHashTable();
|
|
~AtomHashTable();
|
|
|
|
const void* Get(nsIAtom* aKey);
|
|
const void* Put(nsIAtom* aKey, const void* aValue);
|
|
const void* Remove(nsIAtom* aKey);
|
|
|
|
protected:
|
|
PLHashTable* mTable;
|
|
};
|
|
|
|
AtomHashTable::AtomHashTable()
|
|
{
|
|
mTable = PL_NewHashTable(8, (PLHashFunction) HashKey,
|
|
(PLHashComparator) CompareKeys,
|
|
(PLHashComparator) nsnull,
|
|
nsnull, nsnull);
|
|
}
|
|
|
|
static PRIntn PR_CALLBACK
|
|
DestroyEntry(PLHashEntry *he, PRIntn i, void *arg)
|
|
{
|
|
((nsIAtom*)he->key)->Release();
|
|
return HT_ENUMERATE_NEXT;
|
|
}
|
|
|
|
AtomHashTable::~AtomHashTable()
|
|
{
|
|
PL_HashTableEnumerateEntries(mTable, DestroyEntry, 0);
|
|
PL_HashTableDestroy(mTable);
|
|
}
|
|
|
|
/**
|
|
* Get the data associated with a Atom.
|
|
*/
|
|
const void*
|
|
AtomHashTable::Get(nsIAtom* aKey)
|
|
{
|
|
PRInt32 hashCode = (PRInt32) aKey;
|
|
PLHashEntry** hep = PL_HashTableRawLookup(mTable, hashCode, aKey);
|
|
PLHashEntry* he = *hep;
|
|
if (nsnull != he) {
|
|
return he->value;
|
|
}
|
|
return nsnull;
|
|
}
|
|
|
|
/**
|
|
* Create an association between a Atom and some data. This call
|
|
* returns an old association if there was one (or nsnull if there
|
|
* wasn't).
|
|
*/
|
|
const void*
|
|
AtomHashTable::Put(nsIAtom* aKey, const void* aData)
|
|
{
|
|
PRInt32 hashCode = (PRInt32) aKey;
|
|
PLHashEntry** hep = PL_HashTableRawLookup(mTable, hashCode, aKey);
|
|
PLHashEntry* he = *hep;
|
|
if (nsnull != he) {
|
|
const void* oldValue = he->value;
|
|
he->value = NS_CONST_CAST(void*, aData);
|
|
return oldValue;
|
|
}
|
|
NS_ADDREF(aKey);
|
|
PL_HashTableRawAdd(mTable, hep, hashCode, aKey, NS_CONST_CAST(void*, aData));
|
|
return nsnull;
|
|
}
|
|
|
|
/**
|
|
* Remove an association between a Atom and it's data. This returns
|
|
* the old associated data.
|
|
*/
|
|
const void*
|
|
AtomHashTable::Remove(nsIAtom* aKey)
|
|
{
|
|
PRInt32 hashCode = (PRInt32) aKey;
|
|
PLHashEntry** hep = PL_HashTableRawLookup(mTable, hashCode, aKey);
|
|
PLHashEntry* he = *hep;
|
|
void* oldValue = nsnull;
|
|
if (nsnull != he) {
|
|
oldValue = he->value;
|
|
PL_HashTableRawRemove(mTable, hep, he);
|
|
}
|
|
return oldValue;
|
|
}
|
|
|
|
//----------------------------------------------------------------------
|
|
PRInt32
|
|
GetDelay(nsString& aURL)
|
|
{
|
|
PRInt32 delay = -1;
|
|
if (aURL.Find("delay:=") >= 0) {
|
|
char buf[128];
|
|
PRInt32 offset = aURL.Find("=") + 1;
|
|
aURL.ToCString(&buf[0], 128, offset);
|
|
sscanf(&buf[0], "%d", &delay);
|
|
}
|
|
return delay;
|
|
}
|
|
|
|
nsWebCrawler::nsWebCrawler(nsViewerApp* aViewer)
|
|
: mHaveURLList(PR_FALSE),
|
|
mQueuedLoadURLs(0)
|
|
{
|
|
NS_INIT_REFCNT();
|
|
|
|
mBrowser = nsnull;
|
|
mViewer = aViewer;
|
|
mCrawl = PR_FALSE;
|
|
mJiggleLayout = PR_FALSE;
|
|
mPostExit = PR_FALSE;
|
|
mDelay = 0;
|
|
mLastDelay = 0;
|
|
mMaxPages = -1;
|
|
mRecord = nsnull;
|
|
mLinkTag = getter_AddRefs(NS_NewAtom("a"));
|
|
mFrameTag = getter_AddRefs(NS_NewAtom("frame"));
|
|
mIFrameTag = getter_AddRefs(NS_NewAtom("iframe"));
|
|
mHrefAttr = getter_AddRefs(NS_NewAtom("href"));
|
|
mSrcAttr = getter_AddRefs(NS_NewAtom("src"));
|
|
mBaseHrefAttr = getter_AddRefs(NS_NewAtom("_base_href"));
|
|
mVisited = new AtomHashTable();
|
|
mVerbose = nsnull;
|
|
LL_I2L(mStartLoad, 0);
|
|
mRegressing = PR_FALSE;
|
|
mPrinterTestType = 0;
|
|
mIncludeStyleInfo = PR_TRUE;
|
|
mLastWebShell = nsnull;
|
|
mLastURL = nsnull;
|
|
}
|
|
|
|
static void FreeStrings(nsVoidArray& aArray)
|
|
{
|
|
PRInt32 i, n = aArray.Count();
|
|
for (i = 0; i < n; i++) {
|
|
nsString* s = (nsString*) aArray.ElementAt(i);
|
|
delete s;
|
|
}
|
|
aArray.Clear();
|
|
}
|
|
|
|
nsWebCrawler::~nsWebCrawler()
|
|
{
|
|
FreeStrings(mSafeDomains);
|
|
FreeStrings(mAvoidDomains);
|
|
NS_IF_RELEASE(mLastWebShell);
|
|
NS_IF_RELEASE(mLastURL);
|
|
NS_IF_RELEASE(mBrowser);
|
|
delete mVisited;
|
|
}
|
|
|
|
NS_IMPL_ISUPPORTS1(nsWebCrawler, nsIDocumentLoaderObserver)
|
|
|
|
void
|
|
nsWebCrawler::DumpRegressionData(nsIWebShell* aWebShell,
|
|
nsIURI* aURL)
|
|
{
|
|
#ifdef NS_DEBUG
|
|
if (mOutputDir.Length() > 0) {
|
|
nsIPresShell* shell = GetPresShell(aWebShell);
|
|
if (!shell) return;
|
|
if ( mPrinterTestType > 0 ) {
|
|
nsCOMPtr <nsIContentViewer> viewer;
|
|
nsCOMPtr<nsIDocShell> docShell(do_QueryInterface(aWebShell));
|
|
docShell->GetContentViewer(getter_AddRefs(viewer));
|
|
|
|
if (viewer){
|
|
nsCOMPtr<nsIContentViewerFile> viewerFile = do_QueryInterface(viewer);
|
|
if (viewerFile) {
|
|
nsAutoString regressionFileName;
|
|
FILE *fp = GetOutputFile(aURL, regressionFileName);
|
|
|
|
switch (mPrinterTestType) {
|
|
case 1:
|
|
// dump print data to a file for regression testing
|
|
viewerFile->Print(PR_TRUE,fp);
|
|
break;
|
|
case 2:
|
|
// visual printing tests, all go to the printer, no printer dialog
|
|
viewerFile->Print(PR_TRUE,0);
|
|
break;
|
|
case 3:
|
|
// visual printing tests, all go to the printer, with a printer dialog
|
|
viewerFile->Print(PR_FALSE,0);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
fclose(fp);
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
nsIFrame* root;
|
|
shell->GetRootFrame(&root);
|
|
if (nsnull != root) {
|
|
nsCOMPtr<nsIPresContext> presContext;
|
|
shell->GetPresContext(getter_AddRefs(presContext));
|
|
|
|
if (mOutputDir.Length() > 0) {
|
|
nsAutoString regressionFileName;
|
|
FILE *fp = GetOutputFile(aURL, regressionFileName);
|
|
if (fp) {
|
|
nsIFrameDebug* fdbg;
|
|
if (NS_SUCCEEDED(root->QueryInterface(NS_GET_IID(nsIFrameDebug), (void**) &fdbg))) {
|
|
fdbg->DumpRegressionData(presContext, fp, 0, mIncludeStyleInfo);
|
|
}
|
|
fclose(fp);
|
|
if (mRegressing) {
|
|
PerformRegressionTest(regressionFileName);
|
|
}
|
|
else {
|
|
fputs(regressionFileName, stdout);
|
|
printf(" - being written\n");
|
|
}
|
|
}
|
|
else {
|
|
char* file;
|
|
(void)aURL->GetPath(&file);
|
|
printf("could not open output file for %s\n", file);
|
|
nsCRT::free(file);
|
|
}
|
|
}
|
|
else {
|
|
nsIFrameDebug* fdbg;
|
|
if (NS_SUCCEEDED(root->QueryInterface(NS_GET_IID(nsIFrameDebug), (void**) &fdbg))) {
|
|
fdbg->DumpRegressionData(presContext, stdout, 0, mIncludeStyleInfo);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
NS_RELEASE(shell);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
nsWebCrawler::OnStartDocumentLoad(nsIDocumentLoader* loader, nsIURI* aURL,
|
|
const char* aCommand)
|
|
{
|
|
if (mDelay > 0) {
|
|
if (mLastWebShell && mLastURL) {
|
|
DumpRegressionData(mLastWebShell, mLastURL);
|
|
}
|
|
}
|
|
NS_IF_RELEASE(mLastWebShell);
|
|
mBrowser->GetWebShell(mLastWebShell);
|
|
|
|
NS_IF_RELEASE(mLastURL);
|
|
mLastURL = aURL;
|
|
NS_ADDREF(mLastURL);
|
|
|
|
return NS_OK;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
nsWebCrawler::OnEndDocumentLoad(nsIDocumentLoader* loader,
|
|
nsIRequest* request,
|
|
nsresult aStatus)
|
|
{
|
|
nsresult rv;
|
|
PRTime endLoadTime = PR_Now();
|
|
|
|
if (loader != mDocLoader.get()) {
|
|
// This notifications is not for the "main" document...
|
|
return NS_OK;
|
|
}
|
|
|
|
if (NS_BINDING_ABORTED == aStatus) {
|
|
//
|
|
// Sometimes a Refresh will interrupt a document that is loading...
|
|
// When this happens just ignore the ABORTED notification and wait
|
|
// for the notification that the Refreshed document has finished..
|
|
//
|
|
return NS_OK;
|
|
}
|
|
|
|
nsCOMPtr<nsIURI> aURL;
|
|
nsCOMPtr<nsIChannel> channel = do_QueryInterface(request);
|
|
rv = channel->GetURI(getter_AddRefs(aURL));
|
|
if (NS_FAILED(rv)) {
|
|
return rv;
|
|
}
|
|
if (nsnull == aURL) {
|
|
return NS_OK;
|
|
}
|
|
|
|
// Ignore this notification unless its for the current url. That way
|
|
// we skip over embedded webshell notifications (e.g. frame cells,
|
|
// iframes, etc.)
|
|
char* spec;
|
|
aURL->GetSpec(&spec);
|
|
if (!spec) {
|
|
nsCRT::free(spec);
|
|
return NS_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
nsCOMPtr<nsIURI> currentURL;
|
|
rv = NS_NewURI(getter_AddRefs(currentURL), mCurrentURL);
|
|
if (NS_FAILED(rv)) {
|
|
nsCRT::free(spec);
|
|
return rv;
|
|
}
|
|
char* spec2;
|
|
currentURL->GetSpec(&spec2);
|
|
if (!spec2) {
|
|
nsCRT::free(spec);
|
|
return NS_ERROR_OUT_OF_MEMORY;
|
|
}
|
|
if (PL_strcmp(spec, spec2)) {
|
|
nsCRT::free(spec);
|
|
nsCRT::free(spec2);
|
|
return NS_OK;
|
|
}
|
|
nsCRT::free(spec2);
|
|
|
|
char buf[400];
|
|
PRTime delta, cvt, rounder;
|
|
LL_I2L(cvt, 1000);
|
|
LL_I2L(rounder, 499);
|
|
LL_SUB(delta, endLoadTime, mStartLoad);
|
|
LL_ADD(delta, delta, rounder);
|
|
LL_DIV(delta, delta, cvt);
|
|
PR_snprintf(buf, sizeof(buf), "%s: done loading (%lld msec)",
|
|
spec, delta);
|
|
printf("%s\n", buf);
|
|
nsCRT::free(spec);
|
|
|
|
// Make sure the document bits make it to the screen at least once
|
|
nsIPresShell* shell = GetPresShell();
|
|
if (nsnull != shell) {
|
|
nsCOMPtr<nsIViewManager> vm;
|
|
shell->GetViewManager(getter_AddRefs(vm));
|
|
if (vm) {
|
|
nsIView* rootView;
|
|
vm->GetRootView(rootView);
|
|
vm->UpdateView(rootView, NS_VMREFRESH_IMMEDIATE);
|
|
}
|
|
if (0 == mDelay) {
|
|
nsIWebShell* webShell;
|
|
mBrowser->GetWebShell(webShell);
|
|
if (webShell) {
|
|
DumpRegressionData(webShell, aURL);
|
|
NS_RELEASE(webShell);
|
|
}
|
|
}
|
|
if (mJiggleLayout) {
|
|
nsRect r;
|
|
mBrowser->GetContentBounds(r);
|
|
nscoord oldWidth = r.width;
|
|
while (r.width > 100) {
|
|
r.width -= 10;
|
|
mBrowser->SizeWindowTo(r.width, r.height, PR_FALSE, PR_FALSE);
|
|
}
|
|
while (r.width < oldWidth) {
|
|
r.width += 10;
|
|
mBrowser->SizeWindowTo(r.width, r.height, PR_FALSE, PR_FALSE);
|
|
}
|
|
}
|
|
|
|
if (mCrawl) {
|
|
FindMoreURLs();
|
|
}
|
|
|
|
if (0 == mDelay) {
|
|
LoadNextURL(PR_TRUE);
|
|
}
|
|
NS_RELEASE(shell);
|
|
}
|
|
else {
|
|
fputs("null pres shell\n", stdout);
|
|
}
|
|
|
|
if (mPostExit && (0 == mQueuedLoadURLs) && (0==mPendingURLs.Count())) {
|
|
QueueExit();
|
|
}
|
|
|
|
return NS_OK;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
nsWebCrawler::OnStartURLLoad(nsIDocumentLoader* loader,
|
|
nsIRequest* request)
|
|
{
|
|
return NS_OK;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
nsWebCrawler::OnProgressURLLoad(nsIDocumentLoader* loader,
|
|
nsIRequest* request,
|
|
PRUint32 aProgress,
|
|
PRUint32 aProgressMax)
|
|
{
|
|
return NS_OK;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
nsWebCrawler::OnStatusURLLoad(nsIDocumentLoader* loader,
|
|
nsIRequest* request,
|
|
nsString& aMsg)
|
|
{
|
|
return NS_OK;
|
|
}
|
|
|
|
NS_IMETHODIMP
|
|
nsWebCrawler::OnEndURLLoad(nsIDocumentLoader* loader, nsIRequest* request,
|
|
nsresult aStatus)
|
|
{
|
|
return NS_OK;
|
|
}
|
|
|
|
FILE*
|
|
nsWebCrawler::GetOutputFile(nsIURI *aURL, nsString& aOutputName)
|
|
{
|
|
static const char kDefaultOutputFileName[] = "test.txt"; // the default
|
|
FILE *result = nsnull;
|
|
if (nsnull!=aURL)
|
|
{
|
|
char *inputFileName;
|
|
char* file;
|
|
(void)aURL->GetPath(&file);
|
|
nsAutoString inputFileFullPath; inputFileFullPath.AssignWithConversion(file);
|
|
nsCRT::free(file);
|
|
PRInt32 fileNameOffset = inputFileFullPath.RFindChar('/');
|
|
if (-1==fileNameOffset)
|
|
{
|
|
inputFileName = new char[strlen(kDefaultOutputFileName) + 1];
|
|
strcpy (inputFileName, kDefaultOutputFileName);
|
|
}
|
|
else
|
|
{
|
|
PRInt32 len = inputFileFullPath.Length() - fileNameOffset;
|
|
inputFileName = new char[len + 1 + 20];
|
|
char *c = inputFileName;
|
|
for (PRInt32 i=fileNameOffset+1; i<fileNameOffset+len; i++)
|
|
{
|
|
char ch = (char) inputFileFullPath.CharAt(i);
|
|
if (ch == '.') {
|
|
// Stop on dot so that we don't keep the old extension
|
|
break;
|
|
}
|
|
*c++ = ch;
|
|
}
|
|
|
|
// Tack on ".rgd" extension for "regression data"
|
|
*c++ = '.';
|
|
*c++ = 'r';
|
|
*c++ = 'g';
|
|
*c++ = 'd';
|
|
*c++ = '\0';
|
|
aOutputName.Truncate();
|
|
aOutputName.AppendWithConversion(inputFileName);
|
|
}
|
|
nsAutoString outputFileName(mOutputDir);
|
|
outputFileName.AppendWithConversion(inputFileName);
|
|
PRInt32 bufLen = outputFileName.Length()+1;
|
|
char *buf = new char[bufLen+1];
|
|
outputFileName.ToCString(buf, bufLen);
|
|
result = fopen(buf, "wt");
|
|
delete [] buf;
|
|
delete [] inputFileName;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::AddURL(const nsString& aURL)
|
|
{
|
|
nsString* url = new nsString(aURL);
|
|
mPendingURLs.AppendElement(url);
|
|
if (1 == mPendingURLs.Count()) {
|
|
mLastDelay = mDelay;
|
|
}
|
|
PRInt32 delay = GetDelay(*url);
|
|
if (delay >= 0) {
|
|
SetDelay(delay);
|
|
mLastDelay = delay;
|
|
}
|
|
if (mVerbose) {
|
|
printf("WebCrawler: adding '");
|
|
fputs(aURL, stdout);
|
|
printf("'\n");
|
|
}
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::AddSafeDomain(const nsString& aDomain)
|
|
{
|
|
nsString* s = new nsString(aDomain);
|
|
mSafeDomains.AppendElement(s);
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::AddAvoidDomain(const nsString& aDomain)
|
|
{
|
|
nsString* s = new nsString(aDomain);
|
|
mAvoidDomains.AppendElement(s);
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::SetOutputDir(const nsString& aOutputDir)
|
|
{
|
|
mOutputDir = aOutputDir;
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::SetRegressionDir(const nsString& aDir)
|
|
{
|
|
mRegressionDir = aDir;
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::Start()
|
|
{
|
|
// Enable observing each URL load...
|
|
nsIWebShell* shell = nsnull;
|
|
mBrowser->GetWebShell(shell);
|
|
nsCOMPtr<nsIDocShell> docShell(do_QueryInterface(shell));
|
|
docShell->SetDocLoaderObserver(this);
|
|
shell->GetDocumentLoader(*getter_AddRefs(mDocLoader));
|
|
NS_RELEASE(shell);
|
|
if (mPendingURLs.Count() >= 1) {
|
|
mHaveURLList = PR_TRUE;
|
|
// duplicate the last url if there is a delay since the regression data for the
|
|
// url gets written when the next url is encountered. Not perfect, but simple.
|
|
if (mLastDelay != 0) {
|
|
nsString* last = (nsString *) mPendingURLs.ElementAt(mPendingURLs.Count() - 1);
|
|
if (last) {
|
|
nsString* dupLast = new nsString(*last);
|
|
mPendingURLs.AppendElement(dupLast);
|
|
}
|
|
}
|
|
}
|
|
LoadNextURL(PR_FALSE);
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::EnableCrawler()
|
|
{
|
|
mCrawl = PR_TRUE;
|
|
}
|
|
|
|
static const unsigned char kLowerLookup[256] = {
|
|
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
|
|
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,
|
|
32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,
|
|
48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,
|
|
64,
|
|
97,98,99,100,101,102,103,104,105,106,107,108,109,
|
|
110,111,112,113,114,115,116,117,118,119,120,121,122,
|
|
|
|
91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,108,109,110,111,
|
|
112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
|
|
|
|
128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
|
|
144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,
|
|
160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,
|
|
176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
|
|
192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,
|
|
208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,
|
|
224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,
|
|
240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
|
|
};
|
|
|
|
static PRBool
|
|
EndsWith(const nsString& aDomain, const char* aHost, PRInt32 aHostLen)
|
|
{
|
|
PRInt32 slen = aDomain.Length();
|
|
if (slen < aHostLen) {
|
|
return PR_FALSE;
|
|
}
|
|
const PRUnichar* uc = aDomain.GetUnicode();
|
|
uc += slen - aHostLen;
|
|
const PRUnichar* end = uc + aHostLen;
|
|
while (uc < end) {
|
|
unsigned char uch = (unsigned char) ((*uc++) & 0xff);
|
|
unsigned char ch = (unsigned char) ((*aHost++) & 0xff);
|
|
if (kLowerLookup[uch] != kLowerLookup[ch]) {
|
|
return PR_FALSE;
|
|
}
|
|
}
|
|
return PR_TRUE;
|
|
}
|
|
|
|
static PRBool
|
|
StartsWith(const nsString& s1, const char* s2)
|
|
{
|
|
PRInt32 s1len = s1.Length();
|
|
PRInt32 s2len = strlen(s2);
|
|
if (s1len < s2len) {
|
|
return PR_FALSE;
|
|
}
|
|
const PRUnichar* uc = s1.GetUnicode();
|
|
const PRUnichar* end = uc + s2len;
|
|
while (uc < end) {
|
|
unsigned char uch = (unsigned char) ((*uc++) & 0xff);
|
|
unsigned char ch = (unsigned char) ((*s2++) & 0xff);
|
|
if (kLowerLookup[uch] != kLowerLookup[ch]) {
|
|
return PR_FALSE;
|
|
}
|
|
}
|
|
return PR_TRUE;
|
|
}
|
|
|
|
PRBool
|
|
nsWebCrawler::OkToLoad(const nsString& aURLSpec)
|
|
{
|
|
if (!StartsWith(aURLSpec, "http:") && !StartsWith(aURLSpec, "ftp:") &&
|
|
!StartsWith(aURLSpec, "file:") &&
|
|
!StartsWith(aURLSpec, "resource:")) {
|
|
return PR_FALSE;
|
|
}
|
|
|
|
PRBool ok = PR_TRUE;
|
|
nsIURI* url;
|
|
nsresult rv;
|
|
rv = NS_NewURI(&url, aURLSpec);
|
|
|
|
if (NS_OK == rv) {
|
|
nsXPIDLCString host;
|
|
rv = url->GetHost(getter_Copies(host));
|
|
if (rv == NS_OK) {
|
|
PRInt32 hostlen = PL_strlen(host);
|
|
|
|
// Check domains to avoid
|
|
PRInt32 i, n = mAvoidDomains.Count();
|
|
for (i = 0; i < n; i++) {
|
|
nsString* s = (nsString*) mAvoidDomains.ElementAt(i);
|
|
if (s && EndsWith(*s, host, hostlen)) {
|
|
printf("Avoiding '");
|
|
fputs(aURLSpec, stdout);
|
|
printf("'\n");
|
|
return PR_FALSE;
|
|
}
|
|
}
|
|
|
|
// Check domains to stay within
|
|
n = mSafeDomains.Count();
|
|
if (n == 0) {
|
|
// If we don't care then all the domains that we aren't
|
|
// avoiding are OK
|
|
return PR_TRUE;
|
|
}
|
|
for (i = 0; i < n; i++) {
|
|
nsString* s = (nsString*) mSafeDomains.ElementAt(i);
|
|
if (s && EndsWith(*s, host, hostlen)) {
|
|
return PR_TRUE;
|
|
}
|
|
}
|
|
ok = PR_FALSE;
|
|
}
|
|
NS_RELEASE(url);
|
|
}
|
|
return ok;
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::RecordLoadedURL(const nsString& aURL)
|
|
{
|
|
if (nsnull != mRecord) {
|
|
fputs(aURL, mRecord);
|
|
fputs("\n", mRecord);
|
|
fflush(mRecord);
|
|
}
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::FindURLsIn(nsIDocument* aDocument, nsIContent* aNode)
|
|
{
|
|
nsCOMPtr<nsIAtom> atom;
|
|
aNode->GetTag(*getter_AddRefs(atom));
|
|
if ((atom == mLinkTag) || (atom == mFrameTag) || (atom == mIFrameTag)) {
|
|
// Get absolute url that tag targets
|
|
nsAutoString base, src, absURLSpec;
|
|
if (atom == mLinkTag) {
|
|
aNode->GetAttribute(kNameSpaceID_HTML, mHrefAttr, src);
|
|
}
|
|
else {
|
|
aNode->GetAttribute(kNameSpaceID_HTML, mSrcAttr, src);
|
|
}
|
|
nsIURI* docURL = aDocument->GetDocumentURL();
|
|
nsresult rv;
|
|
rv = NS_MakeAbsoluteURI(absURLSpec, src, docURL);
|
|
if (NS_OK == rv) {
|
|
nsCOMPtr<nsIAtom> urlAtom = getter_AddRefs(NS_NewAtom(absURLSpec));
|
|
if (0 == mVisited->Get(urlAtom)) {
|
|
// Remember the URL as visited so that we don't go there again
|
|
mVisited->Put(urlAtom, "visited");
|
|
if (OkToLoad(absURLSpec)) {
|
|
mPendingURLs.AppendElement(new nsString(absURLSpec));
|
|
if (mVerbose) {
|
|
printf("Adding '");
|
|
fputs(absURLSpec, stdout);
|
|
printf("'\n");
|
|
}
|
|
}
|
|
else {
|
|
if (mVerbose) {
|
|
printf("Skipping '");
|
|
fputs(absURLSpec, stdout);
|
|
printf("'\n");
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
if (mVerbose) {
|
|
printf("Already visited '");
|
|
fputs(absURLSpec, stdout);
|
|
printf("'\n");
|
|
}
|
|
}
|
|
}
|
|
NS_RELEASE(docURL);
|
|
}
|
|
|
|
PRBool canHaveKids;
|
|
aNode->CanContainChildren(canHaveKids);
|
|
if (canHaveKids) {
|
|
PRInt32 i, n;
|
|
aNode->ChildCount(n);
|
|
for (i = 0; i < n; i++) {
|
|
nsIContent* kid;
|
|
aNode->ChildAt(i, kid);
|
|
if (nsnull != kid) {
|
|
FindURLsIn(aDocument, kid);
|
|
NS_RELEASE(kid);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::FindMoreURLs()
|
|
{
|
|
nsIWebShell* shell = nsnull;
|
|
mBrowser->GetWebShell(shell);
|
|
nsCOMPtr<nsIDocShell> docShell(do_QueryInterface(shell));
|
|
if (docShell) {
|
|
nsIContentViewer* cv = nsnull;
|
|
docShell->GetContentViewer(&cv);
|
|
if (nsnull != cv) {
|
|
nsIDocumentViewer* docv = nsnull;
|
|
cv->QueryInterface(NS_GET_IID(nsIDocumentViewer), (void**) &docv);
|
|
if (nsnull != docv) {
|
|
nsIDocument* doc = nsnull;
|
|
docv->GetDocument(doc);
|
|
if (nsnull != doc) {
|
|
nsIContent* root;
|
|
root = doc->GetRootContent();
|
|
if (nsnull != root) {
|
|
FindURLsIn(doc, root);
|
|
NS_RELEASE(root);
|
|
}
|
|
NS_RELEASE(doc);
|
|
}
|
|
NS_RELEASE(docv);
|
|
}
|
|
NS_RELEASE(cv);
|
|
}
|
|
NS_RELEASE(shell);
|
|
}
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::SetBrowserWindow(nsBrowserWindow* aWindow)
|
|
{
|
|
NS_IF_RELEASE(mBrowser);
|
|
mBrowser = aWindow;
|
|
NS_IF_ADDREF(mBrowser);
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::GetBrowserWindow(nsBrowserWindow** aWindow)
|
|
{
|
|
NS_IF_ADDREF(mBrowser);
|
|
*aWindow = mBrowser;
|
|
}
|
|
|
|
static void
|
|
TimerCallBack(nsITimer *aTimer, void *aClosure)
|
|
{
|
|
nsWebCrawler* wc = (nsWebCrawler*) aClosure;
|
|
wc->LoadNextURL(PR_TRUE);
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::LoadNextURL(PRBool aQueueLoad)
|
|
{
|
|
nsString* url = (nsString*) mPendingURLs.ElementAt(0);
|
|
if (nsnull != url) {
|
|
PRInt32 delay = GetDelay(*url);
|
|
if (delay >= 0) {
|
|
SetDelay(delay);
|
|
mPendingURLs.RemoveElementAt(0);
|
|
char buf[128];
|
|
url->ToCString(&buf[0], 128);
|
|
printf("%s\n", buf);
|
|
}
|
|
}
|
|
|
|
if ((0 != mDelay) && (mPendingURLs.Count() > 0)) {
|
|
mTimer = do_CreateInstance("@mozilla.org/timer;1");
|
|
mTimer->Init(TimerCallBack, (void *)this, mDelay * 1000);
|
|
}
|
|
|
|
if ((mMaxPages < 0) || (mMaxPages > 0)) {
|
|
while (0 != mPendingURLs.Count()) {
|
|
nsString* url = (nsString*) mPendingURLs.ElementAt(0);
|
|
mPendingURLs.RemoveElementAt(0);
|
|
if (nsnull != url) {
|
|
if (OkToLoad(*url)) {
|
|
RecordLoadedURL(*url);
|
|
nsIWebShell* webShell;
|
|
mBrowser->GetWebShell(webShell);
|
|
if (aQueueLoad) {
|
|
// Call stop to cancel any pending URL Refreshes...
|
|
/// webShell->Stop();
|
|
QueueLoadURL(*url);
|
|
}
|
|
else {
|
|
mCurrentURL = *url;
|
|
mStartLoad = PR_Now();
|
|
nsCOMPtr<nsIWebNavigation> webNav(do_QueryInterface(webShell));
|
|
webNav->LoadURI(url->GetUnicode(), nsIWebNavigation::LOAD_FLAGS_NONE);
|
|
}
|
|
NS_RELEASE(webShell);
|
|
|
|
if (mMaxPages > 0) {
|
|
--mMaxPages;
|
|
}
|
|
delete url;
|
|
return;
|
|
}
|
|
delete url;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (nsnull != mRecord) {
|
|
fclose(mRecord);
|
|
mRecord = nsnull;
|
|
}
|
|
|
|
}
|
|
|
|
nsIPresShell*
|
|
nsWebCrawler::GetPresShell(nsIWebShell* aWebShell)
|
|
{
|
|
nsIWebShell* webShell = aWebShell;
|
|
if (webShell) {
|
|
NS_ADDREF(webShell);
|
|
}
|
|
else {
|
|
mBrowser->GetWebShell(webShell);
|
|
}
|
|
nsIPresShell* shell = nsnull;
|
|
nsCOMPtr<nsIDocShell> docShell(do_QueryInterface(webShell));
|
|
if (nsnull != webShell) {
|
|
nsIContentViewer* cv = nsnull;
|
|
docShell->GetContentViewer(&cv);
|
|
if (nsnull != cv) {
|
|
nsIDocumentViewer* docv = nsnull;
|
|
cv->QueryInterface(NS_GET_IID(nsIDocumentViewer), (void**) &docv);
|
|
if (nsnull != docv) {
|
|
nsIPresContext* cx;
|
|
docv->GetPresContext(cx);
|
|
if (nsnull != cx) {
|
|
cx->GetShell(&shell);
|
|
NS_RELEASE(cx);
|
|
}
|
|
NS_RELEASE(docv);
|
|
}
|
|
NS_RELEASE(cv);
|
|
}
|
|
NS_RELEASE(webShell);
|
|
}
|
|
return shell;
|
|
}
|
|
|
|
static FILE*
|
|
OpenRegressionFile(const nsString& aBaseName, const nsString& aOutputName)
|
|
{
|
|
nsAutoString a;
|
|
a.Append(aBaseName);
|
|
a.AppendWithConversion("/");
|
|
a.Append(aOutputName);
|
|
char* fn = a.ToNewCString();
|
|
FILE* fp = fopen(fn, "r");
|
|
if (!fp) {
|
|
printf("Unable to open regression data file %s\n", fn);
|
|
}
|
|
delete[] fn;
|
|
return fp;
|
|
}
|
|
|
|
#define BUF_SIZE 1024
|
|
// Load up both data files (original and the one we just output) into
|
|
// two independent xml content trees. Then compare them.
|
|
void
|
|
nsWebCrawler::PerformRegressionTest(const nsString& aOutputName)
|
|
{
|
|
// First load the trees
|
|
nsIFrameUtil* fu;
|
|
nsresult rv = nsComponentManager::CreateInstance(kFrameUtilCID, nsnull,
|
|
kIFrameUtilIID, (void **)&fu);
|
|
if (NS_FAILED(rv)) {
|
|
printf("Can't find nsIFrameUtil implementation\n");
|
|
return;
|
|
}
|
|
FILE* f1 = OpenRegressionFile(mRegressionDir, aOutputName);
|
|
if (!f1) {
|
|
NS_RELEASE(fu);
|
|
return;
|
|
}
|
|
FILE* f2 = OpenRegressionFile(mOutputDir, aOutputName);
|
|
if (!f2) {
|
|
fclose(f1);
|
|
NS_RELEASE(fu);
|
|
return;
|
|
}
|
|
rv = fu->CompareRegressionData(f1, f2);
|
|
NS_RELEASE(fu);
|
|
|
|
char dirName[BUF_SIZE];
|
|
char fileName[BUF_SIZE];
|
|
mOutputDir.ToCString(dirName, BUF_SIZE-1);
|
|
aOutputName.ToCString(fileName, BUF_SIZE-1);
|
|
|
|
printf("regression test %s%s %s\n", dirName, fileName, NS_SUCCEEDED(rv) ? "passed" : "failed");
|
|
}
|
|
|
|
//----------------------------------------------------------------------
|
|
|
|
static NS_DEFINE_IID(kEventQueueServiceCID, NS_EVENTQUEUESERVICE_CID);
|
|
static NS_DEFINE_IID(kIEventQueueServiceIID, NS_IEVENTQUEUESERVICE_IID);
|
|
|
|
static nsresult
|
|
QueueEvent(PLEvent* aEvent)
|
|
{
|
|
nsISupports* is;
|
|
nsresult rv = nsServiceManager::GetService(kEventQueueServiceCID,
|
|
kIEventQueueServiceIID,
|
|
&is,
|
|
nsnull);
|
|
if (NS_FAILED(rv)) {
|
|
return rv;
|
|
}
|
|
|
|
nsCOMPtr<nsIEventQueueService> eqs = do_QueryInterface(is);
|
|
if (eqs) {
|
|
nsCOMPtr<nsIEventQueue> eq;
|
|
rv = eqs->GetThreadEventQueue(NS_CURRENT_THREAD, getter_AddRefs(eq));
|
|
if (eq) {
|
|
eq->PostEvent(aEvent);
|
|
}
|
|
}
|
|
|
|
nsServiceManager::ReleaseService(kEventQueueServiceCID, is, nsnull);
|
|
return rv;
|
|
}
|
|
|
|
//----------------------------------------------------------------------
|
|
|
|
struct ExitEvent : public PLEvent {
|
|
ExitEvent(nsWebCrawler* aCrawler);
|
|
~ExitEvent();
|
|
|
|
void DoIt() {
|
|
crawler->Exit();
|
|
}
|
|
|
|
nsWebCrawler* crawler;
|
|
|
|
static void PR_CALLBACK HandleMe(ExitEvent* e);
|
|
static void PR_CALLBACK DeleteMe(ExitEvent* e);
|
|
};
|
|
|
|
ExitEvent::ExitEvent(nsWebCrawler* aCrawler)
|
|
: crawler(aCrawler)
|
|
{
|
|
PL_InitEvent(this, crawler, (PLHandleEventProc) HandleMe,
|
|
(PLDestroyEventProc) DeleteMe);
|
|
NS_ADDREF(aCrawler);
|
|
}
|
|
|
|
ExitEvent::~ExitEvent()
|
|
{
|
|
NS_RELEASE(crawler);
|
|
}
|
|
|
|
void
|
|
ExitEvent::HandleMe(ExitEvent* e)
|
|
{
|
|
e->DoIt();
|
|
}
|
|
|
|
void
|
|
ExitEvent::DeleteMe(ExitEvent* e)
|
|
{
|
|
delete e;
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::QueueExit()
|
|
{
|
|
ExitEvent* event = new ExitEvent(this);
|
|
QueueEvent(event);
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::Exit()
|
|
{
|
|
mViewer->Exit();
|
|
}
|
|
|
|
//----------------------------------------------------------------------
|
|
|
|
struct LoadEvent : public PLEvent {
|
|
LoadEvent(nsWebCrawler* aCrawler, const nsString& aURL);
|
|
~LoadEvent();
|
|
|
|
void DoIt() {
|
|
crawler->GoToQueuedURL(url);
|
|
}
|
|
|
|
nsString url;
|
|
nsWebCrawler* crawler;
|
|
|
|
static void PR_CALLBACK HandleMe(LoadEvent* e);
|
|
static void PR_CALLBACK DeleteMe(LoadEvent* e);
|
|
};
|
|
|
|
LoadEvent::LoadEvent(nsWebCrawler* aCrawler, const nsString& aURL)
|
|
: url(aURL),
|
|
crawler(aCrawler)
|
|
{
|
|
PL_InitEvent(this, crawler, (PLHandleEventProc) HandleMe,
|
|
(PLDestroyEventProc) DeleteMe);
|
|
NS_ADDREF(aCrawler);
|
|
}
|
|
|
|
LoadEvent::~LoadEvent()
|
|
{
|
|
NS_RELEASE(crawler);
|
|
}
|
|
|
|
void
|
|
LoadEvent::HandleMe(LoadEvent* e)
|
|
{
|
|
e->DoIt();
|
|
}
|
|
|
|
void
|
|
LoadEvent::DeleteMe(LoadEvent* e)
|
|
{
|
|
delete e;
|
|
}
|
|
|
|
void
|
|
nsWebCrawler::GoToQueuedURL(const nsString& aURL)
|
|
{
|
|
nsIWebShell* webShell;
|
|
mBrowser->GetWebShell(webShell);
|
|
nsCOMPtr<nsIWebNavigation> webNav(do_QueryInterface(webShell));
|
|
if (webNav) {
|
|
mCurrentURL = aURL;
|
|
mStartLoad = PR_Now();
|
|
webNav->LoadURI(aURL.GetUnicode(), nsIWebNavigation::LOAD_FLAGS_NONE);
|
|
NS_RELEASE(webShell);
|
|
}
|
|
mQueuedLoadURLs--;
|
|
|
|
}
|
|
|
|
nsresult
|
|
nsWebCrawler::QueueLoadURL(const nsString& aURL)
|
|
{
|
|
LoadEvent* event = new LoadEvent(this, aURL);
|
|
nsresult rv = QueueEvent(event);
|
|
if (NS_SUCCEEDED(rv)) {
|
|
mQueuedLoadURLs++;
|
|
}
|
|
return rv;
|
|
}
|