//-*- coding: utf-8 -*- #include "gb-include.h" #include "hash.h" #include "XmlDoc.h" #include "Indexdb.h" // for TERMID_MASK definition and g_indexdb.getTermId() #include "Conf.h" #include "Query.h" // getFieldCode() #include "Clusterdb.h" // g_clusterdb #include "Categories.h" // g_categories #include "iana_charset.h" //#include "Checksumdb.h" //#include "Msg24.h" #include "Stats.h" #include "Sanity.h" #include "Speller.h" #include "CountryCode.h" //#include "SiteBonus.h" #include "linkspam.h" #include "Tagdb.h" //#include "Dates.h" #include "Repair.h" //#include "Links.h" #include "HashTableX.h" #include "LanguageIdentifier.h" // g_langId #include "CountryCode.h" // g_countryCode #include "sort.h" #include "Wiki.h" #include "Speller.h" #include "SiteGetter.h" #include "Placedb.h" #include "Test.h" #include "Synonyms.h" //#include "Revdb.h" #include "Timedb.h" #ifdef _USETURKS_ //#include "PageTurk.h" #endif #include "PageInject.h" #include "HttpServer.h" #include "Facebook.h" #include "Posdb.h" #include "Highlight.h" #include "Wiktionary.h" #include "seo.h" // Msg99Request etc. //#include #include "PingServer.h" #include "Parms.h" extern int g_inMemcpy; #define MAXDOCLEN (1024*1024) HashTableX *g_ct = NULL; XmlDoc *g_doc = NULL; char *g_ptr = NULL; int32_t *g_int32_t = NULL; #define SENT_UNITS 30 static int32_t getIsContacty ( Url *url , LinkInfo *info1 , int32_t hops , uint8_t ct , bool isRoot , int32_t niceness ); static int32_t getTopGigabits ( HashTableX *ht , GigabitInfo **top , int32_t max , int32_t minDocCount ) ; static void getWordToPhraseRatioWeights ( int64_t pid1 , // pre phrase int64_t wid1 , int64_t pid2 , int64_t wid2 , // post word float *ww , HashTableX *tt1 , int32_t titleRecVersion ) ; static bool addGigabit ( HashTableX *ht , char *s , int32_t slen , int64_t docId , Section *sp , bool singleWord , uint8_t langId , // starts with word #i int32_t i , int32_t ptsArg = -1 ) ; static bool getWordPosVec ( Words *words , Sections *sections, //int32_t wordStart, //int32_t wordEnd, int32_t startDist, char *fragVec, int32_t niceness , SafeBuf *wpos ) ; static void getMetaListWrapper ( void *state ) ; char *getFirstJSONObject ( char *p , int32_t niceness , bool *isProduct , bool *isImage ) ; char *getJSONObjectEnd ( char *p , int32_t niceness ) ; XmlDoc::XmlDoc() { m_esbuf.setLabel("exputfbuf"); for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL; m_freed = false; m_contentInjected = false; m_wasContentInjected = false; //m_coll = NULL; m_ubuf = NULL; m_pbuf = NULL; //m_contactDoc = NULL; m_rootDoc = NULL; m_oldDoc = NULL; m_dx = NULL; m_printedMenu = false; // reset all *valid* flags to false void *p = &m_VALIDSTART; void *pend = &m_VALIDEND; memset ( p , 0 , (char *)pend - (char *)p );//(int32_t)pend-(int32_t)p m_msg22Request.m_inUse = 0; m_msg4Waiting = false; m_msg4Launched = false; //m_sectiondbData = NULL; //m_placedbData = NULL; m_dupTrPtr = NULL; m_oldTitleRec = NULL; m_filteredContent = NULL; m_filteredContentAllocSize = 0; m_metaList = NULL; m_metaListSize = 0; m_metaListAllocSize = 0; //m_titleRec = NULL; //m_freeTitleRec = true; m_rootTitleRec = NULL; m_outlinkHopCountVector = NULL; //m_gsbuf = NULL; m_extraDoc = NULL; m_ahrefsDoc = NULL; m_wikiqbuf = NULL; //m_cr = NULL; //m_msg3aArray = NULL; m_msg3a = NULL; m_query3a = NULL; //m_numMsg99Replies = 0; m_numMsg95Replies = 0; m_seoSocket = NULL; m_hackSocket = NULL; m_doingSEO = false; //m_newxd = NULL; //m_newxd2 = NULL; //m_newMsg20 = NULL; m_registeredSocketCallback = false; //m_numMsg98Requests = 0; //m_numMsg98Replies = 0; m_numMsg8eReplies = 0; m_numMsg8eRequests = 0; m_tempMsg25Page = NULL; m_tempMsg25Site = NULL; m_numLinkRequestsOut = 0; m_numLinkRequestsIn = 0; m_numMsg3fReplies = 0; m_numMsg3fRequests = 0; m_numMsg4fRequests = 0; m_numMsg4fReplies = 0; m_sentMsg4fRequests = false; //m_notifyBlocked = 0; //m_mcasts = NULL; //for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) // m_currentBinPtrs[i] = NULL; reset(); }; XmlDoc::~XmlDoc() { setStatus("freeing this xmldoc"); reset(); m_freed = true; }; static int64_t s_lastTimeStart = 0LL; void XmlDoc::reset ( ) { m_isImporting = false; m_printedMenu = false; // for hashing CT_STATUS docs consistently, this might be invalid // so call it 0 m_pubDate = 0; m_tmpBuf2.purge(); m_gotFacets = false; m_bodyStartPos = 0; m_mcastArray = NULL; m_skipIframeExpansion = false; m_indexedTime = 0; m_didDelete = false; m_metaList2.purge(); m_zbuf.purge(); m_kbuf.purge(); m_mySiteLinkInfoBuf.purge(); m_myPageLinkInfoBuf.purge(); m_myTempLinkInfoBuf.purge(); // reset count for nukeJSONObjects() function m_joc = 0; // notifications pending? //if ( m_notifyBlocked ) { char *xx=NULL;*xx=0; } m_sentToDiffbot = 0; m_gotDiffbotSuccessfulReply = 0; m_loaded = false; m_msg4Launched = false; m_diffbotReplyError = 0; m_diffbotJSONCount = 0; //m_downloadAttempted = false; m_incrementedAttemptsCount = false; m_incrementedDownloadCount = false; if ( m_dx ) { mdelete ( m_dx , sizeof(XmlDoc), "xddx" ); delete ( m_dx ); m_dx = NULL; //log("diffbot: deleting m_dx2"); } m_isDiffbotJSONObject = false; m_dmozBuf.purge(); m_fakeIpBuf.purge(); m_fakeTagRecPtrBuf.purge(); m_tlbufTimer = 0LL; m_gsbuf.reset(); //m_launchedAll = false; m_qstringTable.reset(); //m_setForReplyPtrs = false; //m_setForLinkPtrs = false; // must be none outstanding if ( m_numMsg3fReplies != m_numMsg3fRequests ) { char *xx=NULL;*xx=0;} if ( m_numMsg4fReplies != m_numMsg4fRequests ) { char *xx=NULL;*xx=0;} m_numMsg4fRequests = 0; m_numMsg4fReplies = 0; m_sentMsg4fRequests = false; // free table's mem if used //m_tmpDupTable.reset(); //m_newxd2Blocked = false; m_lastPrintedDocId = 0LL; m_loggedMsg3 = false; m_progressBar = 0; m_triedToAddWordPosInfoToCachedb = false; if ( m_numLinkRequestsOut > m_numLinkRequestsIn ){char *xx=NULL;*xx=0;} m_doConsistencyTesting = g_conf.m_doConsistencyTesting; m_computedMetaListCheckSum = false; m_msg3aErrno = 0; m_hadMatchError = 0; m_clientClosed = false; m_lastCheckTime = 0; m_calledMsg25ForSite = false; m_calledMsg25ForPage = false; m_checkedCachedbForSite = false; m_checkedCachedbForPage = false; m_allHashed = false; // nuke it if ( m_tempMsg25Page ) { mdelete ( m_tempMsg25Page , sizeof(Msg25), "m25li" ); delete ( m_tempMsg25Page ); m_tempMsg25Page = NULL; } if ( m_tempMsg25Site ) { mdelete ( m_tempMsg25Site , sizeof(Msg25), "m25li" ); delete ( m_tempMsg25Site ); m_tempMsg25Site = NULL; } m_numLinkRequestsOut = 0; m_seoDebug = 0; //m_seoInfoSetFromCache = false; m_checkedCachedb = false; m_processedCachedbReply = false; m_cacheList.freeList(); for ( int32_t i = 0; m_numMsg8eReplies && i < g_hostdb.m_numHosts;i++) { if ( ! m_msg8eReply[i] ) continue; mfree ( m_msg8eReply[i] , m_msg8eReplySize[i] , "8erep" ); m_msg8eReply[i] = NULL; } m_numMsg8eRequests = 0; m_numMsg8eReplies = 0; for ( int32_t i = 0; m_numMsg95Replies && i < g_hostdb.m_numHosts;i++) { if ( ! m_msg95ReplyPtrs[i] ) continue; mfree ( m_msg95ReplyPtrs[i] , m_msg95ReplySizes[i] , "95rep" ); m_msg95ReplyPtrs[i] = NULL; } m_numMsg95Replies = 0; m_numMsg3fRequests = 0; m_numMsg3fReplies = 0; m_qcursor = 0; //m_binError = 0; //m_msg98ReplyError = 0; //m_binErrorForReplyPtrs = 0; //m_binErrorForLinkPtrs = 0; //m_msg17.reset(); //m_triedCache = false; //m_cacheRec = NULL; //m_cacheRecSize = 0; // reset this crap m_beginTimeAllMatch = 0LL; m_beginTimeMatchUrl = 0LL; m_beginTimeFullQueries = 0LL; m_beginTimeLinks = 0LL; //m_beginMsg98s = 0LL; m_beginRelatedQueries = 0LL; m_doledbKey.n0 = 0LL; m_doledbKey.n1 = 0; // sanity check, any outstanding? //if( m_numMsg98Requests != m_numMsg98Replies ) { char *xx=NULL;*xx=0;} // reset them now //m_numMsg98Requests = 0; //m_numMsg98Replies = 0; //if ( m_newxd ) { // mdelete ( m_newxd , sizeof(XmlDoc),"newxd"); // delete ( m_newxd ); // m_newxd = NULL; //} //if ( m_newxd2 ) { // mdelete ( m_newxd2 , sizeof(XmlDoc),"newxd2"); // delete ( m_newxd2 ); // m_newxd2 = NULL; //} /* if ( m_newMsg20 ) { mdelete ( m_newMsg20 , sizeof(Msg20),"newmsg20"); delete ( m_newMsg20 ); m_newMsg20 = NULL; }*/ /* NO! we use this for clientClosedConnection() function now if ( m_seoSocket ) { TcpServer *tcp = m_seoSocket->m_this; // gotta set this so it can be destroyed and closed m_seoSocket->m_waitingOnHandler = false; tcp->destroySocket ( m_seoSocket ); m_seoSocket = NULL; } */ if ( m_registeredSocketCallback ) { char *xx=NULL; *xx=0; } //for ( int32_t i = 0 ; i < m_numMsg99Replies ; i++ ) { // if ( ! m_msg99ReplyPtrs[i] ) continue; // mfree ( m_msg99ReplyPtrs [i] , // m_msg99ReplyAlloc[i] , // "m99reply" ); //} //m_numMsg99Replies = 0; //m_sentMsg99Requests = false; if ( m_msg3a ) { mdelete ( m_msg3a , sizeof(Msg3a) , "xdmsg3a" ); delete ( m_msg3a ); m_msg3a = NULL; } if ( m_query3a ) { mdelete ( m_query3a , sizeof(Query),"xdqry3a"); delete ( m_query3a ); m_query3a = NULL; } //m_twbuf.purge(); m_topMatchingQueryBuf.purge(); //m_queryPtrs.purge(); m_queryOffsets.purge(); m_extraQueryBuf.purge(); //m_socketWriteBuf.purge(); m_relatedDocIdBuf.purge(); m_relatedTitleBuf.purge(); m_commonQueryNumBuf.purge(); m_queryLinkBuf.purge(); //m_relatedQueryLinksIntersected.purge(); m_queryLinkStringBuf.purge(); //m_queryRelBuf.purge(); //m_relPtrs.purge(); m_sortedPosdbListBuf.purge(); m_wpSortedPosdbListBuf.purge(); m_termListBuf.purge(); m_insertableTermsBuf.purge(); //m_iwfiBuf.purge(); m_wordPosInfoBuf.purge(); //m_msg20ReplyPtrBuf.purge(); m_recommendedLinksBuf.purge(); m_tmpMsg0Buf.purge(); m_msg20Array.purge(); m_newLinkerBuf.purge(); //m_msg99ReplyBuf.purge(); m_matchingQueryBuf.purge(); m_relatedQueryBuf.purge(); m_queryLinkBuf.purge(); m_matchingQueryStringBuf.purge(); m_relatedQueryStringBuf.purge(); m_queryLinkStringBuf.purge(); m_docIdListBuf.purge(); m_queryChangeBuf.purge(); m_queryLogBuf.purge(); //m_itStrBuf.purge(); m_debugScoreInfoBuf.purge(); m_origScoreInfoBuf.purge(); m_msg20Buf.purge(); m_topDocIdsBuf.purge(); m_missingTermBuf.purge(); m_termInfoBuf.purge(); m_newTermInfoBuf.purge(); m_matchingTermBuf.purge(); m_termId32Buf.purge(); m_storeList.freeList(); //m_queryHashTable.reset(); m_tidTable32.reset(); m_queryOffsetTable.reset(); m_tmpTable.reset(); m_fullQueryDedup.reset(); //m_dupVotes.reset(); m_wordSpamBuf.purge(); m_fragBuf.purge(); m_downloadLevel = 0; for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) { if ( ! m_xmlDocs[i] ) continue; mdelete ( m_xmlDocs[i] , sizeof(XmlDoc), "xdarr" ); delete ( m_xmlDocs[i] ); m_xmlDocs[i] = NULL; } s_lastTimeStart = 0LL; m_req = NULL; m_doneWithAhrefs = false; m_useAhrefs = false; m_linkDedupTablePtr = NULL; m_domDedupTablePtr = NULL; m_storeTermListInfo = false; m_gotDupStats = false; //m_nextSection = (Section *)-1; m_si = (Section *)-1; // for limiting # of iframe tag expansions m_numExpansions = 0; // . are not allowed to exit if waiting for msg4 to complete // . yes we are, it should be saved as addsinprogress.dat if ( m_msg4Waiting ) { log("doc: resetting xmldoc with outstanding msg4. should " "me saved in addsinprogress.dat. docid=%"UINT64"",m_docId); //char *xx=NULL;*xx=0; } } m_ei = 0; m_lastLaunch = -1; m_pbuf = NULL; m_wts = NULL; m_deleteFromIndex = false; //if ( m_contactDocValid ) nukeDoc ( m_contactDoc ); if ( m_rootDocValid ) nukeDoc ( m_rootDoc ); if ( m_oldDocValid ) nukeDoc ( m_oldDoc ); if ( m_extraDocValid ) nukeDoc ( m_extraDoc ); if ( m_ahrefsDocValid ) nukeDoc ( m_ahrefsDoc ); if ( m_linkInfo1Valid && ptr_linkInfo1 && m_freeLinkInfo1 ) { // it now points into m_myPageLinkInfoBuf ! //mfree ( ptr_linkInfo1 , size_linkInfo1, "LinkInfo1"); ptr_linkInfo1 = NULL; m_linkInfo1Valid = false; } if ( m_linkInfo2Valid && ptr_linkInfo2 && m_freeLinkInfo2 ) { // should point into a safebuf as well //mfree ( ptr_linkInfo2 , size_linkInfo2, "LinkInfo2"); ptr_linkInfo2 = NULL; m_linkInfo2Valid = false; } if ( m_rawUtf8ContentValid && m_rawUtf8Content && !m_setFromTitleRec // was content supplied by pageInject.cpp? //! m_contentInjected ) { ) { mfree ( m_rawUtf8Content, m_rawUtf8ContentAllocSize,"Xml3"); } // reset this m_contentInjected = false; m_rawUtf8ContentValid = false; m_wasContentInjected = false; m_rootDoc = NULL; // if this is true, then only index if new m_newOnly = 0; //if ( m_sectiondbData ) { // mfree ( m_sectiondbData , m_sectiondbDataSize ,"sdbdata" ); // m_sectiondbData = NULL; //} //if ( m_placedbData ) { // mfree ( m_placedbData , m_placedbDataSize ,"pdbdata" ); // m_placedbData = NULL; //} if ( m_httpReplyValid && m_httpReply ) { mfree(m_httpReply,m_httpReplyAllocSize,"httprep"); m_httpReply = NULL; m_httpReplyValid = false; } if ( m_filteredContentAllocSize ) { mfree (m_filteredContent,m_filteredContentAllocSize,"xdfc"); m_filteredContent = NULL; m_filteredContentAllocSize = 0; } //if ( m_utf8ContentValid && ! m_setFromTitleRec && ptr_utf8Content ) // mfree ( ptr_utf8Content , m_utf8ContentAllocSize,"Xml3"); if ( m_metaList ) { // m_metaListValid && m_metaList ) { mfree ( m_metaList , m_metaListAllocSize , "metalist"); m_metaList = NULL; m_metaListSize = 0; m_metaListAllocSize = 0; } if ( m_ubuf ) { mfree ( m_ubuf , m_ubufAlloc , "ubuf"); m_ubuf = NULL; } //if ( m_freeTitleRec && m_titleRec ) { // && m_titleRecValid ) { // mfree ( m_titleRec , m_titleRecAllocSize , "trec" ); //} //m_titleRec = NULL; m_titleRecBuf.purge(); if ( m_dupTrPtr ) { mfree ( m_dupTrPtr , m_dupTrSize , "trecd" ); m_dupTrPtr = NULL; } if ( m_oldTitleRecValid && m_oldTitleRec ) { mfree ( m_oldTitleRec , m_oldTitleRecSize , "treca" ); m_oldTitleRec = NULL; m_oldTitleRecValid = false; } if ( m_rootTitleRecValid && m_rootTitleRec ) { mfree ( m_rootTitleRec , m_rootTitleRecSize , "treca" ); m_rootTitleRec = NULL; m_rootTitleRecValid = false; } if ( m_outlinkHopCountVectorValid && m_outlinkHopCountVector ) { int32_t sz = m_outlinkHopCountVectorSize; mfree ( m_outlinkHopCountVector,sz,"ohv"); } m_outlinkHopCountVector = NULL; //if ( m_gsbufValid && m_gsbuf ) { // mfree ( m_gsbuf , m_gsbufAllocSize , "gsbuf" ); //} //m_gsbuf = NULL; m_gsbuf.reset(); // reset all *valid* flags to false void *p = &m_VALIDSTART; void *pend = &m_VALIDEND; memset ( p , 0 , (char *)pend - (char *)p ); m_hashedMetas = false; m_mcastBuf.purge(); m_serpBuf.purge(); // Doc.cpp: m_mime.reset(); m_words.reset(); m_phrases.reset(); m_bits.reset(); m_sections.reset(); //m_weights.reset(); m_countTable.reset(); m_dates.reset(); m_addresses.reset(); // other crap m_xml.reset(); m_links.reset(); m_bits2.reset(); m_pos.reset(); //m_synonyms.reset(); m_synBuf.reset(); //m_nsvt.reset(); //m_osvt.reset(); m_turkVotingTable.reset(); m_turkBitsTable.reset(); m_vtr.reset(); m_rdtab.reset(); m_vctab.reset(); m_vcduptab.reset(); m_images.reset(); m_countTable.reset(); m_mime.reset(); m_tagRec.reset(); m_newTagBuf.reset(); m_catRec.reset(); m_clockCandidatesTable.reset(); m_cctbuf.reset(); m_dupList.reset(); //m_oldMetaList.reset(); m_msg8a.reset(); //m_siteLinkInfo.reset(); //m_msg25.reset(); //m_msgc.reset(); m_msg13.reset(); m_tmpsb1.reset(); m_tmpsb2.reset(); m_turkBuf.reset(); m_msg0b.reset(); //m_siteGetter.reset(); m_msge0.reset(); m_msge1.reset(); m_reply.reset(); // mroe stuff skipped m_wtsTable.reset(); m_wbuf.reset(); m_pageLinkBuf.reset(); m_siteLinkBuf.reset(); m_esbuf.reset(); m_xbuf.reset(); m_tagRecBuf.reset(); //m_titleRec = NULL; //m_titleRecSize = 0; // origin of this XmlDoc m_setFromTitleRec = false; m_setFromUrl = false; m_setFromDocId = false; m_setFromSpiderRec = false; m_freeLinkInfo1 = false; m_freeLinkInfo2 = false; m_checkedUrlFilters = false; m_indexCode = 0; m_masterLoop = NULL; m_masterState = NULL; //m_isAddUrl = false; m_isInjecting = false; m_useFakeMime = false; m_useSiteLinkBuf = false; m_usePageLinkBuf = false; m_printInXml = false; m_check1 = false; m_check2 = false; m_prepared = false; // keep track of updates to the rdbs we have done, so we do not re-do m_listAdded = false; m_listFlushed = false; m_updatedCounts = false; m_updatedCounts2 = false; //m_updatedTagdb1 = false; //m_updatedTagdb2 = false; //m_updatedTagdb3 = false; //m_updatedTagdb4 = false; //m_updatedTagdb5 = false; m_copied1 = false; m_updatingSiteLinkInfoTags = false; m_addressSetCalled = false; m_hashedTitle = false; m_registeredSleepCallback = false; m_addedNegativeDoledbRec = false; m_numRedirects = 0; m_numOutlinksAdded = 0; // . use sameDomain and sameIp waits? // . these may be bypassed in getContactDoc() //m_throttleDownload = true; m_spamCheckDisabled = false; m_useRobotsTxt = true; m_redirectFlag = false; // Scraper.cpp sets this to true //m_isScraping = false; m_allowSimplifiedRedirs = false; //m_calledMsg22a = false; //m_calledMsg22b = false; //m_calledMsg22c = false; m_didDelay = false; m_didDelayUnregister = false; m_calledMsg22d = 0LL; m_calledMsg22e = false; m_calledMsg22f = false; m_calledMsg25 = false; m_calledMsg25b = false; m_calledMsg40 = false; m_calledSections = false; m_calledThread = false; m_alreadyRegistered = false; m_loaded = false; m_firstEntry = true; m_firstEntry2 = true; m_launchedSpecialMsg8a = false; m_launchedMsg8a2 = false; m_numSectiondbReads = 0; m_numSectiondbNeeds = 0; m_sectiondbRecall = 0; //m_triedVoteCache = false; //m_storedVoteCache = false; m_setTr = false; //m_checkedRobots = false; m_triedTagRec = false; m_didGatewayPage = false; m_didQuickDupCheck = false; m_calledMsg8b = false; m_recycleContent = false; //m_loadFromOldTitleRec = false; m_callback1 = NULL; m_callback2 = NULL; m_state = NULL; // used for getHasContactInfo() m_processed0 = false; m_hasContactInfo = false; m_hasContactInfo2 = false; //m_checkForRedir = true; m_processedLang = false; m_doingConsistencyCheck = false; // used for getting contact info //m_triedRoot = false; //m_winner = -2; // tell Msg13 to just call HttpServer::getDoc() and not to forward // the download request to another host. although this does not // exclude possible forwarding it to a compression proxy if // g_conf.m_useCompressionProxy is set m_forwardDownloadRequest = false; m_isChildDoc = false; // for utf8 content functions m_savedp = NULL; m_oldp = NULL; m_didExpansion = false; // Repair.cpp now explicitly sets these to false if needs to m_usePosdb = true; //m_useDatedb = true; m_useClusterdb = true; m_useLinkdb = true; m_useSpiderdb = true; m_useTitledb = true; m_useTagdb = true; m_usePlacedb = true; //m_useTimedb = true; // only use for custom crawls for now to save disk space m_useSectiondb = false; //m_useRevdb = true; m_useSecondaryRdbs = false; //m_useIpsTxtFile = true; // used by Msg13.cpp only. kinda a hack. m_isSpiderProxy = false; // do not cache the http reply in msg13 etc. m_maxCacheAge = 0; // reset these ptrs too! void *px = &ptr_firstUrl; void *pxend = &size_firstUrl; memset ( px , 0 , (char *)pxend - (char *)px ); } // . set the url with the intention of adding it or deleting it from the index // . Msg7 and Repair.cpp can also set other members of XmlDoc rather than just // m_firstUrl. they can provide the ip, the http reply, content, filtered // content, the forced next spider time and the forced first indexed date, // the hop count // . they might also want to skip deduping, or any algo deemed unnecessary // by setting, for instance, m_isDupValid = true, or something bool XmlDoc::set1 ( char *url , char *coll , SafeBuf *pbuf , int32_t niceness ) { reset(); // this is true m_setFromUrl = true; //m_coll = coll; m_pbuf = pbuf; m_niceness = niceness; m_version = TITLEREC_CURRENT_VERSION; m_versionValid = true; // sanity check if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; } // copy this in case collection gets deleted i guess... //m_forceDelete = forceDelete; // did we get this url from PageAddUrl? //m_isAddUrl = isAddUrl; // set m_indexCode so that XmlDoc::indexDoc() will delete it //if ( forceDelete ) m_indexCode = EDOCFORCEDELETE; // set this important member var //cr = g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) ); //if ( ! cr ) return false; if ( ! setCollNum ( coll ) ) return false; setFirstUrl ( url , false ); //setSpideredTime(); return true; } char *XmlDoc::getTestDir ( ) { CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // return NULL if we are not the "qatest123" collection if ( strcmp(cr->m_coll,"qatest123") ) return NULL; // if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit // then return "test-spider" otherwise... if ( m_sreqValid && m_sreq.m_useTestSpiderDir ) return "qa";//"test-spider"; // ... default to "test-parser" //return "test-parser"; return "qa"; /* if ( getIsPageParser() ) return "test-page-parser"; //if ( m_sreqValid && m_sreq.m_isInjecting ) // return "test-page-inject"; else if ( g_conf.m_testParserEnabled ) return "test-parser"; else if ( g_conf.m_testSpiderEnabled ) return "test-spider"; // default to being from PageInject return "test-page-inject"; */ //else { char *xx=NULL;*xx=0; } //return NULL; } int32_t XmlDoc::getSpideredTime ( ) { // stop if already set if ( m_spideredTimeValid ) return m_spideredTime; // tmp var int32_t date = 0; CollectionRec *cr = getCollRec(); if ( ! cr ) return 0; // if not test collection keep it simple if ( strcmp(cr->m_coll,"qatest123") ) { // . set spider time to current time // . this might already be valid if we set it in // getTestSpideredDate() m_spideredTime = getTimeGlobal(); m_spideredTimeValid = true; return m_spideredTime; } char *testDir = getTestDir(); // get url Url *cu = getCurrentUrl(); if ( ! cu || cu == (void *)-1 ) { char *xx=NULL;*xx=0; } // this returns false if not in there, in which case, add it if ( ! getTestSpideredDate(cu,&date,testDir) ) { m_spideredTime = getTimeGlobal(); m_spideredTimeValid = true; addTestSpideredDate ( cu , m_spideredTime , testDir ); return m_spideredTime; } // if we are injecting into the test coll for the 2nd+ time // we need to use the spidered date from the first time we // injected the doc in order to ensure things are parsed // exactly the same way since some things depend on the // spideredTime, like Dates (for setting "in future" // flags) m_spideredTimeValid = true; m_spideredTime = date; // hack for test coll which has fake vals for these because // the SpiderRequest::m_addedTime and m_parentPrevSpiderTime //m_minPubDate = m_spideredTime - 48*3600; //m_maxPubDate = m_spideredTime - 24*3600; return m_spideredTime; } // . we need this so PageGet.cpp can get the cached web page // . but not for Msg20::getSummary(), that uses XmlDoc::set(Msg20Request*) // . returns false and sets g_errno on error bool XmlDoc::set3 ( int64_t docId , char *coll , int32_t niceness ) { reset(); // this is true m_setFromDocId = true; m_docId = docId; m_docIdValid = true; //m_coll = coll; m_niceness = niceness; // . sanity check // . why can't we allow this??? MDW //if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; } // set this important member var //cr = g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) ); //if ( ! cr ) { m_errno = ENOCOLLREC; return false; } if ( ! setCollNum ( coll ) ) return false; // solidify some parms //m_eliminateMenus = cr->m_eliminateMenus; //m_eliminateMenusValid = true; return true; } void loadFromOldTitleRecWrapper ( void *state ) { XmlDoc *THIS = (XmlDoc *)state; // make sure has not been freed from under us! if ( THIS->m_freed ) { char *xx=NULL;*xx=0;} // note it THIS->setStatus ( "loading from old title rec wrapper" ); // return if it blocked if ( ! THIS->loadFromOldTitleRec ( ) ) return; char *coll = ""; CollectionRec *cr = THIS->getCollRec(); if ( cr ) coll = cr->m_coll; // error? if ( g_errno ) log("doc: loadfromtitlerec coll=%s: %s", coll, mstrerror(g_errno)); // otherwise, all done, call the caller callback if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state ); else THIS->m_callback2 ( THIS->m_state ); } // returns false if blocked, returns true and sets g_errno on error otherwise bool XmlDoc::loadFromOldTitleRec ( ) { // . we are an entry point. // . if anything blocks, this will be called when it comes back if ( ! m_masterLoop ) { m_masterLoop = loadFromOldTitleRecWrapper; m_masterState = this; } // if we already loaded! if ( m_loaded ) return true; // if set from a docid, use msg22 for this! char **otr = getOldTitleRec ( ); // error? if ( ! otr ) return true; // blocked? if ( otr == (void *)-1 ) return false; // this is a not found if ( ! *otr ) { // so we do not retry m_loaded = true; // make it an error g_errno = ENOTFOUND; return true; } CollectionRec *cr = getCollRec(); if ( ! cr ) return true; // use that. decompress it! this will also set // m_setFromTitleRec to true if ( ! set2 ( m_oldTitleRec , m_oldTitleRecSize , // maxSize cr->m_coll , NULL , // pbuf m_niceness )) { // we are now loaded, do not re-call m_loaded = true; // return true with g_errno set on error uncompressing return true; } // we are now loaded, do not re-call m_loaded = true; // sanity check if ( ! m_titleRecBufValid ) { char *xx=NULL;*xx=0; } // good to go return true; } bool XmlDoc::setCollNum ( char *coll ) { CollectionRec *cr; cr = g_collectiondb.getRec ( coll , gbstrlen(coll) ); if ( ! cr ) { g_errno = ENOCOLLREC; return log("build: collrec not found for %s",coll); } // we can store this safely: m_collnum = cr->m_collnum; m_collnumValid = true; // if user "resets" the collection we need to know m_lastCollRecResetCount = cr->m_lastResetCount; return true; } CollectionRec *XmlDoc::getCollRec ( ) { if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; } CollectionRec *cr = g_collectiondb.m_recs[m_collnum]; if ( ! cr ) { log("build: got NULL collection rec."); g_errno = ENOCOLLREC; return NULL; } // was it reset since we started spidering this url? // we don't do it this way, when resetting a coll when delete it and // re-add under a different collnum to avoid getting msg4 adds to it. //if ( cr->m_lastResetCount != m_lastCollRecResetCount ) { // log("build: collection rec was reset. returning null."); // g_errno = ENOCOLLREC; // return NULL; //} return cr; } // returns false and sets g_errno on error bool XmlDoc::set4 ( SpiderRequest *sreq , key_t *doledbKey , char *coll , SafeBuf *pbuf , int32_t niceness , char *utf8ContentArg , bool deleteFromIndex , int32_t forcedIp , uint8_t contentType , uint32_t spideredTime , bool contentHasMime ) { // sanity check if ( sreq->m_dataSize == 0 ) { char *xx=NULL;*xx=0; } reset(); if ( g_conf.m_logDebugSpider ) log("xmldoc: set4 uh48=%"UINT64" parentdocid=%"UINT64"", sreq->getUrlHash48(),sreq->getParentDocId()); // used by PageSpiderdb.cpp m_startTime = gettimeofdayInMilliseconds(); m_startTimeValid = true; // this is true m_setFromSpiderRec = true; // did page inject (pageinject) request to delete it? m_deleteFromIndex = deleteFromIndex; // PageReindex.cpp will set this in the spider request if ( sreq->m_forceDelete ) m_deleteFromIndex = true; char *utf8Content = utf8ContentArg; if ( contentHasMime && utf8Content ) { // get length of it all int32_t clen = gbstrlen(utf8Content); // return true on error with g_errno set if ( ! m_mime.set ( utf8ContentArg , clen , NULL ) ) { if ( ! g_errno ) g_errno = EBADMIME; log("xmldoc: could not set mime: %s", mstrerror(g_errno)); return false; } // it's valid m_mimeValid = true; // advance utf8Content = m_mime.getContent(); } // sometimes they supply the content they want! like when zaks' // injects pages from PageInject.cpp if ( utf8Content ) { int32_t slen = gbstrlen(utf8Content); // . this is the most basic content from the http reply // . only set this since sometimes it is facebook xml and // contains encoded html which needs to be decoded. // like Ben & Jerry's otherwise are // sentence formation stops at the ';' in the "&" and // we also index "amp" which is bad. m_content = utf8Content; m_contentLen = slen; m_contentValid = true; //m_rawUtf8Content = utf8Content; //m_expandedUtf8Content = utf8Content; //ptr_utf8Content = utf8Content; //size_utf8Content = slen+1; //m_rawUtf8ContentValid = true; //m_expandedUtf8ContentValid = true; //m_utf8ContentValid = true; m_contentInjected = true; m_wasContentInjected = true; m_contentType = contentType; m_contentTypeValid = true; // use this ip as well for now to avoid ip lookup //m_ip = atoip("127.0.0.1"); //m_ipValid = true; // use this to avoid ip lookup if it is not zero if ( forcedIp ) { m_ip = forcedIp; m_ipValid = true; } // do not need robots.txt then m_isAllowed = true; m_isAllowedValid = true; // nor mime m_httpStatus = 200; m_httpStatusValid = true; // this too m_downloadStatus = 0; m_downloadStatusValid = true; // assume this is the download time since the content // was pushed/provided to us if ( spideredTime ) m_downloadEndTime = spideredTime; else m_downloadEndTime = gettimeofdayInMillisecondsGlobal(); // either way, validate it m_downloadEndTimeValid = true; // and need a legit mime if ( ! m_mimeValid ) { m_mime.m_bufLen = 1; m_mimeValid = true; m_mime.m_contentType = contentType; } m_isContentTruncated = false; m_isContentTruncatedValid = true; // no redir ptr_redirUrl = NULL; size_redirUrl = 0; m_redirUrl.reset(); m_redirUrlPtr = NULL;//&m_redirUrl; m_redirUrlValid = true; m_redirErrorValid = true; m_redirError = 0; m_crawlDelay = -1; m_crawlDelayValid = true; } // override content type based on mime for application/json if ( m_mimeValid ) { m_contentType = m_mime.m_contentType; m_contentTypeValid = true; } //m_coll = coll; m_pbuf = pbuf; m_niceness = niceness; m_version = TITLEREC_CURRENT_VERSION; m_versionValid = true; /* // set min/max pub dates right away m_minPubDate = -1; m_maxPubDate = -1; // parentPrevSpiderTime is 0 if that was the first time that the // parent was spidered, in which case isNewOutlink will always be set // for every outlink it had! if ( sreq->m_isNewOutlink && sreq->m_parentPrevSpiderTime ) { // sanity check if ( ! sreq->m_parentPrevSpiderTime ) {char *xx=NULL;*xx=0;} // pub date is somewhere between these two times m_minPubDate = sreq->m_parentPrevSpiderTime; m_maxPubDate = sreq->m_addedTime; } */ // this is used to removing the rec from doledb after we spider it m_doledbKey.setMin(); if ( doledbKey ) m_doledbKey = *doledbKey; // . sanity check // . we really don't want the parser holding up the query pipeline // even if this page is being turked! //if ( m_niceness == 0 && // // spider proxy uses xmldoc class to expand iframe tags and // // sometimes the initiating msg13 class was re-niced to 0 // // in the niceness converstion logic. // ! g_hostdb.m_myHost->m_isProxy ) { // char *xx=NULL; *xx=0; } m_sreqValid = true; // store the whole rec, key+dataSize+data, in case it disappears. gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() ); // set m_collnum etc. if ( ! setCollNum ( coll ) ) return log("XmlDoc: set4() coll %s invalid",coll); // it should be valid since we just set it CollectionRec *cr = getCollRec(); m_useRobotsTxt = cr->m_useRobotsTxt; // solidify some parms //m_eliminateMenus = cr->m_eliminateMenus; //m_eliminateMenusValid = true; // validate these here too /* m_titleWeight = cr->m_titleWeight; m_headerWeight = cr->m_headerWeight; m_urlPathWeight = cr->m_urlPathWeight; m_externalLinkTextWeight = cr->m_externalLinkTextWeight; m_internalLinkTextWeight = cr->m_internalLinkTextWeight; m_conceptWeight = cr->m_conceptWeight; m_titleWeightValid = true; m_headerWeightValid = true; m_urlPathWeightValid = true; m_externalLinkTextWeightValid = true; m_internalLinkTextWeightValid = true; m_conceptWeightValid = true; */ // fix some corruption i've seen if ( m_sreq.m_urlIsDocId && ! is_digit(m_sreq.m_url[0]) ) { log("xmldoc: fixing sreq %s to non docid",m_sreq.m_url); m_sreq.m_urlIsDocId = 0; } // if url is a docid... we are from pagereindex.cpp //if ( sreq->m_isPageReindex ) { // now we can have url-based page reindex requests because // if we have a diffbot json object fake url reindex request // we add a spider request of the PARENT url for it as page reindex //if ( is_digit ( sreq->m_url[0] ) ) { // watch out for 0.r.msn.com!! if ( m_sreq.m_urlIsDocId ) { m_docId = atoll(m_sreq.m_url); // assume its good m_docIdValid = true; // similar to set3() above m_setFromDocId = true; // use content and ip from old title rec to save time // . crap this is making the query reindex not actually // re-download the content. // . we already check the m_deleteFromIndex flag below // in getUtf8Content() and use the old content in that case // so i'm not sure why we are recycling here, so take // this out. MDW 9/25/2014. //m_recycleContent = true; // sanity if ( m_docId == 0LL ) { char *xx=NULL;*xx=0; } } else { // add www is now REQUIRED for all! // crap, injection of tmblr.co/ZHw5yo1E5TAaW fails because // www.tmblr.co has no IP setFirstUrl ( m_sreq.m_url , false );//true ); // false ); // you can't call this from a docid based url until you // know the uh48 //setSpideredTime(); } // now query reindex can specify a recycle content option so it // can replace the rebuild tool. try to recycle on global index. if ( m_sreqValid ) m_recycleContent = m_sreq.m_recycleContent; return true; } // . set our stuff from the TitleRec (from titledb) // . returns false and sets g_errno on error bool XmlDoc::set2 ( char *titleRec , int32_t maxSize , char *coll , SafeBuf *pbuf , int32_t niceness , SpiderRequest *sreq ) { // NO! can't do this. see below //reset(); setStatus ( "setting xml doc from title rec"); // . it resets us, so save this // . we only save these for set2() not the other sets()! //void (*cb1)(void *state) = m_callback1; //bool (*cb2)(void *state) = m_callback2; //void *state = m_state; // . clear it all out // . no! this is clearing our msg20/msg22 reply... // . ok, but repair.cpp needs it so do it there then //reset(); // restore callbacks //m_callback1 = cb1; //m_callback2 = cb2; //m_state = state; // sanity check - since we do not reset if ( m_contentValid ) { char *xx=NULL;*xx=0; } // this is true m_setFromTitleRec = true; // this is valid i guess. includes key, etc. //m_titleRec = titleRec; //m_titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key_t) + 4; //m_titleRecValid = true; // . should we free m_cbuf on our reset/destruction? // . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec // that should not be freed, besides the alloc size is not known! //m_freeTitleRec = false; int32_t titleRecSize = *(int32_t *)(titleRec+12) + sizeof(key_t) + 4; // . should we free m_cbuf on our reset/destruction? // . no because doCOnsistencyCheck calls XmlDoc::set2 with a titleRec // that should not be freed, besides the alloc size is not known! m_titleRecBuf.setBuf ( titleRec , titleRecSize , // bufmax titleRecSize , // bytes in use false, // ownData? csUTF8); // encoding m_titleRecBufValid = true; //m_coll = coll; m_pbuf = pbuf; m_niceness = niceness; // . sanity check // . NO! could be from XmlDoc::getMsg20Reply()! //if ( m_niceness == 0 ) { char *xx=NULL; *xx=0; } // it must be there! if ( !titleRec||titleRecSize==0 ) {g_errno=ENOTFOUND; return false;} // set our collection number if ( ! setCollNum ( coll ) ) return false; // store the whole rec, key+dataSize+data, in case it disappears. if ( sreq ) { gbmemcpy ( &m_sreq , sreq , sreq->getRecSize() ); m_sreqValid = true; } m_hashedTitle = false; m_hashedMetas = false; // save the compressed buffer in case we should free it when done //m_titleRec = titleRec; // should we free m_cbuf on our reset/destruction? //m_freeTitleRec = true; // our record may not occupy all of m_cbuf, careful //m_titleRecAllocSize = maxSize; // get a parse ptr char *p = titleRec ; // . this is just like a serialized RdbList key/dataSize/data of 1 rec // . first thing is the key // . key should have docId embedded in it m_titleRecKey = *(key_t *) p ; p += sizeof(key_t); // bail on error if ( (m_titleRecKey.n0 & 0x01) == 0x00 ) { g_errno = EBADTITLEREC; log("db: Titledb record is a negative key."); char *xx=NULL; *xx=0; return false; } // set m_docId from key m_docId = g_titledb.getDocIdFromKey ( m_titleRecKey ); // validate that m_docIdValid = true; // then the size of the data that follows this int32_t dataSize = *(int32_t *) p ; p += 4; // bail on error if ( dataSize < 4 ) { g_errno = EBADTITLEREC; return log("db: Titledb record has size of %"INT32" which " "is less then 4. Probable disk corruption in a " "titledb file.", dataSize); } // what is the size of cbuf/titleRec in bytes? int32_t cbufSize = dataSize + 4 + sizeof(key_t); // . the actual data follows "dataSize" // . what's the size of the uncompressed compressed stuff below here? m_ubufSize = *(int32_t *) p ; p += 4; // . because of disk/network data corruption this may be wrong! // . we can now have absolutely huge titlerecs... if ( m_ubufSize <= 0 ) { //m_ubufSize > 2*1024*1024 || m_ubufSize < 0 ) g_errno = EBADTITLEREC; return log("db: TitleRec::set: uncompress uncompressed " "size=%"INT32".",m_ubufSize ); } // trying to uncompress corrupt titlerecs sometimes results in // a seg fault... watch out if ( m_ubufSize > 100*1024*1024 ) { g_errno = EBADTITLEREC; return log("db: TitleRec::set: uncompress uncompressed " "size=%"INT32" > 100MB. unacceptable, probable " "corruption.",m_ubufSize ); } // make buf space for holding the uncompressed stuff m_ubufAlloc = m_ubufSize; m_ubuf = (char *) mmalloc ( m_ubufAlloc ,"TitleRecu1"); if ( ! m_ubuf ) { // we had bad ubufsizes on gb6, like > 1GB print out key // so we can manually make a titledb.dat file to delete these // bad keys log("build: alloc failed ubufsize=%"INT32" key.n1=%"UINT32" " "n0=%"UINT64, m_ubufAlloc,m_titleRecKey.n1,m_titleRecKey.n0); return false; } // we need to loop since uncompress is wierd, sometimes it needs more // space then it should. see how much it actually took. int32_t realSize = m_ubufSize; // time it int64_t startTime = gettimeofdayInMilliseconds(); // debug msg setStatus( "Uncompressing title rec." ); // . uncompress the data into m_ubuf // . m_ubufSize should remain unchanged since we stored it int err = gbuncompress ( (unsigned char *) m_ubuf , (uint32_t *) &realSize , (unsigned char *) p , (uint32_t ) (dataSize - 4) ); // hmmmm... if ( err == Z_BUF_ERROR ) { log("db: Buffer is too small to hold uncompressed " "document. Probable disk corruption in a titledb file."); g_errno = EUNCOMPRESSERROR; return false; } // set g_errno and return false on error if ( err != Z_OK ) { g_errno = EUNCOMPRESSERROR; return log("db: Uncompress of document failed. ZG_ERRNO=%i. " "cbufSize=%"INT32" ubufsize=%"INT32" realSize=%"INT32"", err , cbufSize , m_ubufSize , realSize ); } if ( realSize != m_ubufSize ) { g_errno = EBADENGINEER; return log("db: Uncompressed document size is not what we " "recorded it to be. Probable disk corruption in " "a titledb file."); } // . add the stat // . use white for the stat g_stats.addStat_r ( 0 , startTime , gettimeofdayInMilliseconds(), 0x00ffffff ); // first 2 bytes in m_ubuf is the header size int32_t headerSize = *(uint16_t *)m_ubuf; int32_t shouldbe = (char *)&ptr_firstUrl - (char *)&m_headerSize; if ( headerSize != shouldbe ) { g_errno = ECORRUPTDATA; return log("doc: bad header size in title rec"); } // set our easy stuff gbmemcpy ( (void *)this , m_ubuf , headerSize ); // NOW set the XmlDoc::ptr_* and XmlDoc::size_* members // like in Msg.cpp and Msg20Reply.cpp if ( m_pbuf ) { int32_t crc = hash32(m_ubuf,headerSize); m_pbuf->safePrintf("crchdr=0x%"XINT32" sizehdr=%"INT32", ", crc,headerSize); } // point to the string data char *up = m_ubuf + headerSize; // end of the rec char *upend = m_ubuf + m_ubufSize; // how many XmlDoc::ptr_* members do we have? set "np" to that int32_t np = ((char *)&size_firstUrl - (char *)&ptr_firstUrl) ; np /= sizeof(char *); // point to the first ptr char **pd = (char **)&ptr_firstUrl; // point to the first size int32_t *ps = (int32_t *)&size_firstUrl; // loop over them for ( int32_t i = 0 ; i < np ; i++ , pd++ , ps++ ) { // zero out the ith ptr_ and size_ member *pd = 0; *ps = 0; // make the mask uint32_t mask = 1 << i ; // do we have this member? skip if not. if ( ! (m_internalFlags1 & mask) ) continue; // watch out for corruption if ( up > upend ) { g_errno = ECORRUPTDATA; return log("doc: corrupt titlerec."); } // get the size *ps = *(int32_t *)up; // this should never be 0, otherwise, why was its flag set? if ( *ps <= 0 ) { char *xx=NULL;*xx=0; } // skip over to point to data up += 4; // point to the data. could be 64-bit ptr. *pd = up;//(int32_t)up; // debug if ( m_pbuf ) { int32_t crc = hash32(up,*ps); m_pbuf->safePrintf("crc%"INT32"=0x%"XINT32" size%"INT32"=%"INT32", ", i,crc,i,*ps); } // skip over data up += *ps; // watch out for corruption if ( up > upend ) { g_errno = ECORRUPTDATA; return log("doc: corrupt titlerec."); } } // cap it char *pend = m_ubuf + m_ubufSize; // sanity check. must match exactly. if ( up != pend ) { char *xx=NULL;*xx=0; } // set the urls i guess m_firstUrl.set ( ptr_firstUrl ); if ( ptr_redirUrl ) { m_redirUrl.set ( ptr_redirUrl ); m_currentUrl.set ( ptr_redirUrl ); m_currentUrlValid = true; m_redirUrlPtr = &m_redirUrl; } else { m_currentUrl.set ( ptr_firstUrl ); m_currentUrlValid = true; m_redirUrlPtr = NULL; } m_firstUrlValid = true; m_redirUrlValid = true; // convert 8 bit to a 32 bit //m_numBannedOutlinks = score8to32 ( m_numBannedOutlinks8 ); // validate *shadow* members since bit flags cannot be returned m_isRSS2 = m_isRSS; m_isPermalink2 = m_isPermalink; m_isAdult2 = m_isAdult; m_spiderLinks2 = m_spiderLinks; m_isContentTruncated2 = m_isContentTruncated; m_isLinkSpam2 = m_isLinkSpam; m_hasAddress2 = m_hasAddress; m_hasTOD2 = m_hasTOD; //m_hasSiteVenue2 = m_hasSiteVenue; m_hasContactInfo2 = m_hasContactInfo; //m_skipIndexingByte = m_skipIndexing; m_isSiteRoot2 = m_isSiteRoot; // these members are automatically validated m_ipValid = true; m_spideredTimeValid = true; m_indexedTimeValid = true; m_pubDateValid = true; m_firstIndexedValid = true; m_outlinksAddedDateValid = true; m_charsetValid = true; m_countryIdValid = true; /* m_titleWeightValid = true; m_headerWeightValid = true; m_urlPathWeightValid = true; m_externalLinkTextWeightValid = true; m_internalLinkTextWeightValid = true; m_conceptWeightValid = true; */ // new stuff m_siteNumInlinksValid = true; m_siteNumInlinksUniqueIpValid = true; m_siteNumInlinksUniqueCBlockValid = true; m_siteNumInlinksTotalValid = true; //m_sitePopValid = true; m_rootLangIdValid = true; m_hasContactInfoValid = true; m_metaListCheckSum8Valid = true; m_hopCountValid = true; //m_numBannedOutlinksValid = true; m_langIdValid = true; m_contentTypeValid = true; m_isRSSValid = true; m_isPermalinkValid = true; m_isAdultValid = true; //m_eliminateMenusValid = true; m_spiderLinksValid = true; m_isContentTruncatedValid = true; m_isLinkSpamValid = true; m_hasAddressValid = true; m_tagRecDataValid = true; m_gigabitHashesValid = true; m_contentHash32Valid = true; //m_tagHash32Valid = true; m_tagPairHash32Valid = true; m_adVectorValid = true; m_wikiDocIdsValid = true; m_imageDataValid = true; m_catIdsValid = true; m_indCatIdsValid = true; // ptr_dmozTitles/Summs/Anchors valid: m_dmozInfoValid = true; m_utf8ContentValid = true; //m_sectionsReplyValid = true; //m_sectionsVotesValid = true; //m_addressReplyValid = true; m_siteValid = true; m_linkInfo1Valid = true; m_linkInfo2Valid = true; m_versionValid = true; m_httpStatusValid = true; m_crawlDelayValid = true; //m_sectiondbDataValid = true; //m_placedbDataValid = true; m_clockCandidatesDataValid = true; //m_skipIndexingValid = true; m_isSiteRootValid = true; // ptr_linkInfo2 is valid. so getDiffbotTitleHashes() works. m_diffbotTitleHashBufValid = true; // set "m_oldTagRec" from ptr_tagRecData //gbmemcpy ( &m_oldTagRec , ptr_tagRecData , size_tagRecData ); //m_oldTagRecValid = true; // there was no issue indexing it... m_indexCode = 0; m_indexCodeValid = true; m_redirError = 0; m_redirErrorValid = true; // stop core when importing and calling getNewSpiderReply() m_downloadEndTime = m_spideredTime; m_downloadEndTimeValid = true; // make a copy for new tag rec too, this one we modify //gbmemcpy ( &m_newTagRec , ptr_tagRecData , size_tagRecData ); // set "m_siteNumInlinks" from m_oldTagRec //Tag *tag = m_oldTagRec.getTag("sitenuminlinks"); // must always be there! //if ( ! tag ) { char *xx=NULL;*xx=0; } // must be null terminated //if ( tag->getTagData()[tag->getTagData()Size-1] != 0 ) { // char *xx=NULL;*xx=0; } // grab that //m_siteNumInlinks = atol(tag->getTagData()); //m_siteNumInlinksValid = true; // must not be negative if ( m_siteNumInlinks < 0 ) { char *xx=NULL;*xx=0; } // set m_hasContactInfo in case someone calls ::getHasContactInfo() // which will do a bunch of parsing!! //tag = m_oldTagRec.getTag ("hascontactinfo"); //if ( tag ) m_hasContactInfo = true; //else m_hasContactInfo = false; //m_hasContactInfoValid = true; // sanity check. if m_siteValid is true, this must be there if ( ! ptr_site ) { char *xx=NULL;*xx=0; } // lookup the tagdb rec fresh if setting for a summary. that way we // can see if it is banned or not //if ( m_req ) m_tagRecDataValid = false; // debug thing ptr_sectiondbData = NULL; size_sectiondbData = 0; // set m_sections.m_nsvt from data. ptr_sectiondbData is the m_osvt // serialized, which is from our read of sectiondb at the time we // indexed it. but now that we may have nulled out our content to // save space in titledb because m_skipIndexing is true, then we have // to save our votes as well, BUT, only if we skipped indexing. // and not allowed to serialize UNLESS we skipped because // that would waste space as well //if (! m_skipIndexing && size_sectionsVotes ) { char *xx=NULL;*xx=0; } // success, return true then return true; } bool XmlDoc::setFirstUrl ( char *u , bool addWWW , Url *baseUrl ) { m_firstUrl.reset(); m_currentUrl.reset(); m_firstUrlValid = true; // sanity check. "u" must be normalized //if ( strncmp(u,"http",4 ) != 0 ) { char *xx=NULL;*xx=0; } // assume url is not correct format ptr_firstUrl = NULL; size_firstUrl = 0; if ( ! u || ! u[0] ) { //if ( ! m_indexCode ) m_indexCode = EBADURL; return true; } //if ( gbstrlen (u) + 1 > MAX_URL_LEN ) // m_indexCode = EURLTOOLONG; m_firstUrl.set ( baseUrl , u , gbstrlen(u) , addWWW ) ; // it is the active url m_currentUrl.set ( &m_firstUrl , false ); m_currentUrlValid = true; // set this to the normalized url ptr_firstUrl = m_firstUrl.getUrl(); size_firstUrl = m_firstUrl.getUrlLen() + 1; // is it is a link loop? //if ( m_firstUrl.isLinkLoop() ) { // if ( ! m_indexCode ) m_indexCode = ELINKLOOP; // return true; //} // it it illegal? //if ( m_firstUrl.m_host && m_firstUrl.m_host[0] == '.' ) { // if ( ! m_indexCode ) m_indexCode = EBADURL; // return true; //} // check if url is porn words in it //if ( cr->m_doUrlSpamCheck && m_firstUrl.isSpam() ) { // if ( ! m_indexCode ) m_indexCode = EDOCURLSPAM; // return true; //} return true; } //CollectionRec *XmlDoc::getCollRec ( ) { // return g_collectiondb.getRec ( m_coll , gbstrlen(m_coll) ); //} //bool XmlDoc::setRedirUrl ( char *u , bool addWWW ) { // m_redirUrl.set ( u , gbstrlen(u) , addWWW ); // ptr_redirUrl = m_redirUrl.getUrl(); // size_redirUrl = m_redirUrl.getUrlLen()+1; // return true; //} void XmlDoc::setStatus ( char *s ) { m_statusMsg = s; m_statusMsgValid = true; static char *s_last = NULL; if ( s == s_last ) return; bool timeIt = false; // if ( m_sreqValid && // m_sreq.m_isInjecting && // m_sreq.m_isPageInject ) // timeIt = true; if ( g_conf.m_logDebugBuildTime ) timeIt = true; // log times to detect slowness if ( timeIt ) { int64_t now = gettimeofdayInMillisecondsLocal(); if ( s_lastTimeStart == 0LL ) s_lastTimeStart = now; int32_t took = now - s_lastTimeStart; //if ( took > 100 ) log("xmldoc: %s (xd=0x%"PTRFMT" " "u=%s) took %"INT32"ms", s_last, (PTRTYPE)this, m_firstUrl.m_url, took); s_lastTimeStart = now; } s_last = s; if ( ! g_conf.m_logDebugBuild ) return ; //return; if ( m_firstUrlValid ) logf(LOG_DEBUG,"build: status = %s for %s (this=0x%"PTRFMT")", s,m_firstUrl.m_url,(PTRTYPE)this); else logf(LOG_DEBUG,"build: status = %s for docId %"INT64" " "(this=0x%"PTRFMT")", s,m_docId, (PTRTYPE)this); } // caller must now call XmlDoc::setCallback() void XmlDoc::setCallback ( void *state, void (* callback) (void *state) ) { m_state = state; m_callback1 = callback; // add this additional state==this constraint to prevent core when // doing a page parser if ( state == this && // i don't remember why i added this sanity check... callback == getMetaListWrapper ) { char *xx=NULL;*xx=0; } } void XmlDoc::setCallback ( void *state, bool (*callback) (void *state) ) { m_state = state; m_callback2 = callback; } // . similar to XmlDoc::indexDoc() but just adds m_firstUrl to spiderdb // . used by PageAddUrl.cpp /* bool XmlDoc::addToSpiderdb ( ) { // set a flag m_isAddUrl = true; // url must be valid if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; } // do not add if something wrong with url if ( m_indexCode ) return true; // this should just add to spiderdb because m_isAddUrl is true return indexDoc(false,false,false,false,true,false); } */ void indexDocWrapper ( void *state ) { XmlDoc *THIS = (XmlDoc *)state; // make sure has not been freed from under us! if ( THIS->m_freed ) { char *xx=NULL;*xx=0;} // note it THIS->setStatus ( "in index doc wrapper" ); // return if it blocked if ( ! THIS->indexDoc( ) ) return; // otherwise, all done, call the caller callback if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state ); else THIS->m_callback2 ( THIS->m_state ); } // for registerSleepCallback void indexDocWrapper2 ( int fd , void *state ) { indexDocWrapper ( state ); } // . inject from http request // . replace more of Msg7.cpp logic with this? //bool XmlDoc::injectDoc ( HttpRequest *hr ) { //} // . the highest level function in here // . user is requesting to inject this url // . returns false if blocked and your callback will be called when done // . returns true and sets g_errno on error bool XmlDoc::injectDoc ( char *url , CollectionRec *cr , char *content , char *diffbotReply, // usually null bool contentHasMime , int32_t hopCount, int32_t charset, bool deleteUrl, char *contentTypeStr, // text/html application/json bool spiderLinks , bool newOnly, // index iff new void *state, void (*callback)(void *state) , uint32_t firstIndexed, uint32_t lastSpidered ) { // wait until we are synced with host #0 if ( ! isClockInSync() ) { log("xmldocl: got injection request but clock not yet " "synced with host #0"); g_errno = ETRYAGAIN;//CLOCKNOTSYNCED; return true; } // normalize url Url uu; // do not add www to fix tmblr.co/ZHw5yo1E5TAaW injection // which has no www.tmblr.co IP! uu.set(url,gbstrlen(url),false);//true); // remove >'s i guess and store in st1->m_url[] buffer char cleanUrl[MAX_URL_LEN+1]; cleanInput ( cleanUrl, MAX_URL_LEN, uu.getUrl(), uu.getUrlLen() ); int32_t contentType = getContentTypeFromStr(contentTypeStr); // use CT_HTML if contentTypeStr is empty or blank. default if ( ! contentTypeStr || ! contentTypeStr[0] ) contentType = CT_HTML; // this can go on the stack since set4() copies it SpiderRequest sreq; sreq.setFromInject ( cleanUrl ); if ( lastSpidered ) sreq.m_addedTime = lastSpidered; if ( deleteUrl ) sreq.m_forceDelete = 1; //static char s_dummy[3]; // sometims the content is indeed NULL... //if ( newOnly && ! content ) { // // don't let it be NULL because then xmldoc will // // try to download the page! // s_dummy[0] = '\0'; // content = s_dummy; // //char *xx=NULL;*xx=0; } //} // . use the enormous power of our new XmlDoc class // . this returns false with g_errno set on error if ( ! set4 ( &sreq , NULL , cr->m_coll , NULL , // pbuf // from PageInject.cpp: // give it a niceness of 1, we have to be // careful since we are a niceness of 0!!!! 1, // niceness, // 1 , // inject this content content , deleteUrl, // false, // deleteFromIndex , 0,//forcedIp , contentType , lastSpidered,//lastSpidered overide contentHasMime )) { // g_errno should be set if that returned false if ( ! g_errno ) { char *xx=NULL;*xx=0; } return true; } // a diffbot reply? should be in json if ( diffbotReply ) { if ( ! m_diffbotReply.safeStrcpy(diffbotReply) ) return true; // it was injected so assume no error m_diffbotReplyError = 0; m_diffbotReplyValid = true; } //m_doConsistencyTesting = doConsistencyTesting; // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag //if ( recycleContent ) m_recycleContent = true; // othercrap. used for importing from titledb of another coll/cluster. if ( firstIndexed ) { m_firstIndexedDate = firstIndexed; m_firstIndexedDateValid = true; } if ( lastSpidered ) { m_spideredTime = lastSpidered; m_spideredTimeValid = true; } if ( hopCount != -1 ) { m_hopCount = hopCount; m_hopCountValid = true; } if ( charset != -1 && charset != csUnknown ) { m_charset = charset; m_charsetValid = true; } // avoid looking up ip of each outlink to add "firstip" tag to tagdb // because that can be slow!!!!!!! m_spiderLinks = spiderLinks; m_spiderLinks2 = spiderLinks; m_spiderLinksValid = true; // . newOnly is true --> do not inject if document is already indexed! // . maybe just set indexCode m_newOnly = newOnly; // do not re-lookup the robots.txt m_isAllowed = true; m_isAllowedValid = true; m_crawlDelay = -1; // unknown m_crawlDelayValid = true; m_isInjecting = true; m_isInjectingValid = true; // set this now //g_inPageInject = true; // log it now //log("inject: indexing injected doc %s",cleanUrl); // make this our callback in case something blocks setCallback ( state , callback ); // . now tell it to index // . this returns false if blocked // . eventually it will call "callback" when done if it blocks bool status = indexDoc ( ); if ( ! status ) return false; // log it. i guess only for errors when it does not block? // because xmldoc.cpp::indexDoc calls logIt() if ( status ) logIt(); // undo it //g_inPageInject = false; return true; } // XmlDoc::injectDoc uses a fake spider request so we have to add // a real spider request into spiderdb so that the injected doc can // be spidered again in the future by the spidering process, otherwise, // injected docs can never be re-spidered. they would end up having // a SpiderReply in spiderdb but no matching SpiderRequest as well. void XmlDoc::getRevisedSpiderRequest ( SpiderRequest *revisedReq ) { if ( ! m_sreqValid ) { char *xx=NULL; *xx=0; } // we are doing this because it has a fake first ip if ( ! m_sreq.m_fakeFirstIp ) { char *xx=NULL;*xx=0; } // copy it over from our current spiderrequest gbmemcpy ( revisedReq , &m_sreq , m_sreq.getRecSize() ); // this must be valid for us of course if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; } // wtf? it might be invalid!!! parent caller will handle it... //if ( m_firstIp == 0 || m_firstIp == -1 ) { char *xx=NULL;*xx=0; } // store the real ip in there now revisedReq->m_firstIp = m_firstIp; // but turn off this flag! the whole point of all this... revisedReq->m_fakeFirstIp = 0; // re-make the key since it contains m_firstIp int64_t uh48 = m_sreq.getUrlHash48(); int64_t parentDocId = m_sreq.getParentDocId(); // set the key properly to reflect the new "first ip" since // we shard spiderdb by that. revisedReq->m_key = g_spiderdb.makeKey ( m_firstIp, uh48, true, // is request? parentDocId , false );// isDel ); revisedReq->setDataSize(); } void XmlDoc::getRebuiltSpiderRequest ( SpiderRequest *sreq ) { // memset 0 sreq->reset(); // assume not valid sreq->m_siteNumInlinks = -1; if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } // how many site inlinks? sreq->m_siteNumInlinks = m_siteNumInlinks; sreq->m_siteNumInlinksValid = true; if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; } // set other fields besides key sreq->m_firstIp = m_firstIp; sreq->m_hostHash32 = m_hostHash32a; //sreq->m_domHash32 = m_domHash32; //sreq->m_siteNumInlinks = m_siteNumInlinks; //sreq->m_pageNumInlinks = m_pageNumInlinks; sreq->m_hopCount = m_hopCount; sreq->m_parentHostHash32 = 0;//m_sreq.m_parentHostHash32; sreq->m_parentDomHash32 = 0;//m_sreq.m_parentDomHash32; sreq->m_parentSiteHash32 = 0;//m_sreq.m_parentSiteHash32; sreq->m_parentFirstIp = 0;//m_sreq.m_parentFirstIp; Url *fu = getFirstUrl(); sreq->m_isNewOutlink = 0; sreq->m_isAddUrl = 0;//m_isAddUrl; sreq->m_isPingServer = fu->isPingServer(); //sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat; // transcribe from old spider rec, stuff should be the same sreq->m_addedTime = m_firstIndexedDate; sreq->m_sameDom = 0;//m_sreq.m_sameDom; sreq->m_sameHost = 0;//m_sreq.m_sameHost; sreq->m_sameSite = 0;//m_sreq.m_sameSite; sreq->m_wasParentIndexed = 0;//m_sreq.m_parentWasIndexed; sreq->m_parentIsRSS = 0;//m_sreq.m_parentIsRSS; sreq->m_parentIsPermalink = 0;//m_sreq.m_parentIsPermalink; sreq->m_parentIsPingServer = 0;//m_sreq.m_parentIsPingServer; // validate the stuff so getUrlFilterNum() acks it sreq->m_hopCountValid = 1; // we need this now for ucp ucr upp upr new url filters that do // substring matching on the url if ( m_firstUrlValid ) strcpy(sreq->m_url,m_firstUrl.m_url); // re-make the key since it contains m_firstIp long long uh48 = fu->getUrlHash48(); // set the key properly to reflect the new "first ip" // since we shard spiderdb by that. sreq->m_key = g_spiderdb.makeKey ( m_firstIp,//ip, uh48, true,//is req? 0LL, // parentDocId , false );//isDel sreq->setDataSize(); } //////////////////////////////////////////////////////////////////// // THIS IS THE HEART OF HOW THE PARSER ADDS TO THE RDBS //////////////////////////////////////////////////////////////////// // . returns false if blocked, true otherwise // . sets g_errno on error and returns true // . this is now a WRAPPER for indexDoc2() and it will deal with // g_errnos by adding an error spider reply so we offload the // logic to the url filters table bool XmlDoc::indexDoc ( ) { // return from the msg4.addMetaList() below? if ( m_msg4Launched ) { // must have been waiting if ( ! m_msg4Waiting ) { char *xx=NULL;*xx=0; } return true; } // return true with g_errno set on error CollectionRec *cr = getCollRec(); if ( ! cr ) return true; if ( ! m_masterLoop ) { m_masterLoop = indexDocWrapper; m_masterState = this; } // do not index if already indexed and we are importing // from the code in PageInject.cpp from a foreign titledb file if ( m_isImporting && m_isImportingValid ) { char *isIndexed = getIsIndexed(); if ( ! isIndexed ) { log("import: import had error: %s",mstrerror(g_errno)); return true; } if ( isIndexed == (char *)-1) return false; if ( *isIndexed ) { log("import: skipping import for %s. already indexed.", m_firstUrl.getUrl()); return true; } } // . even if not using diffbot, keep track of these counts // . even if we had something like EFAKEFIRSTIP, OOM, or whatever // it was an attempt we made to crawl this url if ( ! m_isDiffbotJSONObject && ! m_incrementedAttemptsCount ) { // do not repeat m_incrementedAttemptsCount = true; // log debug //log("build: attempted %s count=%"INT64"",m_firstUrl.getUrl(), // cr->m_localCrawlInfo.m_pageDownloadAttempts); // this is just how many urls we tried to index //cr->m_localCrawlInfo.m_urlsConsidered++; // avoid counting if it is a fake first ip bool countIt = true; // pagereindex.cpp sets this as does any add url (bulk job) if ( m_sreqValid && m_sreq.m_fakeFirstIp ) countIt = false; if ( countIt ) { cr->m_localCrawlInfo.m_pageDownloadAttempts++; cr->m_globalCrawlInfo.m_pageDownloadAttempts++; // changing status, resend local crawl info to all cr->localCrawlInfoUpdate(); } // need to save collection rec now during auto save cr->m_needsSave = true; // update this just in case we are the last url crawled //int64_t now = gettimeofdayInMillisecondsGlobal(); //cr->m_diffbotCrawlEndTime = now; } bool status = true; if ( ! g_errno ) status = indexDoc2 ( ); // blocked? if ( ! status ) return false; // done with no error? bool success = true; if ( g_errno ) success = false; // if we were trying to spider a fakefirstip request then // pass through because we lookup the real firstip below and // add a new request as well as a reply for this one if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) success = false; if ( success ) return true; // . ignore failed child docs like diffbot pages // . they are getting EMALFORMEDSECTIONS if ( m_isChildDoc ) { log("build: done indexing child doc. error=%s. not adding " "spider reply for %s", mstrerror(g_errno), m_firstUrl.m_url); return true; } /// // otherwise, an internal error. we must add a SpiderReply // to spiderdb to release the lock. /// logErr: if ( m_firstUrlValid && g_errno ) log("build: %s had internal error = %s. adding spider " "error reply.", m_firstUrl.m_url,mstrerror(g_errno)); else if ( g_errno ) log("build: docid=%"INT64" had internal error = %s. adding spider " "error reply.", m_docId,mstrerror(g_errno)); // seems like this was causing a core somehow... if ( g_errno == ENOMEM ) return true; // if docid not found when trying to do a query reindex... // this really shouldn't happen but i think we were adding // additional SpiderRequests since we were using a fake first ip. // but i have since fixed that code. so if the titlerec was not // found when trying to do a force delete... it's not a temporary // error and should not be retried. if we set indexCode to // EINTERNALERROR it seems to be retried. if ( g_errno == ENOTFOUND ) { m_indexCode = g_errno; m_indexCodeValid = true; } if ( g_errno == EBADURL ) { m_indexCode = g_errno; m_indexCodeValid = true; } if ( g_errno == ENOTITLEREC ) { m_indexCode = g_errno; m_indexCodeValid = true; } if ( ! m_indexCodeValid ) { m_indexCode = EINTERNALERROR;//g_errno; m_indexCodeValid = true; } // if our spiderrequest had a fake "firstip" so that it could be // injected quickly into spiderdb, then do the firstip lookup here // and re-add the new spider request with that, and add the reply // to the fake firstip request below. if ( m_indexCodeValid && m_indexCode == EFAKEFIRSTIP ) { // at least get this if possible int32_t *fip = getFirstIp(); if ( fip == (void *) -1 ) return false; // error? g_errno will be changed if this is NULL if ( ! fip ) { log("build: error getting real firstip: %s", mstrerror(g_errno)); m_indexCode = EINTERNALERROR; m_indexCodeValid = true; goto logErr; } // sanity log if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; } // sanity log if ( *fip == 0 || *fip == -1 ) { char *url = "unknown"; if ( m_sreqValid ) url = m_sreq.m_url; log("build: error2 getting real firstip of %"INT32" for " "%s. Not adding new spider req", (int32_t)*fip,url); goto skipNewAdd1; } // store the new request (store reply for this below) char rd = RDB_SPIDERDB; if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2; m_metaList2.pushChar(rd); // store it here SpiderRequest revisedReq; // this fills it in getRevisedSpiderRequest ( &revisedReq ); // and store that new request for adding if ( ! m_metaList2.safeMemcpy (&revisedReq, revisedReq.getRecSize())) return true; // make sure to log the size of the spider request m_addedSpiderRequestSize = revisedReq.getRecSize(); m_addedSpiderRequestSizeValid = true; } skipNewAdd1: SpiderReply *nsr = NULL; // if only rebuilding posdb do not rebuild spiderdb if ( m_useSpiderdb ) { //// // // make these fake so getNewSpiderReply() below does not block // //// nsr = getFakeSpiderReply ( ); // this can be NULL and g_errno set to ENOCOLLREC or something if ( ! nsr ) return true; //SafeBuf metaList; char rd = RDB_SPIDERDB; if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2; if ( ! m_metaList2.pushChar( rd ) ) return true; if ( ! m_metaList2.safeMemcpy ( (char *)nsr,nsr->getRecSize())) return true; m_addedSpiderReplySize = nsr->getRecSize(); m_addedSpiderReplySizeValid = true; } m_msg4Launched = true; // display the url that had the error logIt(); // log this for debug now if ( nsr ) { SafeBuf tmp; nsr->print(&tmp); log("xmldoc: added reply %s",tmp.getBufStart()); } // clear g_errno g_errno = 0; // "cr" might have been deleted by calling indexDoc() above i think // so use collnum here, not "cr" if ( ! m_msg4.addMetaList ( m_metaList2.getBufStart() , m_metaList2.length() , m_collnum,//cr->m_coll , m_masterState , // state m_masterLoop , m_niceness ) ) { // spider hang bug //if ( g_conf.m_testSpiderEnabled ) // logf(LOG_DEBUG,"build: msg4 meta add3 blocked" // "msg4=0x%"XINT32"" ,(int32_t)&m_msg4); m_msg4Waiting = true; return false; } //logf(LOG_DEBUG,"build: msg4 meta add3 did NOT block" ); m_msg4Launched = false; // all done return true; } // . returns false if blocked, true otherwise // . sets g_errno on error and returns true bool XmlDoc::indexDoc2 ( ) { if ( g_isYippy ) return true; // if anything blocks, this will be called when it comes back if ( ! m_masterLoop ) { m_masterLoop = indexDocWrapper; m_masterState = this; } CollectionRec *cr = getCollRec(); if ( ! cr ) return true; // do this before we increment pageDownloadAttempts below so that // john's smoke tests, which use those counts, are not affected if ( m_sreqValid && m_sreq.m_fakeFirstIp && // only do for add url, not for injects. injects expect // the doc to be indexed while the browser waits. add url // is really just adding the spider request and returning // to the browser without delay. ! m_sreq.m_isInjecting && // not for page reindexes either! ! m_sreq.m_isPageReindex && // just add url m_sreq.m_isAddUrl && // diffbot requests are ok though! ! strstr(m_sreq.m_url,"-diffbotxyz") ) { m_indexCodeValid = true; m_indexCode = EFAKEFIRSTIP; return true; } // ensure that CollectionRec::m_globalCrawlInfo (spider stats) // is at least 1 minute in sync with counts of // all hosts in network. this returns false if it sent out requests // to update the counts from all the hosts in the network, and // when it updates CollectionRec::m_crawlInfoGlobal with all the // requests from each hosts in the network it will call the // specified callback, m_masterLoop with m_masterState. this code // is all in Spider.cpp. // this is now in a sleep wrapper in spider.cpp. //setStatus ( "updating crawl info" ); //if ( ! g_errno && // ! updateCrawlInfo ( cr , m_masterState , m_masterLoop ) ) // return false; // MDW: we do this in indexDoc() above why do we need it here? /* // even if not using diffbot, keep track of these counts if ( ! m_isDiffbotJSONObject && ! m_incrementedAttemptsCount ) { // do not repeat m_incrementedAttemptsCount = true; // this is just how many urls we tried to index //cr->m_localCrawlInfo.m_urlsConsidered++; cr->m_localCrawlInfo.m_pageDownloadAttempts++; cr->m_globalCrawlInfo.m_pageDownloadAttempts++; // need to save collection rec now during auto save cr->m_needsSave = true; // update this just in case we are the last url crawled int64_t now = gettimeofdayInMillisecondsGlobal(); cr->m_diffbotCrawlEndTime = now; } */ /* // if we are being called from Spider.cpp and we met our max // to crawl requirement, then bail out on this. this might // become true when we are in the middle of processing this url... if ( ! m_isDiffbotJSONObject && // this is just for this collection, from all hosts in network cr->m_globalCrawlInfo.m_pageDownloadSuccesses >= //Attempts >= cr->m_diffbotMaxToCrawl ) { // set the code to badness m_indexCode = EHITCRAWLLIMIT;//EABANDONED; m_indexCodeValid = true; log("diffbot: abandoning url because we hit crawl limit " "of %"INT64". downloaded %"INT64". Disabling spiders." ,cr->m_diffbotMaxToCrawl ,cr->m_globalCrawlInfo.m_pageDownloadSuccesses ); g_errno = m_indexCode; // if spiders already off.. if ( ! cr->m_spideringEnabled ) return true; // do not repeat call sendNotification() cr->m_spideringEnabled = false; // set this m_emailInfo.reset(); m_emailInfo.m_finalCallback = m_masterLoop; m_emailInfo.m_finalState = m_masterState; m_emailInfo.m_collnum = m_collnum; // note it setStatus("sending notification"); // this returns false if it would block, so we ret fals if ( ! sendNotification ( &m_emailInfo ) ) return false; // it didn't block g_errno = m_indexCode; return true; } // likewise if we hit the max processing limit... if ( ! m_isDiffbotJSONObject && cr->m_globalCrawlInfo.m_pageProcessSuccesses >= // Attempts >= cr->m_diffbotMaxToProcess ) { // set the code to badness m_indexCode = EHITPROCESSLIMIT;//EABANDONED; m_indexCodeValid = true; log("diffbot: abandoning url because we hit process limit " "of %"INT64". processed %"INT64". Disabling spiders." , cr->m_diffbotMaxToProcess , cr->m_globalCrawlInfo.m_pageProcessSuccesses ); g_errno = m_indexCode; // if spiders already off... if ( ! cr->m_spideringEnabled ) return true; // turn them off and send notification (email or url) cr->m_spideringEnabled = false; // set this m_emailInfo.reset(); m_emailInfo.m_finalCallback = m_masterLoop; m_emailInfo.m_finalState = m_masterState; m_emailInfo.m_collnum = m_collnum; // note it setStatus("sending notification"); // . this returns false if it would block, so we ret fals // . this is now in PingServer.cpp if ( ! sendNotification( &m_emailInfo ) ) return false; // it didn't block g_errno = m_indexCode; return true; } */ setStatus("indexing doc"); // maybe a callback had g_errno set? if ( g_errno ) return true; // before indexing this doc, index its inlinks it has according // to ahrefs? if ( m_downloadLevel == 1 && m_useAhrefs && ! m_doneWithAhrefs ) { // do not repeat this call! m_doneWithAhrefs = true; // call it if ( ! injectAhrefsLinks () ) return false; } // . now get the meta list from it to add // . returns NULL and sets g_errno on error char *metaList = getMetaList ( ); // error? if ( ! metaList ) { // sanity check. g_errno must be set if ( ! g_errno ) { log("build: Error UNKNOWN error spidering. setting " "to bad engineer."); g_errno = EBADENGINEER; //char *xx=NULL;*xx=0; } } log("build: Error spidering for doc %s: %s", m_firstUrl.m_url,mstrerror(g_errno)); return true; } // did it block? return false if so, we will be recalled since // we set m_masterLoop to indexDoc if ( metaList == (char *) -1 ) return false; // before we add the meta list let's updateTagdb() //char *ret = updateTagdb(); // it returns NULL on error //if ( ret == NULL ) return true; // return false if it blocked //if ( ret == (char *)-1 ) return false; // . let's update tagdb's venue address default too // . no. that is in getTitleRecBuf() // must be valid int32_t *indexCode = getIndexCode(); if (! indexCode || indexCode == (void *)-1) return (char *)indexCode; // . check to make sure the parser is consistent so we can cleanly // delete the various rdb records if we need to in the future solely // based on the titleRec. // . force = false // . unless we force it, the test is only done at random intervals // for performance reasons if ( ! *indexCode ) doConsistencyTest ( false ); // ignore errors from that g_errno = 0; // unregister any sleep callback if ( m_registeredSleepCallback ) { g_loop.unregisterSleepCallback(m_masterState,indexDocWrapper2); m_registeredSleepCallback = false; } ////////// // . add the doledb negative key quickly to our tree to avoid a // respider because the msg4 doledb negative key is buffered by msg4 // . make it negative // . well it should not be respidered because the lock is on it!! // -- so let's comment this out ///////// /* key_t negative = m_doledbKey; // make it negative negative.n0 &= 0xfffffffffffffffeLL; // . store it in our tree if we can // . returns false and sets g_errno on error // . i.e. g_errno == ETRYAGAIN if ( ! m_addedNegativeDoledbRec && ! g_doledb.m_rdb.addRecord(m_coll,(char *)&negative, NULL,0,m_niceness)){ log("build: error trying to add to doledb: %s", mstrerror(g_errno)); // set sleep wrapper g_loop.registerSleepCallback(1000,m_masterState, indexDocWrapper2,m_niceness); // note it m_registeredSleepCallback = true; // sleep and retry return false; } */ // we did that m_addedNegativeDoledbRec = true; // now add it if ( ! m_listAdded && m_metaListSize ) { // only call thuis once m_listAdded = true; // show it for now //printMetaList(m_metaList , m_metaList + m_metaListSize,NULL); // test it verifyMetaList ( m_metaList , m_metaList + m_metaListSize , false ); // do it if ( ! m_msg4.addMetaList ( m_metaList , m_metaListSize , m_collnum,//cr->m_coll , m_masterState , // state m_masterLoop , m_niceness ) ) { // spider hang bug if ( g_conf.m_testSpiderEnabled ) logf(LOG_DEBUG,"build: msg4 meta add blocked" "msg4=0x%"PTRFMT"" ,(PTRTYPE)&m_msg4); m_msg4Waiting = true; return false; } // error with msg4? bail if ( g_errno ) return logIt(); } // make sure our msg4 is no longer in the linked list! if (m_msg4Waiting && isInMsg4LinkedList(&m_msg4)){char *xx=NULL;*xx=0;} if ( m_msg4Waiting && g_conf.m_testSpiderEnabled ) logf(LOG_DEBUG,"build: msg4=0x%"PTRFMT" returned" ,(PTRTYPE)&m_msg4); // we are not waiting for the msg4 to return m_msg4Waiting = false; bool flush = false; if ( m_contentInjected ) flush = true; if ( m_sreqValid && m_sreq.m_isPageInject ) flush = true; // to keep our qa runs consistent if ( strcmp(cr->m_coll,"qatest123") ) flush = true; if ( ! m_listAdded ) flush = false; if ( m_listFlushed ) flush = false; // HACK: flush it if we are injecting it in case the next thing we // spider is dependent on this one if ( flush ) { // note it setStatus ( "flushing msg4" ); // only do it once m_listFlushed = true; // do it if ( ! flushMsg4Buffers ( m_masterState , m_masterLoop ) ) return false; } // . all done with that. core if we block i guess. // . but what if we were not the function that set this to begin w/? //m_masterLoop = NULL; return logIt(); /* // if not doing exact quotas, we're done if ( ! cr->m_exactQuotas ) return logIt(); char *isIndexed = getIsIndexed(); // this means it blocked if ( isIndexed == (char *)-1) { char *xx=NULL; *xx=0; } // returns NULL with g_errno set if ( isIndexed ) return logIt(); // otherwise, tell Msg36 to update our quota count for this site // so we don't have to keep merging site: termlists m_incCount = false; m_decCount = false; if ( m_indexCode ) m_decCount = true; //if ( m_forceDelete ) m_decCount = true; // fix for the exact quota bug found on eurekster collection. bug 229 // if we're not a new doc, then don't increment the count because // we have been already counted as the old doc. MDW: i added the // condition that if decCount is true we need to update the count! if ( *isIndexed && ! m_decCount ) return logIt(); // if it is new and we are not adding it to the index then no need // to update any quota count... if ( ! *isIndexed && m_decCount ) return logIt(); // if not decrementing the count, must be incrementing it then! if ( ! m_decCount ) m_incCount = true; */ // i am not using quotas, so disable this for now /* log(LOG_DEBUG,"build: inc'ing quota to REMOTE table " "for termIdHost %"UINT64" termIdDom %"UINT64" for %s.", m_msg16.m_termIdHost,m_msg16.m_termIdDom,m_url.getUrl()); setStatus ( "updating quota cache" ); // sanity checks if ( m_msg16.m_termIdHost == 0 ) { char *xx = NULL; *xx = 0; } if ( m_msg16.m_termIdDom == 0 ) { char *xx = NULL; *xx = 0; } // . Msg36 gets the correct count from disk and puts it in cache. It // doesn't try to increment or decrement the quotas in cache, because // then it would have to be done on all twins, and also the correct // split will have to be found. // . Actually, we should only use the cache on one host to hold the // sum of all splits. This will be the authority cache. if ( ! m_updatedCounts ) { // only call this once m_updatedCounts = true; // do it if ( ! m_msg36.getTermFreq ( m_coll , 0 , // maxAge m_msg16.m_termIdHost , this , m_masterLoop , m_niceness , m_exactQuotas , m_incCount , m_decCount , false )) // we blocked return false; // error? if ( g_errno ) return logIt(); } // add the second entry for domain if ( ! m_updatedCounts2 ) { // only call this once m_updateCounts2 = true; // do it if ( ! m_msg36.getTermFreq ( m_coll , 0 , // maxAge m_msg16.m_termIdDom , this , doneAddingMsg36Entry2, m_niceness , m_exactQuotas , m_incCount , m_decCount , false )) // we blocked return false; // error? if ( g_errno ) return logIt(); } // that is it! return logIt(); */ } void getTitleRecBufWrapper ( void *state ) { XmlDoc *THIS = (XmlDoc *)state; // make sure has not been freed from under us! if ( THIS->m_freed ) { char *xx=NULL;*xx=0;} // note it THIS->setStatus ( "in get title rec wrapper" ); // return if it blocked if ( THIS->getTitleRecBuf() == (void *)-1 ) return; // otherwise, all done, call the caller callback if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state ); else THIS->m_callback2 ( THIS->m_state ); } key_t *XmlDoc::getTitleRecKey() { if ( m_titleRecBufValid ) return &m_titleRecKey; SafeBuf *tr = getTitleRecBuf(); if ( ! tr || tr == (void *)-1 ) return (key_t *)tr; return &m_titleRecKey; } int32_t *XmlDoc::getIndexCode ( ) { int32_t *indexCode = getIndexCode2(); if ( ! indexCode || indexCode == (void *)-1 ) return indexCode; // if zero good! if ( *indexCode == 0 ) return indexCode; // // should we neutralize it? // // in the case of indexing dmoz urls outputted from // 'dmozparse urldump -s' it outputs a meta tag // () that // indicates to index the links even in the case of some errors, // so that we can be assured to have exactly the same urls the dmoz // has in our index. so when we do a gbcatid:xxx query we get the same // urls in the search results that dmoz has for that category id. if ( ! m_sreqValid || ! m_sreq.m_ignoreExternalErrors ) return indexCode; // only neutralize certain errors if ( * indexCode != EDNSTIMEDOUT && *indexCode != ETCPTIMEDOUT && *indexCode != EUDPTIMEDOUT // from m_redirError && *indexCode != EDOCSIMPLIFIEDREDIR && *indexCode != EDOCNONCANONICAL && *indexCode != EDNSDEAD && *indexCode != ENETUNREACH && *indexCode != EHOSTUNREACH && *indexCode != EDOCFILTERED && *indexCode != EDOCREPEATSPAMMER && *indexCode != EDOCDUP && *indexCode != EDOCISERRPG && *indexCode != EDOCHIJACKED && *indexCode != EDOCBADHTTPSTATUS && *indexCode != EDOCDISALLOWED && *indexCode != EBADCHARSET && *indexCode != EDOCDUPWWW && *indexCode != EBADIP && *indexCode != EDOCEVILREDIRECT // fix video.google.com dmoz && *indexCode != EBADMIME // index.t and .exe files are in dmoz but those // extensions are "bad" according to Url::isBadExtension() && *indexCode != EDOCBADCONTENTTYPE // repeat url path components are ok: && *indexCode != ELINKLOOP && *indexCode != ECONNREFUSED // malformed sections: && *indexCode != EDOCBADSECTIONS && *indexCode != ECORRUPTHTTPGZIP ) return indexCode; // ok, neutralize it *indexCode = 0; // if we could not get an ip we need to make a fake one if ( ! m_ipValid || m_ip == 0 || m_ip == -1 ) { log("build: ip unattainable. forcing ip address of %s " "to 10.5.123.45",m_firstUrl.m_url); m_ip = atoip("10.5.123.45"); m_ipValid = true; } // make certain things valid to avoid core in getNewSpiderReply() if ( ! m_crawlDelayValid ) { m_crawlDelayValid = true; m_crawlDelay = -1; } return indexCode; } // . return NULL and sets g_errno on error // . returns -1 if blocked int32_t *XmlDoc::getIndexCode2 ( ) { // return it now if we got it already if ( m_indexCodeValid ) return &m_indexCode; setStatus ( "getting index code"); // page inject can set deletefromindex to true if ( m_deleteFromIndex ) { m_indexCode = EDOCFORCEDELETE; m_indexCodeValid = true; return &m_indexCode; } // . internal callback // . so if any of the functions we end up calling directly or // indirectly block and return -1, we will be re-called from the top //if ( ! m_masterLoop ) { // m_masterLoop = getTitleRecWrapper; // m_masterState = this; //} if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; } if ( m_firstUrl.m_ulen <= 5 ) { m_indexCode = EBADURL; m_indexCodeValid = true; return &m_indexCode; } if ( m_firstUrl.m_ulen + 1 >= MAX_URL_LEN ) { m_indexCode = EURLTOOLONG; m_indexCodeValid = true; return &m_indexCode; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // "url is repeating path components" error? if ( ! m_check1 ) { m_check1 = true; if ( cr->m_isCustomCrawl == 0 && m_firstUrl.isLinkLoop() ) { m_indexCode = ELINKLOOP; m_indexCodeValid = true; return &m_indexCode; } } // fix for "http://.xyz.com/...." if ( m_firstUrl.m_host && m_firstUrl.m_host[0] == '.' ) { m_indexCode = EBADURL; m_indexCodeValid = true; return &m_indexCode; } if ( cr->m_doUrlSpamCheck && ! m_check2 ) { m_check2 = true; if ( m_firstUrl.isSpam() ) { m_indexCode = EDOCURLSPAM; m_indexCodeValid = true; return &m_indexCode; } } // . don't spider robots.txt urls for indexing! // . quickly see if we are a robots.txt url originally int32_t fulen = getFirstUrl()->getUrlLen(); char *fu = getFirstUrl()->getUrl(); char *fp = fu + fulen - 11; if ( fulen > 12 && fp[1] == 'r' && ! strncmp ( fu + fulen - 11 , "/robots.txt" , 11 )) { m_indexCode = EBADURL; m_indexCodeValid = true; return &m_indexCode; } // if this is an injection and "newonly" is not zero then we // only want to do the injection if the url is "new", meaning not // already indexed. "m_wasContentInjected" will be true if this is // an injection. "m_newOnly" will be true if the injector only // wants to proceed with the injection if this url is not already // indexed. if ( m_wasContentInjected && m_newOnly ) { XmlDoc **pod = getOldXmlDoc ( ); if ( ! pod || pod == (XmlDoc **)-1 ) return (int32_t *)pod; XmlDoc *od = *pod; // if the old doc does exist and WAS NOT INJECTED itself // then abandon this injection. it was spidered the old // fashioned way and we want to preserve it and NOT overwrite // it with this injection. if ( od && ! od->m_wasContentInjected ) { m_indexCode = EABANDONED; m_indexCodeValid = true; return &m_indexCode; } // if it was injected itself, only abandon this injection // in the special case that m_newOnly is "2". otherwise // if m_newOnly is 1 then we will overwrite any existing // titlerecs that were injected themselves. if ( od && od->m_wasContentInjected && m_newOnly == 2 ) { m_indexCode = EABANDONED; m_indexCodeValid = true; return &m_indexCode; } } // need tagrec to see if banned TagRec *gr = getTagRec(); if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr; // this is an automatic ban! if ( gr->getLong("manualban",0) ) { m_indexCode = EDOCBANNED; m_indexCodeValid = true; return &m_indexCode; } // get the ip of the current url int32_t *ip = getIp ( ); if ( ! ip || ip == (int32_t *)-1 ) return (int32_t *)ip; if ( *ip == 0 ) { m_indexCode = EBADIP; m_indexCodeValid = true; return &m_indexCode; } // . check robots.txt // . uses the curernt url // . if we end in /robots.txt then this quickly returns true // . no, we still might want to index if we got link text, so just // check this again below bool *isAllowed = getIsAllowed(); if ( ! isAllowed || isAllowed == (void *)-1) return (int32_t *)isAllowed; /* if ( ! *isAllowed ) { m_indexCode = EDOCDISALLOWED; m_indexCodeValid = true; return &m_indexCode; } */ // . TCPTIMEDOUT, NOROUTETOHOST, EDOCUNCHANGED, etc. // . this will be the reply from diffbot.com if using diffbot int32_t *dstatus = getDownloadStatus(); if ( ! dstatus || dstatus == (void *)-1 ) return (int32_t *)dstatus; if ( *dstatus ) { m_indexCode = *dstatus; m_indexCodeValid = true; return &m_indexCode; } // check the mime HttpMime *mime = getMime(); if ( ! mime || mime == (HttpMime *)-1 ) return (int32_t *)mime; // no, now the smart compression will nuke a reply if it has // no good date or for other reasons... // if empty, bad mime //if ( mime->getMimeLen() <= 0 && ! m_recycleContent ) { // m_indexCode = EBADMIME; // m_indexCodeValid = true; // return &m_indexCode; //} // check redir url Url **redirp = getRedirUrl(); if ( ! redirp || redirp == (void *)-1 ) return (int32_t *)redirp; // this must be valid now if ( ! m_redirErrorValid ) { char *xx=NULL;*xx=0; } if ( m_redirError ) { m_indexCode = m_redirError; m_indexCodeValid = true; return &m_indexCode; } int64_t *d = getDocId(); if ( ! d || d == (void *)-1 ) return (int32_t *)d; if ( *d == 0LL ) { m_indexCode = ENODOCID; m_indexCodeValid = true; return &m_indexCode; } // . is the same url but with a www. present already in titledb? // . example: if we are xyz.com and www.xyz.com is already in titledb // then nuke ourselves by setting m_indexCode to EDOCDUPWWW char *isWWWDup = getIsWWWDup (); if ( ! isWWWDup || isWWWDup == (char *)-1) return (int32_t *)isWWWDup; if ( *isWWWDup ) { m_indexCode = EDOCDUPWWW; m_indexCodeValid = true; return &m_indexCode; } uint16_t *charset = getCharset(); if ( ! charset && g_errno == EBADCHARSET ) { g_errno = 0; m_indexCode = EBADCHARSET; m_indexCodeValid = true; return &m_indexCode; } if ( ! charset || charset == (void *)-1) return (int32_t *)charset; // we had a 2024 for charset come back and that had a NULL // get_charset_str() but it was not supported if ( ! supportedCharset(*charset) ) { //&&get_charset_str(*charset) ) { m_indexCode = EBADCHARSET; m_indexCodeValid = true; return &m_indexCode; } // get local link info LinkInfo *info1 = getLinkInfo1(); if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int32_t *)info1; // get remote link info LinkInfo **pinfo2 = getLinkInfo2(); if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (int32_t *)pinfo2; LinkInfo *info2 = *pinfo2; // if robots.txt said no, and if we had no link text, then give up bool disallowed = true; if ( *isAllowed ) disallowed = false; if ( info1 && info1->hasLinkText() ) disallowed = false; if ( info2 && info2->hasLinkText() ) disallowed = false; // if we generated a new sitenuminlinks to store in tagdb, we might // want to add this for that only reason... consider! if ( disallowed ) { m_indexCode = EDOCDISALLOWED; m_indexCodeValid = true; return &m_indexCode; } // check for bad url extension, like .jpg Url *cu = getCurrentUrl(); if ( ! cu || cu == (void *)-1 ) return (int32_t *)cu; // take this check out because it is hurting // http://community.spiceworks.com/profile/show/Mr.T // because 't' was in the list of bad extensions. // now we use the url filters table to exclude the extensions we want. // and we use the 'ismedia' directive to exclude common media // extensions. having this check here is no longer needed and confusing // BUT on the otherhand stuff like .exe .rpm .deb is good to avoid! // so i'll just edit the list to remove more ambiguous extensions // like .f and .t bool badExt = cu->isBadExtension ( m_version ); if ( badExt && ! info1->hasLinkText() && ( ! info2 || ! info2->hasLinkText() ) ) { m_indexCode = EDOCBADCONTENTTYPE; m_indexCodeValid = true; return &m_indexCode; } int16_t *hstatus = getHttpStatus(); if ( ! hstatus || hstatus == (void *)-1 ) return (int32_t *)hstatus; if ( *hstatus != 200 ) { m_indexCode = EDOCBADHTTPSTATUS; m_indexCodeValid = true; return &m_indexCode; } // debug point //if ( cr->m_localCrawlInfo.m_pageDownloadAttempts >= 2 ) { // m_indexCode = ETCPTIMEDOUT; // m_indexCodeValid = true; // return &m_indexCode; //} // if this page is hijacked, toss it! char *hj = getIsHijacked(); if ( ! hj || hj == (char *)-1 ) return (int32_t *)hj; // if not allowed m_indexCode will be set if ( *hj ) { m_indexCode = EDOCHIJACKED; m_indexCodeValid = true; return &m_indexCode; } // check for EDOCISERRPG (custom error pages) char *isErrorPage = getIsErrorPage(); if ( !isErrorPage||isErrorPage==(void *)-1) return (int32_t *)isErrorPage; if ( *isErrorPage ) { m_indexCode = EDOCISERRPG; m_indexCodeValid = true; return &m_indexCode; } // . i moved this up to perhaps fix problems of two dup pages being // downloaded at about the same time // . are we a dup of another doc from any other site already indexed? char *isDup = getIsDup(); if ( ! isDup || isDup == (char *)-1 ) return (int32_t *)isDup; if ( *isDup ) { m_indexCode = EDOCDUP; m_indexCodeValid = true; return &m_indexCode; } // . is a non-canonical page that have // . also sets m_canonicanlUrl.m_url to it if we are not // . returns NULL if we are the canonical url // . do not do this check if the page was injected bool checkCanonical = true; if ( m_wasContentInjected ) checkCanonical = false; if ( m_isInjecting && m_isInjectingValid ) checkCanonical = false; // do not do canonical deletion if recycling content either i guess if ( m_sreqValid && m_sreq.m_recycleContent ) checkCanonical = false; // do not delete from being canonical if doing a query reindex if ( m_sreqValid && m_sreq.m_isPageReindex ) checkCanonical = false; if ( checkCanonical ) { Url **canon = getCanonicalRedirUrl(); if ( ! canon || canon == (void *)-1 ) return (int32_t *)canon; // if there is one then we are it's leaf, it is the primary // page so we should not index ourselves if ( *canon ) { m_indexCode = EDOCNONCANONICAL; m_indexCodeValid = true; return &m_indexCode; } } // was page unchanged since last time we downloaded it? XmlDoc **pod = getOldXmlDoc ( ); if ( ! pod || pod == (XmlDoc **)-1 ) return (int32_t *)pod; XmlDoc *od = NULL; if ( *pod ) od = *pod; // if recycling content is true you gotta have an old title rec. if ( ! od && m_recycleContent ) { m_indexCode = ENOTITLEREC; m_indexCodeValid = true; return &m_indexCode; } bool check = true; if ( ! od ) check = false; // do not do this logic for diffbot because it might want to get // the diffbot reply even if page content is the same, because it // might have an ajax call that updates the product price. // onlyProcessIfNewUrl defaults to true, so typically even diffbot // crawls will do this check. if ( cr->m_isCustomCrawl && ! cr->m_diffbotOnlyProcessIfNewUrl && // but allow urls like *-diffbotxyz2445187448 to be deduped, // that is the whole point of this line ! m_isDiffbotJSONObject ) check = false; if ( m_sreqValid && m_sreq.m_ignoreDocUnchangedError ) check = false; // or if recycling content turn this off as well! otherwise // it will always be 100% the same if ( m_recycleContent ) check = false; if ( check ) { // check inlinks now too! LinkInfo *info1 = getLinkInfo1 (); if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int32_t *)info1; LinkInfo *info2 = od->getLinkInfo1 (); if ( ! info2 || info2 == (LinkInfo *)-1 ) return (int32_t *)info2; Inlink *k1 = NULL; Inlink *k2 = NULL; char *s1, *s2; int32_t len1,len2; if ( info1->getNumGoodInlinks() != info2->getNumGoodInlinks() ) goto changed; for ( ; k1=info1->getNextInlink(k1) , k2=info2->getNextInlink(k2); ) { if ( ! k1 ) break; if ( ! k2 ) break; if ( k1->m_siteNumInlinks != k2->m_siteNumInlinks ) goto changed; s1 = k1->getLinkText(); len1 = k1->size_linkText - 1; // exclude \0 s2 = k2->getLinkText(); len2 = k2->size_linkText - 1; // exclude \0 if ( len1 != len2 ) goto changed; if ( len1 > 0 && memcmp(s1,s2,len1) != 0 ) goto changed; } // no change in link text, look for change in page content now int32_t *ch32 = getContentHash32(); if ( ! ch32 || ch32 == (void *)-1 ) return (int32_t *)ch32; if ( *ch32 == od->m_contentHash32 ) { m_indexCode = EDOCUNCHANGED; m_indexCodeValid = true; return &m_indexCode; } } changed: // words Words *words = getWords(); if ( ! words || words == (Words *)-1 ) return (int32_t *)words; // we set the D_IS_IN_DATE flag for these bits Bits *bits = getBits(); if ( ! bits ) return NULL; // . check for date buffer overflow before setting sections // . returns false and sets g_errno on error /* if ( ! m_dates.parseDates ( words , DF_FROM_BODY , bits )) { // sanity check if ( ! g_errno ) { char *xx=NULL;*xx=0; } // note it log("doc: parseDates: %s",mstrerror(g_errno)); // this just means we ran out of stack space to parse // out all the dates, so ignore and continue... that way // Spider.cpp does not give up and keep retrying us over // and over again if ( g_errno != EBUFOVERFLOW ) return NULL; g_errno = 0; m_indexCode = EDOCBADDATES; m_indexCodeValid = true; return &m_indexCode; } */ // bad sections? fixes http://www.beerexpedition.com/northamerica.shtml // being continuously respidered when its lock expires every // MAX_LOCK_AGE seconds Sections *sections = getSections(); // on EBUFOVERFLOW we will NEVER be able to parse this url // correctly so do not retry! if ( ! sections && g_errno == EBUFOVERFLOW ) { g_errno = 0; m_indexCode = EBUFOVERFLOW; m_indexCodeValid = true; return &m_indexCode; } if (!sections||sections==(Sections *)-1) return (int32_t *)sections; if ( sections->m_numSections == 0 && words->m_numWords > 0 ) { m_indexCode = EDOCBADSECTIONS; m_indexCodeValid = true; return &m_indexCode; } // i think an oom error is not being caught by Sections.cpp properly if ( g_errno ) { char *xx=NULL;*xx=0; } Dates *dp = getDates(); if ( ! dp && g_errno == EBUFOVERFLOW ) { g_errno = 0; m_indexCode = EBUFOVERFLOW; m_indexCodeValid = true; return &m_indexCode; } if ( ! dp || dp == (Dates *)-1 ) return (int32_t *)dp; // make sure address buffers did not overflow Addresses *aa = getAddresses (); if ( (! aa && g_errno == EBUFOVERFLOW) || // it sets m_breached now if there's a problem (aa && aa->m_breached) ) { g_errno = 0; m_indexCode = EBUFOVERFLOW; m_indexCodeValid = true; return &m_indexCode; } if ( ! aa || aa == (void *)-1 ) return (int32_t *)aa; // are we a root? char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (int32_t *)isRoot; // get the tag rec //TagRec *gr = getTagRec (); //if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr; bool spamCheck = true; // if we are a root, allow repeat spam if ( *isRoot ) spamCheck = false; // if we are being spidered deep, allow repeat spam if ( gr->getLong("deep",0) ) spamCheck = false; // not for crawlbot if ( cr->m_isCustomCrawl ) spamCheck = false; // only html for now if ( m_contentTypeValid && m_contentType != CT_HTML ) spamCheck =false; // turn this off for now spamCheck = false; // otherwise, check the weights if ( spamCheck ) { char *ws = getWordSpamVec(); if ( ! ws || ws == (void *)-1 ) return (int32_t *)ws; if ( m_isRepeatSpammer ) { m_indexCode = EDOCREPEATSPAMMER; m_indexCodeValid = true; return &m_indexCode; } } // validate this here so getSpiderPriority(), which calls // getUrlFilterNum(), which calls getNewSpiderReply(), which calls // us, getIndexCode() does not repeat all this junk //m_indexCodeValid = true; //m_indexCode = 0; // fix query reindex on global-index from coring because // the spider request is null if ( m_isDiffbotJSONObject ) { m_indexCode = 0; m_indexCodeValid = true; return &m_indexCode; } // this needs to be last! int32_t *priority = getSpiderPriority(); if ( ! priority || priority == (void *)-1) { // allow this though if ( g_errno == EBUFOVERFLOW ) { g_errno = 0; m_indexCode = EBUFOVERFLOW; m_indexCodeValid = true; return &m_indexCode; } // but if it blocked, then un-validate it m_indexCodeValid = false; // and return to be called again i hope return (int32_t *)priority; } if ( *priority == SPIDER_PRIORITY_FILTERED ) { m_indexCode = EDOCFILTERED; m_indexCodeValid = true; return &m_indexCode; } if ( *priority == SPIDER_PRIORITY_BANNED ) { m_indexCode = EDOCBANNED; m_indexCodeValid = true; return &m_indexCode; } // . if using diffbot and the diffbot reply had a time out error // or otherwise... diffbot failure demands a re-try always i guess. // put this above getSpiderPriority() call otherwise we end up in // a recursive loop with getIndexCode() and getNewSpiderReply() // . NO, don't do this anymore, however, if there is a diffbot // reply error then record it in the spider reply BUT only if it is // a diffbot reply error that warrants a retry. for instance, // EDIFFBOTCOULDNOTDOWNLOAD happens when diffbot got a 404 or 500 // error trying to download the page so it probably should not // retry. but EDIFFBOTREQUESTTIMEDOUT should retry. // SafeBuf *dbr = getDiffbotReply(); // if ( ! dbr || dbr == (void *)-1 ) return (int32_t *)dbr; // if ( m_diffbotReplyValid && m_diffbotReplyError ) { // m_indexCode= m_diffbotReplyError; // m_indexCodeValid = true; // return &m_indexCode; // } // no error otherwise m_indexCode = 0; m_indexCodeValid = true; return &m_indexCode; } char *XmlDoc::prepareToMakeTitleRec ( ) { // do not re-call this for speed if ( m_prepared ) return (char *)1; int32_t *indexCode = getIndexCode(); if (! indexCode || indexCode == (void *)-1) return (char *)indexCode; if ( *indexCode ) { m_prepared = true; return (char *)1; } // // do all the sets here // // . this gets our old doc from titledb, if we got it // . TODO: make sure this is cached in the event of a backoff, we // will redo this again!!! IMPORTANT!!! char *isIndexed = getIsIndexed(); if ( ! isIndexed || isIndexed == (char *)-1) return (char *)isIndexed; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // if we are injecting into the "qatest123" coll, then we need to have // m_spideredTimeValid be true before calling getIsSpam() which calls // getSiteNumInlinks() which adds tags to tagdb using that date, but // only for the "qatest123" coll! // that keeps our parser output consistent across runs! char **content = NULL; if ( ! strcmp ( cr->m_coll,"qatest123") ) { content = getContent ( ); if ( ! content || content == (void *)-1 ) return (char *)content; } // get our site root char *mysite = getSite(); if ( ! mysite || mysite == (void *)-1 ) return (char *)mysite; // if we are a root page, update tagdb with the root lang id //bool *status1 = updateRootLangId(); //if ( ! status1 || status1 == (void *)-1 ) return (char *)status1; // if we are a root page, update tagdb with the root lang id //bool *status2 = updateSiteTitleBuf(); //if ( ! status2 || status2 == (void *)-1 ) return (char *)status2; // if we found some default venue addresses on page, add to tagdb //bool *status3 = updateVenueAddresses(); //if ( ! status3 || status3 == (void *)-1 ) return (char *)status3; // add "firstip" to tag rec if we need to //bool *status4 = updateFirstIp(); //if ( ! status4 || status4 == (void *)-1 ) return (char *)status4; uint8_t *langId = getLangId(); if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId; int32_t *datedbDate = getPubDate(); if ( ! datedbDate || datedbDate == (int32_t *)-1 ) return (char *)datedbDate; getHostHash32a(); getContentHash32(); //Images *images = getImages(); //if ( ! images || images == (Images *)-1 ) return (char *)images; char **id = getThumbnailData(); if ( ! id || id == (void *)-1 ) return (char *)id; int8_t *hopCount = getHopCount(); if ( ! hopCount || hopCount == (void *)-1 ) return (char *)hopCount; char *spiderLinks = getSpiderLinks(); if ( ! spiderLinks || spiderLinks == (char *)-1 ) return (char *)spiderLinks; //int32_t *nextSpiderTime = getNextSpiderTime(); //if ( ! nextSpiderTime || nextSpiderTime == (int32_t *)-1 ) // return (char *)nextSpiderTime; //int8_t *nextSpiderPriority = getNextSpiderPriority(); //if ( ! nextSpiderPriority || nextSpiderPriority == (void *)-1 ) // return (char *)nextSpiderPriority; int32_t *firstIndexedDate = getFirstIndexedDate(); if ( ! firstIndexedDate || firstIndexedDate == (int32_t *)-1 ) return (char *)firstIndexedDate; int32_t *outlinksAddedDate = getOutlinksAddedDate(); if ( ! outlinksAddedDate || outlinksAddedDate == (int32_t *)-1 ) return (char *)outlinksAddedDate; uint16_t *countryId = getCountryId(); if ( ! countryId||countryId==(uint16_t *)-1) return (char *)countryId; char *trunc = getIsContentTruncated(); if ( ! trunc || trunc == (char *)-1 ) return (char *)trunc; char *pl = getIsPermalink(); if ( ! pl || pl == (char *)-1 ) return (char *)pl; //int32_t *numBannedOutlinks = getNumBannedOutlinks(); // set this //m_numBannedOutlinks8 = score32to8 ( *numBannedOutlinks ); Dates *dp = getDates(); if ( ! dp || dp == (Dates *)-1 ) return (char *)dp; // . before storing this into title Rec, make sure all tags // are valid and tagRec is up to date // . like we might need to update the contact info, siteNumInlinks, // or other tags because, for instance, contact info might not // be in there because isSpam() never required it. int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni; char *hci = getHasContactInfo(); if ( ! hci || hci == (char *)-1 ) return (char *)hci; char *ict = getIsContentTruncated(); if ( ! ict || ict == (char *)-1 ) return (char *)ict; int64_t **wd = getWikiDocIds(); if ( ! wd || wd == (void *)-1 ) return (char *)wd; int64_t **avp = getAdVector(); if ( ! avp || avp == (void *)-1 ) return (char *)avp; char *at = getIsAdult(); if ( ! at || at == (void *)-1 ) return (char *)at; char *ls = getIsLinkSpam(); if ( ! ls || ls == (void *)-1 ) return (char *)ls; uint32_t *tph = getTagPairHash32(); if ( ! tph || tph == (uint32_t *)-1 ) return (char *)tph; // sets the ptr_sectionsReply, that is all we need it to do //char **sd = getSectionsReply ( ) ; //if ( ! sd || sd == (void *)-1 ) return (char *)sd; // sets the ptr_addressReply, that is all we need it to do //char **ad = getAddressReply ( ) ; //if ( ! ad || ad == (void *)-1 ) return (char *)ad; uint8_t *rl = getRootLangId(); if ( ! rl || rl == (void *)-1 ) return (char *)rl; int32_t **pcids = getCatIds(); if ( ! pcids || pcids == (void *)-1) return (char *)pcids; // get dmoz ptr_dmozTitles, ptr_dmozSumms, ptr_dmozAnchors if ( ! setDmozInfo() ) return (char *)-1; m_prepared = true; return (char *)1; } #define MAX_DMOZ_TITLES 10 int32_t *XmlDoc::getNumDmozEntries() { // MDW: wth is this? //int32_t **getDmozCatIds(); int32_t nc = size_catIds / 4; if ( nc > MAX_DMOZ_TITLES ) nc = MAX_DMOZ_TITLES; m_numDmozEntries = nc; return &m_numDmozEntries; } // list of \0 terminated titles, etc. use getNumDmozTitles() to get # char **XmlDoc::getDmozTitles ( ) { // returns false if blocked if ( ! setDmozInfo() ) return (char **)-1; if ( g_errno ) return NULL; return &ptr_dmozTitles; } char **XmlDoc::getDmozSummaries ( ) { // returns false if blocked if ( ! setDmozInfo() ) return (char **)-1; if ( g_errno ) return NULL; return &ptr_dmozSumms; } char **XmlDoc::getDmozAnchors ( ) { // returns false if blocked if ( ! setDmozInfo() ) return (char **)-1; if ( g_errno ) return NULL; return &ptr_dmozAnchors; } // returns false if blocked, true otherwise. sets g_errno on error & rets true bool XmlDoc::setDmozInfo () { if ( m_dmozInfoValid ) return true; g_errno = 0; // return true and set g_errno on error if ( ! m_dmozBuf.reserve(12000) ) { log("xmldoc: error getting dmoz info: %s",mstrerror(g_errno)); // ensure log statement does not clear g_errno if ( ! g_errno ) { char *xx=NULL;*xx=0; } return true; } // start here char *dmozBuf = m_dmozBuf.getBufStart(); char *titles = dmozBuf; char *summs = dmozBuf+5000; char *anchors = dmozBuf+10000; // the end of it char *dtend = dmozBuf + 5000; char *dsend = dmozBuf + 10000; char *daend = dmozBuf + 12000; // point into those bufs char *dt = titles; char *ds = summs; char *da = anchors; // MDW: i limit this to 10 to save stack space! int32_t nc = size_catIds / 4; if ( nc > MAX_DMOZ_TITLES ) nc = MAX_DMOZ_TITLES; for (int32_t i = 0; i < nc ; i++) { // breathe QUICKPOLL ( m_niceness ); // temp stuff int32_t dtlen = 0; int32_t dslen = 0; unsigned char dalen = 0; // . store all dmoz info separated by \0's into titles[] buffer // . crap, this does a disk read and blocks on that // // . TODO: make it non-blocking!!!! // g_categories->getTitleAndSummary ( m_firstUrl.getUrl(), m_firstUrl.getUrlLen(), ptr_catIds[i], dt,//&titles[titlesLen], &dtlen,//&titleLens[i], dtend-dt, ds,//&summs[summsLen], &dslen,//&summLens[i], dsend-ds, da,//&anchors[anchorsLen], &dalen,//&anchorLens[i], daend-da, m_niceness); // advance ptrs dt += dtlen; ds += dslen; da += dalen; // null terminate *dt++ = 0; *ds++ = 0; *ds++ = 0; } // if empty, make it a \0 to keep in sync with the rest if ( dt == titles ) *dt++ = '\0'; if ( ds == summs ) *ds++ = '\0'; if ( da == anchors ) *da++ = '\0'; // set these ptr_dmozTitles = titles; ptr_dmozSumms = summs; ptr_dmozAnchors = anchors; size_dmozTitles = dt - titles; size_dmozSumms = ds - summs; size_dmozAnchors = da - anchors; m_dmozInfoValid = true; return true; } // . create and store the titlerec into "buf". // . it is basically the header part of all the member vars in this XmlDoc. // . it has a key,dataSize,compressedData so it can be a record in an Rdb // . return true on success, false on failure bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, int64_t docId, int64_t uh48 ){ //setStatus ( "making title rec"); // assume could not make one because we were banned or something tbuf->purge(); // m_titleRec = NULL; // start seting members in THIS's header before compression m_version = TITLEREC_CURRENT_VERSION; // tag rec must have "sitenuminlinks" in it //if (! m_newTagRec.getTag("sitenuminlinks") ) { char *xx=NULL;*xx=0; } // we often update m_oldTagRec above by calling updateRootLangId(), etc // so update the size our of tag rec here //size_tagRecData = m_oldTagRec.getSize(); // and sanity check this //if( ptr_tagRecData != (char *)&m_oldTagRec ) { char *xx=NULL;*xx=0; } // lookup dmoz title and summary for this site //int32_t titleLens [10]; //int32_t summLens [10]; //unsigned char anchorLens [10]; //int32_t titlesLen = 0; //int32_t summsLen = 0; //int32_t anchorsLen = 0; //char titles [10*1024]; //char summs [10*4096]; //char anchors [10* 256]; /* MDW oct 12 2013 - why is this here? we should store this info at spider time? char *titles = m_dmozBuf; char *summs = m_dmozBuf+5000; char *anchors = m_dmozBuf+10000; // the end of it char *dtend = m_dmozBuf + 5000; char *dsend = m_dmozBuf + 10000; char *daend = m_dmozBuf + 12000; // point into those bufs char *dt = titles; char *ds = summs; char *da = anchors; // MDW: i limit this to 10 to save stack space! int32_t nc = size_catIds / 4; if ( nc > 10 ) nc = 10; for (int32_t i = 0; i < nc ; i++) { // breathe QUICKPOLL ( m_niceness ); // temp stuff int32_t dtlen = 0; int32_t dslen = 0; unsigned char dalen = 0; // . store all dmoz info separated by \0's into titles[] buffer // . crap, this does a disk read and blocks on that // // . TODO: make it non-blocking!!!! // g_categories->getTitleAndSummary ( m_firstUrl.getUrl(), m_firstUrl.getUrlLen(), ptr_catIds[i], dt,//&titles[titlesLen], &dtlen,//&titleLens[i], dtend-dt, ds,//&summs[summsLen], &dslen,//&summLens[i], dsend-ds, da,//&anchors[anchorsLen], &dalen,//&anchorLens[i], daend-da, m_niceness); // advance ptrs dt += dtlen; ds += dslen; da += dalen; // null terminate if ( dtlen>0 && dt[dtlen-1]!='\0' ) { *dt++=0; dtlen++; } if ( dslen>0 && ds[dslen-1]!='\0' ) { *ds++=0; dslen++; } if ( dalen>0 && da[dalen-1]!='\0' ) { *da++=0; dalen++; } // must always be something! if ( dtlen==0 ) {*dt++=0; dtlen++;} if ( dslen==0 ) {*ds++=0; dslen++;} if ( dalen==0 ) {*da++=0; dalen++;} } // set these ptr_dmozTitles = titles; ptr_dmozSumms = summs; ptr_dmozAnchors = anchors; size_dmozTitles = dt - titles; size_dmozSumms = ds - summs; size_dmozAnchors = da - anchors; */ // set our crap that is not necessarily set //ptr_firstUrl = m_firstUrl.getUrl(); //ptr_redirUrl = m_redirUrl.getUrl(); //ptr_tagRecData = (char *)&m_oldTagRec; // this must be valid now //if ( ! m_skipIndexingValid ) { char *xx=NULL;*xx=0; } // set this m_headerSize = (char *)&ptr_firstUrl - (char *)&m_headerSize; // add in variable length data int32_t *ps = (int32_t *)&size_firstUrl; // data ptr, consider a NULL to mean empty too! char **pd = (char **)&ptr_firstUrl; // how many XmlDoc::ptr_* members do we have? set "np" to that int32_t np = ((char *)&size_firstUrl - (char *)&ptr_firstUrl) ; np /= sizeof(char *); // count up total we need to alloc int32_t need1 = m_headerSize; // clear these m_internalFlags1 = 0; // loop over em for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) { // skip if empty if ( *ps <= 0 ) continue; // or empty string ptr if ( ! *pd ) continue; // skip utf8content if we should -- no events or addresses //if ( m_skipIndexing && pd == &ptr_utf8Content ) continue; // 4 bytes for the size need1 += 4; // add it up need1 += *ps; // make the mask uint32_t mask = 1 << i ; // add it in m_internalFlags1 |= mask; } // alloc the buffer char *ubuf = (char *) mmalloc ( need1 , "xdtrb" ); // return NULL with g_errno set on error if ( ! ubuf ) return false; // serialize into it char *p = ubuf; // copy our crap into there gbmemcpy ( p , &m_headerSize , m_headerSize ); // skip it p += m_headerSize; // reset data ptrs pd = (char **)&ptr_firstUrl; // reset data sizes ps = (int32_t *)&size_firstUrl; // then variable length data for ( int32_t i = 0 ; i < np ; i++ , ps++ , pd++ ) { // skip if empty, do not serialize if ( ! *ps ) continue; // or empty string ptr if ( ! *pd ) continue; // skip utf8content if we should -- no events or addresses //if ( m_skipIndexing && pd == &ptr_utf8Content ) continue; // store size first *(int32_t *)p = *ps; p += 4; // then the data gbmemcpy ( p , *pd , *ps ); // skip *ps bytes we wrote. should include a \0 p += *ps; } // sanity check if ( p != ubuf + need1 ) { char *xx=NULL; *xx=0; } // now restore it for other functions to use //size_content = saved; // . now compress our "title rec" data into a titleRec // . cbuf should not be set //if ( cbuf ) { // log(LOG_LOGIC,"db: titlerec: compress: cbuf is set."); // char *p = NULL; *p = 0; exit(-1); //} // should we free cbuf on our reset/destruction? //m_owncbuf = ownCompressedData; // . make a buf big enough to hold compressed, we'll realloc afterwards // . according to zlib.h line 613 compress buffer must be .1% larger // than source plus 12 bytes. (i add one for round off error) // . now i added another extra 12 bytes cuz compress seemed to want it int32_t need2 = ((int64_t)need1 * 1001LL) / 1000LL + 13 + 12; // we also need to store a key then regular dataSize then // the uncompressed size in cbuf before the compression of m_ubuf int32_t hdrSize = sizeof(key_t) + 4 + 4; // . now i add 12 bytes more so Msg14.cpp can also squeeze in a // negative key to delete the old titleRec, cuz we use this cbuf // to set our list that we add to our twins with // . we now store the negative rec before the positive rec in Msg14.cpp //hdrSize += sizeof(key_t) + 4; need2 += hdrSize; // alloc what we need //char *cbuf = (char *) mmalloc ( need2 ,"TitleRecc"); //if ( ! cbuf ) return false; // return false on error if ( ! tbuf->reserve ( need2 ,"titbuf" ) ) return false; // int16_tcut char *cbuf = tbuf->getBufStart(); // set cbuf sizes, we set cbufSize below to fit exactly used buf //int32_t cbufMaxSize = need2; // . how big is the buf we're passing to ::compress()? // . don't include the last 12 byte, save for del key in Msg14.cpp int32_t size = need2 - hdrSize ; // . uncompress the data into ubuf // . this will reset cbufSize to a smaller value probably // . "size" is set to how many bytes we wrote into "cbuf + hdrSize" int err = gbcompress ( (unsigned char *)cbuf + hdrSize, (uint32_t *)&size, (unsigned char *)ubuf , (uint32_t )need1 ); // note it //log("test: compressed %s from %"INT32" to %"INT32" bytes", // m_firstUrl.m_url,need2-hdrSize,size); // free the buf we were trying to compress now mfree ( ubuf , need1 , "trub" ); // we should check ourselves if ( err == Z_OK && size > (need2 - hdrSize ) ) { //mfree ( cbuf , need2 ,"TitleRecc" ); tbuf->purge(); g_errno = ECOMPRESSFAILED; log("db: Failed to compress document of %"INT32" bytes. " "Provided buffer of %"INT32" bytes.", size, (need2 - hdrSize ) ); return false; } // check for error if ( err != Z_OK ) { //mfree ( cbuf , need2 ,"TitleRecc" ); tbuf->purge(); g_errno = ECOMPRESSFAILED; log("db: Failed to compress document."); return false; } // calc cbufSize, the uncompressed header + compressed stuff //cbufSize = hdrSize + size ; //int64_t uh48 = getFirstUrlHash48(); // . make the key from docId // . false = delkey? //m_titleRecKey = g_titledb.makeKey (*getDocId(),uh48,false);//delkey? key_t tkey = g_titledb.makeKey (docId,uh48,false);//delkey? // validate it //m_titleRecKeyValid = true; // get a ptr to the Rdb record at start of the header p = cbuf; // skip over the negative rec reserved space for Msg14.cpp //p += 12 + 4; // . store key in header of cbuf // . store in our host byte ordering so we can be a rec in an RdbList *(key_t *) p = tkey; p += sizeof(key_t); // store total dataSize in header (excluding itself and key only) int32_t dataSize = size + 4; *(int32_t *) p = dataSize ; p += 4; // store uncompressed size in header *(int32_t *) p = need1 ; p += 4; // sanity check if ( p != cbuf + hdrSize ) { char *xx = NULL; *xx = 0; } // sanity check if ( need1 <= 0 ) { char *xx = NULL; *xx = 0; } // advance over data p += size; // update safebuf::m_length so it is correct tbuf->setLength ( p - cbuf ); return true; } // . return NULL and sets g_errno on error // . returns -1 if blocked SafeBuf *XmlDoc::getTitleRecBuf ( ) { // return it now if we got it already if ( m_titleRecBufValid ) return &m_titleRecBuf; setStatus ( "making title rec"); // did one of our many blocking function calls have an error? if ( g_errno ) return NULL; // . HACK so that TitleRec::isEmpty() return true // . faster than calling m_titleRec.reset() //m_titleRec.m_url.m_ulen = 0; int32_t *indexCode = getIndexCode(); // not allowed to block here if ( indexCode == (void *)-1) { char *xx=NULL;*xx=0; } // return on errors with g_errno set if ( ! indexCode ) return NULL; // force delete? EDOCFORCEDELETE if ( *indexCode ) { m_titleRecBufValid = true; return &m_titleRecBuf; } // . internal callback // . so if any of the functions we end up calling directly or // indirectly block and return -1, we will be re-called from the top if ( ! m_masterLoop ) { m_masterLoop = getTitleRecBufWrapper; m_masterState = this; } /* // parsing knobs if ( ! m_titleWeightValid ) { // TODO: watchout for overruns!! these are 16-bits only! //m_eliminateMenus = cr->m_eliminateMenus; m_titleWeight = cr->m_titleWeight; m_headerWeight = cr->m_headerWeight; m_urlPathWeight = cr->m_urlPathWeight; m_externalLinkTextWeight = cr->m_externalLinkTextWeight; m_internalLinkTextWeight = cr->m_internalLinkTextWeight; m_conceptWeight = cr->m_conceptWeight; //int32_t siteNumInlinksBoost = cr->m_siteNumInlinksBoost; // validate these //m_eliminateMenusValid = true; m_titleWeightValid = true; m_headerWeightValid = true; m_urlPathWeightValid = true; m_externalLinkTextWeightValid = true; m_internalLinkTextWeightValid = true; m_conceptWeightValid = true; } */ ///////// // // IF ANY of these validation sanity checks fail then update // prepareToMakeTitleRec() so it makes them valid!!! // ///////// // verify key parts if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; } // verify record parts //if ( ! m_versionValid ) { char *xx=NULL;*xx=0; } if ( ! m_ipValid ) { char *xx=NULL;*xx=0; } if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; } if ( ! m_firstIndexedDateValid ) { char *xx=NULL;*xx=0; } if ( ! m_outlinksAddedDateValid ) { char *xx=NULL;*xx=0; } if ( ! m_charsetValid ) { char *xx=NULL;*xx=0; } if ( ! m_countryIdValid ) { char *xx=NULL;*xx=0; } if ( ! m_httpStatusValid ) { char *xx=NULL;*xx=0; } /* if ( ! m_titleWeightValid ) { char *xx=NULL;*xx=0; } if ( ! m_headerWeightValid ) { char *xx=NULL;*xx=0; } if ( ! m_urlPathWeightValid ) { char *xx=NULL;*xx=0; } if ( ! m_externalLinkTextWeightValid ) { char *xx=NULL;*xx=0; } if ( ! m_internalLinkTextWeightValid ) { char *xx=NULL;*xx=0; } if ( ! m_conceptWeightValid ) { char *xx=NULL;*xx=0; } */ if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } if ( ! m_siteNumInlinksUniqueIpValid ) { char *xx=NULL;*xx=0; } if ( ! m_siteNumInlinksUniqueCBlockValid ) { char *xx=NULL;*xx=0; } if ( ! m_siteNumInlinksTotalValid ) { char *xx=NULL;*xx=0; } //if ( ! m_sitePopValid ) { char *xx=NULL;*xx=0; } if ( ! m_rootLangIdValid ) { char *xx=NULL;*xx=0; } if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; } if ( ! m_metaListCheckSum8Valid ) { char *xx=NULL;*xx=0; } //if ( ! m_numBannedOutlinksValid ) { char *xx=NULL;*xx=0; } if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; } if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; } if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; } if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; } if ( ! m_isAdultValid ) { char *xx=NULL;*xx=0; } //if ( ! m_eliminateMenusValid ) { char *xx=NULL;*xx=0; } if ( ! m_spiderLinksValid ) { char *xx=NULL;*xx=0; } if ( ! m_isContentTruncatedValid ) { char *xx=NULL;*xx=0; } if ( ! m_isLinkSpamValid ) { char *xx=NULL;*xx=0; } // buffers if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; } if ( ! m_redirUrlValid ) { char *xx=NULL;*xx=0; } //if ( ! m_metaRedirUrlValid ) { char *xx=NULL;*xx=0; } if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; } if ( ! m_gigabitHashesValid ) { char *xx=NULL;*xx=0; } if ( ! m_adVectorValid ) { char *xx=NULL;*xx=0; } if ( ! m_wikiDocIdsValid ) { char *xx=NULL;*xx=0; } if ( ! m_imageDataValid ) { char *xx=NULL;*xx=0; } if ( ! m_catIdsValid ) { char *xx=NULL;*xx=0; } if ( ! m_indCatIdsValid ) { char *xx=NULL;*xx=0; } if ( ! m_dmozInfoValid ) { char *xx=NULL;*xx=0; } // if m_recycleContent is true, these are not valid if ( ! m_recycleContent ) { if ( ! m_rawUtf8ContentValid ) { char *xx=NULL;*xx=0; } if ( ! m_expandedUtf8ContentValid ) { char *xx=NULL;*xx=0; } } if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; } if ( ! m_datesValid ) { char *xx=NULL;*xx=0; } // why do we need valid sections for a titlerec? we no longer user // ptr_sectiondbData... //if ( ! m_sectionsValid ) { char *xx=NULL;*xx=0; } //if ( ! m_sectionsReplyValid ) { char *xx=NULL;*xx=0; } //if ( ! m_addressReplyValid ) { char *xx=NULL;*xx=0; } if ( ! m_siteValid ) { char *xx=NULL;*xx=0; } if ( ! m_linkInfo1Valid ) { char *xx=NULL;*xx=0; } if ( ! m_linkInfo2Valid ) { char *xx=NULL;*xx=0; } //if ( ! m_sectiondbDataValid ) { char *xx=NULL;*xx=0; } //if ( ! m_placedbDataValid ) { char *xx=NULL;*xx=0; } if ( ! m_clockCandidatesDataValid ) { char *xx=NULL;*xx=0; } // do we need these? if ( ! m_hostHash32aValid ) { char *xx=NULL;*xx=0; } if ( ! m_contentHash32Valid ) { char *xx=NULL;*xx=0; } //if ( ! m_tagHash32Valid ) { char *xx=NULL;*xx=0; } if ( ! m_tagPairHash32Valid ) { char *xx=NULL;*xx=0; } // sanity checks if ( ! m_addressesValid ) { char *xx=NULL;*xx=0; } // breathe QUICKPOLL( m_niceness ); setStatus ( "compressing into final title rec"); int64_t uh48 = getFirstUrlHash48(); int64_t *docId = getDocId(); // time it int64_t startTime = gettimeofdayInMilliseconds(); ////// // // fill in m_titleRecBuf // ////// // we need docid and uh48 for making the key of the titleRec if ( ! setTitleRecBuf ( &m_titleRecBuf , *docId , uh48 ) ) return NULL; // set this member down here because we can't set it in "xd" // because it is too int16_t of an xmldoc stub m_versionValid = true; // breathe QUICKPOLL( m_niceness ); // . add the stat // . use white for the stat g_stats.addStat_r ( 0 , startTime , gettimeofdayInMilliseconds(), 0x00ffffff ); QUICKPOLL( m_niceness ); char *cbuf = m_titleRecBuf.getBufStart(); m_titleRecKey = *(key_t *)cbuf; m_titleRecKeyValid = true; // we are legit //m_freeTitleRec = true; //m_titleRec = cbuf; // key + dataSize + ubufSize + compressedData //m_titleRecSize = sizeof(key_t)+ 4 + 4 + size; //m_titleRecAllocSize = need2; // now valid. congratulations! m_titleRecBufValid = true; return &m_titleRecBuf; } // . an "id" of 2 means very indicative of a dirty doc // . an "id" of 1 means it must be joined with another dirty word to indicate // . taken mostly from Url.cpp // . see matches2.h for Needle class definition static Needle s_dirtyWords [] = { {"upskirt" ,0,2,0,0,NULL,0,NULL}, {"downblouse" ,0,2,0,0,NULL,0,NULL}, {"shemale" ,0,1,0,0,NULL,0,NULL}, {"spank" ,0,1,0,0,NULL,0,NULL}, {"dildo" ,0,2,0,0,NULL,0,NULL}, {"bdsm" ,0,2,0,0,NULL,0,NULL}, {"voyeur" ,0,2,0,0,NULL,0,NULL}, {"fisting" ,0,2,0,0,NULL,0,NULL}, {"vibrator" ,0,2,0,0,NULL,0,NULL}, {"ejaculat" ,0,2,0,0,NULL,0,NULL}, {"rgasm" ,0,2,0,0,NULL,0,NULL}, {"orgy" ,0,2,0,0,NULL,0,NULL}, {"orgies" ,0,2,0,0,NULL,0,NULL}, {"stripper" ,0,1,0,0,NULL,0,NULL}, {"softcore" ,0,2,0,0,NULL,0,NULL}, {"whore" ,0,2,0,0,NULL,0,NULL}, // gary slutkin on ted.com. make this just 1 point. {"slut" ,0,1,0,0,NULL,0,NULL}, {"smut" ,0,2,0,0,NULL,0,NULL}, {"tits" ,0,2,0,0,NULL,0,NULL}, {"lesbian" ,0,2,0,0,NULL,0,NULL}, {"swinger" ,0,2,0,0,NULL,0,NULL}, {"fetish" ,0,2,0,0,NULL,0,NULL}, {"nude" ,0,1,0,0,NULL,0,NULL}, {"centerfold" ,0,2,0,0,NULL,0,NULL}, {"incest" ,0,2,0,0,NULL,0,NULL}, {"pedophil" ,0,2,0,0,NULL,0,NULL}, {"pedofil" ,0,2,0,0,NULL,0,NULL}, {"horny" ,0,2,0,0,NULL,0,NULL}, // horny toad {"pussy" ,0,2,0,0,NULL,0,NULL}, // pussy willow pussy cat {"pussies" ,0,2,0,0,NULL,0,NULL}, {"penis" ,0,2,0,0,NULL,0,NULL}, {"vagina" ,0,2,0,0,NULL,0,NULL}, {"phuck" ,0,2,0,0,NULL,0,NULL}, {"blowjob" ,0,2,0,0,NULL,0,NULL}, {"blow job" ,0,2,0,0,NULL,0,NULL}, {"gangbang" ,0,2,0,0,NULL,0,NULL}, {"xxx" ,0,1,0,0,NULL,0,NULL}, // yahoo.com has class="fz-xxxl" {"porn" ,0,2,0,0,NULL,0,NULL}, {"felch" ,0,2,0,0,NULL,0,NULL}, {"cunt" ,0,2,0,0,NULL,0,NULL}, {"bestial" ,0,2,0,0,NULL,0,NULL}, {"beastial" ,0,2,0,0,NULL,0,NULL}, {"kink" ,0,2,0,0,NULL,0,NULL}, // . "sex" is often substring in tagids. // . too many false positives, make "1" not "2" {"sex" ,0,1,0,0,NULL,0,NULL}, {"anal" ,0,2,0,0,NULL,0,NULL}, {"cum" ,0,2,0,0,NULL,0,NULL}, // often used for cumulative {"clit" ,0,2,0,0,NULL,0,NULL}, {"fuck" ,0,2,0,0,NULL,0,NULL}, {"boob" ,0,1,0,0,NULL,0,NULL}, {"wank" ,0,2,0,0,NULL,0,NULL}, {"fick" ,0,2,0,0,NULL,0,NULL}, {"eroti" ,0,2,0,0,NULL,0,NULL}, {"gay" ,0,1,0,0,NULL,0,NULL}, // make 1 pt. 'marvin gay' // new stuff not in Url.cpp {"thong" ,0,1,0,0,NULL,0,NULL}, {"masturbat" ,0,2,0,0,NULL,0,NULL}, {"bitch" ,0,1,0,0,NULL,0,NULL}, {"hell" ,0,1,0,0,NULL,0,NULL}, {"damn" ,0,1,0,0,NULL,0,NULL}, {"rimjob" ,0,2,0,0,NULL,0,NULL}, {"cunnilingu" ,0,2,0,0,NULL,0,NULL}, {"felatio" ,0,2,0,0,NULL,0,NULL}, {"fellatio" ,0,2,0,0,NULL,0,NULL}, {"dick" ,0,1,0,0,NULL,0,NULL}, {"cock" ,0,1,0,0,NULL,0,NULL}, {"rape" ,0,2,0,0,NULL,0,NULL}, {"raping" ,0,2,0,0,NULL,0,NULL}, {"bukake" ,0,2,0,0,NULL,0,NULL}, {"shit" ,0,2,0,0,NULL,0,NULL}, {"naked" ,0,1,0,0,NULL,0,NULL}, {"nympho" ,0,2,0,0,NULL,0,NULL}, {"hardcore" ,0,1,0,0,NULL,0,NULL}, // hardcore gamer, count as 1 {"sodom" ,0,2,0,0,NULL,0,NULL}, {"titties" ,0,2,0,0,NULL,0,NULL}, // re-do {"twat" ,0,2,0,0,NULL,0,NULL}, {"bastard" ,0,1,0,0,NULL,0,NULL}, {"erotik" ,0,2,0,0,NULL,0,NULL}, // EXCEPTIONS // smut {"transmut" ,0,-2,0,0,NULL,0,NULL}, {"bismuth" ,0,-2,0,0,NULL,0,NULL}, // sex {"middlesex" ,0,-1,0,0,NULL,0,NULL}, {"sussex" ,0,-1,0,0,NULL,0,NULL}, {"essex" ,0,-1,0,0,NULL,0,NULL}, {"deusex" ,0,-1,0,0,NULL,0,NULL}, {"sexchange" ,0,-1,0,0,NULL,0,NULL}, {"sexpress" ,0,-1,0,0,NULL,0,NULL}, {"sexpert" ,0,-1,0,0,NULL,0,NULL}, // EXCEPTIONS // sex {"middlesex" ,0,-1,0,0,NULL,0,NULL}, {"sussex" ,0,-1,0,0,NULL,0,NULL}, {"essex" ,0,-1,0,0,NULL,0,NULL}, {"deusex" ,0,-1,0,0,NULL,0,NULL}, {"sexchange" ,0,-1,0,0,NULL,0,NULL}, {"sexpress" ,0,-1,0,0,NULL,0,NULL}, {"sexpert" ,0,-1,0,0,NULL,0,NULL}, {"sexcel" ,0,-1,0,0,NULL,0,NULL}, {"sexist" ,0,-1,0,0,NULL,0,NULL}, {"sexile" ,0,-1,0,0,NULL,0,NULL}, {"sexperi" ,0,-1,0,0,NULL,0,NULL}, {"sexual" ,0,-1,0,0,NULL,0,NULL}, {"sexpose" ,0,-1,0,0,NULL,0,NULL}, {"sexclu" ,0,-1,0,0,NULL,0,NULL}, {"sexo" ,0,-1,0,0,NULL,0,NULL}, {"sexism" ,0,-1,0,0,NULL,0,NULL}, {"sexpan" ,0,-1,0,0,NULL,0,NULL}, // buttonsexpanion {"same-sex" ,0,-1,0,0,NULL,0,NULL}, {"opposite sex",0,-1,0,0,NULL,0,NULL}, // anal {"analog" ,0,-2,0,0,NULL,0,NULL}, {"analy" ,0,-2,0,0,NULL,0,NULL}, {"canal" ,0,-2,0,0,NULL,0,NULL}, {"kanal" ,0,-2,0,0,NULL,0,NULL}, {"banal" ,0,-2,0,0,NULL,0,NULL}, {"ianalbert" ,0,-2,0,0,NULL,0,NULL}, // ian albert // cum {"circum" ,0,-2,0,0,NULL,0,NULL}, {"cum laude" ,0,-2,0,0,NULL,0,NULL}, {"succum" ,0,-2,0,0,NULL,0,NULL}, {"cumber" ,0,-2,0,0,NULL,0,NULL}, {"docum" ,0,-2,0,0,NULL,0,NULL}, {"cumul" ,0,-2,0,0,NULL,0,NULL}, {"acumen" ,0,-2,0,0,NULL,0,NULL}, {"incum" ,0,-2,0,0,NULL,0,NULL}, {"capsicum" ,0,-2,0,0,NULL,0,NULL}, {"modicum" ,0,-2,0,0,NULL,0,NULL}, {"locum" ,0,-2,0,0,NULL,0,NULL}, {"scum" ,0,-2,0,0,NULL,0,NULL}, {"accum" ,0,-2,0,0,NULL,0,NULL}, {"cumbre" ,0,-2,0,0,NULL,0,NULL}, {"swank" ,0,-2,0,0,NULL,0,NULL}, {"fickle" ,0,-2,0,0,NULL,0,NULL}, {"traffick" ,0,-2,0,0,NULL,0,NULL}, {"scleroti" ,0,-2,0,0,NULL,0,NULL}, {"gaylor" ,0,-2,0,0,NULL,0,NULL}, {"gaynor" ,0,-2,0,0,NULL,0,NULL}, {"gayner" ,0,-2,0,0,NULL,0,NULL}, {"gayton" ,0,-2,0,0,NULL,0,NULL}, {"dipthong" ,0,-1,0,0,NULL,0,NULL}, // hell {"hellen" ,0,-1,0,0,NULL,0,NULL}, {"hellman" ,0,-1,0,0,NULL,0,NULL}, {"shell" ,0,-1,0,0,NULL,0,NULL}, {"mitchell" ,0,-1,0,0,NULL,0,NULL}, {"chelle" ,0,-1,0,0,NULL,0,NULL}, // me/michelle {"hello" ,0,-1,0,0,NULL,0,NULL}, {"moschella" ,0,-1,0,0,NULL,0,NULL}, {"othello" ,0,-1,0,0,NULL,0,NULL}, {"schelling" ,0,-1,0,0,NULL,0,NULL}, {"seychelles" ,0,-1,0,0,NULL,0,NULL}, {"wheller" ,0,-1,0,0,NULL,0,NULL}, {"winchell" ,0,-1,0,0,NULL,0,NULL}, // dick {"dicker" ,0,-1,0,0,NULL,0,NULL}, {"dickins" ,0,-1,0,0,NULL,0,NULL}, {"dickies" ,0,-1,0,0,NULL,0,NULL}, {"dickran" ,0,-1,0,0,NULL,0,NULL}, // cock {"babcock" ,0,-1,0,0,NULL,0,NULL}, {"cocked" ,0,-1,0,0,NULL,0,NULL}, {"cocking" ,0,-1,0,0,NULL,0,NULL}, {"cockpit" ,0,-1,0,0,NULL,0,NULL}, {"cockroach" ,0,-1,0,0,NULL,0,NULL}, {"cocktail" ,0,-1,0,0,NULL,0,NULL}, {"cocky" ,0,-1,0,0,NULL,0,NULL}, {"hancock" ,0,-1,0,0,NULL,0,NULL}, {"hitchcock" ,0,-1,0,0,NULL,0,NULL}, {"peacock" ,0,-1,0,0,NULL,0,NULL}, {"shuttlecock" ,0,-1,0,0,NULL,0,NULL}, {"stopcock" ,0,-1,0,0,NULL,0,NULL}, {"weathercock" ,0,-1,0,0,NULL,0,NULL}, {"woodcock" ,0,-1,0,0,NULL,0,NULL}, {"cockburn" ,0,-1,0,0,NULL,0,NULL}, // kink {"kinko" ,0,-2,0,0,NULL,0,NULL}, {"ukink" ,0,-2,0,0,NULL,0,NULL}, // ink shop in uk // naked {"snaked" ,0,-1,0,0,NULL,0,NULL}, // rape {"drape" ,0,-2,0,0,NULL,0,NULL}, {"grape" ,0,-2,0,0,NULL,0,NULL}, {"scrape" ,0,-2,0,0,NULL,0,NULL}, {"therape" ,0,-2,0,0,NULL,0,NULL}, {"trapez" ,0,-2,0,0,NULL,0,NULL}, {"parapet" ,0,-2,0,0,NULL,0,NULL}, {"scraping" ,0,-2,0,0,NULL,0,NULL}, {"draping" ,0,-2,0,0,NULL,0,NULL}, // twat {"twatch" ,0,-2,0,0,NULL,0,NULL}, // courtwatch -- cspan.org // clit {"heraclitus" ,0,-2,0,0,NULL,0,NULL}, // boob {"booboo" ,0,-1,0,0,NULL,0,NULL}, // shit {"shitak" ,0,-2,0,0,NULL,0,NULL} }; //// //// New stuff from sex.com adult word list //// //// //// make it a 2nd part because of performance limits on matches2.cpp algo //// static Needle s_dirtyWordsPart2 [] = { {"amateurfoto" ,0,2,0,0,NULL,0,NULL}, {"amateurhardcore" ,0,2,0,0,NULL,0,NULL}, {"amateurindex" ,0,2,0,0,NULL,0,NULL}, {"amateurnaked" ,0,2,0,0,NULL,0,NULL}, {"amatuerhardcore" ,0,2,0,0,NULL,0,NULL}, {"ampland" ,0,2,0,0,NULL,0,NULL}, //{"animehentai" ,0,2,0,0,NULL,0,NULL}, dup {"anitablonde" ,0,2,0,0,NULL,0,NULL}, {"asiacarrera" ,0,2,0,0,NULL,0,NULL}, {"asshole" ,0,2,0,0,NULL,0,NULL}, {"asslick" ,0,2,0,0,NULL,0,NULL}, {"asspic" ,0,2,0,0,NULL,0,NULL}, {"assworship" ,0,2,0,0,NULL,0,NULL}, //{"badgirl" ,0,2,0,0,NULL,0,NULL}, not necessarily bad {"bareceleb" ,0,2,0,0,NULL,0,NULL}, {"barenaked" ,0,2,0,0,NULL,0,NULL}, {"beaverboy" ,0,2,0,0,NULL,0,NULL}, {"beavershot" ,0,2,0,0,NULL,0,NULL}, // was beavershots //{"bigball" ,0,2,0,0,NULL,0,NULL}, // not necessarily bad {"bigbreast" ,0,2,0,0,NULL,0,NULL}, //{"bigbutt" ,0,2,0,0,NULL,0,NULL}, // not necessarily bad {"bigcock" ,0,2,0,0,NULL,0,NULL}, {"bigdick" ,0,2,0,0,NULL,0,NULL}, {"biggestdick" ,0,2,0,0,NULL,0,NULL}, {"biggesttit" ,0,2,0,0,NULL,0,NULL}, {"bighairyball" ,0,2,0,0,NULL,0,NULL}, {"bighooter" ,0,2,0,0,NULL,0,NULL}, {"bignipple" ,0,2,0,0,NULL,0,NULL}, {"bigtit" ,0,2,0,0,NULL,0,NULL}, {"blackbooty" ,0,2,0,0,NULL,0,NULL}, {"blackbutt" ,0,2,0,0,NULL,0,NULL}, {"blackcock" ,0,2,0,0,NULL,0,NULL}, {"blackdick" ,0,2,0,0,NULL,0,NULL}, {"blackhardcore" ,0,2,0,0,NULL,0,NULL}, {"blackonblonde" ,0,2,0,0,NULL,0,NULL}, {"blacksonblonde" ,0,2,0,0,NULL,0,NULL}, {"blacktit" ,0,2,0,0,NULL,0,NULL}, {"blacktwat" ,0,2,0,0,NULL,0,NULL}, {"boner" ,0,1,0,0,NULL,0,NULL}, // softcore, someone's lastname? {"bordello" ,0,2,0,0,NULL,0,NULL}, {"braless" ,0,2,0,0,NULL,0,NULL}, {"brothel" ,0,2,0,0,NULL,0,NULL}, {"bukake" ,0,2,0,0,NULL,0,NULL}, {"bukkake" ,0,2,0,0,NULL,0,NULL}, {"bustyblonde" ,0,2,0,0,NULL,0,NULL}, {"bustyceleb" ,0,2,0,0,NULL,0,NULL}, {"butthole" ,0,2,0,0,NULL,0,NULL}, {"buttman" ,0,2,0,0,NULL,0,NULL}, {"buttpic" ,0,2,0,0,NULL,0,NULL}, {"buttplug" ,0,2,0,0,NULL,0,NULL}, {"buttthumbnails" ,0,2,0,0,NULL,0,NULL}, {"callgirl" ,0,2,0,0,NULL,0,NULL}, {"celebritiesnaked" ,0,2,0,0,NULL,0,NULL}, {"celebritybush" ,0,2,0,0,NULL,0,NULL}, {"celebritybutt" ,0,2,0,0,NULL,0,NULL}, {"chaseylain" ,0,2,0,0,NULL,0,NULL}, {"chickswithdick" ,0,2,0,0,NULL,0,NULL}, {"christycanyon" ,0,2,0,0,NULL,0,NULL}, {"cicciolina" ,0,2,0,0,NULL,0,NULL}, //{"cunilingus" ,0,2,0,0,NULL,0,NULL}, {"cunniling" ,0,2,0,0,NULL,0,NULL}, // abbreviate {"cyberlust" ,0,2,0,0,NULL,0,NULL}, {"danniashe" ,0,2,0,0,NULL,0,NULL}, {"dicksuck" ,0,2,0,0,NULL,0,NULL}, {"dirtymind" ,0,2,0,0,NULL,0,NULL}, {"dirtypicture" ,0,2,0,0,NULL,0,NULL}, {"doggiestyle" ,0,2,0,0,NULL,0,NULL}, {"doggystyle" ,0,2,0,0,NULL,0,NULL}, {"domatrix" ,0,2,0,0,NULL,0,NULL}, {"dominatrix" ,0,2,0,0,NULL,0,NULL}, //{"dyke" ,0,2,0,0,NULL,0,NULL}, // dick van dyke! {"ejaculation" ,0,2,0,0,NULL,0,NULL}, {"erosvillage" ,0,2,0,0,NULL,0,NULL}, {"facesit" ,0,2,0,0,NULL,0,NULL}, {"fatass" ,0,2,0,0,NULL,0,NULL}, {"feetfetish" ,0,2,0,0,NULL,0,NULL}, {"felatio" ,0,2,0,0,NULL,0,NULL}, {"fellatio" ,0,2,0,0,NULL,0,NULL}, {"femdom" ,0,2,0,0,NULL,0,NULL}, {"fetishwear" ,0,2,0,0,NULL,0,NULL}, {"fettegirl" ,0,2,0,0,NULL,0,NULL}, {"fingerbang" ,0,2,0,0,NULL,0,NULL}, {"fingering" ,0,1,0,0,NULL,0,NULL}, // fingering the keyboard? use 1 {"flesh4free" ,0,2,0,0,NULL,0,NULL}, {"footfetish" ,0,2,0,0,NULL,0,NULL}, {"footjob" ,0,2,0,0,NULL,0,NULL}, {"footlicking" ,0,2,0,0,NULL,0,NULL}, {"footworship" ,0,2,0,0,NULL,0,NULL}, {"fornication" ,0,2,0,0,NULL,0,NULL}, {"freeass" ,0,2,0,0,NULL,0,NULL}, {"freebigtit" ,0,2,0,0,NULL,0,NULL}, {"freedick" ,0,2,0,0,NULL,0,NULL}, {"freehardcore" ,0,2,0,0,NULL,0,NULL}, //{"freehentai" ,0,2,0,0,NULL,0,NULL}, dup {"freehooter" ,0,2,0,0,NULL,0,NULL}, {"freelargehooter" ,0,2,0,0,NULL,0,NULL}, {"freenakedpic" ,0,2,0,0,NULL,0,NULL}, {"freenakedwomen" ,0,2,0,0,NULL,0,NULL}, {"freetit" ,0,2,0,0,NULL,0,NULL}, {"freevoyeur" ,0,2,0,0,NULL,0,NULL}, {"gratishardcoregalerie" ,0,2,0,0,NULL,0,NULL}, {"hardcorecelebs" ,0,2,0,0,NULL,0,NULL}, {"hardcorefree" ,0,2,0,0,NULL,0,NULL}, {"hardcorehooter" ,0,2,0,0,NULL,0,NULL}, {"hardcorejunkie" ,0,2,0,0,NULL,0,NULL}, {"hardcorejunky" ,0,2,0,0,NULL,0,NULL}, {"hardcoremovie" ,0,2,0,0,NULL,0,NULL}, {"hardcorepic" ,0,2,0,0,NULL,0,NULL}, {"hardcorepix" ,0,2,0,0,NULL,0,NULL}, {"hardcoresample" ,0,2,0,0,NULL,0,NULL}, {"hardcorestories" ,0,2,0,0,NULL,0,NULL}, {"hardcorethumb" ,0,2,0,0,NULL,0,NULL}, {"hardcorevideo" ,0,2,0,0,NULL,0,NULL}, {"harddick" ,0,2,0,0,NULL,0,NULL}, {"hardnipple" ,0,2,0,0,NULL,0,NULL}, {"hardon" ,0,2,0,0,NULL,0,NULL}, {"hentai" ,0,2,0,0,NULL,0,NULL}, {"interacialhardcore" ,0,2,0,0,NULL,0,NULL}, {"intercourseposition" ,0,2,0,0,NULL,0,NULL}, {"interracialhardcore" ,0,2,0,0,NULL,0,NULL}, {"ittybittytitty" ,0,2,0,0,NULL,0,NULL}, {"jackoff" ,0,2,0,0,NULL,0,NULL}, {"jennajameson" ,0,2,0,0,NULL,0,NULL}, {"jennicam" ,0,2,0,0,NULL,0,NULL}, {"jerkoff" ,0,2,0,0,NULL,0,NULL}, {"jism" ,0,2,0,0,NULL,0,NULL}, {"jiz" ,0,2,0,0,NULL,0,NULL}, {"justhardcore" ,0,2,0,0,NULL,0,NULL}, {"karasamateurs" ,0,2,0,0,NULL,0,NULL}, {"kascha" ,0,2,0,0,NULL,0,NULL}, {"kaylakleevage" ,0,2,0,0,NULL,0,NULL}, {"kobetai" ,0,2,0,0,NULL,0,NULL}, {"lapdance" ,0,2,0,0,NULL,0,NULL}, {"largedick" ,0,2,0,0,NULL,0,NULL}, {"largehooter" ,0,2,0,0,NULL,0,NULL}, {"largestbreast" ,0,2,0,0,NULL,0,NULL}, {"largetit" ,0,2,0,0,NULL,0,NULL}, {"lesben" ,0,2,0,0,NULL,0,NULL}, {"lesbo" ,0,2,0,0,NULL,0,NULL}, {"lickadick" ,0,2,0,0,NULL,0,NULL}, {"lindalovelace" ,0,2,0,0,NULL,0,NULL}, {"longdick" ,0,2,0,0,NULL,0,NULL}, {"lovedoll" ,0,2,0,0,NULL,0,NULL}, {"makinglove" ,0,2,0,0,NULL,0,NULL}, {"mangax" ,0,2,0,0,NULL,0,NULL}, {"manpic" ,0,2,0,0,NULL,0,NULL}, {"marilynchambers" ,0,2,0,0,NULL,0,NULL}, {"massivecock" ,0,2,0,0,NULL,0,NULL}, {"masterbating" ,0,2,0,0,NULL,0,NULL}, {"mensdick" ,0,2,0,0,NULL,0,NULL}, {"milf" ,0,2,0,0,NULL,0,NULL}, {"minka" ,0,2,0,0,NULL,0,NULL}, {"monstercock" ,0,2,0,0,NULL,0,NULL}, {"monsterdick" ,0,2,0,0,NULL,0,NULL}, {"muffdiving" ,0,2,0,0,NULL,0,NULL}, {"nacktfoto" ,0,2,0,0,NULL,0,NULL}, {"nakedblackwomen" ,0,2,0,0,NULL,0,NULL}, {"nakedceleb" ,0,2,0,0,NULL,0,NULL}, {"nakedcelebrity" ,0,2,0,0,NULL,0,NULL}, {"nakedcheerleader" ,0,2,0,0,NULL,0,NULL}, {"nakedchick" ,0,2,0,0,NULL,0,NULL}, {"nakedgirl" ,0,2,0,0,NULL,0,NULL}, {"nakedguy" ,0,2,0,0,NULL,0,NULL}, {"nakedladies" ,0,2,0,0,NULL,0,NULL}, {"nakedlady" ,0,2,0,0,NULL,0,NULL}, {"nakedman" ,0,2,0,0,NULL,0,NULL}, {"nakedmen" ,0,2,0,0,NULL,0,NULL}, {"nakedness" ,0,2,0,0,NULL,0,NULL}, {"nakedphoto" ,0,2,0,0,NULL,0,NULL}, {"nakedpic" ,0,2,0,0,NULL,0,NULL}, {"nakedstar" ,0,2,0,0,NULL,0,NULL}, {"nakedwife" ,0,2,0,0,NULL,0,NULL}, {"nakedwoman" ,0,2,0,0,NULL,0,NULL}, {"nakedwomen" ,0,2,0,0,NULL,0,NULL}, {"nastychat" ,0,2,0,0,NULL,0,NULL}, {"nastythumb" ,0,2,0,0,NULL,0,NULL}, {"naughtylink" ,0,2,0,0,NULL,0,NULL}, {"naughtylinx" ,0,2,0,0,NULL,0,NULL}, {"naughtylynx" ,0,2,0,0,NULL,0,NULL}, {"naughtynurse" ,0,2,0,0,NULL,0,NULL}, {"niceass" ,0,2,0,0,NULL,0,NULL}, {"nikkinova" ,0,2,0,0,NULL,0,NULL}, {"nikkityler" ,0,2,0,0,NULL,0,NULL}, {"nylonfetish" ,0,2,0,0,NULL,0,NULL}, {"nympho" ,0,2,0,0,NULL,0,NULL}, {"openleg" ,0,2,0,0,NULL,0,NULL}, {"oral4free" ,0,2,0,0,NULL,0,NULL}, {"pantyhosefetish" ,0,2,0,0,NULL,0,NULL}, {"peepcam" ,0,2,0,0,NULL,0,NULL}, {"persiankitty" ,0,2,0,0,NULL,0,NULL}, {"perverted" ,0,2,0,0,NULL,0,NULL}, {"pimpserver" ,0,2,0,0,NULL,0,NULL}, {"pissing" ,0,2,0,0,NULL,0,NULL}, {"poontang" ,0,2,0,0,NULL,0,NULL}, {"privatex" ,0,2,0,0,NULL,0,NULL}, {"prono" ,0,2,0,0,NULL,0,NULL}, {"publicnudity" ,0,2,0,0,NULL,0,NULL}, {"puffynipple" ,0,2,0,0,NULL,0,NULL}, {"racqueldarrian" ,0,2,0,0,NULL,0,NULL}, //{"rape" ,0,2,0,0,NULL,0,NULL}, // dup! {"rawlink" ,0,2,0,0,NULL,0,NULL}, {"realhardcore" ,0,2,0,0,NULL,0,NULL}, {"rubberfetish" ,0,2,0,0,NULL,0,NULL}, {"seka" ,0,2,0,0,NULL,0,NULL}, {"sheboy" ,0,2,0,0,NULL,0,NULL}, {"showcam" ,0,2,0,0,NULL,0,NULL}, {"showercam" ,0,2,0,0,NULL,0,NULL}, {"smallbreast" ,0,2,0,0,NULL,0,NULL}, {"smalldick" ,0,2,0,0,NULL,0,NULL}, {"spycamadult" ,0,2,0,0,NULL,0,NULL}, {"strapon" ,0,2,0,0,NULL,0,NULL}, {"stripclub" ,0,2,0,0,NULL,0,NULL}, {"stripshow" ,0,2,0,0,NULL,0,NULL}, {"striptease" ,0,2,0,0,NULL,0,NULL}, {"strokeit" ,0,2,0,0,NULL,0,NULL}, {"strokeme" ,0,2,0,0,NULL,0,NULL}, {"suckdick" ,0,2,0,0,NULL,0,NULL}, {"sylviasaint" ,0,2,0,0,NULL,0,NULL}, {"teenhardcore" ,0,2,0,0,NULL,0,NULL}, {"teenie" ,0,2,0,0,NULL,0,NULL}, {"teenpic" ,0,2,0,0,NULL,0,NULL}, {"teensuck" ,0,2,0,0,NULL,0,NULL}, {"tgp" ,0,2,0,0,NULL,0,NULL}, {"threesome" ,0,2,0,0,NULL,0,NULL}, {"thumblord" ,0,2,0,0,NULL,0,NULL}, {"thumbzilla" ,0,2,0,0,NULL,0,NULL}, {"tiffanytowers" ,0,2,0,0,NULL,0,NULL}, {"tinytitties" ,0,2,0,0,NULL,0,NULL}, //{"tities" ,0,2,0,0,NULL,0,NULL}, // entities {"titman" ,0,2,0,0,NULL,0,NULL}, {"titsandass" ,0,2,0,0,NULL,0,NULL}, {"titties" ,0,2,0,0,NULL,0,NULL}, {"titts" ,0,2,0,0,NULL,0,NULL}, {"titty" ,0,2,0,0,NULL,0,NULL}, {"tokyotopless" ,0,2,0,0,NULL,0,NULL}, {"tommysbookmark" ,0,2,0,0,NULL,0,NULL}, {"toplesswomen" ,0,2,0,0,NULL,0,NULL}, {"trannies" ,0,2,0,0,NULL,0,NULL}, {"twinks" ,0,2,0,0,NULL,0,NULL}, {"ultradonkey" ,0,2,0,0,NULL,0,NULL}, {"ultrahardcore" ,0,2,0,0,NULL,0,NULL}, {"uncutcock" ,0,2,0,0,NULL,0,NULL}, {"vividtv" ,0,2,0,0,NULL,0,NULL}, {"wendywhoppers" ,0,2,0,0,NULL,0,NULL}, {"wetdick" ,0,2,0,0,NULL,0,NULL}, {"wetpanties" ,0,2,0,0,NULL,0,NULL}, {"wifesharing" ,0,2,0,0,NULL,0,NULL}, {"wifeswapping" ,0,2,0,0,NULL,0,NULL}, {"xrated" ,0,2,0,0,NULL,0,NULL} }; // . store this in clusterdb rec so family filter works! // . check content for adult words char *XmlDoc::getIsAdult ( ) { if ( m_isAdultValid ) return &m_isAdult2; // call that setStatus ("getting is adult bit"); int32_t **pici = getIndCatIds(); if ( ! pici || pici == (void *)-1 ) return (char *)pici; // check categories for ( int32_t i = 0 ; i < size_indCatIds / 4 ; i++ ) { int32_t ic = ptr_indCatIds[i]; // skip if not an adult category if ( ! g_categories->isIdAdult ( ic ) ) continue; // got it m_isAdult = true; m_isAdult2 = true; m_isAdultValid = true; return &m_isAdult2; } // . if any of the wiki docids we are in are adult.... then we are // . we set the top bit of wiki docids to indicate if adult //for ( int32_t i = 0 ; i < size_wikiDocIds / 8 ; i++ ) { // int64_t d = ptr_wikiDocIds[i]; // if ( ! ( d & 0x8000000000000000 ) ) continue; // // got it // m_isAdult = true; // m_isAdultValid = true; // return &m_isAdult; //} // need the content char **u8 = getUtf8Content(); if ( ! u8 || u8 == (char **)-1) return (char *)u8; // time it int64_t start = gettimeofdayInMilliseconds(); // score that up int32_t total = getDirtyPoints ( ptr_utf8Content, size_utf8Content - 1 , m_niceness , m_firstUrl.m_url ); // then the url //char *u = getFirstUrl()->getUrl(); //total += getDirtyPoints ( u , gbstrlen(u) ); // and redir url //char *r = getRedirUrl()->getUrl(); //total += getDirtyPoints ( r , gbstrlen(r) ); // debug msg int64_t took = gettimeofdayInMilliseconds() - start; if ( took > 10 ) logf(LOG_DEBUG, "build: Took %"INT64" ms to check doc of %"INT32" bytes for " "dirty words.",took,size_utf8Content-1); m_isAdult = false; // adult? if ( total >= 2 ) m_isAdult = true; // set shadow member m_isAdult2 = (bool)m_isAdult; // validate m_isAdultValid = true; // note it if ( m_isAdult2 && g_conf.m_logDebugDirty ) log("dirty: %s points = %"INT32"",m_firstUrl.m_url,total); // no dirty words found return &m_isAdult2; } int32_t getDirtyPoints ( char *s , int32_t slen , int32_t niceness , char *url ) { // . use the matches function to get all the matches // . then check each match to see if it is actually a legit word // . actually match the dirty words, then match the clean words // then we can subtract counts. int32_t numDirty = sizeof(s_dirtyWords) / sizeof(Needle); getMatches2 ( s_dirtyWords , numDirty , s , slen , NULL , // linkPos NULL , // needleNum false , // stopAtFirstMatch? NULL , // hadPreMatch ptr true , // saveQuickTables? niceness ); int32_t points = 0; // each needle has an associated score for ( int32_t i = 0 ; i < numDirty ; i++ ) { // skip if no match if ( s_dirtyWords[i].m_count <= 0 ) continue; // . the "id", is positive for dirty words, - for clean // . uses +2/-2 for really dirty words // . uses +1/-1 for borderline dirty words points += s_dirtyWords[i].m_id; // log debug if ( ! g_conf.m_logDebugDirty ) continue; // show it in the log log("dirty: %s %"INT32" %s" ,s_dirtyWords[i].m_string ,(int32_t)s_dirtyWords[i].m_id ,url ); } //// // // repeat for part2 // // we have to do two separate parts otherwise the algo in // matches2.cpp gets really slow. it was not meant to match // so many needles in one haystack. // /// int32_t numDirty2 = sizeof(s_dirtyWordsPart2) / sizeof(Needle); // . disable this for now. most of these are phrases and they // will not be detected. // . TODO: hash the dirty words and phrases and just lookup // words in that table like we do for isStopWord(), but use // isDirtyWord(). Then replace the code is Speller.cpp // with isDirtyUrl() which will split the string into words // and call isDirtyWord() on each one. also use bi and tri grams // in the hash table. numDirty2 = 0; getMatches2 ( s_dirtyWordsPart2 , numDirty2 , s , slen , NULL , // linkPos NULL , // needleNum false , // stopAtFirstMatch? NULL , // hadPreMatch ptr true , // saveQuickTables? niceness ); // each needle has an associated score for ( int32_t i = 0 ; i < numDirty2 ; i++ ) { // skip if no match if ( s_dirtyWordsPart2[i].m_count <= 0 ) continue; // . the "id", is positive for dirty words, - for clean // . uses +2/-2 for really dirty words // . uses +1/-1 for borderline dirty words points += s_dirtyWordsPart2[i].m_id; // log debug if ( ! g_conf.m_logDebugDirty ) continue; // show it in the log log("dirty: %s %"INT32" %s" ,s_dirtyWordsPart2[i].m_string ,(int32_t)s_dirtyWordsPart2[i].m_id ,url ); } return points; } int32_t **XmlDoc::getIndCatIds ( ) { // if XmlDoc was set from a titleRec it should validate this if ( m_indCatIdsValid ) return &ptr_indCatIds; // otherwise, we must compute them! CatRec *cat = getCatRec (); // blocked or error? if ( ! cat || cat == (CatRec *)-1 ) return (int32_t **)cat; // set this ptr_indCatIds = cat->m_indCatids; size_indCatIds = cat->m_numIndCatids * 4; m_indCatIdsValid = true; // parse that up return &ptr_indCatIds; } int32_t **XmlDoc::getCatIds ( ) { // if XmlDoc was set from a titleRec it should validate this if ( m_catIdsValid ) return &ptr_catIds; // otherwise, we must compute them! CatRec *cat = getCatRec (); // blocked or error? if ( ! cat || cat == (CatRec *)-1 ) return (int32_t **)cat; // set this ptr_catIds = cat->m_catids; size_catIds = cat->m_numCatids * 4; m_catIdsValid = true; // parse that up return &ptr_catIds; } CatRec *XmlDoc::getCatRec ( ) { // return what we got if ( m_catRecValid ) return &m_catRec; // call that setStatus ("getting dmoz cat rec"); // callback? if ( m_calledMsg8b ) { // return NULL on error if ( g_errno ) return NULL; // otherwise, success m_catRecValid = true; return &m_catRec; } // consider it called m_calledMsg8b = true; // assume empty and skip the call for now m_catRec.reset(); m_catRecValid = true; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // let's bring dmoz back //return &m_catRec; // compute it otherwise if ( ! m_msg8b.getCatRec ( &m_firstUrl , cr->m_coll , gbstrlen(cr->m_coll) , true , // use canonical name? m_niceness , &m_catRec , // store here m_masterState , // state m_masterLoop )) // callback // return -1 if we blocked return (CatRec *)-1; // error? if ( g_errno ) return NULL; // we got it somehow without blocking... local cached lookup? m_catRecValid = true; return &m_catRec; } void gotWikiResultsWrapper ( void *state , UdpSlot *slot ) { XmlDoc *THIS = (XmlDoc *)state; THIS->gotWikiResults ( slot ); THIS->m_masterLoop ( THIS->m_masterState ); } // . get the wiki pages that this page matches // . use the docids of the wiki pages to represent them // . use an independent 32-node cluster to index all of wikipedia so it is all // in ram. do not need datedb, etc. // . get the gigabits for this page, up to 50 of them, and use that as a rat=0 // query on the wiki cluster // . score each wiki docid too, based on match // . normalize scores so they range from 10% to 100%, based on # of gigabits // that the wiki page matches // . index these as gbwiki: with the score given (8-bit) mapped // to 32 bits using score8to32() so the score itself is preserved // . WE CAN ALSO call this at QUERY TIME, using the actual query of the // searcher instead of the string of gigabits // . BUT i will probably just look at the wiki topics of the search results, // that will be faster and maybe more accurate... int64_t **XmlDoc::getWikiDocIds ( ) { if ( m_wikiDocIdsValid ) return (int64_t **)&ptr_wikiDocIds; setStatus ( "getting wiki docids" ); // . get our gigabit vector // . consists of array of 32-bit hashes // . followed by 1-1 array of 16-bit scores // . TODO: restrict gigabits to capitalized words and phrases, and // also to 2+ word wiki titles char *gq = getGigabitQuery ( ); if ( ! gq || gq == (char *)-1 ) return (int64_t **)gq; // empty? then no wiki match i guess //logf(LOG_DEBUG,"FIX ME FIX ME - getWikiDocIds"); // MDW: for now bail here too! if ( ! gq[0] || 1 == 1 ) { ptr_wikiDocIds = m_wikiDocIds; ptr_wikiScores = m_wikiScores; size_wikiDocIds = 0; size_wikiScores = 0; m_wikiDocIdsValid = true; return (int64_t **)&ptr_wikiDocIds; } // set our query to these gigabits // re-enable this later //if ( ! m_calledMsg40 ) m_wq.set ( gq ); int32_t need = 200 + gbstrlen(gq); // make buf m_wikiqbuf = (char *)mmalloc ( need , "wikiqbuf"); // error? if ( ! m_wikiqbuf ) return NULL; // save size m_wikiqbufSize = need; // use large single tier for speed char *p = m_wikiqbuf; p += sprintf ( p , "GET /search?raw=9&n=%"INT32"&sc=0&dr=0&"//dio=1&" "t0=1000000&rat=0&" "c=wiki&q=%s", (int32_t)MAX_WIKI_DOCIDS, gq ); // terminate it *p++ = '\0'; // then put in the ip *(int32_t *)p = g_hostdb.m_myHost->m_ip; // skip over ip p += 4; // sanity check if ( p - m_wikiqbuf > need ) { char *xx=NULL;*xx=0; } int32_t ip = g_conf.m_wikiProxyIp; // if not given, make it gf1 for now if ( ! ip ) ip = atoip ( "10.5.62.11" , 10 ); int32_t port = g_conf.m_wikiProxyPort; // port default too to gf1 if ( ! port ) port = 9002; // send it using msg 0xfd to the wiki cluster's proxy if ( ! g_udpServer.sendRequest ( m_wikiqbuf , p - m_wikiqbuf , 0xfd , ip , port , -1 , // hostId NULL , // retSlot this , // state gotWikiResultsWrapper , 1000 ) ) // we had an error, g_errno should be set return NULL; // got without blocking? no way! return (int64_t **)-1; } void XmlDoc::gotWikiResults ( UdpSlot *slot ) { setStatus ( "got wiki docids" ); // do not free our request in slot slot->m_sendBufAlloc = NULL; // free request buf mfree ( m_wikiqbuf , m_wikiqbufSize , "wikiqbuf" ); // error getting the wiki results? if ( g_errno ) return; // TODO: normalize all scores with each other some how. i think // they are fairly absolute, but now sure with a lot of rat=0 terms! logf(LOG_DEBUG,"wiki: fix my scoring stuff. have a min score... " " and somehow normalize scores to be in [0,1.0]"); // . force this reply to be NULL terminated // . i can't fix in the code now because the reply is coming from // a different cluster running an older version of gb char *s = slot->m_readBuf; char *end = s + slot->m_readBufSize - 1; // overwrite the last '>', who cares! *end = '\0'; // make our xml Xml xml; if ( ! xml.set ( s , end - s , false , // ownData? 0 , false , TITLEREC_CURRENT_VERSION , false , // setParents? m_niceness , CT_HTML )) // return if g_errno got set return; // grab docids int32_t nd = 0; int32_t nn = xml.getNumNodes(); XmlNode *nodes = xml.getNodes(); float score = 0.0; int64_t docId = 0LL; for ( int32_t i = 0 ; i + 1 < nn ; i++ ) { if ( nodes[i].m_nodeId != 1 ) continue; // tagname is ? if ( nodes[i].m_tagNameLen == 5 && nodes[i].m_tagName[0] == 'd' && ! strncmp(nodes[i].m_tagName,"docId",5) ) docId = atoll ( nodes[i].m_tagName ); // is ? (after docid tag) if ( nodes[i].m_tagNameLen == 8 && nodes[i].m_tagName[0] == 'a' && ! strncmp(nodes[i].m_tagName,"absScore",8) ) { score = atof ( nodes[i].m_tagName ); // add it m_wikiDocIds [ nd ] = docId; m_wikiScores [ nd ] = score; nd++; // do not overflow if ( nd >= MAX_WIKI_DOCIDS ) break; } } // point to them ptr_wikiDocIds = m_wikiDocIds; ptr_wikiScores = m_wikiScores; size_wikiDocIds = nd * 8; size_wikiScores = nd * sizeof(rscore_t); log ( LOG_DEBUG , "build: got %"INT32" wiki docids",nd); m_wikiDocIdsValid = true; } int32_t *XmlDoc::getPubDate ( ) { if ( m_pubDateValid ) return (int32_t *)&m_pubDate; // get date parse Dates *dp = getDates(); if ( ! dp || dp == (Dates *)-1 ) return (int32_t *)dp; // got it m_pubDateValid = true; m_pubDate = dp->getPubDate(); // print it once for page parser. we now do this in XmlDoc::print() //if ( m_pbuf ) m_dates.printPubDates ( m_pbuf ); // set m_ageInDays if ( m_pubDate == (uint32_t)-1 ) return (int32_t *)&m_pubDate; // for parsing date //int32_t currentTime = getTimeGlobal(); // this must be valid //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } int32_t spideredTime = getSpideredTime(); // get doc age //float age = currentTime - m_pubDate; float age = spideredTime - m_pubDate; // convert to days (could be negative if in the future) m_ageInDays = age / (3600*24.0); // fix it if negative if ( m_ageInDays < 0.0 ) m_ageInDays = 0.0; return (int32_t *)&m_pubDate; } Dates *XmlDoc::getDates ( ) { if ( m_datesValid ) return &m_dates; // skip for now m_datesValid = true; return &m_dates; // set status. we can time status changes with this routine! setStatus ( "getting dates"); Dates *dd = getSimpleDates(); // bail on error if ( ! dd ) { if ( ! g_errno ) { char *xx=NULL;*xx=0; } return NULL; } // need addresses Addresses *aa = getAddresses (); if ( ! aa || aa == (void *)-1 ) return (Dates *)aa; char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (Dates *)isRoot; // . get root doc, from titlerec is ok ( TODO: make sure from titlerec) // . TODO: make sure to save in titledb too??? // . we need this now too // . now set DF_IN_ROOTDOC on dates that were in the same section but // in the root doc. // . if we are not the root, we use the root title rec to see if // the website repeats the store hours on every page. in that case // . TODO: a special cache just fo rholding "svt" for root pages. // should be highly efficient!!! //XmlDoc *rd = NULL; // setPart2() needs the implied sections set, so set them Sections *sections = getSections(); if ( !sections ||sections==(Sections *)-1) return(Dates *)sections; //SectionVotingTable *osvt = getOldSectionVotingTable(); //if ( ! osvt || osvt == (void *)-1 ) return (Dates *)osvt; // table should be empty if we are the root! //HashTableX *rvt = getRootVotingTable(); //if ( ! rvt || rvt == (void *)-1 ) return (Dates *)rvt; char *isRSS = getIsRSS(); if ( ! isRSS || isRSS == (void *)-1 ) return (Dates *)isRSS; uint8_t *ctype = getContentType(); if ( ! ctype || ctype == (void *)-1 ) return (Dates *)ctype; bool isXml = false; if ( *isRSS ) isXml = true; if ( *ctype == CT_XML ) isXml = true; int32_t minPubDate = -1; int32_t maxPubDate = -1; // parentPrevSpiderTime is 0 if that was the first time that the // parent was spidered, in which case isNewOutlink will always be set // for every outlink it had! if ( m_sreqValid && m_sreq.m_isNewOutlink && m_sreq.m_parentPrevSpiderTime ) { // pub date is somewhere between these two times minPubDate = m_sreq.m_parentPrevSpiderTime; maxPubDate = m_sreq.m_addedTime; } // now set part2 , returns false and sets g_errno on error if ( ! m_dates.setPart2 ( aa , minPubDate, maxPubDate,//osvt, isXml , *isRoot )) { if ( ! g_errno ) { char *xx=NULL;*xx=0; } // note it log("doc: dates2: %s",mstrerror(g_errno)); // this just means we ran out of stack space to parse // out all the dates, so ignore and continue... that way // Spider.cpp does not give up and keep retrying us over // and over again //if ( g_errno == EBUFOVERFLOW ) g_errno = 0; // on all other errors, return NULL if ( g_errno ) return NULL; } // debug EBADENGINEER error if ( g_errno ) { char *xx=NULL;*xx=0; } // overflow? does not set g_errno. at least clear all so we do not // get a messed up partial representation. //if ( m_dates.m_overflowed ) { // log("doc: date overflow for %s",m_firstUrl.m_url); // m_dates.reset(); //} // only call it once m_datesValid = true; // return it return &m_dates; } Dates *XmlDoc::getSimpleDates ( ) { if ( m_simpleDatesValid ) return &m_dates; // note that setStatus("get dates part 1"); // try the current url Url *u = getCurrentUrl(); // and ip int32_t *ip = getIp(); if ( ! ip || ip == (int32_t *)-1 ) return (Dates *)ip; // the docid int64_t *d = getDocId(); if ( ! d || d == (int64_t *)-1 ) return (Dates *)d; // the site hash int32_t *sh32 = getSiteHash32(); if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Dates *)sh32; // words Words *words = getWords(); if ( ! words || words == (Words *)-1 ) return (Dates *)words; // we set the D_IS_IN_DATE flag for these bits Bits *bits = getBits(); if ( ! bits ) return NULL; // sections. is it ok that these do not include implied sections? Sections *sections = getExplicitSections(); if (!sections||sections==(Sections *)-1) return (Dates *)sections; // link info (this is what we had the problem with) LinkInfo *info1 = getLinkInfo1(); if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Dates *)info1; //int32_t *sv = getPageSampleVector(); //if ( ! sv || sv == (int32_t *)-1 ) return (Dates *)sv; Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (Dates *)xml; // this must be valid, cuz Dates.cpp uses it! //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0;} // . get the xml doc of the previously stored title rec // . Dates will compare the two docs to check for clocks, etc. XmlDoc **pod = getOldXmlDoc ( ); if ( ! pod || pod == (XmlDoc **)-1 ) return (Dates *)pod; Url **redir = getRedirUrl(); if ( ! redir || redir == (Url **)-1 ) return (Dates *)redir; //char *ru = NULL; //if ( *redir ) ru = (*redir)->getUrl(); // this should deserialize from its title rec data //Dates *odp = NULL; //if ( *pod ) odp = (*pod)->getDates (); // the key in this table is the date tagHash and occNum, and the // value is the timestamp of the date. this is used by the clock // detection algorithm to compare a date in the previous version // of this web page to see if it changed and is therefore a clock then. HashTableX *cct = NULL; if ( *pod ) cct = (*pod)->getClockCandidatesTable(); // this should be valid uint8_t ctype = *getContentType(); CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // this now returns false and sets g_errno on error, true on success if ( ! m_dates.setPart1 ( u , //->getUrl(), *redir, // ru , ctype , *ip , *d , *sh32 , xml , words , // set D_IS_IN_DATE flag so Address.cpp // can avoid such word in addresses! bits , sections , info1 , //sv , //odp , // old dates cct , this , // us *pod , // old XmlDoc cr->m_coll , m_niceness )) { // sanity check if ( ! g_errno ) { char *xx=NULL;*xx=0; } // note it log("doc: dates1: %s",mstrerror(g_errno)); // this just means we ran out of stack space to parse // out all the dates, so ignore and continue... that way // Spider.cpp does not give up and keep retrying us over // and over again //if ( g_errno == EBUFOVERFLOW ) g_errno = 0; // on all other errors, return NULL if ( g_errno ) return NULL; } // only call it once m_simpleDatesValid = true; // return it return &m_dates; } // returns NULL and sets g_errno on error, returns -1 if blocked HashTableX *XmlDoc::getClockCandidatesTable ( ) { // return if valid if ( m_clockCandidatesTableValid ) return &m_clockCandidatesTable; // otherwise, deserialize? if ( m_clockCandidatesDataValid ) { // and table is now valid m_clockCandidatesTableValid = true; // return empty table if ptr is NULL if (! ptr_clockCandidatesData ) return &m_clockCandidatesTable; // otherwise, deserialize m_clockCandidatesTable.deserialize ( ptr_clockCandidatesData , size_clockCandidatesData, m_niceness ); // and return that return &m_clockCandidatesTable; } // otherwise, get our dates Dates *dp = getDates(); if ( ! dp || dp == (Dates *)-1 ) return (HashTableX *)dp; // reset table just in case m_clockCandidatesTable.reset(); // if no dates, bail if ( dp->m_numDatePtrs == 0 ) { m_clockCandidatesTableValid = true; m_clockCandidatesDataValid = true; ptr_clockCandidatesData = NULL; size_clockCandidatesData = 0; return &m_clockCandidatesTable; } // and set size to 32 buckets to start if ( ! m_clockCandidatesTable.set (8,4,32,NULL,0,false,m_niceness, "clockcands") ) return NULL; // now stock the table for ( int32_t i = 0 ; i < dp->m_numDatePtrs ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // get date Date *di = dp->m_datePtrs[i]; // skip if got nuked if ( ! di ) continue; // make the key int64_t key ; // lower 32 bits is taghash key = di->m_tagHash; // upper 32 bits is occNum key |= ((int64_t)(di->m_occNum)) << 32; // timestamp is the val int32_t val = di->m_timestamp; // then store it if ( ! m_clockCandidatesTable.addKey ( &key , &val ) ) return NULL; } // that is now valid m_clockCandidatesTableValid = true; // how many bytes to serialize? int32_t need = m_clockCandidatesTable.getStoredSize(); // now make the ptr valid if ( ! m_cctbuf.reserve ( need ) ) return NULL; // store it in there m_clockCandidatesTable.serialize ( &m_cctbuf ); // point to it ptr_clockCandidatesData = m_cctbuf.getBufStart(); size_clockCandidatesData = need; // that is valid now m_clockCandidatesDataValid = true; return &m_clockCandidatesTable; } // a date of -1 means not found or unknown int32_t XmlDoc::getUrlPubDate ( ) { if ( m_urlPubDateValid ) return m_urlPubDate; // need a first url. caller should have called setFirstUrl() if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; } // use Dates //Dates dp; // -1 means unknown m_urlPubDate = -1; //m_urlAge = -1; // try the FIRST url Url *u = getFirstUrl(); // get last url we redirected to Url **redir = getRedirUrl(); if ( ! redir || redir == (Url **)-1 ) {char *xx=NULL;*xx=0;} subloop: // . try to get the date just from the url // . this will be zero if none found m_urlPubDate = parseDateFromUrl ( u->getUrl() ); // we are kosher m_urlPubDateValid = true; // if we are unknown try last/redir url, if any if ( m_urlPubDate == 0 && *redir && u != *redir ) { u = *redir; goto subloop; } // if we got a valid pub date from the url, set "m_urlAge" if ( m_urlPubDate == 0 ) return m_urlPubDate; // note it log ( LOG_DEBUG, "date: Got url pub date: %"UINT32"", (uint32_t)m_urlPubDate ); // set the age //m_urlAge = getTimeGlobal() - m_urlPubDate; //if ( m_urlAge < 0 ) m_urlAge = 0; return m_urlPubDate; } // . use Dates to extract pub date from the url itself if pub date exists // . an age of "-1" means unknown /* int32_t XmlDoc::getOutlinkAge ( int32_t outlinkNum ) { // use Dates Dates dp; // sanity if ( outlinkNum < 0 ) { char *xx=NULL;*xx=0; } // get it char *us = m_links.getLinkPtr(outlinkNum); // for now set this, until we mod Dates to use normalized // string urls Url u; u.set ( us ); // try to get the date just from the url if ( ! dp.set ( &u , 0 , // ip 0LL , // m_newDocId 0 , // siteHash NULL , // Xml NULL , // Words NULL , // Bits NULL , // Sections NULL , // LinkInfo NULL , // pageSampleVec NULL , // old date parse2 NULL , // m_newDoc NULL , // m_oldDoc m_coll , 0 , // defaultTimeZone m_niceness )){ // should never block! char *xx=NULL; *xx= 0; } // this will be -1 if no date was found in the url int32_t urlPubDate = dp.getPubDate(); // if we got a valid pub date from the url, set "m_urlAge" if ( urlPubDate == -1 ) return -1; // note it //log ( LOG_DEBUG, "date: Got url pub date: %"UINT32"", m_urlDate ); // set the age int32_t age = getTimeGlobal() - urlPubDate; // keep positive if ( age < 0 ) age = 0; // return it return age; } */ // . sets g_errno on error and returns NULL // . now returns a ptr to it so we can return NULL to signify error, that way // all accessors have equivalent return values // . an acessor function returns (char *)-1 if it blocked! char *XmlDoc::getIsPermalink ( ) { if ( m_isPermalinkValid ) return &m_isPermalink2; Url *url = getCurrentUrl(); if ( ! url ) return NULL; char *isRSS = getIsRSS(); // return NULL with g_errno set, -1 if blocked if ( ! isRSS || isRSS == (char *)-1 ) return isRSS; Links *links = getLinks(); // return NULL with g_errno set, -1 if blocked if ( ! links || links == (Links *)-1 ) return (char *)links; uint8_t *ct = getContentType(); // return NULL with g_errno set, -1 if blocked if ( ! ct || ct == (uint8_t *)-1 ) return (char *)ct; // GUESS if it is a permalink by the format of the url int32_t p = ::isPermalink ( links , // Links ptr url , *ct , // CT_HTML default? NULL , // LinkInfo ptr *isRSS );// isRSS? m_isPermalink = p; m_isPermalink2 = p; m_isPermalinkValid = true; return &m_isPermalink2; } // guess based on the format of the url if this is a permalink char *XmlDoc::getIsUrlPermalinkFormat ( ) { if ( m_isUrlPermalinkFormatValid ) return &m_isUrlPermalinkFormat; setStatus ( "getting is url permalink format" ); Url *url = getCurrentUrl(); if ( ! url ) return NULL; // just guess if we are rss here since we most likely do not have // access to the url's content... bool isRSS = false; char *ext = url->getExtension(); if ( ext && strcasecmp(ext,"rss") == 0 ) isRSS = true; // GUESS if it is a permalink by the format of the url int32_t p = ::isPermalink ( NULL , // Links ptr url , CT_HTML , NULL , // LinkInfo ptr isRSS );// we guess this... m_isUrlPermalinkFormat = p; m_isUrlPermalinkFormatValid = true; return &m_isUrlPermalinkFormat; } char *XmlDoc::getIsRSS ( ) { if ( m_isRSSValid ) return &m_isRSS2; // the xml tells us for sure Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; m_isRSS = xml->isRSSFeed(); m_isRSS2 = (bool)m_isRSS; m_isRSSValid = true; return &m_isRSS2; } // . this function should really be called getTagTokens() because it mostly // works on HTML documents, not XML, and just sets an array of ptrs to // the tags in the document, including ptrs to the text in between // tags. Xml *XmlDoc::getXml ( ) { // return it if it is set if ( m_xmlValid ) return &m_xml; // get the filtered content char **u8 = getUtf8Content(); if ( ! u8 || u8 == (char **)-1 ) return (Xml *)u8; int32_t u8len = size_utf8Content - 1; uint8_t *ct = getContentType(); if ( ! ct || ct == (void *)-1 ) return (Xml *)ct; // note it setStatus ( "getting xml"); // set it if ( ! m_xml.set ( *u8 , u8len , false , // ownData? 0 , // allocSize false , // pure xml? m_version , false , // setParentsArg? m_niceness , *ct ) ) // return NULL on error with g_errno set return NULL; // set just once m_xmlValid = true; // all done return &m_xml; } // Language support static stuff enum { METHOD_TAG = 0, METHOD_DMOZ, METHOD_URL, METHOD_OUTLINKS, METHOD_INLINKS, METHOD_FREQ, METHOD_DEFAULT, METHOD_IP, METHOD_ROOT, METHOD_CAP }; bool setLangVec ( Words *words , SafeBuf *langBuf , Sections *ss , int32_t niceness ) { int64_t *wids = words->getWordIds (); char **wptrs = words->m_words; int32_t nw = words->getNumWords (); // allocate if ( ! langBuf->reserve ( nw ) ) return false; uint8_t *langVector = (uint8_t *)langBuf->getBufStart(); // now set the langid for ( int32_t i = 0 ; i < nw ; i++ ) { // breathe QUICKPOLL ( niceness ); // default langVector[i] = langUnknown; // add the word if ( wids[i] == 0LL ) continue; // skip if number if ( is_digit(wptrs[i][0]) ) { langVector[i] = langTranslingual; continue; } // get the lang bits. does not include langTranslingual // or langUnknown int64_t bits = g_speller.getLangBits64 ( &wids[i] ); // skip if not unique char count = getNumBitsOn64 ( bits ) ; // if we only got one lang we could be, assume that if ( count == 1 ) { // get it. bit #0 is english, so add 1 char langId = getBitPosLL((uint8_t *)&bits) + 1; //langVector[i] = g_wiktionary.getLangId(&wids[i]); langVector[i] = langId; continue; } // ambiguous? set it to unknown then if ( count >= 2 ) { langVector[i] = langUnknown; continue; } // try setting based on script. greek. russian. etc. // if the word was not in the wiktionary. // this will be langUnknown if not definitive. langVector[i] = getCharacterLanguage(wptrs[i]); } // . now go sentence by sentence // . get the 64 bit vector for each word in the sentence // . then intersect them all // . if the result is a unique langid, assign that langid to // all words in the sentence // get first sentence in doc Section *si = NULL; if ( ss ) si = ss->m_firstSent; // scan the sentence sections and or in the bits we should for ( ; si ; si = si->m_nextSent ) { // breathe QUICKPOLL ( niceness ); // reset vec int64_t bits = LANG_BIT_MASK; // get lang 64 bit vec for each wid in sentence for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) { // breathe QUICKPOLL ( niceness ); // skip if not alnum word if ( ! wids[j] ) continue; // skip if starts with digit if ( is_digit(wptrs[j][0]) ) continue; // get 64 bit lang vec. does not include // langUnknown or langTransligual bits bits &= g_speller.getLangBits64 ( &wids[j] ); } // bail if none if ( ! bits ) continue; // skip if more than one language in intersection if ( getNumBitsOn64(bits) != 1 ) continue; // get it. bit #0 is english, so add 1 char langId = getBitPosLL((uint8_t *)&bits) + 1; // ok, must be this language i guess for ( int32_t j = si->m_senta ; j < si->m_sentb ; j++ ) { // breathe QUICKPOLL ( niceness ); // skip if not alnum word if ( ! wids[j] ) continue; // skip if starts with digit if ( is_digit(wptrs[j][0]) ) continue; // set it langVector[j] = langId; } } // try the same thing but do not use sentences. use windows of // 5 words. this will pick up pages that have an english menu // where each menu item is an individual sentence and only // one word. // http://www.topicexchange.com/ int64_t window[5]; int32_t wpos[5]; memset ( window , 0 , 8*5 ); int32_t wp = 0; int32_t total = 0; // now set the langid for ( int32_t i = 0 ; i < nw ; i++ ) { // breathe QUICKPOLL ( niceness ); // must be alnum if ( ! wids[i] ) continue; // skip if starts with digit if ( is_digit(wptrs[i][0]) ) continue; // skip if lang already set to a language //if ( langVector[i] != langUnknown && // langVector[i] != langTranslingual ) // continue; // get last 5 window[wp] = g_speller.getLangBits64 ( &wids[i] ); // skip if not in dictionary! if ( window[wp] == 0 ) continue; // otherwise, store it wpos [wp] = i; if ( ++wp >= 5 ) wp = 0; // need at least 3 samples if ( ++total <= 2 ) continue; // intersect them all together int64_t bits = LANG_BIT_MASK; for ( int32_t j = 0 ; j < 5 ; j++ ) { // skip if uninitialized, like if we have 3 // or only 4 samples if ( ! window[j] ) continue; // otherwise, toss it in the intersection bits &= window[j]; } // skip if intersection empty if ( ! bits ) continue; // skip if more than one language in intersection if ( getNumBitsOn64(bits) != 1 ) continue; // get it. bit #0 is english, so add 1 char langId = getBitPosLL((uint8_t *)&bits) + 1; // set all in window to this language for ( int32_t j = 0 ; j < 5 ; j++ ) { // skip if unitialized if ( ! window[j] ) continue; // otherwise, set it langVector[wpos[j]] = langId; } } return true; } // 1-1 with the words! uint8_t *XmlDoc::getLangVector ( ) { if ( m_langVectorValid ) { // can't return NULL, that means error! uint8_t *v = (uint8_t *)m_langVec.getBufStart(); if ( ! v ) return (uint8_t *)0x01; return v; } // words Words *words = getWords(); if ( ! words || words == (Words *)-1 ) return (uint8_t *)words; // get the sections without implied sections Sections *ss = getImpliedSections(); if ( ! ss || ss==(void *)-1) return (uint8_t *)ss; if ( ! setLangVec ( words , &m_langVec , ss , m_niceness) ) return NULL; m_langVectorValid = true; // can't return NULL, that means error! uint8_t *v = (uint8_t *)m_langVec.getBufStart(); if ( ! v ) return (uint8_t *)0x01; return v; } // returns -1 and sets g_errno on error uint8_t *XmlDoc::getLangId ( ) { if ( m_langIdValid ) return &m_langId; setStatus ( "getting lang id"); // debu ghack //m_langId = langRussian; //m_langIdValid = true; //return &m_langId; // get the stuff we need int32_t *ip = getIp(); if ( ! ip || ip == (int32_t *)-1 ) return (uint8_t *)ip; // . if we got no ip, we can't get the page... // . also getLinks() will call getSiteNumInlinks() which will // call getSiteLinkInfo() and will core if ip is 0 or -1 if ( *ip == 0 || *ip == -1 ) { m_langId = langUnknown; m_langIdValid = true; return &m_langId; } //Xml *xml = getXml (); //if ( ! xml || xml == (Xml *)-1 ) return (uint8_t *)xml; Words *words = getWords (); if ( ! words || words == (Words *)-1 ) return (uint8_t *)words; // do not get regular sections, getSections() which will call // getImpliedSections(), because then that will need to set addresses // and dates, etc. the addresses could return NULL with EBUFOVERFLOW // from a static buffer overflow causing us some problems here and // since that g_errno is only really handled well in getIndexCode() // it will log that CRITICAL CRITICAL message. and we really only // need the section sot avoid looking at script tag sections, etc. // when calling Words::getLanguage() Sections *sections = getExplicitSections(); // did it block? if ( sections==(Sections *)-1) return(uint8_t *)sections; // well, it still calls Dates::parseDates which can return g_errno // set to EBUFOVERFLOW... if ( ! sections && g_errno != EBUFOVERFLOW ) return NULL; // if sectinos is still NULL - try lang id without sections then, // reset g_errno g_errno = 0; //Links *links = getLinks(); //if ( ! links || links == (Links *)-1 ) return (uint8_t *)links; //LinkInfo *info1 = getLinkInfo1(); //if ( ! info1 || info1 == (LinkInfo *)-1 ) return (uint8_t *)info1; //CatRec *cat = getCatRec (); //if ( ! cat || cat == (CatRec *)-1) return (uint8_t *)cat; uint8_t *lv = getLangVector(); if ( ! lv || lv == (void *)-1 ) return (uint8_t *)lv; setStatus ( "getting lang id"); // compute langid from vector m_langId = computeLangId ( sections , words, (char *)lv ); if ( m_langId != langUnknown ) { m_langIdValid = true; return &m_langId; } // . try the meta description i guess // . 99% of the time we don't need this because the above code // captures the language int32_t mdlen; char *md = getMetaDescription( &mdlen ); Words mdw; mdw.setx ( md , mdlen , m_niceness ); SafeBuf langBuf; setLangVec ( &mdw,&langBuf,NULL,m_niceness); char *tmpLangVec = langBuf.getBufStart(); m_langId = computeLangId ( NULL , &mdw , tmpLangVec ); if ( m_langId != langUnknown ) { m_langIdValid = true; return &m_langId; } // try meta keywords md = getMetaKeywords( &mdlen ); mdw.setx ( md , mdlen , m_niceness ); langBuf.purge(); setLangVec ( &mdw,&langBuf,NULL,m_niceness); tmpLangVec = langBuf.getBufStart(); m_langId = computeLangId ( NULL , &mdw , tmpLangVec ); m_langIdValid = true; return &m_langId; } // lv = langVec char XmlDoc::computeLangId ( Sections *sections , Words *words, char *lv ) { Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // this means null too if ( sections && sections->m_numSections == 0 ) sp = NULL; int32_t badFlags = SEC_SCRIPT|SEC_STYLE;//|SEC_SELECT; int32_t counts [ MAX_LANGUAGES ]; memset ( counts , 0 , MAX_LANGUAGES * 4); int32_t nw = words->getNumWords (); char **wptrs = words->m_words; int32_t *wlens = words->m_wordLens; // now set the langid for ( int32_t i = 0 ; i < nw ; i++ ) { // breathe QUICKPOLL(m_niceness); // skip if in script or style section if ( sp && (sp[i]->m_flags & badFlags) ) continue; // // skip if in a url // // blah/ if ( wptrs[i][wlens[i]] == '/' ) continue; // blah.blah or blah?blah if ( (wptrs[i][wlens[i]] == '.' || wptrs[i][wlens[i]] == '?' ) && is_alnum_a(wptrs[i][wlens[i]+1]) ) continue; // /blah or ?blah if ( (i>0 && wptrs[i][-1] == '/') || (i>0 && wptrs[i][-1] == '?') ) continue; // add it up counts[(unsigned char)lv[i]]++; } // get the majority count int32_t max = 0; int32_t maxi = 0; // skip langUnknown by starting at 1, langEnglish for ( int32_t i = 1 ; i < MAX_LANGUAGES ; i++ ) { // skip translingual if ( i == langTranslingual ) continue; if ( counts[i] <= max ) continue; max = counts[i]; maxi = i; } return maxi; //m_langId = maxi; //m_langIdValid = true; //return &m_langId; /* int32_t freqScore = 0; int32_t lang; if ( ! m_processedLang ) { // do not repeat this call for this document m_processedLang = true; lang = words->getLanguage( sections , 1000 , // sampleSize , m_niceness, &freqScore); // return NULL on error with g_errno set if ( lang == -1 ) return NULL; // we got it from words, return if ( lang != 0 ) { m_langId = lang; m_langIdValid = true; return &m_langId; } } m_langId = 0; // try from charset uint16_t *charset = getCharset ( ); if ( ! charset || charset == (uint16_t *)-1 )return (uint8_t *)charset; // do based on charset if ( *charset == csGB18030 ) m_langId = langChineseTrad; if ( *charset == csGBK ) m_langId = langChineseSimp; if ( m_langId ) { m_langIdValid = true; return &m_langId; } // are we a root? char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot; // this lookup here might be unnecessary uint8_t *rl = NULL; if ( ! *isRoot ) { rl = getRootLangId(); if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl; } //Url *u = getCurrentUrl(); Url *u = getFirstUrl(); uint8_t gs[METHOD_CAP]; // reset language method vector memset( gs , 0, sizeof(uint8_t) * METHOD_CAP ); // Let the site tell us what language it's in gs [METHOD_TAG] = g_langId.guessLanguageFromTag( xml ); // Guess from the FIRST URL (unredirected url) gs [METHOD_URL] = g_langId.guessLanguageFromUrl( u->getUrl() ); // Guess from the outlinks gs [METHOD_OUTLINKS] = g_langId.guessLanguageFromOutlinks( links ); // Guess from the inlinks gs [METHOD_INLINKS] = g_langId.guessLanguageFromInlinks(info1, *ip); // root page's language, if there was one if ( ! *isRoot ) gs [METHOD_ROOT] = *rl; int32_t scores[MAX_LANGUAGES]; memset( scores, 0, sizeof(int32_t) * MAX_LANGUAGES ); // weights for the 10 methods char cw[] = { 8,9,4,7,6,7,8,1,2}; // add up weighted scores for(int i = 0; i < METHOD_CAP; i++ ) scores[gs[i]] += cw[i]; // reset the "lang" to langUnknown which is 0 lang = langUnknown ; int max, oldmax; max = oldmax = 0; // find best language for ( int32_t i = MAX_LANGUAGES - 1; i > 0 ; i-- ) { if ( scores[i] < max) continue; oldmax = max; max = scores[i]; lang = i; } // give up if not too conclusive if( (max - oldmax) < 3 ) { // cr->m_languageThreshold) { //log(LOG_DEBUG, "build: Language: Threshold, score " // "(%"INT32" - %"INT32") %"INT32" vs. %"INT32".\n", // (int32_t)max, // (int32_t)oldmax, // (int32_t)max - oldmax, // (int32_t)3);//(int32_t)cr->m_languageThreshold); lang = langUnknown; } // Make sure we're over the bailout value, this // keeps low scoring methods like TLD from being // the decider if it was the only successful method. if ( max < 5 ) { // cr->m_languageBailout ) { //log(LOG_DEBUG, "build: Language: Bailout, " // "score %"INT32" vs. %"INT32".", // (int32_t)max, (int32_t)5);//cr->m_languageBailout); lang = langUnknown; } // If the language is still not known, // use the language detected from the frames. //if(lang == langUnknown) lang = frameFoundLang; // . try dmoz if still unknown // . limit to 10 of them // all done, do not repeat m_langIdValid = true; m_langId = lang; m_langIdScore = max; return &m_langId; */ } Words *XmlDoc::getWords ( ) { // return it if it is set if ( m_wordsValid ) return &m_words; // this will set it if necessary Xml *xml = getXml(); // returns NULL on error, -1 if blocked if ( ! xml || xml == (Xml *)-1 ) return (Words *)xml; // note it setStatus ( "getting words"); // now set what we need if ( ! m_words.set ( xml , true , // computeWordIds? m_niceness )) return NULL; // we got it m_wordsValid = true; return &m_words; } Bits *XmlDoc::getBits ( ) { // return it if it is set if ( m_bitsValid ) return &m_bits; // this will set it if necessary Words *words = getWords(); // returns NULL on error, -1 if blocked if ( ! words || words == (Words *)-1 ) return (Bits *)words; // now set what we need if ( ! m_bits.set ( words , m_version , m_niceness ) ) return NULL; // we got it m_bitsValid = true; return &m_bits; } Bits *XmlDoc::getBitsForSummary ( ) { // return it if it is set if ( m_bits2Valid ) return &m_bits2; // this will set it if necessary Words *words = getWords(); // returns NULL on error, -1 if blocked if ( ! words || words == (Words *)-1 ) return (Bits *)words; // now set what we need if ( ! m_bits2.setForSummary ( words ) ) return NULL; // we got it m_bits2Valid = true; return &m_bits2; } Pos *XmlDoc::getPos ( ) { // return it if it is set if ( m_posValid ) return &m_pos; // this will set it if necessary Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (Pos *)ww; //Sections *sections = getSections(); //if ( !sections ||sections==(Sections *)-1) return(Pos *)sections; // now set what we need //if ( ! m_pos.set ( ww , sections ) ) return NULL; if ( ! m_pos.set ( ww , NULL ) ) return NULL; // we got it m_posValid = true; return &m_pos; } Phrases *XmlDoc::getPhrases ( ) { // return it if it is set if ( m_phrasesValid ) return &m_phrases; // this will set it if necessary Words *words = getWords(); // returns NULL on error, -1 if blocked if ( ! words || words == (Words *)-1 ) return (Phrases *)words; // get this Bits *bits = getBits(); // bail on error if ( ! bits ) return NULL; // now set what we need if ( ! m_phrases.set ( words , bits , true , // use stop words false , // use stems m_version , m_niceness ) ) return NULL; // we got it m_phrasesValid = true; return &m_phrases; } /* Synonyms *XmlDoc::getSynonyms ( ) { // return if already set if ( m_synonymsValid ) return &m_synonyms; // this will set it if necessary Words *words = getWords(); if ( ! words || words == (Words *)-1 ) return (Synonyms *)words; Phrases *phrases = getPhrases (); if ( ! phrases || phrases == (void *)-1 ) return (Synonyms *)phrases; uint8_t *lv = getLangVector(); if ( ! lv || lv == (void *)-1 ) return (Synonyms *)lv; // primary language of the document uint8_t *langId = getLangId(); if ( ! langId || langId == (uint8_t *)-1 ) return (Synonyms *) langId; // . now set what we need // . provide a buf for which synonyms can be stored if we need to SafeBuf *synBuf = NULL; if ( m_pbuf || m_storeTermListInfo ) synBuf = &m_synBuf; // force on for printing out the synonyms in the loop below //synBuf = &m_synBuf; if ( ! m_synonyms.set ( words, (char *)lv, (char)*langId,phrases, m_niceness,synBuf) ) return NULL; // we got it m_synonymsValid = true; return &m_synonyms; } */ Sections *XmlDoc::getExplicitSections ( ) { // these sections might or might not have the implied sections in them if ( m_explicitSectionsValid ) return &m_sections; // if json forget this it is only html //uint8_t *ct = getContentType(); //if ( ! ct || ct == (void *)-1 ) return (Sections *)ct; //if ( *ct != CT_HTML && *ct != CT_TEXT && *ct != CT_XML ) { // m_sectionsValid = true; // return &m_sections; //} setStatus ( "getting explicit sections" ); // use the old title rec to make sure we parse consistently! XmlDoc **pod = getOldXmlDoc ( ); if ( ! pod || pod == (XmlDoc **)-1 ) return (Sections *)pod; // int16_tcut //XmlDoc *od = *pod; // if the serialized section is valid, use that //char *sd = NULL; //bool valid = false; //if ( od && od->m_sectionsReplyValid ) valid = true; //if ( valid ) sd = od->ptr_sectionsReply; // shouldn't we use the section data in ptr_sections for this??? //bool valid = m_sectionsReplyValid ; //char *sd = NULL; //if ( valid ) sd = ptr_sectionsReply; // this will set it if necessary Words *words = getWords(); // returns NULL on error, -1 if blocked if ( ! words || words == (Words *)-1 ) return (Sections *)words; // need these too now Phrases *phrases = getPhrases(); if ( ! phrases || phrases == (void *)-1 ) return (Sections *)phrases; // get this Bits *bits = getBits(); // bail on error if ( ! bits ) return NULL; // the site hash int64_t *sh64 = getSiteHash64(); // sanity check if ( ! sh64 && ! g_errno ) { char *xx=NULL; *xx=0; } if ( ! sh64 || sh64 == (void *)-1 ) return (Sections *)sh64; // the docid int64_t *d = getDocId(); if ( ! d || d == (int64_t *)-1 ) return (Sections *)d; // get the content type uint8_t *ct = getContentType(); if ( ! ct ) return NULL; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; setStatus ( "getting sections"); //char *sv = NULL; //if ( m_setFromTitleRec ) sv = ptr_sectionsVotes; // debug time to find a slow url int64_t start = gettimeofdayInMillisecondsLocal(); // this uses the sectionsReply to see which sections are "text", etc. // rather than compute it expensively if ( ! m_calledSections && // we get malformed sections error for some diffbot replies //*ct != CT_JSON && ! m_sections.set ( &m_words , &m_phrases , bits , getFirstUrl() , *d , *sh64 , // 64 bits cr->m_coll , m_niceness , m_masterState , // state m_masterLoop , // callback *ct , &m_dates , NULL , // sd // sections data true , // sections data valid? NULL , // sv // for m_nsvt //*tph , NULL , // buf 0 )) { // bufSize m_calledSections = true; // sanity check, this should not block, we are setting // exclusively from the titleRec //if ( sd ) { char *xx=NULL;*xx=0; } // it blocked, return -1 return (Sections *) -1; } int64_t end = gettimeofdayInMillisecondsLocal(); if ( end - start > 1000 ) log("build: %s section set took %"INT64" ms", m_firstUrl.m_url,end -start); // error? ETAGBREACH for example... or maybe ENOMEM if ( g_errno ) return NULL; // set inlink bits m_bits.setInLinkBits ( &m_sections ); // we got it m_explicitSectionsValid = true; return &m_sections; } Sections *XmlDoc::getImpliedSections ( ) { if ( m_impliedSectionsValid ) return &m_sections; // get the sections without implied sections Sections *sections = getExplicitSections(); if ( ! sections || sections==(void *)-1) return (Sections *)sections; // just use that for now if not doing events to save time! because // adding implied sections really sucks the resources. m_impliedSectionsValid = true; return &m_sections; // this will set it if necessary Words *words = getWords(); // returns NULL on error, -1 if blocked if ( ! words || words == (Words *)-1 ) return (Sections *)words; // get this Bits *bits = getBits(); // bail on error if ( ! bits ) return NULL; // get the content type uint8_t *ct = getContentType(); if ( ! ct ) return NULL; if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; } // now we need basic date types to add implied sections that // have a dow/dom header and tod brother sections // THIS WAS in getExplicitSections() but now m_wids is NULL. // m_wids is set in setPart1() called by XmlDoc::getSimpleDates(), // which calls getExplicitSections(). // . This was called for the benefit of Sections::addImpliedSections() // but now getAddresses() which we call below ends up calling // getSimpleDates() which calls m_dates.setPart1() which calls // m_dates.parseDates() so this is no longer needed i guess. /* if ( ! m_dates.parseDates ( words , DF_FROM_BODY , bits, sections, m_niceness , &m_firstUrl , *ct )) { // sanity check if ( ! g_errno ) { char *xx=NULL;*xx=0; } // note it log("doc: dates3: %s",mstrerror(g_errno)); // this just means we ran out of stack space to parse // out all the dates, so ignore and continue... that way // Spider.cpp does not give up and keep retrying us over // and over again //if ( g_errno == EBUFOVERFLOW ) g_errno = 0; // on all other errors, return NULL if ( g_errno ) return NULL; } */ // if we got no sections it was bad html. so don't go any further // lest we core in other code.. // it might have also just been an empty doc. // either way we'll core in getAddresses cuz it calls getSimpleDates // which will core in Dates::setPart1() trying to use m_sectionPtrs if ( sections->m_numSections == 0 ) { m_impliedSectionsValid = true; // hack to avoid core for empty docs like www.mini-polis.com sections->m_addedImpliedSections = true; return &m_sections; } // . now set addresses so we can use those to add implied sections // . this calls getSimpleDates() which calles m_dates.setPart1() // which calls parseDates again Addresses *aa = getAddresses (); if ( ! aa || aa == (void *)-1 ) return (Sections *)aa; // . now add implied sections // . return NULL with g_errno set on error if ( ! m_sections.addImpliedSections ( aa ) ) return NULL; // we got it m_impliedSectionsValid = true; return &m_sections; } // add in Section::m_sentFlags bits having to do with our voting tables Sections *XmlDoc::getSections ( ) { // get the sections without implied sections Sections *ss = getImpliedSections(); if ( ! ss || ss==(void *)-1) return (Sections *)ss; // hash the turk votes (each vote maps a contenthash or taghash to // a value) and use these to set sections sentence flags, etc. //HashTableX *tvt = getTurkVotingTable (); //if ( ! tvt || tvt == (void *)-1 ) return (Sections *)tvt; // returns NULL if our url is root! //HashTableX *rvt = getRootVotingTable(); //if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt; SectionVotingTable *osvt = getOldSectionVotingTable(); if ( ! osvt || osvt == (void *)-1 ) return (Sections *)osvt; uint32_t *tph = getTagPairHash32(); if ( ! tph || tph == (uint32_t *)-1 ) return (Sections *)tph; // need a getUseSectiondb() function... if ( ! m_useSectiondb ) { m_sectionsValid = true; return &m_sections; } // start here Section *si; /* // get first sentence in doc si = ss->m_firstSent; // do not bother scanning if no votes if ( osvt->getNumVotes() <= 0 ) si = NULL; // scan the sentence sections and or in the bits we should for ( ; si ; si = si->m_nextSent ) { // breathe QUICKPOLL(m_niceness); // combine section tagHash with contentHashAll to get // the "modified tagHash" int32_t modified = si->m_tagHash ^ si->m_contentHash; // save this float dups = osvt->getNumSampled (modified,SV_TAGCONTENTHASH); // . getNumSampled() combines both m_nsvt and m_osvt so it // includes ourselves... NO!... let's change this! // the osvt should not include votes from us! // it strips those outin SectionVotingTable::addListOfVotes() // . if it is a print-friendly version of the same page then // one of the two should have been deduped and not indexed, // so be strict with adhering to no more than 1! if ( dups > 0 ) si->m_flags |= SEC_DUP; // . content hash must be unique! // . can detect texty bios repeated throughout the site // . this is the hash of the words directly in the section // . HACK: the contentHash is the "tagHash" for this call // . SectionVote::m_numSampled is how many sections over all // docs we indexed from this site have this m_contentHash // . note that it is not restricted to pages with the same // tagPairHash as us (i.e. pages with similar layouts) // therefore it is very flexible!!! it is only restricted // to pages with our same site hash. // . getNumSampled() combines both m_nsvt and m_osvt so it // includes ourselves // . if it is a print-friendly version of the same page then // one of the two should have been deduped and not indexed, // so be strict with adhering to no more than 1! if ( dups > 0 ) continue; // . must be in a unique section // . if the section has siblings, skip it! if ( si->m_numOccurences > 1 ) continue; // . eliminate dynamic menus // . like "related posts" menus // . therefore require that we must be "texty" ... // . i.e. be like 80% plain text and no more than 20% link text // . vote on this since in some cases article may be mostly // just all in anchor text on a few article pages, but on // other pages it is well-behaved if ( osvt->getScore ( si->m_tagHash, SV_TEXTY) < .80 ) continue; // . check for comment sections // . these are text and the content is unique // . BUT the section tagHash is typically repeated at least // once on some other pages (HOPEFULLY!!!!) // . if we only require there be X other pages from this site // with the same layout, we might get unlucky in that each // page has 1 or less comments!!! how to fix??? // . anyway, we ask for the max # sampled from all of the votes // here because if just one page has 2+ copies of this // section enum tag hash, that is enough to be a comment // section // . SV_TEXTY_MAX_SAMPLED is a statistic compiled from the // voters and does not actually exist in sectiondb per se. // we add this statistic transparently in addVote() below // . it just gets the num sampled from the voter that had the // maximum m_numSampled value, because we don't want an // average in this case if ( osvt->getNumSampled(si->m_tagHash,SV_TEXTY_MAX_SAMPLED)>0) continue; // set it si->m_flags |= SEC_ARTICLE; // tally it up //m_numAlnumWordsInArticle += si->m_exclusive; // and another flag //m_hadArticle = true; } */ // // . how many other pages from this site have our tagpairhash? // . that is all the unique adjacent tag pair hashes xor'd together // . kind of represents the template of the webpage, ideally // //int32_t numSimLayouts = osvt->getNumSampled ( *tph , SV_TAGPAIRHASH ); /////////////////////////////////////// // // set m_dupVotes and m_notDupVotes for each section // // answers the question... out of all the pages with this taghash, // from this site, how often is this content repeated? // // trumba.com often repeats an event on its various feeds, but // not on EVERY page. so we should adjust the event title penalties // based on the ratio of repeated to not-repeated from the various // pages on the site that have the same *taghash* // /////////////////////////////////////// // get first sentence in doc si = ss->m_firstSent; // do not bother scanning if no votes if ( osvt->getNumVotes() <= 0 ) si = NULL; // assume no dups m_maxVotesForDup = 0; // scan the sentence sections and or in the bits we should for ( ; si ; si = si->m_nextSent ) { // breathe QUICKPOLL ( m_niceness ); // sanity check if ( ! si->m_sentenceContentHash64 ) { char *xx=NULL;*xx=0; } // how many pages from this site have this taghash for // a sentence float nt; nt = osvt->getNumSampled(si->m_turkTagHash32,SV_TURKTAGHASH); // skip if nobody! (except us) if ( nt <= 0.0 ) continue; // . get out tag content hash // . for some reason m_contentHash is 0 for like menu-y sectns int32_t modified =si->m_turkTagHash32^si->m_sentenceContentHash64; // . now how many pages also had same content in that tag? // . TODO: make sure numsampled only counts a docid once! // and this is not each time it occurs on that page. float nsam = osvt->getNumSampled(modified,SV_TAGCONTENTHASH); // cast it to a int32_t int32_t votes1 = (int32_t)nsam; // by default, complement int32_t votes2 = (int32_t)nt - votes1; // store votes si->m_votesForDup = votes1; si->m_votesForNotDup = votes2; // what's the most dup votes we had... if ( votes1 > m_maxVotesForDup ) m_maxVotesForDup = votes1; // set it //if ( si->m_votesForDup > 2 * si->m_votesForNotDup && // si->m_votesForDup >= 1 && // ! (si->m_flags & SEC_HAS_NONFUZZYDATE) ) // si->m_sentFlags |= SENT_DUP_SECTION; } m_sectionsValid = true; return &m_sections; } SectionVotingTable *XmlDoc::getNewSectionVotingTable ( ) { if ( m_nsvtValid ) return &m_nsvt; // need sections Sections *ss = getSections(); if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss; // and dates Dates *dp = getDates(); if ( ! dp || dp == (Dates *)-1 ) return (SectionVotingTable *)dp; // hash of all adjacent tag pairs uint32_t *tph = getTagPairHash32 ( ) ; if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph; // are we a site root url? //char *isRoot = getIsSiteRoot(); //if ( ! isRoot || isRoot == (char *)-1 ) // return (SectionVotingTable *)isRoot; // init table if ( ! m_nsvt.init ( 4096,"nsvt",m_niceness) ) return NULL; // . tally the section votes from the sections class // . only add the date votes, not the taghash/contenthash keys // from the root, since we add those from the root voting table // into m_osvt directly! // . we no longer have root voting table! // . this adds keys of the hash of each tag xpath // . and it adds keys of the hash of each tag path PLUS its innerhtml if ( ! ss->addVotes ( &m_nsvt , *tph ) ) return NULL; // tally the section votes from the dates if ( ! dp->addVotes ( &m_nsvt ) ) return NULL; // our new section voting table is now valid, and ready to be added // to sectiondb by calling SectionVotingTable::hash() m_nsvtValid = true; return &m_nsvt; } // . scan every section and look up its tag and content hashes in // sectiondb to find out how many pages and sites have the same hash // . use the secondary sectiondb key, key2 // . then store the stats in the Sections::m_stats class Sections *XmlDoc::getSectionsWithDupStats ( ) { Sections *ss = getSections(); if ( !ss ||ss==(Sections *)-1) return(Sections *)ss; if ( m_gotDupStats ) return ss; int32_t *sh32 = getSiteHash32(); if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Sections *)sh32; uint32_t siteHash32 = (uint32_t)*sh32; //int64_t *shp64 = getSiteHash64(); //if ( ! shp64 || shp64 == (void *)-1 ) return (Sections *)shp64; //int64_t siteHash48 = *shp64 & 0x0000ffffffffffffLL; // first time called? then init m_nextSection. //Section *si = m_si; // if this is -1, we are called for the first time if ( m_si == (void *)-1 ) { m_si = ss->m_rootSection; m_mcastRequestsIn = 0; m_mcastRequestsOut = 0; m_secStatsErrno = 0; } //sec_t menuFlags = SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ; for ( ; m_si ; m_si = m_si->m_next ) { // breathe QUICKPOLL(m_niceness); // don't bother with the section if it doesn't have this set // because this eliminates parent dupage to reduce amount // of gbxpathsitehash123456 terms we index. if ( ! ( m_si->m_flags & SEC_HASHXPATH ) ) continue; // skip if sentence, only hash tags now i guess for diffbot //if ( m_si->m_sentenceContentHash64 ) // continue; // get hash of sentences this tag contains indirectly uint32_t val32 = (uint32_t)m_si->m_indirectSentHash64; if ( ! val32 ) continue; // skip if menu! //if ( m_si->m_flags & menuFlags ) continue; // get section xpath hash combined with sitehash uint32_t secHash32 = m_si->m_turkTagHash32 ^ siteHash32; // convert this to 32 bits uint32_t innerHash32 ; //sentHash32 = (uint32_t)m_si->m_sentenceContentHash64; innerHash32 = (uint32_t)m_si->m_indirectSentHash64; // save in case we need to read more than 5MB //m_lastSection = si; // . does a gbfacets:gbxpathsitehashxxxxxx query on secHash32 // . we hack the "sentContentHash32" into each posdb key // as the "value" so we can do a facet-like histogram // over all the possible values this xpath has for this site SectionStats *stats = getSectionStats ( secHash32, innerHash32, false ); // cache only? // it returns -1 if would block if ( stats == (void *)-1 ) { // count it as outstanding //m_mcastRequestsOut++; // launch more if we have room // UdpServer.cpp has a limit of 10 on 0x39 requests if ( m_mcastRequestsOut - m_mcastRequestsIn < 10) continue; // advance m_si so we do not repeat m_si = m_si->m_next; // otherwise, return -1 to indicate blocked return (Sections *)-1; } // NULL means g_errno if ( ! stats ) { // ensure g_errno is set if ( ! g_errno ) { char *xx=NULL;*xx=0; } // save it m_secStatsErrno = g_errno; // clear it g_errno = 0; // if still waiting though return -1 if ( m_mcastRequestsOut > m_mcastRequestsIn ) return (Sections *)-1; // otherwise, all done i guess return NULL; } // if already in the table, skip it! } // waiting for more replies to come back? if ( m_mcastRequestsOut > m_mcastRequestsIn ) return (Sections *) -1; // now scan the sections and copy the stats from the table // into Section::m_stats of each sentence section. // use the key hash as the the hash of the tag/xpath and the innerhtml // and the val instead of being site hash will be hash of the // content. then we can get the histogram of our content hash // for this xpath on our site. Section *si = ss->m_rootSection; for ( ; si ; si = si->m_next ) { // breathe QUICKPOLL(m_niceness); // skip if no content to hash //if ( ! si->m_sentenceContentHash64 ) continue; // don't bother with the section if it doesn't have this set // because this eliminates parent dupage to reduce amount // of gbxpathsitehash123456 terms we index if ( ! ( si->m_flags & SEC_HASHXPATH ) ) continue; // skip if sentence, only hash tags now i guess for diffbot //if ( si->m_sentenceContentHash64 ) // continue; // get hash of sentences this tag contains indirectly uint32_t val32 = (uint32_t)si->m_indirectSentHash64; if ( ! val32 ) continue; // skip if menu! //if ( si->m_flags & menuFlags ) continue; // get section xpath hash combined with sitehash uint32_t secHash32 = si->m_turkTagHash32 ^ siteHash32; // convert this to 32 bits uint32_t innerHash32 ; innerHash32 = (uint32_t)si->m_indirectSentHash64; // the "stats" class should be in the table from // the lookups above!! SectionStats *stats = getSectionStats ( secHash32, innerHash32, true ); // cache only? // sanity //if ( ! stats || stats == (void *)-1 ) { char *xx=NULL;*xx=0;} // must have had a network error or something if ( ! stats ) continue; // copy gbmemcpy ( &si->m_stats , stats, sizeof(SectionStats) ); } // // now if a section has no stats but has the same // m_indirectSentHash64 as a kid, take his stats // Section *sx = ss->m_rootSection; for ( ; sx ; sx = sx->m_next ) { // breathe QUICKPOLL(m_niceness); // don't bother with the section if it doesn't have this set // because this eliminates parent dupage to reduce amount // of gbxpathsitehash123456 terms we index if ( ! ( sx->m_flags & SEC_HASHXPATH ) ) continue; // scan up parents and set their stats to ours as int32_t as // they have the same indirect sent hash64 Section *p = sx->m_parent; for ( ; p ; p = p->m_parent ) { // if parent is like an img tag, skip it if ( p->m_tagId == TAG_IMG ) continue; if ( p ->m_indirectSentHash64 != sx->m_indirectSentHash64 ) break; // copy it to parent with the same inner html hash gbmemcpy (&p->m_stats,&sx->m_stats,sizeof(SectionStats)); } } // now free the table's mem m_sectionStatsTable.reset(); m_gotDupStats = true; return ss; } static void gotReplyWrapper39 ( void *state1 , void *state2 ) { //XmlDoc *THIS = (XmlDoc *)state; XmlDoc *THIS = (XmlDoc *)state1; Multicast *mcast = (Multicast *)state2; THIS->gotSectionFacets ( mcast ); // this will end up calling getSectionsWithDupStats() again // which will call getSectionStats() some more on new sections // until m_gotDupStats is set to true. THIS->m_masterLoop ( THIS->m_masterState ); } // . launch a single msg3a::getDocIds() for a section hash, secHash32 SectionStats *XmlDoc::getSectionStats ( uint32_t secHash32 , uint32_t innerHash32 , bool cacheOnly ) { // init cache? if ( m_sectionStatsTable.m_numSlots == 0 && ! m_sectionStatsTable.set(4, sizeof(SectionStats), 32, NULL, 0, false, m_niceness, "secstatsch")) return NULL; // check in cache... SectionStats *stats ; stats = (SectionStats *)m_sectionStatsTable.getValue ( &secHash32 ); // if there, return it if ( stats ) return stats; // if cache only do not launch if ( cacheOnly ) return NULL; // // TODO: shard gbxpathsitehashxxxxx by termid // and make sure msg3a only sends to that single shard and sends // the stats back. should make us much faster to sectionize // a web page. but for now try without it... // //int32_t *sh32 = getSiteHash32(); //if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SectionStats *)sh32; int32_t maxOut = 32; // . need to make new msg39Request and a new Multicast arrays // . only need multicast since these gbfacetstr:gbxpathsitehash123456 // terms are sharded by termid, otherwise we'd have to use msg3a if ( ! m_mcastArray ) { // how much mem to alloc? int32_t need = 0; need += sizeof(Multicast); need += sizeof(Msg39Request); // query buf str need += 100; need *= maxOut; // a single query now to be shared //need += sizeof(Query); // just in case we are being re-used m_mcastBuf.reset(); // alloc space if ( ! m_mcastBuf.reserve(need) ) return NULL; // point to buf char *p = m_mcastBuf.getBufStart(); // set them up m_mcastArray = (Multicast *)p; p += sizeof(Multicast) * maxOut; m_msg39RequestArray = (Msg39Request *)p; p += sizeof(Msg39Request) * maxOut; //m_queryArray = (Query *)p; //p += sizeof(Query) * maxOut; //m_sharedQuery = (Query *)p; //p += sizeof(Query); // for holding the query string // assume query will not exceed 100 bytes incuding \0 m_queryBuf = p; p += 100 * maxOut; // initialize all! for ( int32_t i = 0 ; i < maxOut ; i++ ) { m_mcastArray [i].constructor(); m_msg39RequestArray[i].reset();//constructor(); //m_queryArray [i].constructor(); m_queryBuf[100*i] = '\0'; //m_inUse[i] = 0; } } // get first available int32_t i; for ( i = 0 ; i < maxOut ; i++ ) if ( ! m_mcastArray[i].m_inUse ) break; // wtf? if ( i >= maxOut ) { char *xx=NULL;*xx=0; } // and our vehicle Multicast *mcast = &m_mcastArray[i]; // mark as in use up here in case we quickpoll into this same code?! // yeah, i guess set2() calls quickpoll? //mcast->m_inUse = 1; // save this for reply //mcast->m_hack = this; char *qbuf = m_queryBuf + 100 * i; // . hash this special term (was gbsectionhash) // . the wordbits etc will be a number though, the hash of the content // of the xpath, the inner html hash // . preceeding this term with gbfacet: will make gigablast return // the statistics for all the values in the posdb keys of this // termlist, which happen to be innerHTML hashes for all pages // with this same xpath and on this same site. sprintf(qbuf,"gbfacetstr:gbxpathsitehash%"UINT32"", (uint32_t)secHash32); CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // set the msg39 request Msg39Request *r = &m_msg39RequestArray[i]; // reset all to defaults r->reset(); //r-> ptr_coll = cr->m_coll; //r->size_coll = gbstrlen(cr->m_coll)+1; r->m_collnum = cr->m_collnum; r->m_maxAge = 60; // cache timeout? r->m_addToCache = true; r->m_docsToGet = 0; // just calc stats r->m_niceness = m_niceness; r->m_debug = 0; r->m_doSiteClustering = false; //r->m_doIpClustering = false; r->m_doDupContentRemoval = false; r->m_boolFlag = 2; r->m_familyFilter = 0; r->m_language = 0; r->ptr_query = qbuf;//m_sectionHashQueryBuf; r->size_query = gbstrlen(r->ptr_query)+1; r->m_timeout = 3600; //-1;// auto-determine based on #terms r->m_maxQueryTerms = 10; // how much of each termlist to read in bytes int32_t readList = 10000; r-> ptr_readSizes = (char *)&readList; r->size_readSizes = 4; // term freqs float tfw = 1.0; r-> ptr_termFreqWeights = (char *)&tfw; r->size_termFreqWeights = 4; // speed it up some with this flag r->m_forSectionStats = true; // only do a single read of docids... do not split up r->m_numDocIdSplits = 1; // 1 query term r->m_nqt = 1; /////////////////////// // // this tells msg3a/msg39/posdbtable its a hack! no need to do this // because it's implied by the query. // BUT REALLY let's eliminate this and just make our queries like // gbfacet:gbxpathsitehash1234567 where 1234567 is the hash of // the section's xpath with the site. the values of that term in // the posdb key will be 32-bit hashes of the innerHtml for such // sections from all pages with the same xpath on the same site. // so no need for this now, comment out. // //r->m_getFacetStats = true; // ///////////////////////// // we need to know what site is the base site so the section stats // can set m_onSiteDocIds and m_offSiteDocIds correctly //r->m_siteHash32 = *sh32; // . now we use the hash of the innerHtml of the xpath // . this is our value for the facet field of gbxpathsitehash12345678 // which is the hash of the innerHTML for that xpath on this site. // 12345678 is the hash of the xpath and the site. //r->m_myFacetVal32 = sentHash32; //Query *qq = &m_queryArray[i]; // set query for msg3a. queryExpansion=false //qq->set2 ( r->ptr_query , langUnknown , false ); Query qq; qq.set2 ( r->ptr_query , langUnknown , false ); // TODO: ensure this just hits the one host since it is sharded // by termid... // what shard owns this termlist. we shard these // gbfacetstr:gbxpathsitehash123456 terms by termid. int64_t termId = qq.getTermId(0); int32_t shardNum = getShardNumFromTermId ( termId ); // hack in our inner html content hash for this xpath mcast->m_hack32 = innerHash32; mcast->m_hack64 = secHash32; // malloc and store the request. mcast will free it when done. int32_t reqSize; char *req = serializeMsg ( sizeof(Msg39Request), &r->size_readSizes, &r->size_whiteList, &r->ptr_readSizes, r, &reqSize, NULL, 0, false); // . send out a msg39 request to each shard // . multicasts to a host in group "groupId" // . we always block waiting for the reply with a multicast // . returns false and sets g_errno on error // . sends the request to fastest host in group "groupId" // . if that host takes more than about 5 secs then sends to // next host // . key should be largest termId in group we're sending to bool status; status = mcast->send ( req , // m_rbufPtr , reqSize , // request size 0x39 , // msgType 0x39 true , // mcast owns m_request? shardNum , // group to send to false , // send to whole group? 0,//(int32_t)qh , // 0 // startKey.n1 this , // state1 data mcast , // state2 data gotReplyWrapper39 , 30 , //timeout in secs m_niceness,//m_r->m_niceness , false , // realtime? -1, // firstHostId, // -1// bestHandlingHostId , NULL , // m_replyBuf , 0 , // MSG39REPLYSIZE, // this is true if multicast should free the // reply, otherwise caller is responsible // for freeing it after calling // getBestReply(). // actually, this should always be false, // there is a bug in Multicast.cpp. // no, if we error out and never steal // the buffers then they will go unfreed // so they are freed by multicast by default // then we steal control explicitly true ); m_mcastRequestsOut++; // if successfully launch, wait... if ( status ) return (SectionStats *) -1; // error? if ( g_errno ) return NULL;//{ mcast->m_inUse = 0; return NULL; } // sets &m_sectionStats and adds to the table gotSectionFacets ( mcast ); // i guess did not block... //return &msg3a->m_sectionStats; return &m_sectionStats; } // . come here when msg39 got the ptr_faceHashList for our single // gbfacet:gbxpathsitehash // . returns false and sets g_errno on error bool XmlDoc::gotSectionFacets ( Multicast *mcast ) { //SectionStats *stats = &msg39->m_sectionStats; if ( mcast->m_inUse ) { char *xx=NULL;*xx=0;} // count it as returned m_mcastRequestsIn++; // mark it as available now int32_t num = mcast - m_mcastArray; // sanity //if ( ! msg39->m_inUse ) { char *xx=NULL;*xx=0; } // grab the xpath/site hash uint32_t secHash32 = mcast->m_hack64; // and our innher html for that xpath int32_t myFacetVal32 = mcast->m_hack32; // sanity. should only be a gbfacet:gbxpathsitehash12345567 term. //if ( mcast->m_q->m_numTerms != 1 ) { char *xx=NULL;*xx=0; } // reset all counts to 0 m_sectionStats.reset(); ////// // // compile m_sectionStats // /////// // set m_sectionStats from the list of facet values for this // gbfacet:xpathsitehash term... // Query::m_queryTerm.m_facetHashTable has the facets merged // from all the shards. so now compute the stats from them. // set the section stats. //QueryTerm *qt = &msg3a->m_q->m_qterms[0]; //HashTableX *ft = &qt->m_facetHashTable; // . get the list of facet field/value pairs. // . see how Msg3a.cpp merges these to see how they are stored Msg39Reply *mr = (Msg39Reply *)mcast->m_readBuf;//getBestReply(); // this is NULL with g_errno set on error if ( ! mr ) { log("xmldoc: got error from sec stats mcast: %s", mstrerror(g_errno)); return false; } deserializeMsg ( sizeof(Msg39Reply) , &mr->size_docIds, &mr->size_clusterRecs, &mr->ptr_docIds, mr->m_buf ); char *p = (char *)(mr->ptr_facetHashList); //char *pfinal = p + mr->size_facetHashList; // // should only be one termid of facets in here, so no need to re-loop // int32_t nh = 0; // "matches" is how many docids with this facet field had our facet val int32_t matches = 0; // "totalDocIds" is how many docids had this facet field int32_t totalFields = 0; if ( p ) { // first is the termid //int64_t termId = *(int64_t *)p; // skip that p += 8; // the # of unique 32-bit facet values nh = *(int32_t *)p; p += 4; // the end point char *pend = p + (8 * nh); // now compile the facet hash list into there for ( ; p < pend ; ) { // does this facet value match ours? // (i.e. same inner html?) if ( *(int32_t *)p == myFacetVal32 ) matches += *(int32_t *)(p+4); p += 4; // now how many docids had this facet value? totalFields += *(int32_t *)p; p += 4; } } // how many unique inner html content hashes for this xpath/site // hash were there? m_sectionStats.m_numUniqueVals = nh;//ft->m_numSlotsUsed; // how many xpaths existsed over all docs. doc can have multiple. m_sectionStats.m_totalEntries = totalFields; // total # unique docids that had this facet m_sectionStats.m_totalDocIds = mr->m_estimatedHits;//totalHits; // how many had the same inner html content hash for // this xpath/site as we did? m_sectionStats.m_totalMatches = matches; //////// // // store m_sectionStats in cache // //////// // cache them. this does a copy of m_sectionStats if ( ! m_sectionStatsTable.addKey ( &secHash32 , &m_sectionStats ) ) log("xmldoc: failed to add sections stats: %s", mstrerror(g_errno)); // reset that msg39 to free its data //msg39->reset(); if ( mcast != &m_mcastArray[num] ) { char *xx=NULL;*xx=0; } // . make it available again // . do this after all in case we were in quickpoll interruptting // the getSectionStats() function below //mcast->m_inUse = 0; // free query Query::m_qwords array etc. to stop mem leaks m_mcastArray [num].reset(); m_msg39RequestArray[num].reset(); //m_queryArray [num].reset(); // now when the master loop calls getSectionsWithDupStats() it // should find the stats class in the cache! return true; } // . for all urls from this subdomain... // . EXCEPT root url since we use msg17 to cache that, etc. SectionVotingTable *XmlDoc::getOldSectionVotingTable ( ) { if ( m_osvtValid ) return &m_osvt; // do not consult sectiondb if we are set from the title rec, // that way we avoid parsining inconsistencies since sectiondb changes! if ( m_setFromTitleRec ) { char *p = ptr_sectiondbData; m_osvtValid = true; m_osvt.m_totalSiteVoters = 0; if ( size_sectiondbData <= 4 ) return &m_osvt; m_osvt.m_totalSiteVoters = *(int32_t *)p; p += 4; int32_t remaining = size_sectiondbData - 4; m_osvt.m_svt.deserialize(p,remaining,m_niceness); return &m_osvt; } // returns empty table if WE are the site root url! //HashTableX *rvt = getRootVotingTable(); //if ( ! rvt || rvt == (void *)-1 ) return (Sections *)rvt; // need sections //Sections *ss = getSections(); //if ( ! ss || ss==(Sections *)-1 ) return (SectionVotingTable *)ss; // hash of all adjacent tag pairs uint32_t *tph = getTagPairHash32 ( ) ; if ( ! tph || tph == (uint32_t *)-1 ) return (SectionVotingTable *)tph; int64_t *siteHash64 = getSiteHash64(); if ( ! siteHash64 || siteHash64 == (void *)-1 ) return (SectionVotingTable *)siteHash64; // the docid int64_t *d = getDocId(); if ( ! d || d == (int64_t *)-1 ) return (SectionVotingTable *)d; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // . for us, dates are really containers of the flags and tag hash // . init this up here, it is re-set if we re-call getSectiondbList() // because there were too many records in it to handle in one read if ( m_numSectiondbReads == 0 ) { // init table if ( ! m_osvt.init ( 8192,"osvt",m_niceness) ) return NULL; // use site hash as the main thing int64_t termId = *siteHash64 & TERMID_MASK; // . start key for reading list from sectiondb // . read all the section votes for this site m_sectiondbStartKey = g_datedb.makeStartKey(termId,0xffffffff); // how many reads we have to do... m_numSectiondbNeeds = 1; } //bool skipRecall = false; // always read 5MB at a time from sectiondb int32_t minRecSizes = 5000000; // crap! host #28 is being totall slammed!!!!! // why?????? in the meantime do this //minRecSizes = 100000; //skipRecall = true; // is it facebook? bool limitSectiondb = false; // limit now to speed up repair rebuild // limit now to speed up injection! limitSectiondb = true; // facebook lists often clog the tree, and when we read 2MB worth of // it, it takes 100ms, so reduce to 50k to so it takes 2.5ms... // because facebook is a well structured xml feed so why read any // really! if ( limitSectiondb ) minRecSizes = 50000; key128_t *lastKey = NULL; // if msg0 blocked and came back with g_errno set, like // in preparing to merge it got an OOM if ( g_errno ) { log("build: sectiondb read2: %s",mstrerror(g_errno)); return NULL; } readLoop: // before looking up TitleRecs using Msg20, let's first consult // datedb to see if we got adequate data as to what sections // are the article sections // only get the list once if ( m_numSectiondbReads < m_numSectiondbNeeds ) { // only do this once m_numSectiondbReads++; // make the termid uint64_t termId = *siteHash64 & TERMID_MASK; // end key is always the same key128_t end = g_datedb.makeEndKey ( termId , 0 ); // int16_tcut Msg0 *m = &m_msg0; // get the group this list is in (split = false) uint32_t shardNum; shardNum = getShardNum ( RDB_SECTIONDB,(char *)&m_sectiondbStartKey); // we need a group # from the groupId //int32_t split = g_hostdb.getGroupNum ( gid ); // note it //logf(LOG_DEBUG,"sections: " // "reading list from sectiondb: " // "sk.n1=0x%"XINT64" sk.n0=0x%"XINT64" " // "ek.n1=0x%"XINT64" ek.n0=0x%"XINT64" " // ,m_sectiondbStartKey.n1 // ,m_sectiondbStartKey.n0 // ,end.n1 // ,end.n0 // ); // . get the list // . gets all votes for one particular site if ( ! m->getList ( -1 , // hostId 0 , // ip 0 , // port 0 , // maxCacheAge false , // addToCache RDB_SECTIONDB , // was RDB_DATEDB cr->m_collnum , &m_secdbList , (char *)&m_sectiondbStartKey , (char *)&end , minRecSizes , m_masterState , m_masterLoop , m_niceness , // MAX_NICENESS // default parms follow true , // doErrorCorrection? true , // includeTree? true , // doMerge? -1 , // firstHostId 0 , // startFileNum -1 , // numFiles 30 , // timeout -1 , // syncPoint -1 , // preferLocalReads NULL , // msg5 NULL , // msg5b false , // isrealmerge? true , // allowpagecache? false , // forceLocalIndexdb? false , // doIndexdbSplit? shardNum ) )//split )) // return -1 if blocks return (SectionVotingTable *)-1; // error? if ( g_errno ) { log("build: sectiondb read: %s",mstrerror(g_errno)); return NULL; } } // it also returns the lastKey in the list so we can use that to // set the startKey for a re-call if we read >= 5MB lastKey = NULL; //logf(LOG_DEBUG,"sections: read list of %"INT32" bytes", // m_secdbList.m_listSize); bool recall = true; if ( m_secdbList.m_listSize + 24 < minRecSizes ) recall = false; // . unless it had special byte set in Msg0.cpp HACK // . we send back a compressed list and tack on an extra 0 byte at // the end so that we know we had a full list! if ( (m_secdbList.m_listSize % 2) == 1 ) { m_secdbList.m_listSize--; m_secdbList.m_listEnd --; recall = true; } // no longer bother re-calling, because facebook is way slow... if ( limitSectiondb ) recall = false; // . returns false and sets g_errno on error // . compile the votes from sectiondb for this site into a hashtable // . m_osvt is a SectionVotingTable and each entry in the hashtable // is a SectionVote class. // . the taghash is the key of the vote and is a hash of all the // nested tags the section is in. // . another vote uses the tag hash hashed with the hash of the // content contained by the section // . using these two vote counts we set Section::m_votesForDup // or Section::m_votesForNotDup counts which let us know how the // section is repeated or not repeated on the site // . SectionVote::m_score is always 1.0 from what i can tell // cuz it seems like addVote*() always uses a score of 1.0 // . SectionVote::m_numSampled is how many times that tagHash // occurs in the document. if ( ! m_osvt.addListOfVotes(&m_secdbList, &lastKey, *tph, *d , // docid m_niceness)) return NULL; // why is this always zero it seems? if ( g_conf.m_logDebugBuild ) log("xmldoc: added sectiondblist size=%"INT32" recall=%"INT32"", m_secdbList.m_listSize,(int32_t)recall); // . recall? yes if we had to truncate our list... // . we need to be able to scan all votes for the website... that is // why we recall here // . limit votes by a special sectiondb key then that is a vote... if ( recall ) { // another debug //logf(LOG_DEBUG,"sections: recallling read"); // just note it for now //if ( m_sectiondbRecall > 5 ) if ( m_numSectiondbNeeds > 5 ) logf(LOG_DEBUG,"sect: msg0 sectiondb recall #%"INT32"", m_sectiondbRecall++); // we should really limit voting per site! we do now! //if ( m_recall > 5 ) { char *xx=NULL;*xx=0; } // update our start key if ( lastKey ) m_sectiondbStartKey = *lastKey; // inc by 2 since we already had this key m_sectiondbStartKey += 2; // unflag m_numSectiondbNeeds++; // and repeat goto readLoop; } // // set ptr_sectiondbData so this can be set from a title rec without // having to lookup in sectiondb again which might have changed! // m_sectiondbData.purge(); // alloc int32_t need = m_osvt.m_svt.getStoredSize() + 4; if ( ! m_sectiondbData.reserve(need) ) // oom error? return NULL; // serialize this number m_sectiondbData.pushLong(m_osvt.m_totalSiteVoters); // serialize the hashtablex m_osvt.m_svt.serialize ( &m_sectiondbData ); // reference it for title rec serialization ptr_sectiondbData = m_sectiondbData.getBufStart(); size_sectiondbData = m_sectiondbData.length(); m_osvtValid = true; return &m_osvt; } int32_t *XmlDoc::getLinkSiteHashes ( ) { if ( m_linkSiteHashesValid ) return (int32_t *)m_linkSiteHashBuf.getBufStart(); // get the outlinks Links *links = getLinks(); if ( ! links || links == (Links *)-1 ) return (int32_t *)links; // . get the outlink tag rec vector // . each link's tagrec may have a "site" tag that is basically // the cached SiteGetter::getSite() computation TagRec ***grv = NULL; if ( ! m_setFromTitleRec ) { grv = getOutlinkTagRecVector(); if ( ! grv || grv == (void *)-1 ) return (int32_t *)grv; } // how many outlinks do we have on this page? int32_t n = links->getNumLinks(); // reserve space m_linkSiteHashBuf.purge(); if ( ! m_linkSiteHashBuf.reserve ( n * 4 ) ) return NULL; if ( n == 0 ) { ptr_linkdbData = NULL; size_linkdbData = 0; return (int32_t *)0x1234; } // if set from titlerec then assume each site is the full hostname // of the link, unless its specified explicitly in the hashtablex // serialized in ptr_linkdbData if ( m_setFromTitleRec ) { // this holds the sites that are not just the hostname int32_t *p = (int32_t *)ptr_linkdbData; int32_t *pend = (int32_t *)(ptr_linkdbData + size_linkdbData); // loop over links for ( int32_t i = 0 ; i < n ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // get the link char *u = links->getLinkPtr(i); // assume site is just the host int32_t hostLen = 0; char *host = ::getHost ( u , &hostLen ); int32_t siteHash32 = hash32 ( host , hostLen , 0 ); // unless give as otherwise if ( p < pend && *p == i ) { p++; siteHash32 = *p; p++; } // store that then. should not fail since we allocated // right above if ( ! m_linkSiteHashBuf.pushLong(siteHash32) ) { char *xx=NULL;*xx=0; } } // return ptr of array, which is a safebuf return (int32_t *)m_linkSiteHashBuf.getBufStart(); } // ptr_linkdbData will point into this buf m_linkdbDataBuf.purge(); // loop through them for ( int32_t i = 0 ; i < n ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // get the link char *u = links->getLinkPtr(i); // get full host from link int32_t hostLen = 0; char *host = ::getHost ( u , &hostLen ); int32_t hostHash32 = hash32 ( host , hostLen , 0 ); // get the site TagRec *gr = (*grv)[i]; char *site = NULL; int32_t siteLen = 0; if ( gr ) { int32_t dataSize = 0; site = gr->getString("site",NULL,&dataSize); if ( dataSize ) siteLen = dataSize - 1; } // otherwise, make it the host or make it cut off at // a "/user/" or "/~xxxx" or whatever path component if ( ! site ) { // GUESS link site... like /~xxx site = host; siteLen = hostLen; } int32_t linkeeSiteHash32 = hash32 ( site , siteLen , 0 ); // only store if different form host itself if ( linkeeSiteHash32 != hostHash32 ) { if ( ! m_linkdbDataBuf.pushLong(i) ) return NULL; if ( ! m_linkdbDataBuf.pushLong(linkeeSiteHash32) ) return NULL; } // store it always in this buf if ( ! m_linkSiteHashBuf.pushLong(linkeeSiteHash32) ) { // space should have been reserved above! char *xx=NULL;*xx=0; } } // set ptr_linkdbData ptr_linkdbData = m_linkdbDataBuf.getBufStart(); size_linkdbData = m_linkdbDataBuf.length(); m_linkSiteHashesValid = true; return (int32_t *)m_linkSiteHashBuf.getBufStart(); } Links *XmlDoc::getLinks ( bool doQuickSet ) { if ( m_linksValid ) return &m_links; // set status setStatus ( "getting outlinks"); // this will set it if necessary Xml *xml = getXml(); // bail on error if ( ! xml || xml == (Xml *)-1 ) return (Links *)xml; // can't call getIsPermalink() here without entering a dependency loop char *pp = getIsUrlPermalinkFormat(); if ( !pp || pp == (char *)-1 ) return (Links *)pp; // use the old xml doc XmlDoc **od = getOldXmlDoc ( ); if ( ! od || od == (XmlDoc **)-1 ) return (Links *)od; // get Links class of the old title rec Links *oldLinks = NULL; // if we were set from a title rec, do not do this if ( *od ) { oldLinks = (*od)->getLinks(); if (!oldLinks||oldLinks==(Links *)-1) return (Links *)oldLinks; } Url *baseUrl = getBaseUrl(); if ( ! baseUrl || baseUrl==(Url *)-1) return (Links *)baseUrl; int32_t *ip = getIp(); if ( ! ip || ip == (int32_t *)-1 ) return (Links *)ip; // this ensures m_contentLen is set //char **content = getContent(); //if ( ! content || content == (char **)-1 ) return (Links *)content; // this will set ptr_indCatIds and size_indCatIds int32_t **pici = getIndCatIds(); if ( ! pici || pici == (void *)-1 ) return (Links *)pici; char *ict = getIsContentTruncated(); if ( ! ict || ict == (char *)-1 ) return (Links *)ict; int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (int32_t *)-1 ) return (Links *)sni; // get the latest url we are on Url *u = getCurrentUrl(); // // if we had a EDOCSIMPLIFIEDREDIR error, pretend it is a link // so addOutlinkSpiderRecsToMetaList() will add it to spiderdb // if ( m_indexCodeValid && m_indexCode == EDOCSIMPLIFIEDREDIR ) { m_links.set ( m_redirUrl.getUrl(),m_redirUrl.getUrlLen() ); m_linksValid = true; return &m_links; } if ( m_indexCodeValid && m_indexCode == EDOCNONCANONICAL ) { m_links.set(m_canonicalRedirUrl.getUrl(), m_canonicalRedirUrl.getUrlLen()); m_linksValid = true; return &m_links; } // . set it // . if parent is a permalink we can avoid its suburl outlinks // containing "comment" from being classified as permalinks if ( ! m_links.set ( true , // useRelNoFollow? xml , u , true , // setLinkHashes? baseUrl , m_version , m_niceness , *pp , // parent url in permalink format? oldLinks ,// oldLinks, might be NULL! doQuickSet )) return NULL; m_linksValid = true; // do not bother setting that bit if we are being called for link // text because that bit was already in the linkdb key, and it // was set to zero! so if getting msg20 reply.... bail now if ( m_req ) return &m_links; // . apply link spam settings // . set the "spam bits" in the Links class setLinkSpam ( *ip , ptr_indCatIds , size_indCatIds / 4 , u , // linker url *sni , xml , &m_links , *ict , m_niceness ); // we got it return &m_links; } HashTableX *XmlDoc::getCountTable ( ) { // return it if we got it if ( m_countTableValid ) return &m_countTable; setStatus ("getting count table"); // get the stuff we need Xml *xml = getXml (); if ( ! xml || xml == (Xml *)-1 ) return (HashTableX *)xml; Words *words = getWords (); if ( ! words || words == (Words *)-1 ) return (HashTableX *)words; Phrases *phrases = getPhrases (); if ( ! phrases || phrases==(Phrases *)-1) return (HashTableX *)phrases; Bits *bits = getBits (); if ( ! bits || bits == (Bits *)-1 ) return (HashTableX *)bits; Sections *sections = getSections(); if ( !sections||sections==(Sections *)-1) return(HashTableX *)sections; LinkInfo *info1 = getLinkInfo1(); if ( ! info1 || info1 == (LinkInfo *)-1 ) return (HashTableX *)info1; // . reduce score of words in badly repeated fragments to 0 so we do // not count them here! // . ff[i] will have score of 0 if in repeated frag // . make sure this is stored for whole doc... since we only use it // for the body char *fv = getFragVec(); if ( ! fv || fv == (void *)-1 ) return (HashTableX *)fv; //LinkInfo *info2 = getLinkInfo2(); //if ( ! info2 || info2 == (LinkInfo *)-1 ) return (HashTableX *)info2; // init our count table otherwise //if(! m_countTable.set( 8,4,1024,NULL,0,false,m_niceness,"xmlcnttbl")) // return NULL; // breathe QUICKPOLL ( m_niceness ); // // this was in Weights.cpp, but now it is here... // // int16_tcut HashTableX *ct = &m_countTable; // reset the counts, just in case set() below does not //ct->reset(); // ez var int64_t *wids = words->getWordIds (); nodeid_t *tids = words->getTagIds (); int32_t nw = words->getNumWords (); char **wptrs = words->m_words; int32_t *wlens = words->m_wordLens; int64_t *pids = phrases->getPhraseIds2(); // add 5000 slots for inlink text in hashString_ct() calls below int32_t numSlots = nw * 3 + 5000; // only alloc for this one if not provided if (!ct->set(8,4,numSlots,NULL,0,false,m_niceness,"xmlct")) return (HashTableX *)NULL; //char *ff = getFragVec ( ) ; //if ( ! ff ) return false; // . now hash all the phrase ids we have in order to see if the phrase // is unique or not. if phrase is repeated a lot we punish the scores // of the individual words in the phrase and boost the score of the // phrase itself. We check for uniqueness down below. for ( int32_t i = 0 ; i < nw ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // add the word if ( wids[i] == 0LL ) continue; //if ( wids[i] == 708411945052722517LL ) // log("hey4 got new pid=%"INT64" i=%"INT32"",pids[i],i); // . skip if in repeated fragment // . unfortunately we truncate the frag vec to like // the first 80,000 words for performance reasons if ( i < MAXFRAGWORDS && fv[i] == 0 ) continue; // accumulate the wid with a score of 1 each time it occurs if ( ! ct->addTerm ( &wids[i] ) ) return (HashTableX *)NULL; // skip if word #i does not start a phrase if ( ! pids [i] ) continue; // if phrase score is less than 100% do not consider as a // phrase so that we do not phrase "albuquerque, NM" and stuff // like that... in fact, we can only have a space here... if ( wptrs[i+1][0] == ',' ) continue; if ( wptrs[i+1][1] == ',' ) continue; if ( wptrs[i+1][2] == ',' ) continue; // put it in, accumulate, max score is 0x7fffffff if ( ! ct->addTerm ( &pids[i] ) ) return (HashTableX *)NULL; } // now add each meta tag to the pot for ( int32_t i = 0 ; i < nw ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // skip if not a meta tag if ( tids[i] != 68 ) continue; // find the "content=" word char *w = wptrs[i]; int32_t wlen = wlens[i]; char *wend = w + wlen; char *p ; p = strncasestr (w,wlen,"content="); // skip if we did not have any content in this meta tag if ( ! p ) continue; // skip the "content=" p += 8; // skip if empty meta content if ( wend - p <= 0 ) continue; // our ouw hash if ( ! hashString_ct ( ct , p , wend - p ) ) return (HashTableX *)NULL; } // add each incoming link text for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) { // breathe QUICKPOLL ( m_niceness ); // int16_tcuts char *p; int32_t plen; // hash link text (was hashPwids()) p = k-> getLinkText(); plen = k->size_linkText - 1; if ( ! verifyUtf8 ( p , plen ) ) { log("xmldoc: bad link text 3 from url=%s for %s", k->getUrl(),m_firstUrl.m_url); continue; } if ( ! hashString_ct ( ct , p , plen ) ) return (HashTableX *)NULL; // hash this stuff (was hashPwids()) p = k->getSurroundingText(); plen = k->size_surroundingText - 1; if ( ! hashString_ct ( ct , p , plen ) ) return (HashTableX *)NULL; } // we got it m_countTableValid = true; return &m_countTable; } // . a special function used by XmlDoc::getCountTable() above // . kinda similar to XmlDoc::hashString() bool XmlDoc::hashString_ct ( HashTableX *ct , char *s , int32_t slen ) { Words words; Bits bits; Phrases phrases; if ( ! words.set ( s , slen , m_version , true , m_niceness ) ) return false; if ( ! bits.set ( &words , m_version , m_niceness ) ) return false; if ( ! phrases.set(&words,&bits,true,false,m_version,m_niceness)) return false; int32_t nw = words.getNumWords(); int64_t *wids = words.getWordIds(); int64_t *pids = phrases.m_phraseIds2; char **wptrs = words.m_words; int32_t *wlens = words.m_wordLens; for ( int32_t i = 0 ; i < nw ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // add the word if ( wids[i] == 0LL ) continue; // skip if in repeated fragment // . NO, we do not use this for these int16_t strings //if ( ww[i] == 0 ) continue; // accumulate the wid with a score of 1 each time it occurs if ( ! ct->addTerm ( &wids[i] ) ) return false; // skip if word #i does not start a phrase if ( ! pids [i] ) continue; // if phrase score is less than 100% do not consider as a // phrase so that we do not phrase "albuquerque, NM" and stuff // like that... in fact, we can only have a space here... if ( i+1=2 && wptrs[i+1][1] == ',' ) continue; if ( wlens[i+1]>=3 && wptrs[i+1][2] == ',' ) continue; } // put it in, accumulate, max score is 0x7fffffff if ( ! ct->addTerm ( &pids[i] ) ) return false; } return true; } uint8_t *XmlDoc::getSummaryLangId ( ) { // return if we got it already if ( m_summaryLangIdValid ) return &m_summaryLangId; Summary *s = getSummary(); if ( ! s || s == (void *)-1 ) return (uint8_t *)s; char *sum = s->getSummary(); // now set the words class Words ww; if ( ! ww.set9 ( sum , m_niceness ) ) return NULL; // check it out. 0 means langUnknown. -1 means error. int32_t ret = ww.getLanguage ( NULL , 100 , m_niceness , NULL ); // -1 means error! g_errno should be set if ( ret < 0 ) return NULL; // set it m_summaryLangId = (uint8_t)ret; // assume valid m_summaryLangIdValid = true; // return it return &m_summaryLangId; } int cmp ( const void *h1 , const void *h2 ) ; // vector components are 32-bit hashes int32_t *XmlDoc::getTagPairHashVector ( ) { if ( m_tagPairHashVecValid ) return m_tagPairHashVec; Xml *xml = getXml (); if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml; // store the hashes here uint32_t hashes [ 2000 ]; int32_t nh = 0; // go through each node XmlNode *nodes = xml->getNodes (); int32_t n = xml->getNumNodes (); // start with the ith node int32_t i = 0; uint32_t saved = 0; uint32_t lastHash = 0; // loop over the nodes for ( ; i < n ; i++ ) { // breathe a little QUICKPOLL ( m_niceness ); // skip NON tags if ( ! nodes[i].isTag() ) continue; // use the tag id as the hash, its unique uint32_t h = hash32h ( nodes[i].getNodeId() , 0 ); // ensure hash is not 0, that has special meaning if ( h == 0 ) h = 1; // store in case we have only one hash saved = h; // if we are the first, set this if ( ! lastHash ) { lastHash = h; continue; } // if they were the same do not xor, they will zero out if ( h == lastHash ) hashes[nh++] = h; // incorporate it into the last hash else hashes[nh++] = h ^ lastHash; // we are the new last hash lastHash = h; // bust out if no room if ( nh >= 2000 ) break; } // if only had one tag after, use that if ( nh == 0 && saved ) hashes[nh++] = saved; // breathe QUICKPOLL ( m_niceness ) ; // . TODO: remove the link text hashes here? // . because will probably be identical.. // . now sort hashes to get the top MAX_PAIR_HASHES gbsort ( hashes , nh , 4 , cmp ); // breathe QUICKPOLL ( m_niceness ) ; // uniquify them int32_t d = 0; for ( int32_t j = 1 ; j < nh ; j++ ) { if ( hashes[j] == hashes[d] ) continue; hashes[++d] = hashes[j]; } // breathe QUICKPOLL ( m_niceness ) ; // how many do we got? nh = d; // truncate to MAX_PAIR_HASHES MINUS 1 so we can put a 0 at the end if ( nh > MAX_TAG_PAIR_HASHES-1 ) nh = MAX_TAG_PAIR_HASHES-1; // store the top MAX_PAIR_HASHES gbmemcpy ( m_tagPairHashVec , hashes , nh * 4 ); // null term it. all vectors need this so computeSimilarity() works m_tagPairHashVec [ nh++ ] = 0; m_tagPairHashVecValid = true; m_tagPairHashVecSize = nh * 4; return m_tagPairHashVec; } // sort in descending order int cmp ( const void *h1 , const void *h2 ) { return *(uint32_t *)h2 - *(uint32_t *)h1; } // . m_tagVector.setTagPairHashes(&m_xml, niceness); // . Sections.cpp and getIsDup() both use this hash // . returns NULL and sets g_errno on error // . xors all the unique adjacent tag hashes together // . kind of represents the template the web pages uses // . we add this to sectiondb as a vote in Sections::addVotes() uint32_t *XmlDoc::getTagPairHash32 ( ) { // only compute once if ( m_tagPairHash32Valid ) return &m_tagPairHash32; Words *words = getWords(); if ( ! words || words == (Words *)-1 ) return (uint32_t *)words; // int16_tcuts //int64_t *wids = words->getWordIds (); nodeid_t *tids = words->getTagIds (); int32_t nw = words->getNumWords (); int32_t nt = words->m_numTags; // . get the hash of all the tag pair hashes! // . we then combine that with our site hash to get our site specific // html template termid // . put all tag pairs into a hash table // . similar to Vector::setTagPairHashes() but we do not compute a // vector, just a single scalar/hash of 32 bits, m_termId HashTableX tp; // T tp; if ( ! tp.set ( 4 , 1 , nt * 4 , NULL , 0 , true,m_niceness,"xmltp")) return 0LL; uint32_t lastTid = 0; char val = 1; for ( int32_t i = 0 ; i < nw ; i++ ) { // skip if not tag if ( tids[i] == 0LL ) continue; // skip if back tag if ( tids[i] & BACKBIT ) continue; // get last tid uint32_t h = hash32h ( tids[i] , lastTid ); //logf(LOG_DEBUG,"build: tph %"INT32" h=%"UINT64"",i,(int64_t)h); // . add to table (skip if 0, means empty bucket) // . return NULL and set g_errno on error if ( h && ! tp.addKey ( &h , &val ) ) return NULL; // update this lastTid = h; } // linear scan on hash table to get all the hash, XOR together uint32_t hx = 0; int32_t nb = tp.getNumSlots(); char *flags = tp.m_flags; // get keys uint32_t *keys = (uint32_t *)tp.m_keys; for ( int32_t i = 0 ; i < nb ; i++ ) { // skip if empty if ( flags[i] == 0 ) continue; // skip if empty //if ( keys[i] == 0LL ) continue; // incorporate hx ^= keys[i]; } // never return 0, make it 1. 0 means an error if ( hx == 0 ) hx = 1; // set the hash m_tagPairHash32 = hx ; // it is now valid m_tagPairHash32Valid = true; return &m_tagPairHash32; } // . used for deduping search results // . also uses the title int32_t *XmlDoc::getSummaryVector ( ) { if ( m_summaryVecValid ) return (int32_t *)m_summaryVec; Summary *s = getSummary(); if ( ! s || s == (Summary *)-1 ) return (int32_t *)s; Title *ti = getTitle(); if ( ! ti || ti == (Title *)-1 ) return (int32_t *)ti; // store title and summary into "buf" so we can call words.set() //char buf[5000]; SafeBuf sb; //char *p = buf; //int32_t avail = 5000; //int32_t len; // put title into there int32_t tlen = ti->m_titleBytes - 1; //if ( len > avail ) len = avail - 10; if ( tlen < 0 ) tlen = 0; // put summary into there int32_t slen = s->m_summaryLen; // allocate space int32_t need = tlen + 1 + slen + 1; if ( ! sb.reserve ( need ) ) return NULL; //gbmemcpy ( p , ti->m_title , len ); //p += len; sb.safeMemcpy ( ti->m_title , tlen ); // space separting the title from summary if ( tlen > 0 ) sb.pushChar(' '); //if ( len > avail ) len = avail - 10; //gbmemcpy ( p , s->m_summary , len ); //p += len; sb.safeMemcpy ( s->m_summary , slen ); // null terminate it //*p = '\0'; sb.nullTerm(); // word-ify it Words words; if ( ! words.set9 ( sb.getBufStart() , m_niceness ) ) return NULL; // . now set the dedup vector from big summary and title // . store sample vector in here // . returns size in bytes including null terminating int32_t m_summaryVecSize = computeVector ( NULL , &words , (uint32_t *)m_summaryVec ); m_summaryVecValid = true; return m_summaryVec; } bool getWordVector ( char *s , HashTableX *ht , uint32_t *d , int32_t *nd , int32_t ndmax ) { // utf8 char size char size; // grab each word and hash it for ( ; *s ; s += size ) { // get size size = getUtf8CharSize(s); // skip if tag if ( *s == '<' ) { while ( *s && *s!='>' ) s += getUtf8CharSize(s); continue; } // skip if other type of punct if ( ! is_alnum_utf8(s) ) continue; // ok, we got a word then char *start = s; // see how long the word is for ( ; *s && is_alnum_utf8(s);s+=getUtf8CharSize(s)); // get wordid, a simple hash, just like Words.cpp does uint64_t h = hash64Lower_utf8(start,s - start); // do not inc this time size = 0; // breathe //QUICKPOLL ( m_niceness ); // make 32 bit uint32_t wid32 = (uint32_t)h; // // TODO: ignore if it is a day name or month name or // number because those are like dates // if ( ht ) { // do not add if we already got it if ( ht->getSlot ( &wid32 ) >= 0 ) continue; // add to hash table. return NULL and set g_errno onerr if ( ! ht->addKey (&wid32 )) return false; } // add it to our vector d[*nd] = (uint32_t)wid32; // inc it *nd = *nd + 1; // stop after 3000 for sure if ( *nd >= ndmax ) return true; } return true; } // used by getIsDup() and Dates.cpp for detecting dups and for // seeing if the content changed respectively int32_t *XmlDoc::getPageSampleVector ( ) { if ( m_pageSampleVecValid ) return m_pageSampleVec; Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww; Sections *ss = NULL; //if ( m_eliminateMenus ) { //ss = getSections(); //if ( ! ss || ss == (Sections *)-1) return (int32_t *)ss; //} m_pageSampleVecSize = computeVector ( ss, ww, (uint32_t *)m_pageSampleVec ); m_pageSampleVecValid = true; return m_pageSampleVec; } // . this is the vector of the words right after the hypertext for the link // we are voting on. // . it is used to dedup voters in Msg25.cpp int32_t *XmlDoc::getPostLinkTextVector ( int32_t linkNode ) { if ( m_postVecValid ) return m_postVec; // assume none m_postVecSize = 0; // set up Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (int32_t *)xml; Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (int32_t *)ww; // sanity check if ( linkNode < 0 ) { char *xx=NULL;*xx=0; } // linkNode starts pointing to a tag so skip over that! linkNode++; // limit int32_t nn = xml->getNumNodes(); XmlNode *nodes = xml->getNodes(); // and advance i to the next anchor tag thereafter, we do not // want to include link text in this vector because it is usually // repeated and will skew our "similarities" for ( ; linkNode < nn ; linkNode++ ) { // stop if we hit or if ( (nodes[linkNode].m_nodeId & BACKBITCOMP) != 2 ) continue; // advance over the or linkNode++; // then stop, we will start gathering link text here break; } // if we hit end of the doc, we got not vector then if ( linkNode >= nn ) return m_postVec; // now convert the linkNode # to a word #, "start" int32_t nw = ww->getNumWords (); int64_t *wids = ww->getWordIds (); nodeid_t *tids = ww->getTagIds (); int32_t *wn = ww->m_nodes; int32_t i = 0; for ( ; i < nw ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // stop when we got the first word in this node # if ( wn[i] == linkNode ) break; } // if none, bail now, size is 0 if ( i >= nw ) return m_postVec; // save that int32_t start = i; // likewise, set the end of it int32_t end = nw; // count alnum words int32_t count = 0; // limit it for ( i = start ; i < nw && count < 35 ; i++ ) { // get tag id nodeid_t tid = tids[i] & BACKBITCOMP; // stop if certain ones if ( tid == TAG_TABLE ) break; if ( tid == TAG_UL ) break; // , is ok if ( tids[i] == TAG_A ) break; // only up to 35 words allowed in the hash if ( wids[i] ) count++; } // set the end of the words to hash end = i; // specify starting node # now m_postVecSize = computeVector(NULL,ww,(uint32_t *)m_postVec,start,end); // return what we got return m_postVec; } // . was kinda like "m_tagVector.setTagPairHashes(&m_xml, niceness);" // . this is used by getIsDup() (below) // . this is used by Dates.cpp to see how much a doc has changed // . this is also now used for getting the title/summary vector for deduping // search results // . if we couldn't extract a good pub date for the doc, and it has changed // since last spidered, use the bisection method to come up with our own // "last modified date" which we use as the pub date. // . this replaces the clusterdb.getSimilarity() logic in Msg14.cpp used // to do the same thing. but we call Vector::setForDates() from // Dates.cpp. that way the logic is more contained in Dates! // . doesn't Msg14 already do that? // . yes, but it uses two TermTables and calls Clusterdb::getSimilarity() // . returns false and sets g_errno on error // . these words classes should have been set by a call to Words::set(Xml *...) // so that we have "tids1" and "tids2" // . returns NULL and sets g_errno on error // . TODO: if our title rec is non-empty consider getting it from that // . we use this vector to compare two docs to see how similar they are int32_t XmlDoc::computeVector ( Sections *sections, Words *words, uint32_t *vec , int32_t start , int32_t end ) { // assume empty vector vec[0] = 0; // skip if no article section. then we have no vector. if ( sections && ! sections->m_hadArticle ) return 0; // int16_tcuts int32_t nw = words->getNumWords(); //int32_t nt = words->m_numTags; int64_t *wids = words->getWordIds(); // set the end to the real end if it was specified as less than zero if ( end < 0 ) end = nw; // # of alnum words, about... minus the tags, then the punct words // are half of what remains... int32_t count = words->m_numAlnumWords; // if we got sections, how many good words? if ( sections ) count = sections->m_numAlnumWordsInArticle; // google seems to index SEC_MARQUEE so i took that out //int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT; // these Section ptrs are 1-1 with the words Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // . Get sample vector from content section only. // . This helps remove duplicate menu/ad from vector // 4 bytes per hash, save the last one for a NULL terminator, 0 hash int32_t maxTerms = SAMPLE_VECTOR_SIZE / 4 - 1; // what portion of them do we want to mask out from the rest? int32_t ratio = count / maxTerms ; // a mask of 0 means to get them all unsigned char mask = 0x00; // if we got twice as many terms as we need, then set mask to 0x01 // to filter out half of them! but actually, let's aim for twice // as many as we need to ensure we really get as many as we need. // so if we got 4 or more than we need then cut in half... while ( ratio >= 4 ) { // shift the mask down, ensure hi bit is set mask >>= 1; mask |= 0x80; ratio >>= 1; // /2 } // store vector into "d" for now. will sort below uint32_t d [ 3000 ]; // dedup our vector using this hashtable, "ht" char hbuf[3000*6*2]; HashTableX ht; if ( ! ht.set(4,0,3000,hbuf,3000*6*2,false,m_niceness,"xmlvecdedup")){ char*xx=NULL;*xx=0;} again: // a buffer to hold the top termIds int32_t nd = 0; // count how many we mask out int32_t mo = 0; // . buffer should have at least "maxTerms" in it // . these should all be 12 byte keys for ( int32_t i = start ; i < end ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // skip if not alnum word if ( wids[i] == 0 ) continue; // skip if mask filters it if ( ((wids[i]>>(NUMTERMIDBITS-8)) & mask)!=0) {mo++;continue;} // skip if in select, style, script or marquee tag section if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue; // make 32 bit uint32_t wid32 = (uint32_t)wids[i]; // do not add if we already got it if ( ht.getSlot ( &wid32 ) >= 0 ) continue; // add to hash table. return NULL and set g_errno on error if ( ! ht.addKey (&wid32 )){char*xx=NULL;*xx=0; } // add it to our vector d[nd] = (uint32_t)wids[i]; // stop after 3000 for sure if ( ++nd < 3000 ) continue; // bitch and break out on error log(LOG_INFO,"build: Sample vector overflow. Slight " "performance hit."); break; } // . if nd was too small, don't use a mask to save time // . well just make the mask less restrictive if ( nd < maxTerms && mask && mo ) { // shift the mask UP, allow more termIds to pass through mask <<= 1; // reset hash table since we are starting over ht.clear(); goto again; } // bubble sort them bool flag = true; while ( flag ) { // breathe QUICKPOLL ( m_niceness ); flag = false; for ( int32_t i = 1 ; i < nd ; i++ ) { if ( d[i-1] <= d[i] ) continue; uint32_t tmp = d[i-1]; d[i-1] = d[i]; d[i] = tmp; flag = true; } } // truncate if ( nd > maxTerms ) nd = maxTerms; // null terminate d [ nd++ ] = 0; // store in our sample vector gbmemcpy ( vec , d , nd * 4 ); // return size in bytes return nd * 4; } float *XmlDoc::getTagSimilarity ( XmlDoc *xd2 ) { int32_t *tv1 = getTagPairHashVector(); if ( ! tv1 || tv1 == (int32_t *)-1 ) return (float *)tv1; int32_t *tv2 = xd2->getTagPairHashVector(); if ( ! tv2 || tv2 == (int32_t *)-1 ) return (float *)tv2; m_tagSimilarity = computeSimilarity ( tv1, tv2, NULL, NULL, NULL , m_niceness ); // this means error, g_errno should be set if ( m_tagSimilarity == -1.0 ) return NULL; return &m_tagSimilarity; } float *XmlDoc::getGigabitSimilarity ( XmlDoc *xd2 ) { int32_t **gv1 = getGigabitHashes(); if ( ! gv1 || gv1 == (int32_t **)-1 ) return (float *)gv1; int32_t **gv2 = xd2->getGigabitHashes(); if ( ! gv2 || gv2 == (int32_t **)-1 ) return (float *)gv2; // *gv1 could be NULL if vec was empty in titlerec's ptr_gigabitHashes m_gigabitSimilarity = computeSimilarity ( *gv1, *gv2, NULL, NULL, NULL, m_niceness ); // this means error, g_errno should be set if ( m_gigabitSimilarity == -1.0 ) return NULL; return &m_gigabitSimilarity; } float *XmlDoc::getPageSimilarity ( XmlDoc *xd2 ) { int32_t *sv1 = getPageSampleVector(); if ( ! sv1 || sv1 == (int32_t *)-1 ) return (float *)sv1; int32_t *sv2 = xd2->getPageSampleVector(); if ( ! sv2 || sv2 == (int32_t *)-1 ) return (float *)sv2; m_pageSimilarity = computeSimilarity ( sv1, sv2, NULL, NULL, NULL, m_niceness ); // this means error, g_errno should be set if ( m_pageSimilarity == -1.0 ) return NULL; return &m_pageSimilarity; } // . compare old page vector with new // . returns ptr to a float from 0.0 to 100.0 float *XmlDoc::getPercentChanged ( ) { // if we got it if ( m_percentChangedValid ) return &m_percentChanged; // get the old doc XmlDoc **od = getOldXmlDoc ( ); if ( ! od || od == (XmlDoc **)-1 ) return (float *)od; // if empty, assume 0% changed if ( ! *od ) { m_percentChanged = 0; m_percentChangedValid = true; return &m_percentChanged; } // get its page c float *ps = getPageSimilarity ( *od ); if ( ! ps || ps == (float *)-1 ) return (float *)ps; // got it m_percentChanged = *ps; m_percentChangedValid = true; // just return it return &m_percentChanged; } // . Address.cpp converts a place name into a vector for comparing via a // call to computeSimilarity() below // . returns -1 and set g_errno on error // . "vbufSize" is in BYTES! // . returns length of word vector in int32_ts (# components stored) int32_t makeSimpleWordVector (char *s,int32_t *vbuf,int32_t vbufSize,int32_t niceness ) { // nonsense? if ( vbufSize < 4 ) { char *xx=NULL;*xx=0; } // empty it *vbuf = 0; // no words, no vector if ( ! s ) return 0; // set them Words w; // return -1 with g_errno set on error if ( ! w.set9 ( s , niceness ) ) return -1; // skip if no words if ( w.m_numWords == 0 ) return 0; // int16_t cut int64_t *wids = w.m_wordIds; int64_t pid = 0LL; // count insertions int32_t count = 0; // ptr int32_t *vbufPtr = vbuf; int32_t *vbufEnd = vbuf + vbufSize/4; // put words into a vector for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) { // skip if not alnum word if ( ! wids[i] ) continue; // if no room stop. need room for NULL terminator if ( vbufPtr + 2 >= vbufEnd ) return count; // put it in //*vbufPtr = (int32_t)wids[i]; // . use the synonym instead if it had one // . maps "theatre" to "theater", "4th" to "fourth", etc. // . false = is street name? int64_t *p = getSynonymWord ( &wids[i] , &pid , false ); // set this pid = wids[i]; //int64_t *p = (int64_t *)synTable->getValue64( wids[i] ); // 0 means to ignore it if ( *p == 0LL ) continue; // otherwise add into our vector *vbufPtr = *p; // advance vbufPtr++; // NULL termination *vbufPtr = 0; // count it count++; } // all done return count; } // . compare two vectors // . components in vectors are int32_ts // . last component is a zero, to mark EOV = end of vector // . discount any termIds that are in the query vector, qvec, which may be NULL // . returns -1 and sets g_errno on error // . vector components are 32-bit hashes of the words (hash32())??? // i would say they should be the lower 32 bits of the 64-bit hashes! // . replaces: // g_clusterdb.getGigabitSimilarity() // m_tagVec->getLinkBrotherProbability() // g_clusterdb.getSampleSimilarity() float computeSimilarity ( int32_t *vec0 , int32_t *vec1 , int32_t *s0 , // corresponding scores vector int32_t *s1 , // corresponding scores vector Query *q , int32_t niceness , bool dedupVectors ) { static int32_t s_tmp = 0; if ( ! vec0 ) vec0 = &s_tmp; if ( ! vec1 ) vec1 = &s_tmp; // if both empty, assume not similar at all if ( *vec0 == 0 && *vec1 == 0 ) return 0; // if either is empty, return 0 to be on the safe side if ( *vec0 == 0 ) return 0; if ( *vec1 == 0 ) return 0; // flag if from query vector HashTableX qt; char qbuf[5000]; if ( q ) { // init hash table if ( ! qt.set ( 4,0,512,qbuf,5000,false,niceness,"xmlqvtbl") ) return -1; // . stock the query term hash table // . use the lower 32 bits of the termids to make compatible // with the other vectors we use int64_t *qtids = q->getTermIds (); int32_t nt = q->getNumTerms(); for ( int32_t i = 0 ; i < nt ; i++ ) { // get it uint32_t h = (uint32_t)(qtids[i] & 0xffffffff); // hash it if ( ! qt.addKey ( &h ) ) return -1; } } // if we ignore cardinality then it only matters if both vectors // have a particular value, and not how many times they each have it. // so we essentially dedup each vector if dedupVectors is true. // but we do total up the score and put it behind the one unique // occurence though. we do this only for // Sections::addDateBasedImpliedSections() right now bool allowDups = true; if ( dedupVectors ) allowDups = false; HashTableX ht; char hbuf[10000]; if ( ! ht.set ( 4,4,-1,hbuf,10000,allowDups,niceness,"xmlqvtbl2")) return -1; bool useScores = (bool)s0; int32_t matches = 0; int32_t total = 0; int32_t matchScore = 0; int32_t totalScore = 0; // hash first vector. accumulating score total and total count for ( int32_t *p = vec0; *p ; p++ , s0++ ) { // breathe QUICKPOLL(niceness); // skip if matches a query term if ( q && qt.getSlot ( p ) ) continue; // count it total++; // get it int32_t score = 1; // get the score if valid if ( useScores ) score = *s0; // total it up totalScore += score; // add it if ( dedupVectors ) { // accumulate all the scores into this one bucket // in the case of p being a dup if ( ! ht.addTerm32 ( p , score ) ) return -1; } else { // otherwise, add each into its own bucket since // ht.m_allowDups should be true if ( ! ht.addKey ( p , &score ) ) return -1; } } int32_t zero = 0; // see what components of this vector match for ( int32_t *p = vec1; *p ; p++ , s1++ ) { // breathe QUICKPOLL(niceness); // skip if matches a query term if ( q && qt.getSlot ( p ) ) continue; // count it total++; // get it int32_t score = 1; // get the score if valid if ( useScores ) score = *s1; // and total scores totalScore += score; // is it in there? int32_t slot = ht.getSlot ( p ); // skip if unmatched if ( slot < 0 ) continue; // otherwise, it is a match! matches++; // and scores matchScore += score; // and score of what we matched uint32_t *val = (uint32_t *)ht.getValueFromSlot ( slot ); // he is hit too matchScore += *val; // remove it as we match it to deal with dups if ( allowDups ) { // once we match it once, do not match again, score was // already accumulated ht.setValue ( slot , &zero ); } else { // otherwise, remove this dup and try to match any // remaining dups in the table ht.removeSlot ( slot ); } } // if after subtracting query terms we got no hits, return 0.framesets? if ( useScores && totalScore == 0 ) return 0; if ( total == 0 ) return 0; // . what is the max possible score we coulda had? // . subtract the vector components that matched a query term float percent = 100 * (float)matchScore / (float)totalScore; //if ( useScores)percent = 100 * (float)matchScore / (float)totalScore; //else percent = 100 * (float)matches / (float)total; // sanity //if ( percent > 100 ) percent = 100; if ( percent > 100 ) { char *xx=NULL;*xx=0; } return percent; } // this returns true if the two vecs are "percentSimilar" or more similar bool isSimilar_sorted ( int32_t *vec0 , int32_t *vec1 , int32_t nv0 , // how many int32_ts in vec? int32_t nv1 , // how many int32_ts in vec? // they must be this similar or more to return true int32_t percentSimilar, int32_t niceness ) { // if both empty, assume not similar at all if ( *vec0 == 0 && *vec1 == 0 ) return 0; // if either is empty, return 0 to be on the safe side if ( *vec0 == 0 ) return 0; if ( *vec1 == 0 ) return 0; // do not include last 0 nv0--; nv1--; int32_t total = nv0 + nv1; // so if the "noMatched" count ever EXCEEDS (not equals) this // "brink" we can bail early because there's no chance of getting // the similarity "percentSimilar" provided. should save some time. int32_t brink = ((100-percentSimilar) * total) / 100; // scan each like doing a merge int32_t *p0 = vec0; int32_t *p1 = vec1; int32_t yesMatched = 0; int32_t noMatched = 0; mergeLoop: // stop if both exhausted. we didn't bail on brink, so it's a match if ( *p0 == 0 && *p1 == 0 ) return true; if ( *p0 < *p1 || *p1 == 0 ) { p0++; if ( ++noMatched > brink ) return false; goto mergeLoop; } if ( *p1 < *p0 || *p0 == 0 ) { p1++; if ( ++noMatched > brink ) return false; goto mergeLoop; } yesMatched += 2; p1++; p0++; goto mergeLoop; } uint64_t *XmlDoc::getFuzzyDupHash ( ) { if ( m_dupHashValid ) return &m_dupHash; uint32_t *h1 = getTagPairHash32(); if ( ! h1 || h1 == (uint32_t *)-1 ) return (uint64_t *)h1; uint32_t *h2 = getGigabitVectorScorelessHash ( ) ; if ( ! h2 || h2 == (uint32_t *)-1 ) return (uint64_t *)h2; //uint64_t h2b = (uint64_t)*h2; m_dupHash = hash64 ( (uint64_t)*h1 , (uint64_t)*h2 ); m_dupHashValid = true; return &m_dupHash; } int64_t *XmlDoc::getExactContentHash64 ( ) { if ( m_exactContentHash64Valid ) return &m_exactContentHash64; char **u8 = getUtf8Content(); if ( ! u8 || u8 == (char **)-1) return (int64_t *)u8; // if (m_docId==88581116800LL) // log("got article1 diffbot"); // if (m_docId==201689682865LL) // log("got article11 diffbot"); CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // if we are diffbot, then do not quite do an exact content hash. // there is a "url:" field in the json that changes. so we have // to exclude that field. otherwise getDupList() spider time dedup // detection will fail the TestDuplicateContent.testDuplicate smoketest if ( cr->m_isCustomCrawl == 1 && m_isDiffbotJSONObject ) { int32_t *ch32 = getContentHashJson32(); if ( ! ch32 || ch32 == (void *)-1 ) return (int64_t *)ch32; m_exactContentHash64Valid = true; m_exactContentHash64 = (uint64_t)(uint32_t)*ch32; return &m_exactContentHash64; } unsigned char *p = (unsigned char *)*u8; int32_t plen = size_utf8Content; if ( plen > 0 ) plen--; // sanity //if ( ! p ) return 0LL; //if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; } unsigned char *pend = (unsigned char *)p + plen; uint64_t h64 = 0LL; unsigned char pos = 0; bool lastWasSpace = true; for ( ; p < pend ; p++ ) { // breathe QUICKPOLL ( m_niceness ); // treat sequences of white space as a single ' ' (space) if ( is_wspace_a(*p) ) { if ( lastWasSpace ) continue; lastWasSpace = true; // treat all white space as a space h64 ^= g_hashtab[pos][(unsigned char)' ']; pos++; continue; } lastWasSpace = false; // xor this in right h64 ^= g_hashtab[pos][p[0]]; pos++; } m_exactContentHash64Valid = true; m_exactContentHash64 = h64; return &m_exactContentHash64; } RdbList *XmlDoc::getDupList ( ) { if ( m_dupListValid ) return &m_dupList; // until we start using posdb and not indexdb, just return an // empty list. // TODO: MDW fix the deduping. //m_dupList.reset(); //m_dupListValid = true; //return &m_dupList; // // end temp hack // //uint64_t *dh = getDupHash ( ); //if ( ! dh || dh == (uint64_t *)-1 ) return (IndexList *)dh; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; int64_t *ph64 = getExactContentHash64(); //int64_t *ph64 = getLooseContentHash64(); if ( ! ph64 || ph64 == (void *)-1 ) return (RdbList *)ph64; // must match term in XmlDoc::hashVectors() char qbuf[256]; snprintf(qbuf, 256, "%"UINT64"",*ph64); int64_t pre = hash64b ( "gbcontenthash" , 0LL ); int64_t rawHash = hash64b ( qbuf , 0LL ); int64_t termId = hash64 ( rawHash , pre ); // get the startkey, endkey for termlist key144_t sk ; key144_t ek ; g_posdb.makeStartKey ( &sk,termId ,0); g_posdb.makeEndKey ( &ek,termId ,MAX_DOCID); // note it log(LOG_DEBUG,"build: check termid=%"UINT64" for docid %"UINT64"" ,(uint64_t)(termId&TERMID_MASK) ,m_docId); // assume valid now m_dupListValid = true; // this is a no-split lookup by default now if ( ! m_msg0.getList ( -1 , // hostId 0 , // ip 0 , // port 0 , // maxCacheAge false , // add to cache? RDB_POSDB, // INDEXDB , cr->m_collnum, &m_dupList , (char *)&sk , (char *)&ek , 606006 , // minRecSizes in bytes m_masterState , // state m_masterLoop , m_niceness , true , // error correction? true , // include tree? true , // domerge? -1 , // firsthosti 0 , // startfilenum -1, // # files 30 , // timeout -1 , // syncpoint -1 , // preferlocal reads NULL, // msg5 NULL, // msg5b false , // isRealMerge true , // allow page cache false , // forcelocalindexdb true ) ) // shardByTermId? THIS IS DIFFERENT!!! // return -1 if this blocks return (RdbList *)-1; // assume valid! m_dupListValid = true; return &m_dupList; } // moved DupDetector.cpp into here... char *XmlDoc::getIsDup ( ) { if ( m_isDupValid ) return &m_isDup; // assume we are not a dup m_isDup = false; // get it CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // skip if we should if ( ! cr->m_dedupingEnabled || // bulk jobs never dedup cr->m_isCustomCrawl == 2 ) { m_isDupValid = true; return &m_isDup; } // BUT if we are already indexed and a a crawlbot/bulk diffbot job // then do not kick us out just because another indexed doc is // a dup of us because it messes up the TestOnlyProcessIfNew smoketests // because in the 2nd round we end up deleting article1.html after // indexing it in the first round, then we add article11.html's // diffbot reply in the 2nd round because article1.html and its // diffbot reply was deleted. thereby giving it a new timestamp and // makeing the smoke fail. if ( cr->m_isCustomCrawl ) { char *isIndexed = getIsIndexed(); if ( ! isIndexed || isIndexed == (char *)-1) return (char *)isIndexed; if ( *isIndexed ) { m_isDupValid = true; return &m_isDup; } } //we need both vectors to be non-empty //uint64_t *tv = getTagPairHash(); //if ( ! tv || tv == (uint64_t *)-1) return (char *)tv; // get our docid int64_t *mydocid = getDocId(); if ( ! mydocid || mydocid == (int64_t *)-1) return (char *)mydocid; // get the duplist! RdbList *list = getDupList(); if ( ! list || list == (RdbList *)-1 ) return (char *)list; // sanity. must be posdb list. if ( ! list->isEmpty() && list->m_ks != 18 ) { char *xx=NULL;*xx=0;} setStatus ( "checking for dups" ); // . see if there are any pages that seem like they are dups of us // . they must also have a HIGHER score than us, for us to be // considered the dup //if ( ! m_didQuickDupCheck ) { // // do not repeat // m_didQuickDupCheck = true; int32_t myRank = getSiteRank ( ); // init //uint8_t maxScore = 0; //uint8_t myScore = 0; //char maxSiteRank = -1; //int64_t maxDocId = -1LL; // assume not a dup m_isDup = false; // get the docid that we are a dup of for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) { // breathe QUICKPOLL(m_niceness); //int64_t d = list->getCurrentDocId(); char *rec = list->getCurrentRec(); // get the docid int64_t d = g_posdb.getDocId ( rec ); // get the score //uint8_t score = list->getCurrentScore(); // just let the best site rank win i guess? // even though one page may have more inlinks??? char sr = (char )g_posdb.getSiteRank ( rec ); // skip if us! //if ( d == *getDocId() ) { // // record our score // //myScore = score; // mySiteRank = sr; // continue; //} // skip if us if ( d == m_docId ) continue; // for debug //if ( d != m_docId ) //log("build: doc %s is dup of docid %"INT64"", // m_firstUrl.m_url,d); // if his rank is <= ours then he was here first and we // are the dup i guess... if ( sr >= myRank ) { log("build: doc %s is dup of docid %"INT64"", m_firstUrl.m_url,d); m_isDup = true; m_isDupValid = true; m_docIdWeAreADupOf = d; return &m_isDup; } // get the winner //if ( score > maxScore ) maxScore = score; //if ( sr > maxSiteRank || maxSiteRank == -1 ) { // maxSiteRank = sr; // maxDocId = d; // continue; //} //if ( sr < maxSiteRank ) continue; // fallback to docid? // do it first come first server othereise i guess // this will prevent dups from existing in the index at least // if they have the same siterank... //if ( d < maxDocId ) { // maxDocId = d; // continue; //} } // are we the highest scoring doc with this template? // corollary: if all dups have equal scores they will be // removed until there is only one doc that matches the pattern //if ( myScore >= maxScore ) { //if ( maxDocId >= 0 && maxDocId != *mydocid && out) { // m_isDup = true; // m_isDupValid = true; // return &m_isDup; //} m_isDup = false; m_isDupValid = true; return &m_isDup; /* we now temporarily at least, do exact dup checking... later we will bring in the fuzzy code... // reset its ptr for stuff below list->resetListPtr(); loop: // . get a title rec for the current docid // . but if exhausted, we are not a dup! if ( list->isExhausted() ) { m_isDupValid = true; return &m_isDup; } // get the docid int64_t d = list->getCurrentDocId(); // continue if us! if ( d == *mydocid ) { list->skipCurrentRecord(); goto loop; } // is this a dup of us? char *dup = isDupOfUs ( d ); if ( ! dup || dup == (char *)dup ) return (char *)dup; // if dup of us, bail out if ( *dup ) { m_isDup = true; m_isDupValid = true; return &m_isDup; } // prepare for next list->skipCurrentRecord(); // loop up goto loop; */ } char *XmlDoc::isDupOfUs ( int64_t d ) { // sanity check if ( d <= 0 ) { char *xx=NULL;*xx=0; } // get our current title rec SafeBuf *tr = getTitleRecBuf(); if ( ! tr || tr == (void *)-1 ) return (char *)tr; // we should not be here if we know we are a dup of another doc if ( m_isDup ) { char *xx=NULL;*xx=0; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // get the title rec for this docid if we haven't yet done so if ( m_calledMsg22d != d ) { // .m_docId != d ) { bool s; // note it setStatus ( "getting possible dup title rec" ); // do not re-call m_calledMsg22d = d; // get the guy that might be a dup of us s = m_msg22d.getTitleRec ( &m_msg22Request , NULL , d , cr->m_coll , &m_dupTrPtr , &m_dupTrSize , false , // just check tfndb? false , // getAvailDocIdOnly m_masterState, // state m_masterLoop , // callback m_niceness , false , // add to cache 60*60*24 , // maxcacheage 60 );// timeout // we blocked if ( ! s ) return (char *)-1; // error? if ( g_errno ) return NULL; } // if not there do not count as an error if ( ! m_dupTrPtr ) { g_errno = 0; return &m_isDup; } // ignore any errors too i guess... if ( m_msg22d.m_errno ) { log(LOG_WARN, "build: Dup Detection error with " "titlerec fetch: %s",mstrerror(m_msg22d.m_errno)); g_errno = 0; return &m_isDup; } // we need to parse this potential dup doc XmlDoc dd; // . parse the possible dup title rec into another XmlDoc class // . it returns false and sets g_errno on error if ( ! dd.set2 ( m_dupTrPtr , m_dupTrSize , cr->m_coll , NULL , // m_pbuf , m_niceness ) ) return NULL; LinkInfo *info1a = dd.getLinkInfo1(); LinkInfo *info1b = getLinkInfo1(); float pageNumInlinksA = info1a->m_numGoodInlinks;//getNumInlinksExtrapolated(); float pageNumInlinksB = info1b->m_numGoodInlinks;//getNumInlinksExtrapolated(); // . if the old dup doc is of lower quality than the new doc that // we are checking, then that one should be removed, not us! // if they are equal, we keep the int16_ter url of the two // . dd was set from title rec so these numInlinks should be taken // from the TagRec in ptr_tagRecData, and therefore NOT BLOCK! if ( *dd.getSiteNumInlinks() < *getSiteNumInlinks() ) return &m_isDup; if ( *dd.getSiteNumInlinks() == *getSiteNumInlinks() && pageNumInlinksA < pageNumInlinksB ) return &m_isDup; if ( *dd.getSiteNumInlinks() == *getSiteNumInlinks() && pageNumInlinksA == pageNumInlinksB && dd.getFirstUrl()->getUrlLen() > getFirstUrl()->getUrlLen()) return &m_isDup; float *ts = getTagSimilarity ( &dd ); if ( ! ts || ts == (float *)-1 ) return (char *)ts; float *gs = getGigabitSimilarity ( &dd ); if ( ! gs || gs == (float *)-1 ) return (char *)gs; float *ps = getPageSimilarity ( &dd ); if ( ! ps || ps == (float *)-1 ) return (char *)ps; int32_t gigabitVecSimilarity = (int32_t)*gs; int32_t tagVecSimilarity = (int32_t)*ts; int32_t sampleVecSimilarity = (int32_t)*ps; int32_t notSimilarCount = 0; if ( gigabitVecSimilarity < 80 ) { notSimilarCount++; if ( gigabitVecSimilarity < 50 ) return &m_isDup; } if ( tagVecSimilarity < 80 ) { notSimilarCount++; if ( tagVecSimilarity < 50 ) return &m_isDup; } if ( sampleVecSimilarity < 80 ) { notSimilarCount++; if ( sampleVecSimilarity < 50 ) return &m_isDup; } // if it is similar enough, we got a dup! if ( notSimilarCount <= 0 ) { m_isDupValid = true; m_isDup = true; } return &m_isDup; } // hash a gigabit hash vector without its scores, also order independent uint32_t *XmlDoc::getGigabitVectorScorelessHash ( ) { if ( m_gigabitVectorHashValid ) return &m_gigabitVectorHash; int32_t **gbvec = getGigabitHashes(); if ( ! gbvec || gbvec == (int32_t **)-1 ) return (uint32_t *)gbvec; uint32_t h = 0; // this bad boy is NULL terminated uint32_t *gbv = (uint32_t *)*gbvec; // i guess zak likes the simple XOR'ing thing... for ( int32_t i = 0; gbv && gbv[i] ; i++) h ^= gbv[i]; m_gigabitVectorHashValid = true; m_gigabitVectorHash = h; return &m_gigabitVectorHash; } // . the original vector used for deduping similar search results is just from // random sample of indexed terms, but gigabit vector is // formed using the hashes of the top-scoring gigabits of the document, and // therefore uses the words class // . sets g_errno and returns NULL on error // . ptr_gigabitHashes can be NULL... int32_t **XmlDoc::getGigabitHashes ( ) { // if it was already set, treat this as an accessor if ( m_gigabitHashesValid ) return &ptr_gigabitHashes; // this also sets the vector char *gq = getGigabitQuery(); if ( ! gq || gq == (char *)-1) return (int32_t **)gq; // it should be valid now! if ( ! m_gigabitHashesValid ) { char *xx=NULL;*xx=0; } return &ptr_gigabitHashes; } // . the new function to get gigabits // . sets and validates m_gigabitQuery[] and m_gigabitHashes[] among others // . candidates = capitalized word, capitalized sequence of words, // uncapitalized 2+ word wikipedia phrase. // . candidates exclude uncapitalized query stop words. // . calls addGigabits() which is called by each doc in search results // when we use this at query time. // . separates gigabits with a comma (delimeter) in m_gigabitQuery[] // . quotes multiple word gigabits char *XmlDoc::getGigabitQuery ( ) { if ( m_gigabitQueryValid ) return m_gigabitQuery; setStatus ( "getting gigabit query" ); Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (char *)ww; int64_t *d = getDocId(); if ( ! d || d == (int64_t *)-1 ) return (char *)d; Sections *ss = getSections(); if ( ! ss || ss == (Sections *)-1 ) return (char *)ss; //Weights *we = getWeights(); //if ( ! we || we == (Weights *)-1 ) return (char *)we; LinkInfo *info1 = getLinkInfo1(); if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1; LinkInfo **pinfo2 = getLinkInfo2(); if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (char *)pinfo2; uint8_t *langId = getLangId(); if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId; HashTableX ht; char buf [ 200000 ]; // pass in niceness in case it has to grow really big and re-hash all!! ht.set ( 8 , 4 , -1 , buf , 200000 , false, m_niceness,"xmlgbtbl"); // . add gigabits from our body words // . includes title and header tags so pts can work well! if ( ! addGigabits ( ww , *d , ss , *langId ) ) return NULL; // add gigabits from link info for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) { // sanity check char *txt = k->getLinkText(); int32_t tlen = k->size_linkText; if ( tlen > 0 ) tlen--; if ( ! verifyUtf8 ( txt , tlen ) ) { log("xmldoc: bad link text 0 from url=%s for %s", k->getUrl(),m_firstUrl.m_url); continue; } // add those in if (!addGigabits(txt, *d, *langId ) ) return NULL; // add in neighborhoods if(!addGigabits(k->getSurroundingText(),*d,*langId)) return NULL; } // add in gigabits for meta keywords int32_t mdlen; char *md = getMetaDescription( &mdlen ); if ( ! addGigabits2 ( md , mdlen, *d , *langId ) ) return NULL; // add in gigabits for meta description int32_t mklen; char *mk = getMetaKeywords( &mklen ); if ( ! addGigabits2 ( mk , mklen , *d , *langId ) ) return NULL; // set m_gigabitQuery and m_gigabitScores //GigabitInfo *top[100]; // fill in "top" in order of score m_numTop = getTopGigabits ( &ht , m_top , 100 , 0 ); // error? then g_errno should be set if ( m_numTop == -1 ) return NULL; char *p = m_gigabitQuery; char *pend = m_gigabitQuery + XD_GQ_MAX_SIZE - 1; // reset count of vector components for setting gigabit vector int32_t ng = 0; // total score //int32_t total = 0; // . now set the gigabit query! // . start with the highest scoring node first, the last node since // nodes are ranked by lowest to highest key for ( int32_t i = 0 ; i < m_numTop ; i++ ) { // get the info GigabitInfo *gi = m_top[i]; // stop if too big if ( p + gi->m_len + 10 >= pend ) continue; // get 32 bit hash uint32_t h = gi->m_hash & 0xffffffff; // never allow 0 if ( h == 0 ) h = 1; // add to vector if ( ng + 1 < XD_MAX_GIGABIT_HASHES ) { // the term hash m_gigabitHashes[ng] = (int32_t)h ; // and the score m_gigabitScores[ng] = gi->m_pts; // point into it, where we will copy it to m_gigabitPtrs [ng] = p + 1; // advance ng++; } // quote it *p++ = '\"'; // write into buffer gbmemcpy ( p , gi->m_ptr , gi->m_len ); // finish quote *p++ = '\"'; // separate terms just in case //gbmemcpy ( p , " , ", 4 ); //p += 4; *p++ = ','; } // done *p++ = '\0'; // NULL termiante the vector to make it a legit vector m_gigabitHashes [ ng ] = 0; m_gigabitScores [ ng ] = 0; // include the terminating 0 ng++; // validate both the query and vector m_gigabitQueryValid = true; m_gigabitHashesValid = true; // set this too ptr_gigabitHashes = m_gigabitHashes; ptr_gigabitScores = m_gigabitScores; size_gigabitHashes = ng * 4 ; // 4 bytes each component size_gigabitScores = ng * 4 ; // 4 bytes each score return m_gigabitQuery; } // . fill in "top" in order of score // . returns -1 and sets g_errno on error int32_t getTopGigabits ( HashTableX *ht , GigabitInfo **top , int32_t max , int32_t minDocCount ) { // store top 100 into this tree RdbTree tree; if ( ! tree.set ( 4 , // fixedDataSize max+2 , // maxNumNodes true , // balance? -1 , // maxMem true , // own data? "tree-topgbits" )) return -1; int32_t ns = ht->getNumSlots(); key_t minKey; bool minKeyValid = false; for ( int32_t i = 0 ; i < ns ; i++ ) { // skip if empty if ( ht->isEmpty(i) ) continue; // get his info GigabitInfo *gi = (GigabitInfo *)ht->getValueFromSlot(i); // must be valid if ( gi->m_count <= 0 ) { char *xx=NULL;*xx=0; } // must be in this many docs minimum if ( gi->m_numDocs < minDocCount ) continue; // make the key key_t key; key.n1 = gi->m_pts; key.n0 = gi->m_hash; // should we add it? if ( minKeyValid && key <= minKey ) continue; // we should add it. use points as the key. use PTR as data int32_t node = tree.addNode(0,key,(char *)&gi,4); // error? g_errno should be set if ( node < 0 ) return -1; // if not full continue if ( tree.getNumUsedNodes() < 100 ) continue; // get the smallest node int32_t tn = tree.getLowestNode ( ) ; // sanity check if ( tn < 0 ) { char *xx=NULL;*xx=0; } // kick out smallest tree.deleteNode ( tn , false ); // get new smallest tn = tree.getLowestNode(); // set the new minkey minKey = *(key_t *)tree.getKey ( tn ); // validate it minKeyValid = true; } int32_t count = 0; // . now set the array // . start with the highest scoring node first, the last node since // nodes are ranked by lowest to highest key for ( int32_t nn=tree.getLastNode() ; nn>=0 ; nn=tree.getPrevNode(nn) ){ // get the info GigabitInfo *gi = (GigabitInfo *)tree.getData(nn); // store it top[count++] = gi; // stop if we are full if ( count >= max ) break; } return count; } char *XmlDoc::getMetaDescription( int32_t *mdlen ) { if ( m_metaDescValid ) { *mdlen = m_metaDescLen; return m_metaDesc; } Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; //xml->getMetaContent ( m_metaDesc, 1024, "description", 11 ); // we need to point to it in the html source so our WordPosInfo // algo works right. m_metaDesc = xml->getMetaContentPointer("description", 11, "name", &m_metaDescLen); *mdlen = m_metaDescLen; m_metaDescValid = true; return m_metaDesc; } char *XmlDoc::getMetaSummary ( int32_t *mslen ) { if ( m_metaSummaryValid ) { *mslen = m_metaSummaryLen; return m_metaSummary; } Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; m_metaSummary = xml->getMetaContentPointer("summary", 7, "name", &m_metaSummaryLen); *mslen = m_metaSummaryLen; m_metaSummaryValid = true; return m_metaSummary; } char *XmlDoc::getMetaKeywords( int32_t *mklen ) { if ( m_metaKeywordsValid ) { *mklen = m_metaKeywordsLen; return m_metaKeywords; } Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; //xml->getMetaContent ( m_metaKeywords, 1024, "keywords", 8 ); // we need to point to it in the html source so our WordPosInfo // algo works right. m_metaKeywords=xml->getMetaContentPointer("keywords", 8, "name", &m_metaKeywordsLen); *mklen = m_metaKeywordsLen; m_metaKeywordsValid = true; return m_metaKeywords; } bool XmlDoc::addGigabits ( char *s , int64_t docId , uint8_t langId ) { Words tmp; // skip if none if ( ! s ) return true; // returns NULL with g_errno set on error if ( ! tmp.set9 ( s , m_niceness ) ) return false; // and weights! //Weights we; //if ( ! we.set ( &tmp , ) // and so does this return addGigabits ( &tmp , docId , NULL , langId ); } bool XmlDoc::addGigabits2 ( char *s , int32_t slen, int64_t docId , uint8_t langId ) { Words tmp; // skip if none if ( ! s ) return true; // returns NULL with g_errno set on error if ( ! tmp.setx ( s , slen , m_niceness ) ) return false; // and weights! //Weights we; //if ( ! we.set ( &tmp , ) // and so does this return addGigabits ( &tmp , docId , NULL , langId ); } bool XmlDoc::addGigabits(Words *ww,int64_t docId,Sections *sections, uint8_t langId ) { // skip sections marked as these: //int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE; // get this Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // not if we don't have any identified sections if ( sections && sections->m_numSections <= 0 ) sp = NULL; // int16_tcuts int64_t *wids = ww->m_wordIds; char **wptrs = ww->m_words; int32_t *wlens = ww->m_wordLens; nodeid_t *tids = ww->m_tagIds; int32_t nw = ww->getNumWords(); //int32_t flags; // inital # of slots int32_t is = 0; if ( m_wordsValid ) is = ww->m_numAlnumWords; // put gigabits into this hash table HashTableX ht; if ( ! ht.set ( 8 , sizeof(GigabitInfo),is,NULL,0,false,m_niceness, "gigabits") ) return false; // scan through the words for ( int32_t i = 0 ; i < nw ; i++ ) { // breathe if being called by spider QUICKPOLL ( m_niceness ); // skip if not alnum word if ( ! wids[i] ) continue; // get section Section *sx = NULL; // get flags if ( sp ) sx = sp[i];//flags = sp[i]->m_flags; //else flags = 0; // skip if ignored. i.e. in the menu or not in the article text //if ( flags & badFlags ) continue; // are we capitalized? bool cap = ww->isCapitalized(i); // ignore lower case query stop words if (!cap&&isQueryStopWord(wptrs[i],wlens[i],wids[i]))continue; // hash of word then the phrase //uint32_t h = wids[i] & 0xffffffff; //uint64_t h = wids[i]; // add the word itself. return NULL with g_errno set on error if ( ! addGigabit (&ht,wptrs[i],wlens[i],docId, sx,true,langId,i)) return false; // save position int32_t j = i + 1 ; // check this far out int32_t maxj = i + 12; if ( maxj > nw ) maxj = nw; // do we got a cap phrase? bool capPhrase = false; // if capitalized look for sequence for ( ; cap && j < maxj ; j++ ) { // . stop on tags // . tids is NULL if being set from meta tag... if ( tids && tids[j] ) break; // skip if not alnum if ( ! wids[j] ) { // make sure it is like a single space or // something we can "phrase across" // TODO: can be like "capt. " if ( wlens[j] == 1 ) continue; // otherwise it stops the phrase break; } // if not capitalized stop if ( ! ww->isCapitalized(j) ) break; // got one! capPhrase = true; // . hash it into the ongoing hash // . Speller::getPopularity() should use this same // method so we can get popularities of the gigabits! //h = hash32Fast ( wids[j] & 0xffffffff , h ); //h = hash64Fast ( wids[j] , h ); } // if we added something... skip whole phrase, if any if ( capPhrase ) { // get length of it int32_t len = wptrs[j-1] + wlens[j-1] - wptrs[i]; // add that entire sequence, [i,j) if ( ! addGigabit ( &ht,wptrs[i],len,docId,sx, false,langId,i)) return false; // advance to end of phrase i = j - 1; continue; } // reset j = i + 1; // this must be true // . ok, look for a wiki phrase then! // . we can speed this up if too slow... using a crazy hash tbl int32_t wikij = -1; // init the hash for wiki lookup uint32_t h = 0; // loop over successive terms for ( ; j < maxj ; j++ ) { // . stop on tags // . tids is NULL if being set from meta tag if ( tids && tids[j] ) break; // skip if not alnum if ( ! wids[j] ) { // make sure it is like a single space or // something we can "phrase across" // TODO: can be like "capt. " if ( wlens[j] == 1 ) continue; // otherwise it stops the phrase break; } // init it if ( ! h ) h = hash32Fast ( wids[i] & 0xffffffff , 0 ); // hash it into the ongoing hash h = hash32Fast ( wids[j] & 0xffffffff , h ); // is this in the wiki? if ( ! g_wiki.isInWiki ( h ) ) continue; // it is, mark it wikij = j + 1; } // must be a 2+ word phrase in the wiki to be a gigabit if ( wikij == -1 ) continue; // bail if breach if ( wikij >= nw ) continue; // get len int32_t len = wptrs[wikij] + wlens[wikij] - wptrs[i]; // add what we got if ( ! addGigabit ( &ht,wptrs[i],len,docId,sx,false, langId,i) ) return false; // advance to end of phrase i = wikij - 1; } return true; } // . this is called by Msg40.cpp to intersect gigabits from multiple docs // . returns -1 and sets g_errno on error // . returns # of GigabitInfos stored into "top" /* int32_t intersectGigabits ( Msg20 **mp , // search results int32_t n , // # of em uint8_t langId , // searcher's langId int32_t maxTop , int32_t docsToScan , int32_t minDocCount , // must be in this # docs GigabitInfo *top , int32_t niceness ) { // put gigabits into this hash table HashTableX ht; ht.set ( 8 , sizeof(GigabitInfo),0,NULL,0,false,niceness,"ginttbl"); for ( int32_t i = 0 ; i < n && i < docsToScan ; i++ ) { // get the reply/searchResult Msg20Reply *mr = mp[i]->m_r; // sanity check if ( ! mr && ! mp[i]->m_errno ) { char *xx=NULL;*xx=0; } // this is NULL on error if ( ! mr ) continue; // count them int32_t count = 0; // add each gigabit for it for ( char *p = mr->ptr_gigabitQuery ; p && *p ; count++ ) { // skip the comma p++; // point to next char *end = strchr ( p , ',' ); // do not allow NULLs if ( ! end ) end = p + gbstrlen(p); // get the score. aka GigabitInfo::m_pts int32_t ptsArg = mr->ptr_gigabitScores[count]; // sanity check for bad scores if ( ptsArg <= 0 ) { char *xx=NULL;*xx=0; } // add it in if ( ! addGigabit ( &ht , p , end - p , // langth mr->m_docId , NULL ,// section ptr false , // singleWrd? unused langId , -1 , // word #i not used ptsArg ) ) return -1; // advance p p = end; // if not comma, all done if ( *p != ',' ) break; // skip comma p++; } } // . get up to the top 50 gigabits GigabitInfo *array [ 50 ]; int32_t numTop = getTopGigabits ( &ht , array , 50 , minDocCount ); // error? g_errno should be set if ( numTop == -1 ) return -1; // sanity check if ( numTop > maxTop ) { char *xx=NULL;*xx=0; } // now copy into our array for ( int32_t i = 0 ; i < numTop ; i++ ) { // get it GigabitInfo *gi = array[i]; // copy it gbmemcpy ( &top[i] , gi , sizeof(GigabitInfo) ); } // return how many we copied return numTop; } */ // . "docId" is the document Id that "h" came from // . if being called at query time we often get called on each search result! // . if being called at parse/index time we are being called on a single docId // . returns false and sets g_errno on error bool addGigabit ( HashTableX *ht , char *s , int32_t slen , int64_t docId , Section *sp , bool singleWord , uint8_t langId , // starts with word #i int32_t i , int32_t ptsArg ) { // get its hash uint64_t h = hash64d ( s , slen ); // get the slot where its at int32_t slot = ht->getSlot ( &h ); // info for this hash/gigabit in the doc GigabitInfo *gi ; // otherwise, init a new slot. set the key to h if ( slot < 0 ) { // . add key to a new slot, set "gi" to the value ptr // . use NULL for the GigabitInfo ptr temporarily so it should // not gbmemcpy into the slot if ( ! ht->addKey ( &h , NULL , &slot ) ) return false; // get data ptr to the bogus data gi = (GigabitInfo *)ht->getValueFromSlot ( slot ); // . set all the stuff now. this way avoids a gbmemcpy... // . every wiki title should have a popularity i guess... // . "pop" is # of docs out of 10,000 that have this phrase? int32_t pop = g_speller.getPhrasePopularity(s,h,true,langId); gi->m_pop = pop; gi->m_pts = 0; gi->m_count = 0; gi->m_numDocs = 0; gi->m_lastDocId = 0LL; gi->m_currentDocCount = 0; // a char gi->m_ptr = s; gi->m_len = slen; gi->m_hash = h; // sanity test GigabitInfo *tt = (GigabitInfo *)ht->getValue ( &h ); if ( tt->m_pop != pop ) { char *xx=NULL;*xx=0; } } else { gi = (GigabitInfo *)ht->getValueFromSlot ( slot ); // only allow up to 5 votes per document! if ( gi->m_currentDocCount >= 5 ) return true; } // inc the count, we got one more occurence gi->m_count++; // doc count. how many docs have this gigabit? count it. if ( docId != gi->m_lastDocId ) { gi->m_numDocs++; gi->m_lastDocId = docId; gi->m_currentDocCount = 1; } else gi->m_currentDocCount++; // given? if ( ptsArg != -1 ) { gi->m_pts += ptsArg; return true; } // base points on popularity float pts = 1.0; if ( gi->m_pop < 1 ) pts = 1000; else if ( gi->m_pop < 2 ) pts = 500; else if ( gi->m_pop < 3 ) pts = 250; else if ( gi->m_pop < 4 ) pts = 200; else if ( gi->m_pop < 5 ) pts = 150; else if ( gi->m_pop < 6 ) pts = 100; else if ( gi->m_pop < 7 ) pts = 20; else if ( gi->m_pop < 8 ) pts = 10; else if ( gi->m_pop < 10 ) pts = 5; else if ( gi->m_pop < 15 ) pts = 3; else if ( gi->m_pop < 20 ) pts = 2; // . special boost if in title, header or anchor tag // . the weights class ONLY boosts the first 20 or so words in // header tags... how can we fix that?????????????????? // . TODO: FIX THAT!!! //if ( flags & SEC_TITLE ) pts = pts * 6.0/(float)we->m_titleWeight; //if ( flags & SEC_HEADER) pts = pts * 4.0/(float)we->m_headerWeight; //if ( flags & SEC_A ) pts = pts * 4.0/(float)we->m_linkTextWeight; if ( sp ) { if ( sp->m_flags & SEC_IN_TITLE ) pts = pts * 6.0; if ( sp->m_flags & SEC_IN_HEADER ) pts = pts * 4.0; if ( sp->m_tagId == TAG_A ) pts = pts * 4.0; } // if for the query 'recreation' you get the phrase "park bench" // 100 times and the word "bench" 100 times. the word weight // for "bench" should be very low! Weights.cpp also demotes repreated // sentence fragments, etc. it is generally a really handy thing! // and i think it already boosts scores for being in the title, etc. // IF BEING called from meta tag, weights are NULL! // TODO: we need to use the diversity vector here then... //if ( we ) { // if ( singleWord ) pts *= we->m_ww[i]; // else pts *= we->m_pw[i]; //} // add them in gi->m_pts += (int32_t)pts; // good to go return true; } /* -- this will be a url filter var like "numindexed" int32_t *XmlDoc::getSiteSpiderQuota ( ) { if ( m_siteSpiderQuotaValid ) return &m_siteSpiderQuota; int32_t *siteNumInlinks = getSiteNumInlinks(); if ( ! siteNumInlinks ) return NULL; if ( siteNumInlinks == (int32_t *)-1 ) return (int32_t *)-1; // get this fresh each time int32_t *rn = getRegExpNum ( -1 ); if ( ! rn || rn == (int32_t *)-1 ) return (int32_t *)rn; // bail early? this happens if we match a banned/filtered rule in // the url filters table if ( m_indexCode ) return NULL; // valid at this point m_siteSpiderQuotaValid = true; // if no match, or filtered or banned, assume no quota if ( *rn == -1 ) m_siteSpiderQuota = -1; else m_siteSpiderQuota = cr->m_spiderQuotas[*rn]; // get the quota, -1 means no limit return &m_siteSpiderQuota; } */ Url *XmlDoc::getCurrentUrl ( ) { if ( m_currentUrlValid ) return &m_currentUrl; // otherwise, get first url Url *fu = getFirstUrl(); if ( ! fu || fu == (void *)-1 ) return (Url *)fu; // make that current url m_currentUrl.set ( &m_firstUrl , false ); m_currentUrlValid = true; return &m_currentUrl; /* // need a valid url Url *u = getFirstUrl(); if ( ! u ) return NULL; // but use redir if we got that Url *r = getRedirUrl(); if ( r && m_redirUrlValid ) return r; return u; */ } Url *XmlDoc::getFirstUrl() { if ( m_firstUrlValid ) return &m_firstUrl; // we might have a title rec if ( m_setFromTitleRec ) { setFirstUrl ( ptr_firstUrl , false ); m_firstUrlValid = true; return &m_firstUrl; } // must be this otherwise if ( ! m_setFromDocId ) { char *xx=NULL;*xx=0; } // this must be valid if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; } // get the old xml doc from the old title rec XmlDoc **pod = getOldXmlDoc ( ); if ( ! pod || pod == (void *)-1 ) return (Url *)pod; // int16_tcut XmlDoc *od = *pod; // now set it setFirstUrl ( od->ptr_firstUrl , false ); m_firstUrlValid = true; return &m_firstUrl; } int64_t XmlDoc::getFirstUrlHash48() { if ( m_firstUrlHash48Valid ) return m_firstUrlHash48; // this must work if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; } m_firstUrlHash48 = hash64b ( m_firstUrl.m_url ) & 0x0000ffffffffffffLL; m_firstUrlHash48Valid = true; return m_firstUrlHash48; } int64_t XmlDoc::getFirstUrlHash64() { if ( m_firstUrlHash64Valid ) return m_firstUrlHash64; // this must work if ( ! m_firstUrlValid ) { char *xx=NULL;*xx=0; } m_firstUrlHash64 = hash64b ( m_firstUrl.m_url ); m_firstUrlHash64Valid = true; return m_firstUrlHash64; } // . operates on the latest m_httpReply Url **XmlDoc::getRedirUrl() { if ( m_redirUrlValid ) return &m_redirUrlPtr; setStatus ( "getting redir url" ); // assume no redirect m_redirUrlPtr = NULL; //ptr_redirUrl = NULL; //size_redirUrl = 0; // bail on this //if ( ! m_checkForRedir ) { // m_redirError = 0; // m_redirErrorValid = true; // return &m_redirUrlPtr; //} // we might have a title rec if ( m_setFromTitleRec ) { char *xx=NULL;*xx=0; } // or recycling content from old title rec if ( m_recycleContent ) { m_redirError = 0; m_redirErrorValid = true; m_redirUrlValid = true; return &m_redirUrlPtr; } // get the current http reply, not the final http reply necessarily if ( ! m_httpReplyValid ) { char *xx=NULL;*xx=0; } // set a mime on the stack HttpMime mime; // int16_tcut int32_t LEN = m_httpReplySize - 1; // sanity check if ( LEN > 0 && ! m_httpReply ) { char *xx=NULL;*xx=0; } // empty reply, no redir if ( LEN == 0 ) { // bad mime, but i guess valid empty redir url m_redirUrlValid = true; // no error m_redirError = 0; m_redirErrorValid = true; // return a fake thing. content length is 0. return &m_redirUrlPtr; } // set it if ( LEN && ! mime.set ( m_httpReply, LEN, getCurrentUrl() ) ) { // set this on mime error //if ( ! m_indexCode ) m_indexCode = EBADMIME; // bad mime, but i guess valid empty redir url m_redirUrlValid = true; // return nothing, no redirect url was there m_redirUrlPtr = NULL; // no error m_redirError = 0; m_redirErrorValid = true; // return a fake thing. content length is 0. return &m_redirUrlPtr; } int32_t httpStatus = mime.getHttpStatus() ; Url *loc = NULL; // quickly see if we are a robots.txt url originally Url *fu = getFirstUrl(); bool isRobotsTxt = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() ); // // check for // if httpStatus is not a redirect // if ( httpStatus < 300 || httpStatus > 399 ) { // ok, crap, i was getting the xml here to get the meta // http-equiv refresh tag, but that added an element of // recursion that is just too confusing to deal with. so // let's just parse out the meta tag by hand if ( ! isRobotsTxt ) { Url **mrup = getMetaRedirUrl(); if ( ! mrup || mrup == (void *)-1) return (Url **)mrup; // set it. might be NULL if not there. loc = *mrup; } } else // get Location: url (the redirect url) from the http mime loc = mime.getLocationUrl(); // get current url Url *cu = getCurrentUrl(); if ( ! cu || cu == (void *)-1 ) return (Url **)cu; // this call set size_catIds int32_t **pcids = getCatIds(); if ( ! pcids || pcids == (void *)-1) return (Url **)pcids; // get local link info LinkInfo *info1 = getLinkInfo1(); // error or blocked if ( ! info1 || info1 == (LinkInfo *)-1 ) return (Url **)info1; // get remote link info LinkInfo **pinfo2 = getLinkInfo2(); // error or blocked if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (Url **)pinfo2; // convenience LinkInfo *info2 = *pinfo2; // breathe QUICKPOLL(m_niceness); // get cookie for redirect to fix nyt.com char *cookie = mime.getCookie(); // find end of cookie at the semicolon char *s = cookie; for ( ; s && *s && *s != ';' ; s++ ); if ( s && *s == ';' ) { // do not include ; int32_t clen = s - cookie; m_redirCookieBuf.reset(); m_redirCookieBuf.safeMemcpy ( cookie , clen ); m_redirCookieBuf.nullTerm(); m_redirCookieBufValid = true; } // mdw23 //log("http: reply=%s",m_httpReply); // a hack for removing session ids already in there. for // brilliantshopper's bs4 collection and gk0 cluster //bool forceRedirect = false; if ( size_catIds == 0 && // must not have an actual redirect url in there ! loc && // must be a valid http status httpStatus == 200 && (gb_strcasestr( cu->getUrl(), "sessionid") || gb_strcasestr( cu->getUrl(), "oscsid") ) ) { Url *tt = &m_redirUrl; tt->set ( cu->getUrl() , cu->getUrlLen() , true , // addwww? true ); // strip sessid? // if it no longer has the session id, force redirect it if ( ! gb_strcasestr( tt->getUrl(), "sessionid") && ! gb_strcasestr( tt->getUrl(), "oscsid") ) { m_redirUrlValid = true; m_redirUrlPtr = &m_redirUrl; // TODO: log redir url in spider log output //logf(LOG_INFO,"build: %s force redirected to %s", // cu->getUrl(),m_redirUrl.getUrl()); m_redirUrlValid = true; ptr_redirUrl = m_redirUrl.m_url; size_redirUrl = m_redirUrl.m_ulen+1; // no error m_redirError = 0; m_redirErrorValid = true; return &m_redirUrlPtr; } } // breathe QUICKPOLL(m_niceness); // if no location url, then no redirect a NULL redir url if ( ! loc || loc->m_url[0] == '\0' ) { // validate it m_redirUrlValid = true; // no error m_redirError = 0; m_redirErrorValid = true; // and return an empty one return &m_redirUrlPtr; } // breathe QUICKPOLL(m_niceness); // this is handy //Url tmp; // TODO: make sure we got this logic elsewhere // if robots.txt said no, and if we had no link text, then give up //if(! *isAllowed && !info1->hasLinkText() && !info2->hasLinkText() ) { // m_indexCode = EDOCDISALLOWED; // set our redir url from the mime's Location: field. addWWW=false //if ( loc != &tmp ) tmp.set ( loc , false ); bool keep = false; if ( size_catIds > 0 ) keep = true; if ( info1->hasLinkText() ) keep = true; if ( info2 && info2->hasLinkText() ) keep = true; // at this point we do not block anywhere m_redirUrlValid = true; // store the redir error m_redirError = 0; m_redirErrorValid = true; // i've seen a "Location: 2010..." bogus url as well, so make sure // we got a legit url if ( ! loc->getDomain() || loc->getDomainLen() <= 0 ) { if ( ! keep ) m_redirError = EDOCBADREDIRECTURL; return &m_redirUrlPtr; } //bool injected = false; // get from spider request if there //if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true; // . if redirect url is nothing new, then bail (infinite loop) // . www.xbox.com/SiteRequirements.htm redirects to itself // until you send a cookie!! // . www.twomileborris.com does the cookie thing, too if ( strcmp ( cu->getUrl(), loc->getUrl() ) == 0 ) { if ( ! keep ) m_redirError = EDOCREDIRECTSTOSELF; return &m_redirUrlPtr; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // . don't allow redirects when injecting! // . otherwise, we would mfree(m_buf) which would free our // injected reply... yet m_injectedReplyLen would still be // positive! can you say 'seg fault'? // . hmmm... seems to have worked though if ( cr->m_recycleContent || m_recycleContent ) { // || injected if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS; return &m_redirUrlPtr; } // . if we followed too many then bail // . www.motorolamobility.com www.outlook.com ... failed when we // had >= 4 here if ( ++m_numRedirects >= 10 ) { if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS; return &m_redirUrlPtr; } // sometimes idiots don't supply us with a Location: mime if ( loc->getUrlLen() == 0 ) { if ( ! keep ) m_redirError = EDOCBADREDIRECTURL; return &m_redirUrlPtr; } // . protocol of url must be http or https // . we had one url redirect to an ihttp:// protocol and caused // spider to core dump when it saw that SpiderRequest record char *proto = loc->getScheme(); if ( strncmp(proto,"http://" ,7) && strncmp(proto,"https://",8) ) { m_redirError = EDOCBADREDIRECTURL; return &m_redirUrlPtr; } // do not allow redirects to evil-G or bing //if ( strstr(loc->getUrl(),".google.com/") || // strstr(loc->getUrl(),".bing.com/") ) { // m_redirError = EDOCEVILREDIRECT; // return &m_redirUrlPtr; //} // log a msg if ( g_conf.m_logSpideredUrls ) logf(LOG_INFO,"build: %s redirected to %s", cu->getUrl(),loc->getUrl()); // if not same Domain, it is not a simplified redirect bool sameDom = true; int32_t dlen = loc->getDomainLen(); if ( cu->getDomainLen() != dlen ) sameDom=false; else if ( strncmp(cu->getDomain(),loc->getDomain(),dlen))sameDom=false; if ( ! sameDom ) { m_redirectFlag = true; m_redirUrl.set ( loc , false ); // addWWW=false m_redirUrlPtr = &m_redirUrl; ptr_redirUrl = m_redirUrl.m_url; size_redirUrl = m_redirUrl.m_ulen+1; return &m_redirUrlPtr; } // if redirecting to the same domain, then do not add "www.". // this way we can take care of slashdot.org, etc. //bool addwww = false; // but never modify if in dmoz, keep it pure //if ( size_catIds > 0 ) addwww = false; // debug msg //if ( strcmp(m_redirUrl.getUrl(),url->getUrl())== 0 ) // log("Redirect error: same url"); //bool stripSessId = (size_catIds == 0); // . reset m_redirUrl now (do not addWWW for slashdot.org, etc) // . we now add "www." UNLESS it's a redirect from the same // domain or firstUrl is in catdb //tmp.set( loc->getUrl(),loc->getUrlLen(),addwww,stripSessId); /* // get this bool sameHostLinks = false; if ( *pi >= 0 ) sameHostLinks =cr->m_pq_spiderSameHostnameLinks[*pi]; // get first url ever Url *f = getFirstUrl(); // . for same host links, addwww for comparing // . so if we are doing google.com and it redirects to // www.google.com then we will allow that... and vice versa if ( sameHostLinks ) { Url u1; Url u2; u1.set ( loc->getUrl () , loc->getUrlLen(), true ); // addwww? u2.set ( f->getUrl() , f->getUrlLen () , true ); // addwww? // host must match if we are restricted to a particular host if ( u1.getHostLen() != u2.getHostLen() || strncmp ( u1.getHost() , u2.getHost() , u1.getHostLen () ) != 0 ) { m_redirError = EDOCBADREDIRECTURL; return &m_redirUrlPtr; } } */ // get first url ever Url *f = getFirstUrl(); // breathe QUICKPOLL(m_niceness); // set this to true if the redirected urls is much preferred bool simplifiedRedir = false; // . if it redirected to a simpler url then stop spidering now // and add the simpler url to the spider queue // . by simpler, i mean one w/ fewer path components // . or one with a www for hostname // . or could be same as firstUrl but with a / appended char *r = loc->getUrl(); char *u = f->getUrl(); int32_t rlen = loc->getUrlLen(); int32_t ulen = f->getUrlLen(); // simpler if new path depth is int16_ter if ( loc->getPathDepth (true) < f->getPathDepth (true) ) simplifiedRedir = true; // simpler if old has cgi and new does not if ( f->isCgi() && ! loc->isCgi() ) simplifiedRedir = true; // if we're a dmoz page, don't do this, unless just a / case,no if ( size_catIds > 0 ) simplifiedRedir = false; // simpler if new one is same as old but has a '/' at the end if ( rlen == ulen+1 && r[rlen-1]=='/' && strncmp(r,u,ulen)==0) simplifiedRedir = true; // . if new url does not have semicolon but old one does // . http://news.yahoo.com/i/738;_ylt=AoL4eFRYKEdXbfDh6W2cF // redirected to http://news.yahoo.com/i/738 if ( strchr (u,';') && ! strchr (r,';') ) simplifiedRedir = true; // simpler is new host is www and old is not if ( loc->isHostWWW() && ! f->isHostWWW() ) simplifiedRedir = true; // if redirect is to different domain, set simplified // this helps locks from bunching on one domain if ( loc->getDomainLen()!=f->getDomainLen() || strncasecmp ( loc->getDomain(), f->getDomain(), loc->getDomainLen() ) != 0 ) // crap, but www.hotmail.com redirects to live.msn.com // login page ... so add this check here if ( ! f->isRoot() ) simplifiedRedir = true; bool allowSimplifiedRedirs = m_allowSimplifiedRedirs; // follow redirects if injecting so we do not return // EDOCSIMPLIFIEDREDIR if ( getIsInjecting ( ) ) allowSimplifiedRedirs = true; // or if disabled then follow the redirect if ( ! cr->m_useSimplifiedRedirects ) allowSimplifiedRedirs = true; // . if the redir url is simpler, but has no hostname we // prepend a "www." to it // . this should avoids www.russ.ru and russ.ru from being // in the index at the same time and causing url: collisions /* if ( size_catIds == 0 && simplifiedRedir && loc->getDomainLen() == loc->getHostLen () ) loc->set (loc->getUrl(), loc->getUrlLen(), true, //false, addwww? stripSessId ); */ // if not allow, do not do them... except for the two below //if ( ! m_useSimplifiedRedirects || m_isDirColl ) // simplifiedRedir = false; // special hack for nytimes.com. do not consider simplified redirs // because it uses a cookie along with redirs to get to the final // page. char *dom2 = m_firstUrl.getDomain(); int32_t dlen2 = m_firstUrl.getDomainLen(); if ( dlen2 == 11 && strncmp(dom2,"nytimes.com",dlen2)==0 ) allowSimplifiedRedirs = true; // . don't bother indexing this url if the redir is better // . 301 means moved PERMANENTLY... // . many people use 301 on their root pages though, so treat // it like a temporary redirect, like exclusivelyequine.com if ( simplifiedRedir && ! allowSimplifiedRedirs && // for custom BULK clients don't like this i guess // AND for custom crawl it was messing up the processing // url format for a nytimes blog subsite which was redirecting // to the proper nytimes.com site... // ! cr->m_isCustomCrawl ) { // no, we need this for custom crawls because otherwise we // get too many dups in the index. so for nyt we need something // else cr->m_isCustomCrawl != 2 ) { // returns false if blocked, true otherwise //return addSimplifiedRedirect(); m_redirError = EDOCSIMPLIFIEDREDIR; // set this because getLinks() treats this redirUrl // as a link now, it will add a SpiderRequest for it: m_redirUrl.set ( loc , false ); // addWWW=false m_redirUrlPtr = &m_redirUrl; // mdw: let this path through so contactXmlDoc gets a proper // redirect that we can follow. for the base xml doc at // least the m_indexCode will be set return &m_redirUrlPtr; } // good to go m_redirectFlag = true; m_redirUrl.set ( loc , false ); // addWWW=false m_redirUrlPtr = &m_redirUrl; ptr_redirUrl = m_redirUrl.m_url; size_redirUrl = m_redirUrl.m_ulen+1; return &m_redirUrlPtr; } int32_t *XmlDoc::getFirstIndexedDate ( ) { if ( m_firstIndexedDateValid ) return (int32_t *)&m_firstIndexedDate; XmlDoc **od = getOldXmlDoc ( ); if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od; // valid m_firstIndexedDateValid = true; // must be downloaded //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } // assume now is the first time m_firstIndexedDate = getSpideredTime();//m_spideredTime; // inherit from our old title rec if ( *od ) m_firstIndexedDate = (*od)->m_firstIndexedDate; // return it return (int32_t *)&m_firstIndexedDate; } int32_t *XmlDoc::getOutlinksAddedDate ( ) { if ( m_outlinksAddedDateValid ) return (int32_t *)&m_outlinksAddedDate; XmlDoc **od = getOldXmlDoc ( ); if ( ! od || od == (XmlDoc **)-1 ) return (int32_t *)od; // valid m_outlinksAddedDateValid = true; // must be downloaded //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } // assume we are doing it now m_outlinksAddedDate = getSpideredTime();//m_spideredTime; // get that if ( *od ) m_outlinksAddedDate = (*od)->m_outlinksAddedDate; // return it return (int32_t *)&m_outlinksAddedDate; } /* int32_t *XmlDoc::getNumBannedOutlinks ( ) { if ( m_numBannedOutlinksValid ) return &m_numBannedOutlinks; setStatus ( "getting num banned outlinks" ); // get the outlinks Links *links = getLinks(); if ( ! links || links == (Links *)-1 ) return (int32_t *)links; // count em int32_t n = links->getNumLinks(); // reset m_numBannedOutlinks = 0; // one vote per domain hash table char buf[20000]; HashTableX ht; ht.set ( 4 , 0 , -1 , buf , 20000 ,false,m_niceness); // loop through them for ( int32_t i = 0 ; i < n ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // get the link char *u = links->getLinkPtr(i); // get domain of the link int32_t dlen; char *dom = getDomFast ( u , &dlen , false ); // skip if bad domain if ( ! dom || dlen <= 0 ) continue; // get domHash int32_t h = hash32 ( dom , dlen ); // one check per domain if ( ht.getSlot ( &h ) >= 0 ) continue; // add it, return NULL on error, g_errno should be set if ( ! ht.addKey ( &h ) ) return NULL; // . loop over all regular expression in the url filters table // . stop at first regular expression it matches int32_t *rn = getRegExpNum2 ( i ); // need to wait for a callback at this point if ( ! rn || rn == (int32_t *)-1 ) return (int32_t *)rn; // skip if no match in url filters table if ( *rn == -1 ) continue; // get spider priority int32_t pr = cr->m_spiderPriorities[*rn]; // skip if not banned if ( pr != -2 ) continue; // count it m_numBannedOutlinks++; } // all done m_numBannedOutlinksValid = true; // convert this too! //m_numBannedOutlinks8 = score32to8 ( m_numBannedOutlinks ); // sanity check on score32to8() //if(m_numBannedOutlinks8>0&&!m_numBannedOutlinks){char*xx=NULL;*xx=0;} return &m_numBannedOutlinks; } */ uint16_t *XmlDoc::getCountryId ( ) { if ( m_countryIdValid ) return &m_countryId; setStatus ( "getting country id" ); // get it CatRec *cat = getCatRec (); if ( ! cat || cat == (CatRec *)-1) return (uint16_t *)cat; // MDW: i limit this to 10 to save stack space! Url *u = getCurrentUrl(); if ( ! u || u == (void *)-1) return (uint16_t *)u; // use the url's tld to guess the country uint16_t country = g_langId.guessCountryTLD ( u->getUrl ( ) ); // . 0 means no country i guess. try dmoz next. // . limit to 10 of them int32_t nc = cat->m_numCatids; for ( int32_t i = 0; ! country && i < nc && i < 10 ; i++) { int32_t catid = cat->m_catids[i]; country = g_countryCode.getCountryFromDMOZ ( catid ); } m_countryIdValid = true; m_countryId = country; return &m_countryId; } /* XmlDoc *XmlDoc::getOldDoc ( ) { if ( m_oldDocValid ) return &m_oldDoc; // get current url Url *u = getCurrentUrl(); // set its url otherwise m_oldDoc.setFirstUrl ( u , false ); // get the old title rec char *ret = getOldTitleRec(); if ( ! ret || ret == (char *)-1 ) return (XmlDoc *)ret; // all done m_oldDocValid = true; // return it return m_oldDoc; } */ uint8_t *XmlDoc::getRootLangId ( ) { // return it if we got it if ( m_rootLangIdValid ) return &m_rootLangId; // note it setStatus ( "getting root lang id from tagdb"); // are we a root? char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (uint8_t *)isRoot; // sanity check - should not be called on a root url if ( *isRoot ) { uint8_t *langId = getLangId(); if ( ! langId || langId == (uint8_t *)-1 ) return (uint8_t *) langId; m_rootLangId = *langId; m_rootLangIdValid = true; return &m_rootLangId; //char *xx=NULL;*xx=0; } } // get the tag rec TagRec *gr = getTagRec (); if ( ! gr || gr == (TagRec *)-1 ) return (uint8_t *)gr; // just use one. there may be multiple ones! Tag *tag = gr->getTag("rootlang"); // if there use that if ( ! tag ) { // . get the root doc // . allow for a one hour cache of the titleRec XmlDoc **prd = getRootXmlDoc( 3600 ); if ( ! prd || prd == (void *)-1 ) return (uint8_t *)prd; // int16_tcut XmlDoc *rd = *prd; // . if no root doc, then assume language unknown // . this happens if we are injecting because we do not want // to download the root page for speed purposes if ( ! rd ) { m_rootLangId = langUnknown; m_rootLangIdValid = true; return &m_rootLangId; } // . update tagdb rec // . on root download error use language "xx" (unknown) to // avoid hammering the root page //bool *status = rd->updateRootLangId (); //if (! status || status==(void *)-1) return (uint8_t *)status; // update our tag rec now //Tag *tt = rd->m_newTagRec.getTag("rootlang"); // must be there //if ( ! tt ) { char *xx=NULL;*xx=0; } // add it for us //if ( ! m_newTagRec.addTag ( tt ) ) return NULL; // get it uint8_t *rl = rd->getLangId(); if ( ! rl || rl == (void *)-1 ) return (uint8_t *)rl; // must be legit now! if ( ! rd->m_langIdValid ) { char *xx=NULL;*xx=0;} // now validate our stuff m_rootLangIdValid = true; //m_rootLangIdScore = rd->m_langIdScore; m_rootLangId = rd->m_langId; return &m_rootLangId; } // sanity check ( must be like "en,50\0" or could be // "en_US,50\0" or "zh_cn,50" if ( tag->getTagDataSize() > 6 ) { char *xx=NULL;*xx=0; } // point to 2 character language abbreviation char *abbr = tag->getTagData(); /* // find comma char *comma = strchr(abbr,',' ); // sanity check if ( ! comma ) { char *xx=NULL;*xx=0; } // tmp NULL *comma = '\0'; */ // map it to an id uint8_t langId = getLangIdFromAbbr( abbr ); /* // put it back *comma = ','; // get score int32_t score = atol(comma+1); // sanity check if ( score < 0 || score > 100 ) { char *xx=NULL;*xx=0; } */ // set that up m_rootLangId = langId; //m_rootLangIdScore = score; m_rootLangIdValid = true; return &m_rootLangId; } XmlDoc **XmlDoc::getOldXmlDoc ( ) { if ( m_oldDocValid ) return &m_oldDoc; // note it setStatus ( "getting old xml doc"); // if we are set from a title rec, we are the old doc if ( m_setFromTitleRec ) { m_oldDocValid = true; m_oldDoc = NULL;//this; return &m_oldDoc; } // . cache age is 0... super fresh // . returns NULL w/ g_errno if not found unless isIndexed is false // and valid, and it is not valid for pagereindexes. char **otr = getOldTitleRec ( ); if ( ! otr || otr == (char **)-1 ) return (XmlDoc **)otr; // if no title rec, return ptr to a null m_oldDoc = NULL; if ( ! *otr ) { m_oldDocValid = true; return &m_oldDoc; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // if provided title rec matches our docid but not uh48 then there // was a docid collision and we should null out our title rec // and return with an error and no index this puppy! // crap, we can't call getFirstUrl() because it might not be // valid if we are a docid based doc and THIS function was called // from getFirstUrl() -- we end up in a recursive loop. if ( ! m_setFromDocId ) { int64_t uh48 = getFirstUrl()->getUrlHash48(); int64_t tuh48 = g_titledb.getUrlHash48 ( (key_t *)*otr ); if ( uh48 != tuh48 ) { log("xmldoc: docid collision uh48 mismatch. cannot " "index " "%s",getFirstUrl()->getUrl() ); g_errno = EDOCIDCOLLISION; return NULL; } } // . if *otr is NULL that means not found // . return a NULL old XmlDoc in that case as well? // . make a new one // . this will uncompress it and set ourselves! try { m_oldDoc = new ( XmlDoc ); } catch ( ... ) { g_errno = ENOMEM; return NULL; } mnew ( m_oldDoc , sizeof(XmlDoc),"xmldoc1"); // if title rec is corrupted data uncompress will fail and this // will return false! if ( ! m_oldDoc->set2 ( m_oldTitleRec , m_oldTitleRecSize , // maxSize cr->m_coll , NULL , // pbuf m_niceness ) ) { log("build: failed to set old doc for %s",m_firstUrl.m_url); if ( ! g_errno ) { char *xx=NULL;*xx=0; } return NULL; } m_oldDocValid = true; // share our masterloop and state! m_oldDoc->m_masterLoop = m_masterLoop; m_oldDoc->m_masterState = m_masterState; return &m_oldDoc; } void XmlDoc::nukeDoc ( XmlDoc *nd ) { // skip if empty if ( ! nd ) return; // do not nuke yerself! if ( nd == this ) return; // or root doc! //if ( nd == m_rootDoc ) return; // nuke it mdelete ( nd , sizeof(XmlDoc) , "xdnuke"); delete ( nd ); // invalidate if ( nd == m_extraDoc ) { m_extraDocValid = false; m_extraDoc = NULL; } if ( nd == m_rootDoc ) { m_rootDocValid = false; m_rootDoc = NULL; } if ( nd == m_oldDoc ) { m_oldDocValid = false; m_oldDoc = NULL; } if ( nd == m_ahrefsDoc ) { m_ahrefsDocValid = false; m_ahrefsDoc = NULL; } } bool XmlDoc::isRobotsTxtFile ( char *u , int32_t ulen ) { if ( ulen > 12 && ! strncmp ( u + ulen - 11 , "/robots.txt" , 11 ) ) return true; return false; } static LinkInfo s_dummy; XmlDoc **XmlDoc::getExtraDoc ( char *u , int32_t maxCacheAge ) { if ( m_extraDocValid ) return &m_extraDoc; // note that setStatus ( "getting new doc" ); // we need a valid first ip first! //int32_t *pfip = getFirstIp(); //if ( ! pfip || pfip == (void *)-1 ) return (XmlDoc **)pfip; // must be NULL if ( m_extraDoc ) { char *xx=NULL;*xx=0; } // sanity check if ( ! u || ! u[0] ) { char *xx=NULL;*xx=0; }//return &m_extraDoc; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // . if *otr is NULL that means not found // . return a NULL old XmlDoc in that case as well? // . make a new one // . this will uncompress it and set ourselves! try { m_extraDoc = new ( XmlDoc ); } catch ( ... ) { g_errno = ENOMEM; return NULL; } mnew ( m_extraDoc , sizeof(XmlDoc),"xmldoc2"); // . if we did not have it in titledb then download it! // . or if titleRec was too old! // a spider rec for the extra doc to use SpiderRequest sreq; // clear it sreq.reset(); // spider the url "u" strcpy ( sreq.m_url , u ); // inherit page parser sreq.m_isPageParser = getIsPageParser(); // set the data size right sreq.setDataSize(); // . prepare to download it, set it up // . returns false and sets g_errno on error if ( ! m_extraDoc->set4 ( &sreq , NULL , // doledbkey ptr cr->m_coll , NULL , // SafeBuf m_niceness )) return NULL; // share our masterloop and state! m_extraDoc->m_masterLoop = m_masterLoop; m_extraDoc->m_masterState = m_masterState; // carry this forward always! m_extraDoc->m_isSpiderProxy = m_isSpiderProxy; // disable spam check because that is not necessary for this doc! m_extraDoc->m_spamCheckDisabled = true; // tell msg13 to get this from it robots.txt cache if it can. it also // keeps a separate html page cache for the root pages, etc. in case m_extraDoc->m_maxCacheAge = maxCacheAge; // a dummy thing s_dummy.m_numStoredInlinks = 0; s_dummy.m_numGoodInlinks = 0; // we indirectly call m_extraDoc->getHttpReply() which calls // m_extraDoc->getRedirectUrl(), which checks the linkInfo and // dmoz catids of the original url to see if we should set m_indexCode // to something bad or not. to avoid these unnecessary lookups we // set these to NULL and validate them m_extraDoc->ptr_catIds = NULL; m_extraDoc->size_catIds = 0; m_extraDoc->m_catIdsValid = true; m_extraDoc->ptr_linkInfo1 = &s_dummy; m_extraDoc->size_linkInfo1 = 0; m_extraDoc->m_linkInfo1Valid = true; m_extraDoc->ptr_linkInfo2 = &s_dummy; m_extraDoc->size_linkInfo2 = 0; m_extraDoc->m_linkInfo2Valid = true; m_extraDoc->m_urlFilterNumValid = true; m_extraDoc->m_urlFilterNum = 0; // for redirects m_extraDoc->m_allowSimplifiedRedirs = true; // always forward the http download request so that Msg13.cpp's // handleRequest13() can avoid this same page // from being downloaded at the same time. also, if we are robots.txt // this allows us to use the same cache since we select the host we // forward to based on ip address. m_extraDoc->m_forwardDownloadRequest = true; // set this flag so msg13.cpp doesn't print the "hammering ip" msg m_extraDoc->m_isChildDoc = true; // debug it //g_doc = this; // and inherit test dir so getTestDir() doesn't core on us bool isPageParser = getIsPageParser(); m_extraDoc->m_isPageParser = isPageParser; m_extraDoc->m_isPageParserValid = true; // without this we send all the msg13 requests to host #3! because // Msg13 uses it to determine what host to handle it if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; } m_extraDoc->m_firstIp = m_firstIp; m_extraDoc->m_firstIpValid = true; // i guess we are valid now m_extraDocValid = true; return &m_extraDoc; } bool XmlDoc::getIsPageParser ( ) { if ( m_isPageParserValid ) return m_isPageParser; // assume not m_isPageParser = false; // and set otherwise if ( m_sreqValid && m_sreq.m_isPageParser ) m_isPageParser = true; // and validate m_isPageParserValid = true; return m_isPageParser; } XmlDoc **XmlDoc::getRootXmlDoc ( int32_t maxCacheAge ) { if ( m_rootDocValid ) return &m_rootDoc; // help avoid mem leaks if ( m_rootDoc ) { char *xx=NULL;*xx=0; } // note it setStatus ( "getting root doc"); // are we a root? char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (XmlDoc **)isRoot; // if we are root use us!!!!! if ( *isRoot ) { m_rootDoc = this; m_rootDocValid = true; return &m_rootDoc; } // get our site root char *mysite = getSite(); if ( ! mysite || mysite == (void *)-1 ) return (XmlDoc **)mysite; // otherwise, we gotta get it! char **rtr = getRootTitleRec ( ); if ( ! rtr || rtr == (char **)-1 ) return (XmlDoc **)rtr; // if no title rec, return ptr to a null //m_rootDoc = NULL; //if ( ! *rtr ) { // // damn, not in titledb, i guess download it then // m_rootDocValid = true; return &m_rootDoc; } // note it setStatus ( "getting root doc"); // to keep injections fast, do not download the root page! if ( ! *rtr && m_contentInjected ) { // assume none m_rootDoc = NULL; m_rootDocValid = true; return &m_rootDoc; } // likewise, if doing a rebuild if ( ! *rtr && m_useSecondaryRdbs ) { // assume none m_rootDoc = NULL; m_rootDocValid = true; return &m_rootDoc; } // or recycling content like for query reindex. keep it fast. if ( ! *rtr && m_recycleContent ) { m_rootDoc = NULL; m_rootDocValid = true; return &m_rootDoc; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // . if *otr is NULL that means not found // . return a NULL root XmlDoc in that case as well? // . make a new one // . this will uncompress it and set ourselves! try { m_rootDoc = new ( XmlDoc ); } catch ( ... ) { g_errno = ENOMEM; return NULL; } mnew ( m_rootDoc , sizeof(XmlDoc),"xmldoc3"); // if we had the title rec, set from that if ( *rtr ) { m_rootDoc->set2 ( m_rootTitleRec , m_rootTitleRecSize , // maxSize , cr->m_coll , NULL , // pbuf m_niceness ); } // . otherwise, set the url and download it on demand // . this junk copied from the contactDoc->* stuff below else { // a spider rec for the contact doc SpiderRequest sreq; // clear it sreq.reset(); // spider the url "u" strcpy ( sreq.m_url , mysite ); // set this if ( m_sreqValid ) { // this will avoid it adding to tagdb! sreq.m_isPageParser = m_sreq.m_isPageParser; } // reset the data size sreq.setDataSize (); // . prepare to download it, set it up // . returns false and sets g_errno on error if ( ! m_rootDoc->set4 ( &sreq , NULL , // doledbkey ptr cr->m_coll , NULL , // SafeBuf m_niceness )) { mdelete ( m_rootDoc , sizeof(XmlDoc) , "xdnuke"); delete ( m_rootDoc ); m_rootDoc = NULL; return NULL; } // do not throttle it! //m_rootDoc->m_throttleDownload = false; // . do not do robots check for it // . no we must to avoid triggering a bot trap & getting banned //m_rootDoc->m_isAllowed = m_isAllowed; //m_rootDoc->m_isAllowedValid = true; } // share our masterloop and state! m_rootDoc->m_masterLoop = m_masterLoop; m_rootDoc->m_masterState = m_masterState; // msg13 caches the pages it downloads m_rootDoc->m_maxCacheAge = maxCacheAge; // like m_contactDoc we avoid unnecessary lookups in call to // getRedirUrl() by validating these empty members m_rootDoc->ptr_catIds = NULL; m_rootDoc->size_catIds = 0; m_rootDoc->m_catIdsValid = true; m_rootDoc->ptr_linkInfo1 = &s_dummy; m_rootDoc->size_linkInfo1 = 0; m_rootDoc->m_linkInfo1Valid = true; m_rootDoc->ptr_linkInfo2 = &s_dummy; m_rootDoc->size_linkInfo2 = 0; m_rootDoc->m_linkInfo2Valid = true; m_rootDoc->m_urlFilterNumValid = true; m_rootDoc->m_urlFilterNum = 0; // for redirects m_rootDoc->m_allowSimplifiedRedirs = true; // always forward the http download request so that Msg13.cpp's // handleRequest13() can avoid the same root page or contact page // from being downloaded at the same time. also, if we are robots.txt // this allows us to use the same cache since we select the host we // forward to based on ip address. m_rootDoc->m_forwardDownloadRequest = true; // set this flag so msg13.cpp doesn't print the "hammering ip" msg m_rootDoc->m_isChildDoc = true; // validate it m_rootDocValid = true; return &m_rootDoc; } /* // no longer access Revdb to get the old metalist, now re-compute RdbList *XmlDoc::getOldMetaList ( ) { // if valid return that if ( m_oldMetaListValid ) return &m_oldMetaList; // update status msg setStatus ( "getting old meta list"); // load the old title rec XmlDoc **odp = getOldXmlDoc( ); if ( ! odp || odp == (XmlDoc **)-1 ) return (RdbList *)odp; XmlDoc *od = *odp; // empty old doc? if ( ! od ) { m_oldMetaList.reset(); m_oldMetaListValid = true; return &m_oldMetaList; } // and use that. it has m_setFromTitleRec set to true. char *old = od->getMetaList(); if ( ! old || old == (void *)-1 ) return (RdbList *)old; // set it m_oldMetaList.m_list = od->m_metaList; // old; m_oldMetaList.m_listSize = od->m_metaListSize; m_oldMetaList.m_ownData = false; // assign it m_oldMetaListValid = true; return &m_oldMetaList; } */ // . look up TitleRec using Msg22 if we need to // . set our m_titleRec member from titledb // . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec // from scratch. this loads it from titledb. // . NULL is a valid value (EDOCNOTFOUND) so return a char ** char **XmlDoc::getOldTitleRec ( ) { // clear if we blocked //if ( g_errno == ENOTFOUND ) g_errno = 0; // if valid return that if ( m_oldTitleRecValid ) return &m_oldTitleRec; // update status msg setStatus ( "getting old title rec"); // if we are set from a title rec, we are the old doc if ( m_setFromTitleRec ) { m_oldTitleRecValid = true; m_oldTitleRec = NULL;//m_titleRec; return &m_oldTitleRec; } // sanity check if ( m_oldTitleRecValid && m_msg22a.m_outstanding ) { char *xx=NULL;*xx=0; } // point to url //char *u = getCurrentUrl()->getUrl(); //char *u = getFirstUrl()->getUrl(); // assume its valid m_oldTitleRecValid = true; // add it to the cache? bool addToCache = false; //if ( maxCacheAge > 0 ) addToCache = true; // not if new! no we need to do this so XmlDoc::getDocId() works! // this logic prevents us from setting g_errno to ENOTFOUND // when m_msg22a below calls indexDocWrapper(). however, for // doing a query delete on a not found docid will succumb to // the g_errno because m_isIndexed is not valid i think... if ( m_isIndexedValid && ! m_isIndexed && m_docIdValid ) { m_oldTitleRec = NULL; m_oldTitleRecValid = true; return &m_oldTitleRec; } // sanity check. if we have no url or docid ... if ( ! m_firstUrlValid && ! m_docIdValid ) { char *xx=NULL;*xx=0; } // use docid if first url not valid int64_t docId = 0; if ( ! m_firstUrlValid ) docId = m_docId; // if url not valid, use NULL char *u = NULL; if ( docId == 0LL && ptr_firstUrl ) u = getFirstUrl()->getUrl(); // if both are not given that is a problem if ( docId == 0LL && ! u ) { log("doc: no url or docid provided to get old doc"); g_errno = EBADENGINEER; return NULL; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // the title must be local since we're spidering it if ( ! m_msg22a.getTitleRec ( &m_msg22Request , u , docId , // probable docid cr->m_coll , // . msg22 will set this to point to it! // . if NULL that means NOT FOUND &m_oldTitleRec , &m_oldTitleRecSize , false , // just chk tfndb? false , // getAvailDocIdOnly m_masterState , m_masterLoop , m_niceness , // niceness addToCache , // add to cache? 0 , // max cache age 999999 , // timeout seconds false ))// load balancing? // return -1 if we blocked return (char **)-1; // not really an error if ( g_errno == ENOTFOUND ) g_errno = 0; // error? if ( g_errno ) return NULL; // got it return &m_oldTitleRec; } // . look up TitleRec using Msg22 if we need to // . set our m_titleRec member from titledb // . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec // from scratch. this loads it from titledb. // . NULL is a valid value (EDOCNOTFOUND) so return a char ** char **XmlDoc::getRootTitleRec ( ) { // if valid return that if ( m_rootTitleRecValid ) return &m_rootTitleRec; // are we a root? char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot; // if we are root use us!!!!! well, the old us... if ( *isRoot ) { char **otr = getOldTitleRec ( ); if ( ! otr || otr == (char **)-1 ) return (char **)otr; m_rootTitleRec = m_oldTitleRec; m_rootTitleRecSize = m_oldTitleRecSize; return &m_rootTitleRec; } // get our site root char *mysite = getSite(); if ( ! mysite || mysite == (char *)-1 ) return (char **)mysite; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // make it a url. keep it on stack since msg22 copies it into its // url request buffer anyway! (m_msg22Request.m_url[]) Url site; site.set ( mysite ); // assume its valid m_rootTitleRecValid = true; // add it to the cache? bool addToCache = false; //if ( maxCacheAge > 0 ) addToCache = true; // update status msg setStatus ( "getting root title rec"); // the title must be local since we're spidering it if ( ! m_msg22b.getTitleRec ( &m_msg22Request , site.getUrl() , 0 , // probable docid cr->m_coll , // . msg22 will set this to point to it! // . if NULL that means NOT FOUND &m_rootTitleRec , &m_rootTitleRecSize , false , // just chk tfndb? false , // getAvailDocIdOnly m_masterState , m_masterLoop , m_niceness , // niceness addToCache , // add to cache? 0 , // max cache age 999999 , // timeout seconds false ))// load balancing? // return -1 if we blocked return (char **)-1; // not really an error if ( g_errno == ENOTFOUND ) g_errno = 0; // error? if ( g_errno ) return NULL; // got it return &m_rootTitleRec; } /* // . look up TitleRec using Msg22 if we need to // . set our m_titleRec member from titledb // . the twin brother of XmlDoc::getTitleRecBuf() which makes the title rec // from scratch. this loads it from titledb. // . NULL is a valid value (EDOCNOTFOUND) so return a char ** char **XmlDoc::getContactTitleRec ( char *u ) { // clear if we blocked //if ( g_errno == ENOTFOUND ) g_errno = 0; // if valid return that if ( m_contactTitleRecValid ) return &m_contactTitleRec; // fake static char *s_fake = NULL; // if no url, we got no contact title rec in titledb then! if ( ! u || u[0] == '\0' ) return &s_fake; // update status msg setStatus ( "getting contact title rec"); // assume its valid m_contactTitleRecValid = true; // add it to the cache? bool addToCache = false; //if ( maxCacheAge > 0 ) addToCache = true; // the title must be local since we're spidering it if ( ! m_msg22c.getTitleRec ( &m_msg22Request , u , 0 , // probable docid m_coll , // . msg22 will set this to point to it! // . if NULL that means NOT FOUND &m_contactTitleRec , &m_contactTitleRecSize , false , // just chk tfndb? m_masterState , m_masterLoop , m_niceness , // niceness addToCache , // add to cache? 0 , // max cache age 999999 , // timeout seconds false ))// load balancing? // return -1 if we blocked return (char **)-1; // not really an error if ( g_errno == ENOTFOUND ) g_errno = 0; // error? if ( g_errno ) return NULL; // got it return &m_contactTitleRec; } */ // used for indexing spider replies. we need a unique docid because it // is treated as a different document even though its url will be the same. // and there is never an "older" version of it because each reply is treated // as a brand new document. int64_t *XmlDoc::getAvailDocIdOnly ( int64_t preferredDocId ) { if ( m_availDocIdValid && g_errno ) { log("xmldoc: error getting availdocid: %s", mstrerror(g_errno)); return NULL; } if ( m_availDocIdValid ) // this is 0 or -1 if no avail docid was found return &m_msg22c.m_availDocId; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // pre-validate it m_availDocIdValid = true; if ( ! m_msg22c.getAvailDocIdOnly ( &m_msg22Requestc , preferredDocId , cr->m_coll , m_masterState , m_masterLoop , m_niceness ) ) return (int64_t *)-1; // error? log("xmldoc: error getting availdocid2: %s",mstrerror(g_errno)); return NULL; } int64_t *XmlDoc::getDocId ( ) { if ( m_docIdValid ) return &m_docId; setStatus ("getting docid"); XmlDoc **od = getOldXmlDoc( ); if ( ! od || od == (XmlDoc **)-1 ) return (int64_t *)od; setStatus ("getting docid"); // . set our docid // . *od is NULL if no title rec found with that docid in titledb if ( *od ) m_docId = *(*od)->getDocId(); else m_docId = m_msg22a.getAvailDocId(); // if docid is zero, none is a vailable!!! //if ( m_docId == 0LL ) m_indexCode = ENODOCID; m_docIdValid = true; return &m_docId; } // . is our docid on disk? i.e. do we exist in the index already? // . TODO: just check tfndb? char *XmlDoc::getIsIndexed ( ) { if ( m_isIndexedValid ) return &m_isIndexed; setStatus ( "getting is indexed" ); // we must be old if this is true //if ( m_setFromTitleRec ) { // m_isNew = false; // m_isNewValid = true; // return &m_isNew; //} // get the url //char *u = getFirstUrl()->getUrl(); CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // sanity check. if we have no url or docid ... if ( ! m_firstUrlValid && ! m_docIdValid ) { char *xx=NULL;*xx=0; } // use docid if first url not valid int64_t docId = 0; char *url = NULL; // use docid if its valid, otherwise use url if ( m_docIdValid ) docId = m_docId; else url = ptr_firstUrl; // note it if ( ! m_calledMsg22e ) setStatus ( "checking titledb for old title rec"); else setStatus ( "back from msg22e call"); // . consult the title rec tree! // . "justCheckTfndb" is set to true here! if ( ! m_calledMsg22e && ! m_msg22e.getTitleRec ( &m_msg22Request , url , docId , // probable docid cr->m_coll , // . msg22 will set this to point to it! // . if NULL that means NOT FOUND NULL , // tr ptr NULL , // tr size ptr true , // just chk tfndb? false, // getavaildocidonly m_masterState , m_masterLoop , m_niceness , // niceness false , // add to cache? 0 , // max cache age 999999 , // timeout seconds false )){//load balancing? // validate m_calledMsg22e = true; // return -1 if we blocked return (char *)-1; } // got it m_calledMsg22e = true; // error? if ( g_errno ) return NULL; // get it if ( m_msg22e.m_found ) m_isIndexed = true; else m_isIndexed = false; // validate m_isIndexedValid = true; return &m_isIndexed; } void gotTagRecWrapper ( void *state ) { XmlDoc *THIS = (XmlDoc *)state; // note it THIS->setStatus ( "in got tag rec wrapper" ); // set these if ( ! g_errno ) { THIS->m_tagRec.serialize ( THIS->m_tagRecBuf ); THIS->ptr_tagRecData = THIS->m_tagRecBuf.getBufStart(); THIS->size_tagRecData = THIS->m_tagRecBuf.length(); // validate THIS->m_tagRecValid = true; } // continue THIS->m_masterLoop ( THIS->m_masterState ); } // if tagrec changed enough so that it would affect what we would index // since last time we indexed this doc, we need to know that! /* int32_t *XmlDoc::getTagHash32 ( ) { // make it valid if ( m_tagHash32Valid ) return &m_tagHash32; // compute it TagRec *gr = getTagRec (); if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr; // init it m_tagHash32 = 0; // hash the values of all tags for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) { // breathe QUICKPOLL(m_niceness); // get data uint32_t h = hash32(tag->getTagData(),tag->getTagDataSize(),0); // skip if 0 if ( ! h ) continue; // xor it up m_tagHash32 = hash32h ( h , m_tagHash32 ); } // validate m_tagHash32Valid = true; return &m_tagHash32; } */ // . returns NULL and sets g_errno on error // . returns -1 if blocked, will re-call m_callback TagRec *XmlDoc::getTagRec ( ) { // if we got it give it if ( m_tagRecValid ) return &m_tagRec; // do we got a title rec? if ( m_setFromTitleRec && m_version >= 118 && // lookup up fresh from tagdb when doing a rebuild so we get // the latest sitenuminlinks! nah, we set m_tagRecValid and // m_tagRecDataValid to false in Repair.cpp iff rebuilding // titledb!! otherwise, we have to use what is in titlerec // to avoid parsing inconsistencies that would result in // undeletable posdb data. //! m_useSecondaryRdbs && // lookup the tagdb rec fresh if setting for a summary. that way // we can see if it is banned or not m_tagRecDataValid ) { // all done m_tagRecValid = true; // assume null if old version //if ( m_version <= 115 ) return &m_tagRec; // just return empty otherwise m_tagRec.setFromBuf ( ptr_tagRecData , size_tagRecData ); return &m_tagRec; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // get our site, usually the hostname, but can be like // "www.last.fm/user/breendaxx/" // we can't call this because it CALLS getTagRec()!!! //char *mysite = getSite(); //if ( ! mysite || mysite == (char *)-1 ) return (TagRec *)mysite; // update status msg setStatus ( "getting tagdb record" ); // get the final redirected url //Url *u = getCurrentUrl(); // nah, try this Url *u = getFirstUrl(); // if we are docid based url this might block! //if ( ! u || u == (void *)-1 ) return (TagRec *)u; // good to go //m_oldTagRecValid = true; // get it, user our collection for lookups, not m_tagdbColl[] yet! if ( ! m_msg8a.getTagRec ( u , // we have to guess the site because // we can't hit tagdb to get it at this // point!!! NULL, // guess it! // mysite , cr->m_collnum , false, // skip domain lookup? // true m_niceness , this , gotTagRecWrapper , &m_tagRec ) ) // we blocked, return -1 return (TagRec *)-1; // error? ENOCOLLREC? if ( g_errno ) return NULL; // assign it m_tagRec.serialize ( m_tagRecBuf ); ptr_tagRecData = m_tagRecBuf.getBufStart(); size_tagRecData = m_tagRecBuf.length(); // validate m_tagRecValid = true; // our tag rec should be all valid now return &m_tagRec; } // this is only for purposes of setting the site's TagRec char *XmlDoc::getHasContactInfo ( ) { if ( m_hasContactInfoValid ) return &m_hasContactInfo2; setStatus ( "getting has contact info" ); // get it from the tag rec if we can TagRec *gr = getTagRec (); if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr; char *ic = getIsThisDocContacty ( ); if ( ! ic || ic == (void *)-1 ) return (char *)ic; // the current top ip address //int32_t *ip = getIp(); //if ( ! ip || ip == (int32_t *)-1) return (char *)ip; //int32_t top = *ip & 0x00ffffff; // and should have a contact page tag Tag *tag = gr->getTag ("hascontactinfo"); if ( tag ) m_hasContactInfo = true; else m_hasContactInfo = false; m_hasContactInfo2 = m_hasContactInfo; // are we a "contact" link? i.e. about us, etc. that would contain // the physical address of the entity responsible for this website //bool isContacty = getIsContacty( fu , // info1 , // hops , // *ct , // *isRoot , // m_niceness ); // bail early if not a candidate for contact info if ( ! *ic ) { // check ) { m_hasContactInfoValid = true; return &m_hasContactInfo2; } // // TODO: did IP change?? invalidate it??? // // set status. we can time status changes with this routine! setStatus ( "getting contact info on just this page" ); int32_t *nca = getNumContactAddresses(); if ( ! nca || nca == (void *)-1 ) return (char *)nca; // did we have a contact address? if ( *nca ) { m_hasContactInfo = true; m_hasContactInfo2 = true; m_hasContactInfoValid = true; return &m_hasContactInfo2; } // get the email addresses int32_t *numOfficial = getNumOfficialEmails ( ); if ( ! numOfficial || numOfficial == (void *)-1) return (char *)numOfficial; // did we get some? if ( *numOfficial > 0 ) { m_hasContactInfo = true; m_hasContactInfo2 = true; m_hasContactInfoValid = true; return &m_hasContactInfo2; } // this should set m_hasContactInfo as well as m_contact*[] arrays //TagRec *pcitr = getContactInfoTagRec (); //if ( ! pcitr || pcitr == (void *)-1 ) return (char *)pcitr; // do not re-peat the above now m_hasContactInfoValid = true; return &m_hasContactInfo2; } // returns "type" of contact link, > 0 int32_t getIsContacty ( Url *url , LinkInfo *info1 , int32_t hops , uint8_t ct , bool isRoot , int32_t niceness ) { static int64_t h_home ; static int64_t h_site ; static int64_t h_map ; static int64_t h_sitemap ; static int64_t h_contact ; static int64_t h_about ; static int64_t h_privacy ; static int64_t h_policy ; static int64_t h_statement ; static int64_t h_terms ; static int64_t h_of ; static int64_t h_and ; static int64_t h_service ; static int64_t h_conditions ; static int64_t h_use ; static int64_t h_us ; static int64_t h_help ; static int64_t h_location ; static int64_t h_faq ; static int64_t h_faqs ; static int64_t h_customer ; static int64_t h_support ; static int64_t h_advertise ; static int64_t h_inquiry ; static int64_t h_inquiries ; static int64_t h_feedback ; static int64_t h_company ; static int64_t h_corporate ; static bool s_inith = false; if ( ! s_inith ) { s_inith = true; h_home = hash64n ("home"); h_site = hash64n ("site"); h_map = hash64n ("map"); h_sitemap = hash64n ("sitemap"); h_contact = hash64n ("contact"); h_about = hash64n ("about"); h_privacy = hash64n ("privacy"); h_policy = hash64n ("policy"); h_statement = hash64n ("statement"); h_terms = hash64n ("terms"); h_of = hash64n ("of"); h_and = hash64n ("and"); h_service = hash64n ("service"); h_conditions = hash64n ("conditions"); h_use = hash64n ("use"); h_us = hash64n ("us"); h_help = hash64n ("help"); h_location = hash64n ("location"); h_faq = hash64n ("faq"); h_faqs = hash64n ("faqs"); h_customer = hash64n ("customer"); h_support = hash64n ("support"); h_advertise = hash64n ("advertise"); h_inquiry = hash64n ("inquiry"); h_inquiries = hash64n ("inquiries"); h_feedback = hash64n ("feedback"); h_company = hash64n ("company"); h_corporate = hash64n ("corporate"); } int32_t check = 0; // loop over the link texts we got for ( Inlink *k = NULL; (k = info1->getNextInlink(k)) ; ) { // never do anything if hop count >= 3 if ( hops >= 3 ) break; // javascript must be hopcount 1 only if ( ct == CT_JS && hops != 1 ) break; // is this inlinker internal? //bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff)); // skip if not local to site //if ( ! internal ) continue; // get the text char *txt = k->getLinkText(); // get length of link text int32_t tlen = k->size_linkText; if ( tlen > 0 ) tlen--; // assume utf-8. so do a utf-8 sanity check so it doesn't // break Words::countWords() by thinking a character is // 2+ bytes and breaching the buffer if ( ! verifyUtf8 ( txt , tlen ) ) { log("xmldoc: bad link text 1 from url=%s for %s", k->getUrl(),url->m_url); continue; } // convert into words i guess Words ww; // . TODO: use alt text if only an image in the link!!!!! // . return -1 if it fails with g_errno set if ( ! ww.setx ( txt , tlen , niceness) ) return (char)-1; // int16_tcut int32_t nw = ww.getNumWords(); // skip if too big if ( nw >= 30 ) continue; // int16_tcut int64_t *wids = ww.getWordIds(); // reset alnumcount int32_t count = 0; // loop over its words for ( int32_t j = 0 ; j < nw && ! check ; j++ ) { // skip if not alnum if ( ! wids[j] ) continue; // keep track of alnum word position count++; // "contact..." only good from root or root kid if ( wids[j] == h_contact && hops >= 1 && count == 1 ) check = 1; // "about..." only good from root or root kid if ( wids[j] == h_about && hops >= 1 && count == 1 ) check = 2; // "...privacy policy..." if ( wids[j ] == h_privacy && j+2getPath(); if ( gb_strcasestr(path,"contact" ) ) { check += 33; check *= 90; } if ( gb_strcasestr(path,"/about" ) ) { check += 34; check *= 91; } if ( gb_strcasestr(path,"/feedback") ) { check += 35; check *= 92; } if ( gb_strcasestr(path,"/help" ) ) { check += 36; check *= 93; } if ( gb_strcasestr(path,"/faq" ) ) { check += 37; check *= 94; } if ( gb_strcasestr(path,"advertise") ) { check += 38; check *= 95; } if ( gb_strcasestr(path,"inquir" ) ) { check += 39; check *= 96; } return check; } char *XmlDoc::getIsThisDocContacty() { if ( m_isContactyValid ) return &m_isContacty; setStatus ( "getting is contacty" ); // are we a root? char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot; int8_t *hc = getHopCount(); if ( ! hc || hc == (void *)-1 ) return (char *)hc; // get the content type uint8_t *ct = getContentType(); if ( ! ct ) return NULL; LinkInfo *info1 = getLinkInfo1 (); if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1; // get the first url Url *fu = getFirstUrl(); // int16_tcut int32_t hops = *hc; // check it m_isContacty = getIsContacty ( fu , info1 , hops , *ct , *isRoot , m_niceness ); m_isContactyValid = true; return &m_isContacty; } int32_t *XmlDoc::getNumContactAddresses ( ) { // process Address **ca = getContactAddresses(); if ( ! ca || ca == (void *)-1 ) return (int32_t *)ca; // now we are valid return &m_numContactAddresses; } Address **XmlDoc::getContactAddresses ( ) { // assume none if ( m_contactAddressesValid ) return m_contactAddresses; // need this of course Addresses *aa = getAddresses (); if ( ! aa || aa == (void *)-1 ) return (Address **)aa; // assume none m_contactAddressesValid = true; m_numContactAddresses = 0; // not if not contacty. we gotta be a url like ".../contact.asp" char *ic = getIsThisDocContacty ( ); if ( ! ic || ic == (void *)-1 ) return (Address **)ic; // if not a of contact url form, return none if ( ! *ic ) return m_contactAddresses; // are we a root? char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (Address **)isRoot; // do not do this for root if multiple addresses. this // fixes http://obits.abqjournal.com/ if ( *isRoot && aa->m_uniqueStreetHashes > 1 ) return m_contactAddresses; // reset count int32_t nca = 0; // number of addresses in this doc int32_t na = aa->m_am.getNumPtrs(); // add all addresses then??? for ( int32_t i = 0 ; i < na ; i++ ) { // breathe QUICKPOLL(m_niceness); // get it Address *ai = (Address *)aa->m_am.getPtr(i); // do not add this to tagdb if not inlined! if ( ! ( ai->m_flags & AF_INLINED ) ) continue; // store it m_contactAddresses[nca++] = ai; // stop before breach if ( nca >= MAX_CONTACT_ADDRESSES ) break; } // update count m_numContactAddresses = nca; return m_contactAddresses; } int32_t *XmlDoc::getNumOfficialEmails ( ) { char *eb = getEmailBuf(); if ( ! eb || eb == (void *)-1 ) return (int32_t *)eb; return &m_numOfficialEmails; } // . add email addresses to tag rec // . add up to 3 of same domain and different domain addresses // . return # of *official* contact infos added to tag rec // . this now includes submission forms! // . returns -1 and sets g_errno on error char *XmlDoc::getEmailBuf ( ) { if ( m_emailBufValid ) return m_emailBuf; Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (char *)ww; // count # of official contacts we got int32_t official = 0; // int16_tcuts int64_t *wids = ww->m_wordIds; char **wptrs = ww->m_words; int32_t *wlens = ww->m_wordLens; nodeid_t *tids = ww->m_tagIds; int32_t nw = ww->getNumWords(); // get our url Url *f = getFirstUrl(); // get its domain len char *myDom = f->getMidDomain(); int32_t myDomLen = f->getMidDomainLen(); // point here char *eptr = m_emailBuf; char *emax = m_emailBuf + EMAILBUFSIZE; m_emailBufValid = true; // reset *eptr = '\0'; // // ADD EMAIL ADDRESSES // // count how many we find int32_t ne = 0; // loop over all the words for ( int32_t i = 1 ; i < nw ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // . email address? look for the '@' // . might also have (bot proof) if ( wptrs[i][0] != '@' && tids[i] != TAG_IMG ) continue; // . make sure any image has an "/at." in it! // . "mailpipl.com" if(tids[i]==TAG_IMG&&!gb_strncasestr(wptrs[i],wlens[i],"/at.")) continue; // must be a single char if ( ! tids[i] && wlens[i] != 1 ) continue; // if i was the last word, give up! if ( i + 1 >= nw ) break; // back up i until we hit a non-email char int32_t a ; for ( a = i ; a - 1 > 0 ; a-- ) { if (wids [a-1] ) continue; if (wptrs[a-1][0]=='.'&&wlens[a-1]==1)continue; if (wptrs[a-1][0]=='-'&&wlens[a-1]==1)continue; break; } // must not start with '.' if ( wptrs[a][0]=='.' ) a++; // now get the end of it int32_t b; int32_t periodCount = 0; for ( b = i ; b+1 < nw ; b++ ) { if (wids[b+1]) continue; // only punct we allow is a single period if ( wptrs[b+1][0]!='.' ) break; if ( wlens[b+1] != 1 ) break; periodCount++; } // must have at least one! if ( ! periodCount ) continue; // must not end on '.' if ( wptrs[b][0]=='.') b--; // hostname must have a valid tld char *host = wptrs[i+1]; char *hend = wptrs[b]+wlens[b]; // temp null term char c = *hend; *hend = '\0'; int32_t tldLen ; char *tld = getTLDFast ( host, &tldLen , false ); // ignore the rest of this line for addresses even // if tld is bogus //ignoreLine = true; // must have a legit tld! if ( ! tld ) { *hend = c; continue; } // if not from our same domain, use "emailaddressoffsite" int32_t dlen ; char *dom = getDomFast ( host , &dlen , false ); // use mid domain. subtract '.' //int32_t midlen = tld - dom - 1; // undo the temp NULL thing *hend = c; if ( ! dom ) continue; // include last word b++; // normal buffer char buf[100]; char *p = buf; char *pend = buf + 100; // normalize it for ( int32_t j = a ; j < b ; j++ ) { // include the at sign if ( j == i ) {*p++ = '@'; continue;} // skip tags if ( tids[j] ) continue; // skip punct if ( ! wids[j] ) {*p++ ='.'; continue;} // ensure minimal space if ( p + wlens[j] + 1 >= pend ) break; // write out wids gbmemcpy ( p , wptrs[j] , wlens[j] ); p += wlens[j]; } // NULL term it *p = '\0'; // do we match domains? //char *tn = "emailaddressoffsite"; // use this if we match domains //if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) { // tn = "emailaddressonsite"; // // this is an official contact method // //official++; //} // we now count even offsite email addresses as official // for addresses like @gmail.com etc. because we are now // only checking "contact us" and "about us" and root pages, // so they should never be email addresses of commenters. // and often bloggers have external email addresses. // http://www.christinesaari.com/html/about.php?psi=44 official++; // store it //if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,buf) ) // return -1; int32_t blen = gbstrlen(buf); // ignore if breach if ( eptr + blen + 2 > emax ) continue; // comma? if ( eptr > m_emailBuf ) *eptr++ = ','; // store it gbmemcpy (eptr , buf , blen ); // advance eptr += blen; // limit it if ( ++ne >= 3 ) break; } // // ADD BOT-PROOF EMAIL ADDRESSES (bot proof) // // super dot john at xyz dot com // int64_t h_at = hash64Lower_utf8("at"); int64_t h_dot = hash64Lower_utf8("dot"); // loop over all the words for ( int32_t i = 1 ; i < nw ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // email address? look for the " at " if ( wids[i] != h_at ) continue; // front name word count int32_t nameCount = 0; // back up i until we hit a non-email word int32_t a ; // do a loop for ( a = i - 1 ; a > 0 ; ) { // need a space/punt word if ( wids[a] ) break; if ( tids[a] ) break; // skip it a--; // then need the "john" part if ( ! wids[a] ) break; if ( tids[a] ) break; if ( wids[a] == h_dot ) break; // "dot" is bad // count account name part nameCount++; // go back if like "mike dot smith" if ( a - 4 >= 0 && ! tids[a-1] && wids [a-2] == h_dot && ! tids[a-3] && wids [a-4] != h_dot && wids [a-4] != h_at ) a -= 4; // that is good enough break; } // need a name at least one if ( nameCount <= 0 ) continue; // skip over that space/punct word //a--; // now must be regular word before that //if ( tids[a-1] ) continue; //if ( ! wids[a-1] ) continue; // we got it //a--; // now get the end of it int32_t b ; // count the dots int32_t dotCount = 0; // make sure last word is a legit tld int32_t tldLen = 0; char *tld = NULL; // do a loop for ( b = i + 1 ; b + 3 < nw ; b++ ) { // need a space/punt word if ( wids[b] ) break; if ( tids[b] ) break; // skip it b++; // then need the "xyz" part if ( ! wids[b] ) break; if ( tids[b] ) break; if ( wids[b] == h_dot ) break; // "dot" is bad // remember it for tld detection tld = wptrs[b]; tldLen = wlens[b]; // skip it b++; // need another space/punct word if ( wids[b] ) break; if ( tids[b] ) break; // skip it b++; // now we need a "dot" if ( wids[b] != h_dot ) break; // count the dots dotCount++; } // need at least one "dot" if ( dotCount < 1 ) continue; // not too many! if ( dotCount > 5 ) continue; // must have legit tld if ( tld && ! isTLD ( tld , tldLen ) ) continue; // normal buffer char buf[100]; char *p = buf; char *pend = buf + 100; // normalize it for ( int32_t j = a ; j < b ; j++ ) { // skip tags if ( tids[j] ) continue; // skip punct if ( ! wids[j] ) continue; // ensure minimal space if ( p + wlens[j] + 1 >= pend ) break; // write out wids if ( wids[j] == h_at ) {*p++ = '@'; continue;} if ( wids[j] == h_dot ) {*p++ = '.'; continue;} gbmemcpy ( p , wptrs[j] , wlens[j] ); p += wlens[j]; } // NULL term it *p = '\0'; // get the host char *host = buf ; // wptrs[i+1]; ?? is this right? // if not from our same domain, use "emailaddressoffsite" int32_t dlen ; char *dom = getDomFast ( host , &dlen , false ); if ( ! dom ) continue; // use mid domain int32_t tlen3; char *tld3 = getTLDFast ( dom, &tlen3 , false ); // limit domain by that. subtract '.' int32_t midlen = tld3 - dom - 1; // do we match domains? char *tn = "emailaddressoffsite"; // use this if we match domains if ( midlen == myDomLen && ! strncmp (dom,myDom,midlen) ) { tn = "emailaddressonsite"; // this is an official contact method //official++; } // we now count even offsite email addresses as official // for addresses like @gmail.com etc. because we are now // only checking "contact us" and "about us" and root pages, // so they should never be email addresses of commenters // and often bloggers have external email addresses. // http://www.christinesaari.com/html/about.php?psi=44 official++; // store that //if ( ! gr->addTag(tn,timestamp,"xmldoc",ip,buf) ) // return -1; int32_t blen = gbstrlen(buf); // ignore if breach if ( eptr + blen + 2 > emax ) continue; // comma? if ( eptr > m_emailBuf ) *eptr++ = ','; // store it gbmemcpy (eptr , buf , blen ); // advance eptr += blen; // limit it if ( ++ne >= 3 ) break; } // // ADD EMAIL ADDRESSES IN MAILTO TAGS // // // // now we check char by char since a website had it in the javascript: // http://www.botanique.com/bincgi/stateprov.CFM?state=NM // char *m = xml->m_xml; char *mend = m + xml->m_xmlLen - 4; // empty? if ( ! m ) mend = m; // scan for ( ; ; m++ ) { // breach? if ( m >= mend ) break; // breathe QUICKPOLL ( m_niceness ); // skip if not possible mailto: if ( *m != 'm' && *m !='M' ) continue; // skip m++; // skip? if ( *m != 'a' && *m !='A' ) continue; // skip m++; // skip? if ( *m != 'i' && *m !='I' ) continue; // skip m++; // skip? if ( *m != 'l' && *m !='L' ) continue; // skip m++; // skip? if ( *m != 't' && *m !='T' ) continue; // skip m++; // skip? if ( *m != 'o' && *m !='O' ) continue; // skip m++; // skip? if ( *m != ':' ) continue; // skip m++; // set end char *mend = m + 100; // skip over the mailto: //m += 7; // that is the start of the email address then char *start = m; // skip til '@' for ( ; *m && m < mend && *m != '@' ; m++ ) { // but give up if we hit a non-email name char if ( is_alnum_a(*m) ) continue; if ( *m == '.' ) continue; if ( *m == '-' ) continue; break; } // bad if no @ if ( *m != '@' ) continue; // skip the @ m++; // . skip until alnum // . fix parsing of "dsquires@ unimelb.edu.au" for // http://www.marcom1.unimelb.edu.au/public/contact.html for (;*m && is_wspace_utf8(m); m+=getUtf8CharSize(m) ); // get the host char *host = m; // skip till end of hostname for (;*m && maddTag(tn,timestamp,"xmldoc",ip,start,end-start) ) // return -1; // cast it char *buf = start; int32_t blen = end - start; // ignore if breach if ( eptr + blen + 2 > emax ) continue; // comma? if ( eptr > m_emailBuf ) *eptr++ = ','; // store it gbmemcpy (eptr , buf , blen ); // advance eptr += blen; // limit it if ( ++ne >= 3 ) break; } // // ADD CONTACT FORM // bool gotEmailBox = false; bool storedForm = false; int32_t emailPos = -1; int32_t alnumCount = 0; // quick compares int64_t he1 = hash64Lower_utf8 ( "email"); int64_t he2 = hash64Lower_utf8 ( "mail"); // loop over all words again for ( int32_t i = 1 ; i < nw ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // get tag id if any int32_t tid = tids[i] & BACKBITCOMP; // . do we have a submit form? // . first, do we have a text box for the sender's email? if ( tid == TAG_INPUT ) { int32_t ttlen; // bad i is not a node # it is a word # int32_t nn = ww->m_nodes[i]; // must be valid char *tt = xml->getString(nn,"type",&ttlen); if ( ! tt || ttlen <= 0 ) continue; // must be of type text if ( strncasecmp(tt,"text",4) ) continue; // might have "email" or "e-mail" in the value int32_t vlen; char *val = xml->getString(nn,"value",&vlen); // check that if ( val ) { if ( gb_strncasestr(val,vlen,"email") || gb_strncasestr(val,vlen,"e-mail") ) // flag it good gotEmailBox = true; } // must have the word "email" or "e-mail" within // a few words right before it! if ( emailPos == -1 ) continue; //if ( i - emailPos >= 7 ) continue; if ( alnumCount > 7 ) continue; // flag it gotEmailBox = true; } // text area? must happen AFTER the email adress box if ( tid == TAG_TEXTAREA && gotEmailBox ) { // must have had the form before us // do not double store into tagdb rec if ( storedForm ) continue; // store this bad boy into the tagdb rec //if ( ! gr->addTag("hascontactform", // timestamp, // "xmldoc", // ip, // "1" , // 1 ) ) // return -1; // copy it char *buf = "hascontactform"; int32_t blen = gbstrlen(buf); // ignore if breach if ( eptr + blen + 2 > emax ) continue; // comma? if ( eptr > m_emailBuf ) *eptr++ = ','; // store it gbmemcpy (eptr , buf , blen ); // advance eptr += blen; // do not double store storedForm = true; // this is an official contact method official++; // another contact method ne++; // that's enough! break; } // alnum counter if ( wids[i] ) alnumCount++; // special counter if ( wids[i] == he1 || wids[i] == he2 ) { // mark it emailPos = i; // reset counter alnumCount = 0; } } // null term *eptr = '\0'; m_numOfficialEmails = official; // i guess that is it return m_emailBuf; } // returns vector 1-1 with Words.m_words[] array /* Spam *XmlDoc::getSpam ( ) { if ( m_spamValid ) return &m_spam; // set it Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (Spam *)ww; Bits *bits = getBits (); if ( ! bits || bits == (Bits *)-1 ) return (Spam *)bits; int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (int32_t *)-1 ) return (Spam *)sni; // if more than X% ("thresh") of words are spammed to some degree, // index all words with a minimum score int32_t thresh = 6; if ( *sni > 10 ) thresh = 8; if ( *sni > 30 ) thresh = 10; if ( *sni > 100 ) thresh = 20; if ( *sni > 500 ) thresh = 30; //int64_t x[] = {30,40,50,70,90}; //int64_t y[] = {6,8,10,20,30}; //int32_t spamThresh = getY ( m_docQuality , x , y , 5 ); if ( ! m_spam.set ( ww , bits , m_version , thresh , 20 , m_niceness )) return NULL; m_spamValid = true; return &m_spam; } */ // this means any tod now bool *XmlDoc::getHasTOD ( ) { if ( m_hasTODValid ) return &m_hasTOD2; // scan the dates Dates *dp = getDates() ; if ( ! dp || dp == (Dates *)-1 ) return (bool *)dp; // assume not m_hasTOD2 = false; m_hasTOD = false; // scan the dates for ( int32_t i = 0 ; i < dp->m_numDatePtrs ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // get date Date *di = dp->m_datePtrs[i]; // skip if got nuked if ( ! di ) continue; // tod? if ( !(di->m_hasType & DT_TOD) ) continue; // got one m_hasTOD2 = true; m_hasTOD = true; } // it is now valid m_hasTODValid = true; return &m_hasTOD2; } /* bool *XmlDoc::getHasSiteVenue ( ) { if ( m_hasSiteVenueValid ) return &m_hasSiteVenue2; // get the tag rec TagRec *gr = getTagRec (); if ( ! gr || gr == (TagRec *)-1 ) return (bool *)gr; // get tag from it Tag *sv = gr->getTag("venueaddress") ; // from that m_hasSiteVenue2 = (bool)sv; m_hasSiteVenue = (bool)sv; m_hasSiteVenueValid = true; return &m_hasSiteVenue2; } */ // do not include addresses that are always in the header/footer of every page! bool *XmlDoc::getHasAddress ( ) { if ( m_hasAddressValid ) return &m_hasAddress2; // get the addresses Addresses *aa = getAddresses(); if ( ! aa || aa == (void *)-1 ) return (bool *)aa; // from that m_hasAddress2 = (aa->getNumNonDupAddresses() > 0); m_hasAddress = (aa->getNumNonDupAddresses() > 0); m_hasAddressValid = true; return &m_hasAddress2; } Addresses *XmlDoc::getAddresses ( ) { if ( m_addressesValid ) { // return error if buf was breached //if ( m_addresses.m_breached ) { // g_errno = EBUFOVERFLOW; // return NULL; //} // otherwise, return it return &m_addresses; } // skip for now m_addressesValid = true; return &m_addresses; // note it setStatus ( "getting addresses"); Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (Addresses *)ww; // we make sure that D_IS_IN_DATE is set by doing this //Dates *dp = getDates(); //if ( ! dp || dp == (Dates *)-1) return (Addresses *)dp; // we set the D_IS_IN_DATE flag for these bits Bits *bits = getBits(); if ( ! bits ) return NULL; Sections *sections = getExplicitSections(); if ( !sections||sections==(Sections *)-1) return (Addresses *)sections; TagRec *gr = getTagRec(); if ( ! gr || gr == (TagRec *)-1 ) return (Addresses *)gr; // the site hash //int32_t *sh32 = getSiteHash32(); //if ( ! sh32 || sh32 == (int32_t *)-1 ) return (Addresses *)sh32; int32_t dh = getDomHash32(); // hash of all adjacent tag pairs //uint32_t *tph = getTagPairHash32 ( ) ; //if ( ! tph || tph == (void *)-1 ) return (Addresses *)tph; int64_t *d = getDocId(); if ( ! d || d == (int64_t *)-1 ) return (Addresses *)d; // get our ip int32_t *ip = getIp(); if ( ! ip || ip == (int32_t *)-1) return (Addresses *)ip; // get the content type uint8_t *ct = getContentType(); if ( ! ct ) return NULL; //char **stb = getSiteTitleBuf(); //if ( ! stb || stb == (void *)-1 ) return (Addresses *)stb; // sanity check //if ( ! m_siteTitleBufValid ) { char *xx=NULL;*xx=0; } char **fbuf = getFilteredRootTitleBuf(); if ( ! fbuf || fbuf == (void *)-1 ) return (Addresses *)fbuf; // this will set D_IS_IN_DATE in the Bits::m_bits[] array which // Addresses::set() uses to avoid having addresses that are really // just dates! Dates *dd = getSimpleDates(); // return NULL on error if ( ! dd ) return (Addresses *)NULL; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // if the serialized section is valid, use that //char *sd = NULL; //bool valid = false; //if ( od && od->m_sectionsReplyValid ) valid = true; //if ( valid ) sd = od->ptr_sectionsReply; // assume valid, really only when it returns in case it blocked... //m_addressesValid = true; // this should not be outstanding! if ( m_addressSetCalled ) { char *xx=NULL;*xx=0; } // assume valid, really only when it returns in case it blocked... m_addressesValid = true; // set it m_addressSetCalled = true; // make a copy of the tag rec here in case it gets mangled later // because the m_addresses class may reference its buffer //m_savedTagRec1.copy ( gr ); // . this returns false if blocked // . it uses the "venueaddress" from the tagrec, "gr", BUT if this // page is the one that sets the venue address, it won't be able // to use it as a default city/state thingy until next time it is // spidered, since that info is in the tagrec // . PROBLEM: if the venue address is on this page, we can't take // advantage of it by usings its city/state as a default for the // other addresses on this page if ( ! m_addresses.set ( sections , ww , bits , &m_tagRec , // &m_savedTagRec1 , // gr &m_firstUrl , *d , cr->m_collnum , dh , // *sh32 *ip , //(int32_t)*tph , m_niceness , m_pbuf , m_masterState , m_masterLoop , *ct , //ptr_addressReply , //size_addressReply , //m_addressReplyValid , m_filteredRootTitleBuf , m_filteredRootTitleBufSize , this )) return (Addresses *)-1; // sanity check if ( m_addresses.m_msg2c && m_addresses.m_msg2c->m_requests != m_addresses.m_msg2c->m_replies) { char *xx=NULL;*xx=0; } // error? if ( g_errno ) return NULL; // return it if not breached //if ( ! m_addresses.m_breached ) return &m_addresses; // return that error otherwise //g_errno = EBUFOVERFLOW; //return NULL; return &m_addresses; } int32_t *XmlDoc::getSiteNumInlinksUniqueIp ( ) { if ( m_siteNumInlinksUniqueIpValid ) return &m_siteNumInlinksUniqueIp; // get our companion number int32_t *ni = getSiteNumInlinks(); if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni; // sanity check if ( ! m_siteNumInlinksUniqueIp ) { char *xx=NULL;*xx=0; } // ok we must be valid return &m_siteNumInlinksUniqueIp; } int32_t *XmlDoc::getSiteNumInlinksUniqueCBlock ( ) { if ( m_siteNumInlinksUniqueCBlockValid ) return &m_siteNumInlinksUniqueCBlock; // get our companion number int32_t *ni = getSiteNumInlinks(); if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni; // sanity check if ( ! m_siteNumInlinksUniqueCBlock ) { char *xx=NULL;*xx=0; } // ok we must be valid return &m_siteNumInlinksUniqueCBlock; } int32_t *XmlDoc::getSiteNumInlinksTotal ( ) { if ( m_siteNumInlinksTotalValid ) return &m_siteNumInlinksTotal; // get our companion number int32_t *ni = getSiteNumInlinks(); if ( ! ni || ni == (int32_t *)-1 ) return (int32_t *)ni; // sanity check if ( ! m_siteNumInlinksTotal ) { char *xx=NULL;*xx=0; } // ok we must be valid return &m_siteNumInlinksTotal; } // we need this for setting SpiderRequest::m_parentFirstIp of each outlink int32_t *XmlDoc::getFirstIp ( ) { // return it if we got it if ( m_firstIpValid ) return &m_firstIp; // note it setStatus ( "getting first ip"); // get tag rec TagRec *gr = getTagRec(); if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr; // got it Tag *tag = gr->getTag ( "firstip" ); // get from tag m_firstIp = 0; if ( tag ) m_firstIp = atoip(tag->getTagData()); // if no tag, or is bogus in tag... set from ip if ( m_firstIp == 0 || m_firstIp == -1 ) { // need ip then! int32_t *ip = getIp(); if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip; // set that m_firstIp = *ip; } m_firstIpValid = true; return &m_firstIp; // must be 4 bytes - no now its a string //if ( tag->getTagDataSize() != 4 ) { char *xx=NULL;*xx=0; } } uint8_t *XmlDoc::getSiteNumInlinks8 () { if ( m_siteNumInlinks8Valid ) return &m_siteNumInlinks8; // get the full count int32_t *si = getSiteNumInlinks(); if ( ! si || si == (int32_t *)-1 ) return (uint8_t *)si; // convert to 8 m_siteNumInlinks8 = score32to8 ( *si ); // validate m_siteNumInlinks8Valid = true; return &m_siteNumInlinks8; } int32_t *XmlDoc::getSiteNumInlinks ( ) { if ( m_siteNumInlinksValid ) return &m_siteNumInlinks; // sanity check if ( m_setFromTitleRec && ! m_useSecondaryRdbs) {char *xx=NULL;*xx=0;} setStatus ( "getting site num inlinks"); // get it from the tag rec if we can TagRec *gr = getTagRec (); if ( ! gr || gr == (void *)-1 ) return (int32_t *)gr; // the current top ip address int32_t *ip = getIp(); if ( ! ip || ip == (int32_t *)-1) return (int32_t *)ip; //int32_t top = *ip & 0x00ffffff; // this happens when its NXDOMAIN reply from dns so assume // no site inlinks if ( *ip == 0 ) { m_siteNumInlinks = 0; m_siteNumInlinksUniqueIp = 0; m_siteNumInlinksUniqueCBlock = 0; m_siteNumInlinksTotal = 0; m_siteNumInlinksValid = true; m_siteNumInlinksUniqueIpValid = true; m_siteNumInlinksUniqueCBlockValid = true; m_siteNumInlinksTotalValid = true; return &m_siteNumInlinks; } if ( *ip == -1 ) { log("xmldoc: ip is %"INT32", can not get site inlinks",*ip); g_errno = EBADIP; return NULL; } // wait for clock to sync before calling getTimeGlobal int32_t wfts = waitForTimeSync(); // 0 means error, i guess g_errno should be set, -1 means blocked if ( ! wfts ) return NULL; if ( wfts == -1 ) return (int32_t *)-1; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; setStatus ( "getting site num inlinks"); // check the tag first Tag *tag = gr->getTag ("sitenuminlinks"); // is it valid? bool valid = true; // current time int32_t now = getTimeGlobal(); // use the spidered time for the test collection for consistency if ( !strcmp(cr->m_coll,"qatest123") ) { //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } now = getSpideredTime();//m_spideredTime; } // get tag age in days int32_t age = 0; if ( tag ) age = (now - tag->m_timestamp) ; // add in some flutter to avoid having all hsots in the network // calling msg25 for this site at the same time. // a 10,000 second jitter. 3 hours. int32_t flutter = rand() % 10000; // add it in age += flutter; // . if site changes ip then toss the contact info out the window, // but give it a two week grace period // . well now we use the "ownershipchanged" tag to indicate that //if (tag && age>14*3600*24) valid=false; // . we also expire it periodically to keep the info uptodate // . the higher quality the site, the longer the expiration date int32_t ns = 0; int32_t maxAge = 0; int32_t sni = -1; if ( tag ) { // how many site inlinks? ns = atol(tag->getTagData()); // for less popular sites use smaller maxAges maxAge = 90; if ( ns < 10 ) maxAge = 10; else if ( ns < 30 ) maxAge = 15; else if ( ns < 50 ) maxAge = 30; else if ( ns < 100 ) maxAge = 60; // if index size is tiny then maybe we are just starting to // build something massive, so reduce the cached max age if ( g_titledb.m_rdb.getNumGlobalRecs() < 100000000 ) // 100M maxAge = 3; if ( g_titledb.m_rdb.getNumGlobalRecs() < 10000000 ) // 10M maxAge = 1; // for every 100 urls you already got, add a day! sni = atol(tag->getTagData()); // double if repairing //if ( m_useSecondaryRdbs ) maxAge = (maxAge+1) * 2; // fix bug for rebuild. rebuild any tag before now because // the MAX_LINKERS_IN_TERMLIST was too small in Linkdb.cpp // and i raised from 1M to 3M. it was hurting mahalo.com. if ( m_useSecondaryRdbs && tag->m_timestamp < 1345819704 ) valid = false; // force another rebuild of siterank because i fixed // the 'beds' query a little to use firstip, so recompute // siterank for those spammers. if ( m_useSecondaryRdbs && tag->m_timestamp < 1348257346 && // leave really big guys in tact sni < 300 ) valid = false; // convert into seconds maxAge *= 3600*24; // so youtube which has 2997 links will add an extra 29 days maxAge += (sni / 100) * 86400; // hack for global index. never affect siteinlinks i imported if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) age = 0; // invalidate for that as wel if ( age > maxAge ) valid = false; } // our companion tags, sitePop and fresh inlinks Tag *tag2 = gr->getTag ( "sitenuminlinksuniqueip" ); Tag *tag3 = gr->getTag ( "sitenuminlinksuniquecblock"); Tag *tag4 = gr->getTag ( "sitenuminlinkstotal"); // if we are missing either of those, invalidate as well if ( ! tag2 ) valid = false; if ( ! tag3 ) valid = false; if ( ! tag4 ) valid = false; // if we have already been through this if ( m_updatingSiteLinkInfoTags ) valid = false; // if rebuilding linkdb assume we have no links to sample from! if ( tag && m_useSecondaryRdbs && g_repair.m_rebuildLinkdb ) valid = true; // debug log if ( g_conf.m_logDebugLinkInfo ) log("xmldoc: valid=%"INT32" " "age=%"INT32" ns=%"INT32" sni=%"INT32" " "maxage=%"INT32" " "tag=%"PTRFMT" " "tag2=%"PTRFMT" " "tag3=%"PTRFMT" " "url=%s", (int32_t)valid,age,ns,sni, maxAge, (PTRTYPE)tag, (PTRTYPE)tag2, (PTRTYPE)tag3, m_firstUrl.m_url); // if we are good return it if ( tag && valid ) { // set it m_siteNumInlinks = atol(tag->getTagData()); // companion tags if ( tag2 ) { m_siteNumInlinksUniqueIp = atol(tag2->getTagData()); m_siteNumInlinksUniqueIpValid = true; } if ( tag3 ) { m_siteNumInlinksUniqueCBlock =atol(tag3->getTagData()); m_siteNumInlinksUniqueCBlockValid = true; } if ( tag4 ) { m_siteNumInlinksTotal =atol(tag4->getTagData()); m_siteNumInlinksTotalValid = true; } // it is good to go now m_siteNumInlinksValid = true; return &m_siteNumInlinks; } // set status. we can time status changes with this routine! //setStatus ( "getting site link info"); // if ip is bad we can't do this. we need to have a legit ip // so we know if a linker is internal or not /* if ( *ip == 0 || *ip == -1 ) { log("gb: bad ip so we can't get site num inlinks right"); m_siteNumInlinks = 0; m_sitePop = 0; m_siteNumInlinksFresh = 0; m_siteNumInlinksValid = true; m_siteNumInlinksFreshValid = true; m_sitePopValid = true; return &m_siteNumInlinks; } */ // set this flag so when we are re-called, "valid" will be set to false // so we can come down here and continue this. "flutter" might // otherwise cause us to not make it down here. m_updatingSiteLinkInfoTags = true; // we need to re-get both if either is NULL LinkInfo *sinfo = getSiteLinkInfo(); // block or error? if ( ! sinfo || sinfo == (LinkInfo *)-1) return (int32_t *)sinfo; // // now update tagdb! // // ok, get the sites of the external outlinks and they must // also be NEW outlinks, added to the page since the last time // we spidered it... //Links *links = getLinks (); //if ( ! links || links == (Links *)-1 ) return (int32_t *)links; char *mysite = getSite(); if ( ! mysite || mysite == (void *)-1 ) return (int32_t *)mysite; setStatus ( "adding site info tags to tagdb 1"); // why are we adding tag again! should already be in tagdb!!! if ( m_doingConsistencyCheck ) {char*xx=NULL;*xx=0;} // do not re-call at this point //m_siteNumInlinks = sinfo->m_numInlinksExtrapolated; m_siteNumInlinks = (int32_t)sinfo->m_numGoodInlinks; //m_siteNumInlinksFresh = sinfo->m_numInlinksFresh; //m_sitePop = sinfo->m_pagePop; m_siteNumInlinksUniqueIp = sinfo->m_numUniqueIps; m_siteNumInlinksUniqueCBlock = sinfo->m_numUniqueCBlocks; m_siteNumInlinksTotal = sinfo->m_totalInlinkingDocIds; m_siteNumInlinksValid = true; m_siteNumInlinksUniqueIpValid = true; m_siteNumInlinksUniqueCBlockValid = true; m_siteNumInlinksTotalValid = true; // deal with it return &m_siteNumInlinks; } // . do a 'site:xyz.com | gbnuminlinks' query to get the top docs // from a site and get the gigabits from that query! // . then store the resulting gigabits into tagdb for efficiency // . recompute once per month or so ... or if ip changes i guess // . we need the root title as a source for city and adm1's for // Addresses::set() function //char **XmlDoc::getSiteGigabits ( ) { //} // TODO: can we have a NULL LinkInfo without having had an error? LinkInfo *XmlDoc::getSiteLinkInfo() { // lookup problem? if ( g_errno ) { log("build: error getting link info: %s", mstrerror(g_errno)); return NULL; } setStatus ( "getting site link info" ); if ( m_siteLinkInfoValid ) //return msg25.m_linkInfo; return (LinkInfo *)m_mySiteLinkInfoBuf.getBufStart(); char *mysite = getSite(); if ( ! mysite || mysite == (void *)-1 ) return (LinkInfo *)mysite; int32_t *fip = getFirstIp(); if ( ! fip || fip == (int32_t *)-1) return (LinkInfo *)fip; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // can we be cancelled? bool canBeCancelled = true; // not if pageparser though if ( m_pbuf ) canBeCancelled = false; // not if injecting if ( ! m_sreqValid ) canBeCancelled = false; // assume valid when it returns m_siteLinkInfoValid = true; // use this buffer so XmlDoc::print() can display it where it wants SafeBuf *sb = NULL; if ( m_pbuf ) sb = &m_siteLinkBuf; // only do this for showing them!!! if ( m_useSiteLinkBuf ) sb = &m_siteLinkBuf; //bool onlyGetGoodInlinks = true; //if ( m_useSiteLinkBuf ) onlyGetGoodInlinks = false; // get this int32_t lastUpdateTime = getTimeGlobal(); // get from spider request if there //bool injected = false; //if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true; // but be consistent if doing the "qatest123" collection if ( ! strcmp(cr->m_coll,"qatest123") ) { //if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;} lastUpdateTime = getSpideredTime();//m_spideredTime; } bool onlyNeedGoodInlinks = true; // so if steve wants to display all links then set this // to false so we get titles of bad inlinks // seems like pageparser.cpp just sets m_pbuf and not // m_usePageLinkBuf any more if ( sb ) onlyNeedGoodInlinks = false; // int16_tcut //Msg25 *m = &m_msg25; if ( ! getLinkInfo ( &m_tmpBuf11, &m_mcast11, mysite , // site mysite , // url true , // isSiteLinkInfo? *fip , 0 , // docId cr->m_collnum , //linkInfoColl NULL , // qbuf 0 , // qbufSize m_masterState , m_masterLoop , m_contentInjected ,// isInjecting? sb , m_printInXml , 0 , // sitenuminlinks -- dunno! //0 , // sitePop NULL , // oldLinkInfo1 , m_niceness , cr->m_doLinkSpamCheck , cr->m_oneVotePerIpDom , canBeCancelled , lastUpdateTime , onlyNeedGoodInlinks , false, 0, 0, // it will store the linkinfo into this safebuf &m_mySiteLinkInfoBuf) ) // return -1 if it blocked return (LinkInfo *)-1; // sanity check //if ( ! m_msg25.m_linkInfo ) { // log("build: error making link info: %s",mstrerror(g_errno)); // return NULL; //} // we got it //return m_msg25.m_linkInfo; // getLinkInfo() now calls multicast so it returns true on errors only log("build: error making link info: %s",mstrerror(g_errno)); return NULL; } static void gotIpWrapper ( void *state , int32_t ip ) ; static void delayWrapper ( int fd , void *state ) { XmlDoc *THIS = (XmlDoc *)state; THIS->m_masterLoop ( THIS->m_masterState ); } // . returns NULL and sets g_errno on error // . returns -1 if blocked, will re-call m_callback int32_t *XmlDoc::getIp ( ) { // return if we got it if ( m_ipValid ) return &m_ip; // update status msg setStatus ( "getting ip" ); // if set from docid and recycling if ( m_recycleContent ) { // get the old xml doc from the old title rec XmlDoc **pod = getOldXmlDoc ( ); if ( ! pod || pod == (void *)-1 ) return (int32_t *)pod; // int16_tcut XmlDoc *od = *pod; // set it if ( od ) { m_ip = od->m_ip; m_ipValid = true; return &m_ip; } } // fakeit for now //log("FAKING IT!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"); //m_ip = atoip("74.201.80.152",13); //m_ipValid = true; //return &m_ip; // get the best url Url *u = getCurrentUrl(); if ( ! u || u == (void *)-1 ) return (int32_t *)u; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; bool useTestCache = false; if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true; // unless its the pagesubmit.cpp event submission tool //if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false; // when building the "qatest123" collection try to get the ip from // "./test/ips.txt" so our injections are consistent every time // Test.cpp runs its injection loop into the "qatest123" collection if ( useTestCache ) { // && m_useIpsTxtFile ) { // stolen from msgc.cpp: // if url is already in a.b.c.d format return that int32_t ip2 = 0; char *host = u->getHost(); if ( host ) ip2 = atoip ( host,u->getHostLen() ); if ( ip2 != 0 ) { m_ip = ip2; m_ipValid = true; return &m_ip; } // assume not found in our file bool found = false; // get test dir char *testDir = getTestDir(); // get it from "./test/ips.txt" getTestIp ( u->getUrl() , &m_ip , &found , m_niceness,testDir); // if we found a match... if ( found ) { // m_ip != 0 ) { // we are valid now return gotIp ( false ); //m_ipValid = true; // return it //return &m_ip; } } // we need the ip before we download the page, but before we get // the IP and download the page, wait for this many milliseconds. // this basically slows the spider down. int32_t delay = cr->m_spiderDelayInMilliseconds; // ignore for testing if ( ! strcmp(cr->m_coll,"qatest123") ) delay = 0; // injected? if ( m_sreqValid && m_sreq.m_isInjecting ) delay = 0; if ( m_sreqValid && m_sreq.m_isPageParser ) delay = 0; if ( m_sreqValid && m_sreq.m_isScraping ) delay = 0; if ( m_sreqValid && m_sreq.m_fakeFirstIp ) delay = 0; // . don't do the delay when downloading extra doc, robots.txt etc. // . this also reports a status msg of "getting new doc" when it // really means "delaying spider" if ( m_isChildDoc ) delay = 0; if ( delay > 0 && ! m_didDelay ) { // we did it m_didDelay = true; m_statusMsg = "delaying spider"; // random fuzz so we don't get everyone being unleashed at once int32_t radius = (int32_t)(.20 * (double)delay); int32_t fuzz = (rand() % (radius * 2)) - radius; delay += fuzz; // make a callback wrapper. // this returns false and sets g_errno on error if ( g_loop.registerSleepCallback ( delay , m_masterState , delayWrapper,//m_masterLoop m_niceness )) // wait for it, return -1 since we blocked return (int32_t *)-1; // if was not able to register, ignore delay } if ( m_didDelay && ! m_didDelayUnregister ) { g_loop.unregisterSleepCallback(m_masterState,delayWrapper); m_didDelayUnregister = true; } // update status msg setStatus ( "getting ip" ); // assume valid! if reply handler gets g_errno set then m_masterLoop // should see that and call the final callback //m_ipValid = true; // get it if ( ! m_msgc.getIp ( u->getHost () , u->getHostLen() , &m_ip , this , gotIpWrapper )) // we blocked return (int32_t *)-1; // wrap it up return gotIp ( true ); } void gotIpWrapper ( void *state , int32_t ip ) { // point to us XmlDoc *THIS = (XmlDoc *)state; // wrap it up THIS->gotIp ( true ); // . call the master callback // . m_masterState usually equals THIS, unless THIS is the // Xml::m_contactDoc or something... THIS->m_masterLoop ( THIS->m_masterState ); } int32_t *XmlDoc::gotIp ( bool save ) { // return NULL on error if ( g_errno ) return NULL; // this is bad too //if ( m_ip == 0 || m_ip == -1 ) m_indexCode = EBADIP; //log("db: got ip %s for %s",iptoa(m_ip),getCurrentUrl()->getUrl()); setStatus ("got ip"); CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // note it for crawlbot if ( cr->m_isCustomCrawl && ( m_ip == 0 || m_ip == -1 ) ) log("db: got ip %"INT32" for %s", m_ip,getCurrentUrl()->getUrl()); bool useTestCache = false; if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true; // unless its the pagesubmit.cpp event submission tool //if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false; // when building the "qatest123" collection try to get the ip from // "./test/ips.txt" so our injections are consistent every time // Test.cpp runs its injection loop into the "qatest123" collection if ( save && useTestCache ) { // ip of 0 means NXDOMAIN i think (-1 means error) //if ( m_ip == 0 ) { // log("waiting for debug break"); // sleep(3600); //} // get the best url Url *u = getCurrentUrl(); if ( !u || u == (void *)-1 ) { char *xx=NULL;*xx=0; } // . add it to "./test/ips.txt" // . this function is in Msge1.cpp addTestIp ( u->getHost() , u->getHostLen() , m_ip ); // get test dir char *testDir = getTestDir(); // save it saveTestBuf ( testDir ); } // we got it m_ipValid = true; // give it to them return &m_ip; } #include "Mime.h" // taken from Robotdb.cpp bool isAllowed2 ( Url *url , char *userAgent , char *file , int32_t fileLen , bool *userAgentFound , bool substringMatch , int32_t *crawlDelay , char **cacheStart , int32_t *cacheLen , bool *hadAllowOrDisallow ) { // assume nothing to cache yet *cacheLen = 0; *cacheStart = file; // assume user agent is not in the file *userAgentFound = false; *hadAllowOrDisallow = false; // assume no crawl delay (-1) // *crawlDelay = -1; // if fileLen is 0 it is allowed if ( fileLen <= 0 ) return true; // get path from url, include cgi stuff char *path = url->getPath(); int32_t pathLen = url->getPathLenWithCgi(); // set the Mime class to this Mime file Mime mime; mime.set ( file , fileLen ); // get a line of Mime char *f , *v; int32_t flen, vlen; // user agent length int32_t uaLen = gbstrlen (userAgent); // ptr into "file" char *p = file; char flag; bool allowed = true; loop: // if p is NULL now we're done if ( ! p ) return allowed; // get the next Mime line p = mime.getLine ( p , &f , &flen , &v , &vlen ); // if this field is NOT "user-agent" skip it if ( flen != 10 ) goto loop; if ( strncasecmp ( f , "user-agent" , 10 ) != 0 ) goto loop; gotAgent: //some webmasters put comments at the end of their lines, //because they think this is a shell script or something. char* vv = v; while(vv - v < vlen && *vv != '#') vv++; vlen = vv - v; // decrement vlen to hack off spaces after the user-agent so that vlen // is really the length of the user agent while ( vlen > 0 && is_wspace_a(v[vlen-1]) ) vlen--; // now match the user agent if ( ! substringMatch && vlen != uaLen ) goto loop; // otherwise take the min of the lengths if ( uaLen < vlen ) vlen = uaLen; // is it the right user-agent? if ( strncasecmp ( v , userAgent , vlen ) != 0 ) goto loop; // we got it, if first instance start our cache here if ( !*userAgentFound ) *cacheStart = f; *userAgentFound = true; flag = 0; urlLoop: // if p is NULL now there is no more lines if ( ! p ) { // set our cache stop to the end of the file *cacheLen = (file + fileLen) - *cacheStart; return allowed; } // now loop over lines until we hit another user-agent line p = mime.getLine ( p , &f , &flen , &v , &vlen ); // if it's another user-agent line ... ignore it unless we already // have seen a disallow line, in which case we got another set of if ( flag && flen==10 && strncasecmp(f,"user-agent",10)==0) { // set our cache stop here *cacheLen = f - *cacheStart; goto gotAgent; } // if a crawl delay, get the delay if ( flen == 11 && strncasecmp ( f , "crawl-delay", 11 ) == 0 ) { // set flag flag = 1; // skip if invalid. it could be ".5" seconds if ( ! is_digit ( *v ) && *v != '.' ) goto urlLoop; // get this. multiply crawl delay by x1000 to be in // milliseconds/ms int64_t vv = (int64_t)(atof(v) * 1000LL); // truncate to 0x7fffffff if ( vv > 0x7fffffff ) *crawlDelay = 0x7fffffff; else if ( vv < 0 ) *crawlDelay = -1; else *crawlDelay = (int32_t)vv; // get the delay //*crawlDelay = atol(v) * 1000; goto urlLoop; } // if already disallowed, just goto the next line if ( !allowed ) goto urlLoop; // if we have an allow line or sitemap: line, then set flag to 1 // so we can go to another user-agent line. // fixes romwebermarketplace.com/robots.txt // (doc.156447320458030317.txt) if ( flen==5 && strncasecmp(f,"allow" ,5)==0 ) { *hadAllowOrDisallow = true; flag = 1; } if ( flen==7 && strncasecmp(f,"sitemap",7)==0 ) { flag = 1; } // if not disallow go to loop at top if ( flen != 8 ) goto urlLoop; if ( strncasecmp ( f , "disallow" , 8 ) != 0 ) { goto urlLoop; } // we had a disallow *hadAllowOrDisallow = true; // set flag flag = 1; // . take off trailing chars from the banned path name // . this is now done below //while ( vlen > 0 && is_space(v[vlen-1]) ) vlen--; // . skip leading spaces // . this should be done in mime class // while ( vlen > 0 && is_space(v[0]) ) { v++; vlen--; } // now stop at first space after url or end of line char *s = v; char *send = v + vlen; // skip all non-space chars while ( s < send && ! is_wspace_a(*s) ) s++; // stop there vlen = s - v; // check for match char *tmpPath = path; int32_t tmpPathLen = pathLen; // assume path begins with / if ( vlen > 0 && v[0] != '/'){tmpPath++;tmpPathLen--;} if ( vlen > tmpPathLen ) goto urlLoop; if ( strncasecmp(tmpPath,v,vlen) != 0 ) goto urlLoop; // an exact match if ( vlen == tmpPathLen ) { //return false; allowed = false; goto urlLoop; } // must be something if ( vlen <= 0 ) goto urlLoop; // "v" may or may not end in a /, it really should end in a / though if ( v[vlen-1] == '/' && tmpPath[vlen-1] == '/' ) { //return false; allowed = false; goto urlLoop; } if ( v[vlen-1] != '/' && tmpPath[vlen ] == '/' ) { //return false; allowed = false; goto urlLoop; } // let's be stronger. just do the substring match. if the webmaster // does not want us splitting path or file names then they should end // all of their robots.txt entries in a '/'. this also fixes the // problem of the "Disallow: index.htm?" line. //return false; allowed = false; // get another url path goto urlLoop; } // when doing a custom crawl we have to decide between the provided crawl // delay, and the one in the robots.txt... int32_t *XmlDoc::getFinalCrawlDelay() { if ( m_finalCrawlDelayValid ) return &m_finalCrawlDelay; bool *isAllowed = getIsAllowed(); if ( ! isAllowed || isAllowed == (void *)-1 ) return (int32_t *)isAllowed; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; m_finalCrawlDelayValid = true; // getIsAllowed already sets m_crawlDelayValid to true if ( ! cr->m_isCustomCrawl ) { m_finalCrawlDelay = m_crawlDelay; // default to 250ms i guess if none specified in robots // just to be somewhat nice by default if ( m_crawlDelay < 0 ) m_finalCrawlDelay = 250; return &m_finalCrawlDelay; } // get manually specified crawl delay in seconds. convert to ms. int32_t manual = (int32_t)(cr->m_collectiveCrawlDelay * 1000.0); // negative means -1 means unknown or not specified if ( manual < 0 ) manual = -1; // if both are unknown... if ( m_crawlDelay == -1 && manual == -1 ) { m_finalCrawlDelay = -1; return &m_finalCrawlDelay; } // if not in robots.txt use manual if ( m_crawlDelay == -1 ) { m_finalCrawlDelay = manual; return &m_finalCrawlDelay; } // if manually provided crawldelay is -1, use robots.txt then if ( manual == -1 ) { m_finalCrawlDelay = m_crawlDelay; return &m_finalCrawlDelay; } // let robots.txt dictate if both are >= 0 if ( m_useRobotsTxt ) { m_finalCrawlDelay = m_crawlDelay; return &m_finalCrawlDelay; } // if not using robots.txt, pick the smallest if ( m_crawlDelay < manual ) m_finalCrawlDelay = m_crawlDelay; else m_finalCrawlDelay = manual; return &m_finalCrawlDelay; } // . get the Robots.txt and see if we are allowed // . returns NULL and sets g_errno on error // . returns -1 if blocked, will re-call m_callback // . getting a robots.txt is not trivial since we need to follow redirects, // so we make use of the powerful XmlDoc class for this bool *XmlDoc::getIsAllowed ( ) { // return if we got it if ( m_isAllowedValid ) return &m_isAllowed; // could be turned off for everyone if ( ! m_useRobotsTxt ) { m_isAllowed = true; m_isAllowedValid = true; m_crawlDelayValid = true; m_crawlDelay = -1; //log("xmldoc: skipping robots.txt lookup for %s", // m_firstUrl.m_url); return &m_isAllowed; } // . if setting from a title rec, assume allowed // . this avoids doConsistencyCheck() from blocking and coring if ( m_setFromTitleRec ) { m_isAllowed = true; m_isAllowedValid = true; return &m_isAllowed; } if ( m_recycleContent ) { m_isAllowed = true; m_isAllowedValid = true; return &m_isAllowed; } // double get? if ( m_crawlDelayValid ) { char *xx=NULL;*xx=0; } // . if WE are robots.txt that is always allowed!!! // . check the *first* url since these often redirect to wierd things Url *fu = getFirstUrl(); bool isRobotsTxt = isRobotsTxtFile ( fu->getUrl() , fu->getUrlLen() ); if ( isRobotsTxt ) { m_isAllowed = true; m_isAllowedValid = true; m_crawlDelayValid = true; // make it super fast... m_crawlDelay = 0; return &m_isAllowed; } // or if using the "qatest123" collection, assume yes! //if ( ! strcmp ( m_coll , "qatest123" ) ) { // m_isAllowed = true; // m_isAllowedValid = true; // return &m_isAllowed; //} // update status msg setStatus ( "getting robots.txt" ); // sanity int32_t *ip = getIp (); // error? or blocked? if ( ! ip || ip == (void *)-1 ) return (bool *)ip; // if ip does not exist on the dns, do not try to download robots.txt // it is pointless... this can happen in the dir coll and we basically // have "m_siteInCatdb" set to true if ( *ip == 1 || *ip == 0 || *ip == -1 ) { // note this log("build: robots.txt ip is %s for url=%s. allowing for now.", fu->getUrl(),iptoa(*ip)); // just core for now //char *xx=NULL;*xx=0; m_isAllowed = true; m_isAllowedValid = true; // since ENOMIME is no longer causing the indexCode // to be set, we are getting a core because crawlDelay // is invalid in getNewSpiderReply() m_crawlDelayValid = true; m_crawlDelay = -1; return &m_isAllowed; } // we need this so getExtraDoc does not core int32_t *pfip = getFirstIp(); if ( ! pfip || pfip == (void *)-1 ) return (bool *)pfip; // set m_extraUrl to the robots.txt url char buf[MAX_URL_LEN+1]; char *p = buf; p += sprintf ( p , "http://" ); // get the current url after redirects Url *cu = getCurrentUrl(); if ( ! cu || cu == (void *)-1 ) return (bool *)cu; // sanity if ( ! cu->getHost() ) { char *xx=NULL;*xx=0; } gbmemcpy ( p , cu->getHost() , cu->getHostLen() ); p += cu->getHostLen(); int32_t port = cu->getPort(); // 80 is the default port int32_t defPort = 80; // is it https://? if ( cu->m_url[4] == 's' ) defPort = 443; if ( port != defPort ) p += sprintf ( p , ":%"INT32"",port ); p += sprintf ( p , "/robots.txt" ); m_extraUrl.set ( buf ); // . maxCacheAge = 3600 seconds = 1 hour for robots.txt // . if this is non-zero then msg13 should store it as well! // . for robots.txt it should only cache the portion of the doc // relevant to our user agent! // . getHttpReply() should use msg13 to get cached reply! XmlDoc **ped = getExtraDoc ( m_extraUrl.getUrl() , 3600 ); if ( ! ped || ped == (void *)-1 ) return (bool *)ped; // assign it XmlDoc *ed = *ped; // return NULL on error with g_errno set if ( ! ed ) { // sanity check, g_errno must be set if ( ! g_errno ) { char *xx=NULL;*xx=0; } // log it -- should be rare? log("doc: had error getting robots.txt: %s", mstrerror(g_errno)); return NULL; } // inherit this //if ( ! m_useIpsTxtFile ) ed->m_useIpsTxtFile = false; // . steal m_firstIp from us to avoid tag rec lookup // . why was this commented out? // . maybe because if we redirect, this is not the same!!! //ed->m_firstIp = m_firstIp; //ed->m_firstIpValid = m_firstIpValid;//true; // also, steal our ip! neither is this! //ed->m_ip = m_ip; //ed->m_ipValid = m_ipValid; // . now try the content // . should call getHttpReply char **pcontent = ed->getContent(); if ( ! pcontent || pcontent == (void *)-1 ) return (bool *)pcontent; // get the mime HttpMime *mime = ed->getMime(); if ( ! mime || mime == (HttpMime *)-1 ) return (bool *)mime; // get this int32_t contentLen = ed->m_contentLen; // save this m_robotsTxtLen = contentLen; m_robotsTxtLenValid = true; // get content char *content = *pcontent; // sanity check if ( content && contentLen>0 && content[contentLen] != '\0'){ char*xx=NULL;*xx=0;} // reset this. -1 means unknown or none found. m_crawlDelay = -1; m_crawlDelayValid = true; // assume valid and ok to spider m_isAllowed = true; m_isAllowedValid = true; // put in a crawldelay test for diffbot /* SafeBuf tmp; if ( strstr(m_firstUrl.getUrl(),"diffbot.com") ) { tmp.safePrintf("User-Agent: *\n" "Crawl-Delay: 10.1\n" ); content = tmp.getBufStart(); contentLen = tmp.getLength(); } // if not success, assume no robots.txt else*/ if ( mime->getHttpStatus() != 200 ) { // nuke it to save mem nukeDoc ( ed ); return &m_isAllowed; } // get the url we lookup //Url *cu = getCurrentUrl(); // this is set to true if our userAgent was found explicitly bool uaFound; bool allowed; char *cacheStart; int32_t cacheLen; bool hadAllowOrDisallow; int32_t savedCrawlDelay = -1; // now use left-anchored substring match so we can match Gigabot/1.0 allowed = isAllowed2 ( cu , g_conf.m_spiderUserAgent , content , contentLen , &uaFound , true , // substrmatch? &m_crawlDelay , &cacheStart , &cacheLen , &hadAllowOrDisallow ); // save it savedCrawlDelay = m_crawlDelay; // . if didn't find our user agent so check for * as a user-agent // . www.wikihow.com/robots.txt just has "Gigabot: crawl-delay:10\n" // and then a "User-Agent: *" after that with the disallows, so // i added the hadAllowDisallow parm if ( ! uaFound || ! hadAllowOrDisallow ) allowed = isAllowed2 ( cu , "*" , content , contentLen , &uaFound , false , // substrmatch? &m_crawlDelay , &cacheStart , &cacheLen , &hadAllowOrDisallow ); // bring back? if ( savedCrawlDelay != -1 ) m_crawlDelay = savedCrawlDelay; // nuke it to save mem nukeDoc ( ed ); // we are legit m_isAllowed = allowed; m_isAllowedValid = true; return &m_isAllowed; } // . lookup the title rec with the "www." if we do not have that in the url // . returns NULL and sets g_errno on error // . returns -1 if blocked, will re-call m_callback char *XmlDoc::getIsWWWDup ( ) { // this is not a real error really //if ( g_errno == ENOTFOUND ) g_errno = 0; // return if we got it if ( m_isWWWDupValid ) return &m_isWWWDup; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // could be turned off for everyone if ( ! cr->m_dupCheckWWW ) { m_isWWWDup = false; m_isWWWDupValid = true; return &m_isWWWDup; } // get the FIRST URL... (no longer current url after redirects) Url *u = getFirstUrl(); // CurrentUrl(); // if we are NOT a DOMAIN-ONLY url, then no need to do this dup check if ( u->getDomainLen() != u->getHostLen() ) { m_isWWWDup = false; m_isWWWDupValid = true; return &m_isWWWDup; } // must NOT have a www if ( ! u->isHostWWW() ) { m_isWWWDup = false; m_isWWWDupValid = true; return &m_isWWWDup; } // make it without the www char withoutWWW[MAX_URL_LEN+1]; char *proto = "http"; if ( u->isHttps() ) proto = "https"; sprintf(withoutWWW,"%s://%s",proto,u->getDomain()); // assume yes m_isWWWDup = true; if ( ! m_calledMsg22f ) setStatus ( "getting possible www dup title rec" ); // . does this title rec exist in titledb? // . "justCheckTfndb" is set to true here! if ( ! m_calledMsg22f && ! m_msg22f.getTitleRec ( &m_msg22Request , withoutWWW , 0 , // probable docid cr->m_coll , // . msg22 will set this to point to it! // . if NULL that means NOT FOUND NULL , // tr ptr NULL , // tr size ptr true , // just chk tfndb? false, // getavaildocidonly m_masterState , m_masterLoop , m_niceness , // niceness false , // add to cache? 0 , // max cache age 999999 , // timeout seconds false )){//load balancing? // validate m_calledMsg22f = true; // return -1 if we blocked return (char *)-1; } // got it m_calledMsg22f = true; // valid now m_isWWWDupValid = true; // found? if ( ! g_errno && m_msg22f.m_found ) { // crap we are a dup m_isWWWDup = true; // set the index code //m_indexCode = EDOCDUPWWW; } // return us return &m_isWWWDup; } LinkInfo s_dummy2; // . returns NULL and sets g_errno on error // . returns -1 if blocked, will re-call m_callback LinkInfo *XmlDoc::getLinkInfo1 ( ) { if ( m_linkInfo1Valid && ptr_linkInfo1 ) return ptr_linkInfo1; // at least get our firstip so if cr->m_getLinkInfo is false // then getRevisedSpiderReq() will not core because it is invalid int32_t *ip = getFirstIp(); if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo *)ip; // just return nothing if not doing link voting CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // to keep things fast we avoid getting link info for some collections if ( ! m_linkInfo1Valid && ! cr->m_getLinkInfo ) { ptr_linkInfo1 = NULL; m_linkInfo1Valid = true; } // sometimes it is NULL in title rec when setting from title rec if ( m_linkInfo1Valid && ! ptr_linkInfo1 ) { memset ( &s_dummy2 , 0 , sizeof(LinkInfo) ); s_dummy2.m_lisize = sizeof(LinkInfo); ptr_linkInfo1 = &s_dummy2; size_linkInfo1 = sizeof(LinkInfo); return ptr_linkInfo1; } // return if we got it if ( m_linkInfo1Valid ) return ptr_linkInfo1; // change status setStatus ( "getting local inlinkers" ); XmlDoc **od = getOldXmlDoc ( ); if ( ! od || od == (XmlDoc **)-1 ) return (LinkInfo *)od; int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo *)sni; //int32_t *fip = getFirstIp(); //if ( ! fip || fip == (int32_t *)-1 ) return (LinkInfo *)fip; int64_t *d = getDocId(); if ( ! d || d == (int64_t *)-1 ) return (LinkInfo *)d; // sanity check. error? if ( *d == 0LL ) { log("xmldoc: crap no g_errno"); g_errno = EBADENGINEER; return NULL; if ( ! g_errno ) { char *xx=NULL;*xx=0; } return NULL; } char *mysite = getSite(); if ( ! mysite || mysite == (void *)-1 ) return (LinkInfo *)mysite; // no linkinfo for diffbot custom crawls to speed up if ( cr->m_isCustomCrawl ) { m_linkInfo1Valid = true; memset ( &s_dummy2 , 0 , sizeof(LinkInfo) ); s_dummy2.m_lisize = sizeof(LinkInfo); ptr_linkInfo1 = &s_dummy2; size_linkInfo1 = sizeof(LinkInfo); return ptr_linkInfo1; } // grab a ptr to the LinkInfo contained in our Doc class LinkInfo *oldLinkInfo1 = NULL; if ( *od ) oldLinkInfo1 = (*od)->getLinkInfo1(); // if ip does not exist, make it 0 if ( *ip == 0 || *ip == -1 ) { m_linkInfo1Valid = true; memset ( &s_dummy2 , 0 , sizeof(LinkInfo) ); s_dummy2.m_lisize = sizeof(LinkInfo); ptr_linkInfo1 = &s_dummy2; size_linkInfo1 = sizeof(LinkInfo); return ptr_linkInfo1; } //link info generation requires an IP for internal/external computation // UNLESS we are from getSpiderStatusDocMetaList2() ... so handle // -1 above! //if ( *ip == -1 || *ip == 0 ) { char *xx=NULL;*xx=0; } // . error getting linkers? // . on udp timeout we were coring below because msg25.m_linkInfo // was NULL if ( g_errno && m_calledMsg25 ) return NULL; // prevent core as well //if ( m_calledMsg25 && ! size_linkInfo1 ) { // m_msg25.m_linkInfo ) { // log("xmldoc: msg25 had null link info"); // g_errno = EBADENGINEER; // return NULL; //} // . now search for some link info for this url/doc // . this queries the search engine to get linking docIds along // with their termIds/scores from anchor text and then compiles // it all into one IndexList // . if we have no linkers to this url then we set siteHash, etc. // for this linkInfo class // . this is my google algorithm // . let's use the first url (before redirects) for this // . m_newDocId is used for classifying doc under predefined news topic // . catSiteRec is used for classifying pages under a predefined // newstopic. this is currently for news search only. // . use the rootTitleRecPtr if there and we are doing our link info // stuff in this collection, but if doing it in another collection // the msg25 will look up the root in that collection... if ( ! m_calledMsg25 ) { // get this int32_t lastUpdateTime = getTimeGlobal(); // but be consistent if doing the "qatest123" collection if ( ! strcmp(cr->m_coll,"qatest123") ) { //if ( ! m_spideredTimeValid ) {char *xx=NULL;*xx=0;} lastUpdateTime = getSpideredTime();//m_spideredTime; } // do not redo it m_calledMsg25 = true; // int16_tcut //Msg25 *m = &m_msg25; // can we be cancelled? bool canBeCancelled = true; // not if pageparser though if ( m_pbuf ) canBeCancelled = false; // not if injecting if ( ! m_sreqValid ) canBeCancelled = false; // use this buffer so XmlDoc::print() can display wherever SafeBuf *sb = NULL; if ( m_pbuf ) sb = &m_pageLinkBuf; // only do this for showing them!!! if ( m_usePageLinkBuf ) sb = &m_pageLinkBuf; // get from spider request if there //bool injected = false; //if ( m_sreqValid && m_sreq.m_isInjecting ) injected = true; // we do not want to waste time computing the page title // of bad inlinks if we only want the good inlinks, because // as of oct 25, 2012 we only store the "good" inlinks // in the titlerec bool onlyNeedGoodInlinks = true; // so if steve wants to display all links then set this // to false so we get titles of bad inlinks if ( m_usePageLinkBuf ) onlyNeedGoodInlinks = false; // seems like pageparser.cpp just sets m_pbuf and not // m_usePageLinkBuf any more if ( m_pbuf ) onlyNeedGoodInlinks = false; // status update setStatus ( "calling msg25 for url" ); CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // we want to get all inlinks if doing a custom crawlbot crawl // because we need the anchor text to pass in to diffbot bool doLinkSpamCheck = cr->m_doLinkSpamCheck; bool oneVotePerIpDom = cr->m_oneVotePerIpDom; // this seems to overdo it when we have a ton of linktext // perhaps, so take this out... //if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) { // doLinkSpamCheck = false; // oneVotePerIpDom = false; // onlyNeedGoodInlinks = false; //} // call it. this is defined in Linkdb.cpp char *url = getFirstUrl()->getUrl(); if ( ! getLinkInfo ( &m_tmpBuf12, &m_mcast12, mysite , url , false , // isSiteLinkInfo? *ip , *d , cr->m_collnum , //linkInfoColl NULL , // qbuf 0 , // qbufSize m_masterState , m_masterLoop , m_contentInjected ,//m_injectedReply , sb , m_printInXml , *sni , //m_sitePop , oldLinkInfo1 , m_niceness , doLinkSpamCheck , oneVotePerIpDom , canBeCancelled , lastUpdateTime , onlyNeedGoodInlinks , false, // getlinkertitles 0, // ourhosthash32 (special) 0, // ourdomhash32 (special) &m_myPageLinkInfoBuf ) ) // blocked return (LinkInfo *)-1; // error? if ( g_errno ) return NULL; // panic! what the fuck? why did it return true and then // call our callback??? //if ( g_conf.m_logDebugBuild ) { log("build: xmldoc call to msg25 did not block"); // must now block since it uses multicast now to // send the request onto the network char *xx=NULL;*xx=0; //} } // at this point assume its valid m_linkInfo1Valid = true; // . get the link info we got set // . this ptr references into m_myPageLinkInfoBuf safebuf //ptr_linkInfo1 = m_msg25.m_linkInfo; //size_linkInfo1 = m_msg25.m_linkInfo->getSize(); ptr_linkInfo1 = (LinkInfo *)m_myPageLinkInfoBuf.getBufStart(); size_linkInfo1 = m_myPageLinkInfoBuf.length(); // we should free it m_freeLinkInfo1 = true; // this can not be NULL! if ( ! ptr_linkInfo1 || size_linkInfo1 <= 0 ) { log("build: error getting linkinfo1: %s",mstrerror(g_errno)); char *xx=NULL;*xx=0; return NULL; } // take it from msg25 permanently //m_msg25.m_linkInfo = NULL; // set flag m_linkInfo1Valid = true; // . validate the hop count thing too // . i took hopcount out of linkdb to put in lower ip byte for steve m_minInlinkerHopCount = -1;//m_msg25.getMinInlinkerHopCount(); // return it return ptr_linkInfo1; } static void *s_null = NULL; // . returns NULL and sets g_errno on error // . returns -1 if blocked, will re-call m_callback LinkInfo **XmlDoc::getLinkInfo2 ( ) { // this can now be title hashes for XmlDoc::m_diffbotTitleHashes // but otherwise, we don't use it for link info from another cluster // any more. m_linkInfo2Valid = true; return (LinkInfo **)&s_null; // return if we got it if ( m_linkInfo2Valid ) return &ptr_linkInfo2; m_linkInfo2Valid = true; ptr_linkInfo2 = NULL; return &ptr_linkInfo2; /* if ( ! cr->m_importFromHosts2Conf ) { m_linkInfo2Valid = true; ptr_linkInfo2 = NULL; return &ptr_linkInfo2; } // change status setStatus ( "getting remote hosts2.conf inlinkers" ); XmlDoc **od = getOldXmlDoc ( ); if ( ! od || od == (XmlDoc **)-1 ) return (LinkInfo **)od; int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (int32_t *)-1 ) return (LinkInfo **)sni; int32_t *ip = getIp(); if ( ! ip || ip == (int32_t *)-1 ) return (LinkInfo **)ip; int64_t *d = getDocId(); if ( ! d || d == (int64_t *)-1 ) return (LinkInfo **)d; // grab a ptr to the LinkInfo contained in our Doc class LinkInfo *oldLinkInfo2 = NULL; if ( *od ) oldLinkInfo2 = *(*od)->getLinkInfo2(); // . now search for some link info for this url/doc // . this queries the search engine to get linking docIds along // with their termIds/scores from anchor text and then compiles // it all into one IndexList // . if we have no linkers to this url then we set siteHash, etc. // for this linkInfo class // . this is my google algorithm // . let's use the first url (before redirects) for this // . m_newDocId is used for classifying doc under predefined news topic // . catSiteRec is used for classifying pages under a predefined // newstopic. this is currently for news search only. // . use the rootTitleRecPtr if there and we are doing our link info // stuff in this collection, but if doing it in another collection // the msg25 will look up the root in that collection... if ( ! m_calledMsg25b ) { // do not redo it m_calledMsg25b = true; // int16_tcut Msg25 *m = &m_msg25; // can we be cancelled? bool canBeCancelled = true; // not if pageparser though if ( m_pbuf ) canBeCancelled = false; // not if injecting if ( ! m_sreqValid ) canBeCancelled = false; // use this buffer so XmlDoc::print() can display wherever //SafeBuf *sb = NULL; //if ( m_pbuf ) sb = &m_pageLinkBuf2; // call it if ( ! m->getPageLinkInfo2 ( getFirstUrl() , m_coll , cr->m_externalColl , m_masterState , m_masterLoop , cr->m_doLinkSpamCheck , cr->m_oneVotePerIpDom , canBeCancelled ) ) // blocked return (LinkInfo **)-1; // error? if ( g_errno ) return NULL; } // at this point assume its valid m_linkInfo2Valid = true; // get the link info we got set ptr_linkInfo2 = m_msg25.m_linkInfo; // we should free it m_freeLinkInfo2 = true; // take it from msg25 permanently m_msg25.m_linkInfo = NULL; // set flag m_linkInfo2Valid = true; // validate the hop count thing too //m_minInlinkerHopCount = m_msg25.getMinInlinkerHopCount(); // return it return &ptr_linkInfo2; */ } static void gotSiteWrapper ( void *state ) ; // . we should store the site in the title rec because site getter might // change what it thinks the site is! char *XmlDoc::getSite ( ) { // was there a problem getting site? if ( m_siteValid && m_siteGetter.m_errno ) { g_errno = m_siteGetter.m_errno; return NULL; } // ok, return it if ( m_siteValid ) return ptr_site;//m_siteGetter.m_site; // note it setStatus ( "getting site"); // need this TagRec *gr = getTagRec(); // sanity check if ( ! gr && ! g_errno ) { char *xx=NULL;*xx=0; } // blocked or error? if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // get url Url *f = getFirstUrl(); // this must be valid //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } int32_t timestamp = getSpideredTime();//m_spideredTime; // add tags to tagdb? //bool addTags = true; //if ( m_sreqValid && m_sreq.m_isPageParser ) addTags = false; //if ( getIsPageParser() ) addTags = false; // do it if ( ! m_siteGetter.getSite ( f->getUrl() , gr , timestamp , cr->m_collnum , m_niceness , //addTags , this , // state gotSiteWrapper )) // return -1 if we blocked return (char *)-1; // error? if ( g_errno ) return NULL; // set these then gotSite(); return ptr_site;//m_siteGetter.m_site; } // set it void gotSiteWrapper ( void *state ) { // point to us XmlDoc *THIS = (XmlDoc *)state; THIS->gotSite (); // resume. this checks g_errno for being set. THIS->m_masterLoop ( THIS->m_masterState ); } void XmlDoc::gotSite ( ) { // sanity check if ( ! m_siteGetter.m_allDone && ! g_errno ) { char *xx=NULL;*xx=0; } // this sets g_errno on error ptr_site = m_siteGetter.m_site; size_site = m_siteGetter.m_siteLen+1; // include \0 // sanity check -- must have a site if ( ! g_errno && size_site <= 1 ) { char *xx=NULL;*xx=0; } // sitegetter.m_errno might be set! m_siteValid = true; // must be valid if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; } // add the sitepathdepth tag to our tagrec //Tag *a = m_siteGetter.m_addedTag.getFirstTag(); //if ( a ) m_newTagRec.addTag ( a ); } int64_t *XmlDoc::getSiteHash64 ( ) { if ( m_siteHash64Valid ) return &m_siteHash64; char *site = getSite(); // sanity check if ( ! site && ! g_errno ) { char *xx=NULL;*xx=0; } if ( ! site || site == (void *)-1) return (int64_t *)site; m_siteHash64 = hash64 ( site , gbstrlen(site) ); m_siteHash64Valid = true; return &m_siteHash64; } int32_t *XmlDoc::getSiteHash32 ( ) { if ( m_siteHash32Valid ) return &m_siteHash32; char *site = getSite(); if ( ! site || site == (void *)-1) return (int32_t *)site; m_siteHash32 = hash32 ( site , gbstrlen(site) ); m_siteHash32Valid = true; return &m_siteHash32; } void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) { XmlDoc *THIS = (XmlDoc *)state; bool hadError = false; THIS->setStatus("got diffbot reply"); // wha? if ( g_errno ) { log("diffbot: http error2 %s",mstrerror(g_errno)); THIS->m_diffbotReplyError = g_errno; hadError = true; } //char *buf = s->m_readBuf; // do not allow TcpServer.cpp to free it since m_diffbotReply // is now responsible for that //s->m_readBuf = NULL; // set the mime HttpMime mime; if ( s->m_readOffset>0 && // set location url to "null" ! mime.set ( s->m_readBuf , s->m_readOffset , NULL ) ) { // g_errno should be set if ( ! g_errno ) { char *xx=NULL;*xx=0; } // note it log("xmldoc: error setting diffbot mime"); THIS->m_diffbotReplyError = EDIFFBOTMIMEERROR; hadError = true; } // check the status if ( ! hadError && mime.getHttpStatus() != 200 ) { THIS->m_diffbotReplyError = EDIFFBOTBADHTTPSTATUS; log("xmldoc: diffbot reply mime was %"INT32"", mime.getHttpStatus()); hadError = true; } // get page content char *page = NULL; int32_t pageLen = 0; if ( ! hadError && mime.getMimeLen() >= 0 ) { page = s->m_readBuf + mime.getMimeLen(); char *end = s->m_readBuf + s->m_readOffset; pageLen = end - page; } // "-1" means diffbot had an error if ( page && page[0] == '-' && page[1] == '1' ) { log("xmldoc: diffbot reply was -1"); THIS->m_diffbotReplyError = EDIFFBOTINTERNALERROR; } CollectionRec *cr = THIS->getCollRec(); // . verify that it contains legit json and has the last field // b/c we saw a case where the diffbot reply was truncated // somehow // . check to make sure it has the "url": field as all diffbot // json replies must if ( ! THIS->m_diffbotReplyError ) { char *ttt = strstr ( page , "\"url\":\""); if ( ! ttt ) ttt = strstr ( page , "\"pageUrl\":\""); if ( ! ttt ) { log("xmldoc: diffbot reply for %s using %s is missing " "the url: field in the json reply. reply=%s", THIS->m_firstUrl.m_url, THIS->m_diffbotUrl.getBufStart(), page ); // try to get the right error code char *err = strstr(page,"\"error\":\""); if ( err ) err += 9; int32_t code = EDIFFBOTUNKNOWNERROR; if ( err && !strncmp(err,"Unable to apply rules",21)) code = EDIFFBOTUNABLETOAPPLYRULES; // like .pdf pages get this error if ( err && !strncmp(err,"Could not parse page",20)) code = EDIFFBOTCOULDNOTPARSE; // if it is 404... 502, etc. any http status code if ( err && !strncmp(err,"Could not download page",23)) code = EDIFFBOTCOULDNOTDOWNLOAD; // custom api does not apply to the url if ( err && !strncmp(err,"Invalid API",11)) code = EDIFFBOTINVALIDAPI; if ( err && !strncmp(err,"Version required",16)) code = EDIFFBOTVERSIONREQ; if ( err && !strncmp(err,"Empty content",13)) code = EDIFFBOTEMPTYCONTENT; if ( err && !strncmp(err,"No content received",19)) code = EDIFFBOTEMPTYCONTENT; if ( err && !strncmp(err,"Request timed",13)) code = EDIFFBOTREQUESTTIMEDOUT; // error processing url if ( err && !strncmp(err,"Error processing",16)) code = EDIFFBOTURLPROCESSERROR; if ( err && !strncmp(err,"Your token has exp",18)) code = EDIFFBOTTOKENEXPIRED; THIS->m_diffbotReplyError = code; } // a hack for detecting if token is expired if ( ! ttt && cr && strstr ( page , ":429}" ) ) { // note it log("xmldoc: pausing crawl %s (%"INT32") because " "token is expired",cr->m_coll,(int32_t)cr->m_collnum); // pause the crawl SafeBuf parmList; // spidering enabled is the "cse" cgi parm in Parms.cpp g_parms.addNewParmToList1 ( &parmList , cr->m_collnum, "0", // val -1 , "cse"); // this uses msg4 so parm ordering is guaranteed g_parms.broadcastParmList ( &parmList , NULL , NULL ); } } // reply is now valid but might be empty THIS->m_diffbotReplyValid = true; //if ( ! cr ) return; bool countIt = true; if ( ! cr ) countIt = false; if ( THIS->m_diffbotReplyError ) countIt = false; /* // if doing /vxxx/analzye?mode=xxxx then ensure matches bool isAnalyze = false; if ( countIt && m_diffbotApiUrlValid && strstr ( m_diffbotApiUrl.getBufStart(), "/analyze?") ) isAnalyze = true; char *mode = NULL; if ( isAnalyze ) { mode = strstr (m_diffbotApiUrl.getBufStart(), "mode="); if ( mode ) mode += 5; // find end of it */ // increment this counter on a successful reply from diffbot if ( countIt ) { // ! THIS->m_diffbotReplyError && cr ) { // mark this flag THIS->m_gotDiffbotSuccessfulReply = 1; // count it for stats cr->m_localCrawlInfo.m_pageProcessSuccesses++; cr->m_globalCrawlInfo.m_pageProcessSuccesses++; // per round as well cr->m_localCrawlInfo.m_pageProcessSuccessesThisRound++; cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound++; // log it log(LOG_INFO, "build: processed page %s (pageLen=%"INT32")", THIS->m_firstUrl.m_url, pageLen); // changing status, resend local crawl info to all cr->localCrawlInfoUpdate(); // sanity! // crap, this can happen if we try to get the metalist // of an old page for purposes of incremental indexing or // deletion. we do not re-download it, but it seems we try // to re-process it... //if ( cr->m_localCrawlInfo.m_pageProcessAttempts > // cr->m_localCrawlInfo.m_pageDownloadAttempts ) { // char *xx=NULL;*xx=0; } // need to save collection rec now during auto save cr->m_needsSave = true; // the diffbot api url we used //SafeBuf *au = THIS->getDiffbotApiUrl(); //if ( ! au || au == (void *)-1 ) {char *xx=NULL;*xx=0;} // set the reply properly int32_t need = pageLen + 1;// + au->length() + 1; if ( ! THIS->m_diffbotReply.reserve ( need ) ) goto skip; // first store the url we used on first line //THIS->m_diffbotReply.safeMemcpy ( au->getBufStart(), // au->length() ); //THIS->m_diffbotReply.pushChar('\n'); // convert the \u1f23 to utf8 (\n and \r as well) // crap, this decodes \\\\\" to \\" which is causing // the json parser to believe it is an encoded \ then // a REAL quote... but quote is contained... //THIS->m_diffbotReply.safeDecodeJSONToUtf8 ( page , pageLen , // THIS->m_niceness ); // do not do that any more then, jsonparse can call it // on a per string basis THIS->m_diffbotReply.safeMemcpy ( page , pageLen ); // convert embedded \0 to space //char *p = THIS->m_diffbotReply.getBufStart(); //char *pend = p + THIS->m_diffbotReply.getLength(); // tack on a \0 but don't increment m_length THIS->m_diffbotReply.nullTerm(); // any embedded \0's in the utf8? int32_t testLen1 = THIS->m_diffbotReply.length(); int32_t testLen2 = gbstrlen(THIS->m_diffbotReply.getBufStart()); if ( testLen1 != testLen2 ) { char *xx=NULL;*xx=0; } // convert the \u1f23 to utf8 (\n and \r as well) //THIS->m_diffbotReply.decodeJSONToUtf8 ( THIS->m_niceness ); //THIS->m_diffbotReply.nullTerm(); } skip: // resume. this checks g_errno for being set. THIS->m_masterLoop ( THIS->m_masterState ); } SafeBuf *XmlDoc::getDiffbotApiUrl ( ) { if ( m_diffbotApiUrlValid ) return &m_diffbotApiUrl; // if we are a diffbot json object, do not re-send to diffbot! if ( m_isDiffbotJSONObject ) { //m_diffbotApiNum = DBA_NONE; m_diffbotApiUrlValid = true; return &m_diffbotApiUrl; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; m_diffbotApiUrl.safeMemcpy ( &cr->m_diffbotApiUrl ); m_diffbotApiUrl.nullTerm(); m_diffbotApiUrlValid = true; // this now automatically sets m_diffbotApiUrl and m_diffbotApiUrlValid // in case the url filters table changes while spidering this!!! // gotta be careful of that. //int32_t *ufn = getUrlFilterNum(); //if ( ! ufn || ufn == (void *)-1 ) return (SafeBuf *)ufn; // ensure it does set it! //if ( ! m_diffbotApiUrlValid ) { char *xx=NULL;*xx=0; } //m_diffbotApiNum = cr->m_spiderDiffbotApiNum[*ufn]; // sanity check //if ( m_diffbotApiNum < 0 ) { char *xx=NULL;*xx=0; } //m_diffbotApiNumValid = true; return &m_diffbotApiUrl; } // if only processing NEW URLs is enabled, then do not get diffbot reply // if we already got one before bool *XmlDoc::getRecycleDiffbotReply ( ) { if ( m_recycleDiffbotReplyValid ) return &m_recycleDiffbotReply; // if from pageparser.cpp re-call diffbot for debugging if ( getIsPageParser() ) { m_recycleDiffbotReply = false; m_recycleDiffbotReplyValid = true; return &m_recycleDiffbotReply; } XmlDoc **odp = getOldXmlDoc( ); if ( ! odp || odp == (XmlDoc **)-1 ) return (bool *)odp; XmlDoc *od = *odp; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // if doc has been successfully processed in the past then // ***RECYCLE*** the diffbot reply! m_recycleDiffbotReply = false; if ( cr->m_diffbotOnlyProcessIfNewUrl && od && od->m_gotDiffbotSuccessfulReply ) m_recycleDiffbotReply = true; // don't recycle if specfically asked to reindex though if ( m_sreqValid && m_sreq.m_isPageReindex ) m_recycleDiffbotReply = false; // unless the 'recycle content' checkbox was checked when doing // the query (page) reindex... if ( m_sreqValid && m_sreq.m_recycleContent ) m_recycleDiffbotReply = true; m_recycleDiffbotReplyValid = true; return &m_recycleDiffbotReply; } // get hashes of the json objects in the diffbotreply int32_t *XmlDoc::getDiffbotTitleHashes ( int32_t *numHashes ) { *numHashes = size_linkInfo2 / 4; if ( ! ptr_linkInfo2 ) *numHashes = 0; // hack: use linkdbdata2 field if ( m_diffbotTitleHashBufValid ) { // do not return NULL without g_errno set if ( ptr_linkInfo2 == NULL ) return (int32_t *)0x01; return (int32_t *)ptr_linkInfo2; } SafeBuf *tdbr = getTokenizedDiffbotReply(); if ( ! tdbr || tdbr == (void *)-1 ) return (int32_t *)tdbr; HashTableX dedup; if ( ! dedup.set ( 4,0,1024,NULL,0,false,m_niceness,"ddthbuf") ) return NULL; // parse out the json items in the reply char *p = tdbr->getBufStart(); char *pend = p + tdbr->length(); int32_t plen; for ( ; p < pend ; p += plen + 1 ) { // breathe some in case diffbot reply is 250MB QUICKPOLL(m_niceness); // set this plen = gbstrlen(p); // get title from it int32_t valLen; char *val = getJSONFieldValue ( p , "title", &valLen ); int32_t th32 = 0; // hash the title if ( val && valLen ) { th32 = hash32 ( val , valLen ); // avoid 0 if ( th32 == 0 ) th32 = 1; } // if no title, use hash of body if ( th32 == 0 ) { th32 = hash32 ( p , plen ); // avoid 0 if ( th32 == 0 ) th32 = 2; } // if our hash is duplicated then increment until unique while ( dedup.isInTable ( &th32 ) ) th32++; // store it for deduping dedup.addKey ( &th32 ); // store it m_diffbotTitleHashBuf.pushLong(th32); } ptr_linkInfo2 = (LinkInfo *)m_diffbotTitleHashBuf.getBufStart(); size_linkInfo2 = m_diffbotTitleHashBuf.length(); *numHashes = size_linkInfo2 / 4; m_diffbotTitleHashBufValid = true; // if no hashes return 0x01 because NULL means g_errno if ( ptr_linkInfo2 == NULL ) return (int32_t *)0x01; return (int32_t *)ptr_linkInfo2; } // . we now get the TOKENIZED diffbot reply. // . that converts a single diffbot reply into multiple \0 separated // json objects. // . for instance, the diffbot product api returns an array like // "products":[{...},{...}],"url":... that consists of multiple // json product items, but the json elements that are not in // this array are description of the page itself, like url and title. // so we need to carry over these outter json objects to each // inner json object we tokenize. // . in this fashion we'll have separate objects that can each be indexed // as a single page, which is what we want for searching. SafeBuf *XmlDoc::getTokenizedDiffbotReply ( ) { if ( m_tokenizedDiffbotReplyValid ) return m_tokenizedDiffbotReplyPtr; SafeBuf *dbr = getDiffbotReply(); if ( ! dbr || dbr == (void *)-1 ) return dbr; // empty? that's easy. might be just "{}\n" i guess if ( dbr->length() <= 3 ) return dbr; char *text = dbr->getBufStart(); Json jp; if ( ! jp.parseJsonStringIntoJsonItems ( text , m_niceness ) ) { g_errno = EBADJSONPARSER; return NULL; } JsonItem *jsonItem = jp.getItem("objects"); char *array = NULL; int32_t arrayLen = 0; if ( jsonItem ) { array = jsonItem->getArrayStart(); arrayLen = jsonItem->getArrayLen(); } if ( array && arrayLen > 0 ) { m_v3buf.safeMemcpy( array , arrayLen ); m_v3buf.nullTerm(); // trim off the enclosing []'s char *p = m_v3buf.getBufStart(); for ( ; *p && is_wspace_a(*p) ; p++ ); if ( *p == '[') *p = ' '; char *e = m_v3buf.getBuf()-1; for ( ; e>p && is_wspace_a(*e) ;e--); if ( *e ==']') *e=' '; // replace top level commas with \0's int32_t curlies = 0; char *x = p; bool inQuotes = false; // scan now for ( ; *x ; x++ ) { // escaping a backslash? if ( *x == '\\' && x[1] == '\\' ) { // skip two bytes then.. x++; continue; } // escaping a quote? ignore quote then. if ( *x == '\\' && x[1] == '\"' ) { // skip two bytes then.. x++; continue; } if ( *x == '\"' ) { inQuotes = ! inQuotes; continue; } // if in a quote, ignore {} in there if ( inQuotes ) continue; if ( *x== '{' ) { curlies++; continue; } if ( *x == '}' ) { curlies--; continue; } if ( curlies != 0 ) continue; if ( *x == ',' ) *x = '\0'; } m_tokenizedDiffbotReplyPtr = &m_v3buf; m_tokenizedDiffbotReplyValid = true; return m_tokenizedDiffbotReplyPtr; } // it must have \"type\":\"product or \"type\":\"image // in order for us to do the array separation logic below. // we don't want to do this logic for articles because they // contain an image array!!! // this must be on the FIRST level of the json object, otherwise // we get errors because we got type:article and it // contains an images array! int32_t valLen; char *val = getJSONFieldValue ( text , "type", &valLen ); bool isProduct = false; bool isImage = false; if ( val && valLen == 7 && strncmp ( val , "product", 7) == 0 ) isProduct = true; if ( val && valLen == 5 && strncmp ( val , "image", 5) == 0 ) isImage = true; if ( ! isProduct && ! isImage ) { m_tokenizedDiffbotReplyValid = true; m_tokenizedDiffbotReplyPtr = &m_diffbotReply; return m_tokenizedDiffbotReplyPtr; } char *needle; char *newTerm; if ( isProduct ) { needle = ",\"products\":["; newTerm = "product"; } else { needle = ",\"images\":["; newTerm = "image"; } char *parray = strstr ( text , needle ); // if not found, no need to do anything... if ( ! parray ) { m_tokenizedDiffbotReplyValid = true; m_tokenizedDiffbotReplyPtr = &m_diffbotReply; return m_tokenizedDiffbotReplyPtr; } // point to [ char *pstart = parray + gbstrlen(needle) - 1; // // ok, now we have to do so json ju jitsu to fix it // // point to array. starting at the '[' char *p = pstart; int32_t brackets = 0; bool inQuotes = false; for ( ; *p ; p++ ) { // escaping a quote? ignore quote then. if ( *p == '\\' && p[1] == '\"' ) { // skip two bytes then.. p++; continue; } if ( *p == '\"' ) { inQuotes = ! inQuotes; continue; } // if in a quote, ignore {} in there if ( inQuotes ) continue; if ( *p == '[' ) brackets++; if ( *p != ']' ) continue; brackets--; // stop if array is done. p points to ']' if ( brackets == 0 ) break; } // now point to outter items to the left of the ",\"products\":[... char *left1 = dbr->getBufStart(); char *left2 = parray; // then to the right. skip over the ending ']' char *right1 = p + 1; char *right2 = dbr->getBuf(); // end of the buffer SafeBuf *tbuf = &m_tokenizedDiffbotReply; // now scan the json products or images in the array char *x = pstart; // skip over [ x++; // each product item in array is enclosed in {}'s if ( *x != '{' ) { log("build: something is wrong with diffbot reply"); g_errno = EBADENGINEER; return NULL; } // reset CURLY bracket count int32_t curlies = 0; char *xstart = NULL; inQuotes = false; // scan now for ( ; x < right1 ; x++ ) { // escaping a quote? ignore quote then. if ( *x == '\\' && x[1] == '\"' ) { // skip two bytes then.. x++; continue; } if ( *x == '\"' ) { inQuotes = ! inQuotes; continue; } // if in a quote, ignore {} in there if ( inQuotes ) continue; if ( *x== '{' ) { if ( curlies == 0 ) xstart = x; curlies++; continue; } if ( *x == '}' ) { curlies--; if ( curlies != 0 ) continue; // unreciprocated '{'? wtf??? if ( ! xstart ) continue; // skip empty curlies if ( x[-1] == '{' ) continue; // // ok, we got an item! // // left top items if ( ! tbuf->safeMemcpy ( left1 , left2-left1 ) ) return NULL; // use "product": if ( ! tbuf->safePrintf(",\"%s\":" , newTerm ) ) return NULL; // the item itself, include it's curlies. if ( ! tbuf->safeMemcpy ( xstart , x - xstart+1 ) ) return NULL; // right top items if ( ! tbuf->safeMemcpy ( right1 , right2-right1 ) ) return NULL; // then a \0 if ( ! tbuf->pushChar('\0') ) return NULL; // reset this! xstart = NULL; } } // now show the items. debug! //p = tbuf->getBufStart(); //for ( ; p < tbuf->getBuf() ; p += gbstrlen(p) + 1 ) // fprintf(stderr,"ITEM\n%s\n\n",p); m_tokenizedDiffbotReplyPtr = tbuf; m_tokenizedDiffbotReplyValid = true; return m_tokenizedDiffbotReplyPtr; } // . convert document into json representing multiple documents // if it makes sense. sometimes a single url contains multiple // subdocuments that each should have their own url, but do not, // so we fix that here. // . the diffbot reply will be a list of json objects we want to index SafeBuf *XmlDoc::getDiffbotReply ( ) { if ( m_diffbotReplyValid ) return &m_diffbotReply; // . check the url filters table to see if diffbot api is specified // . just return "\0" if none, but NULL means error i guess SafeBuf *au = getDiffbotApiUrl(); if ( ! au || au == (void *)-1 ) return (SafeBuf *)au; // if no url, assume do not access diffbot if ( au->length() <= 0 ) { m_diffbotReplyValid = true; return &m_diffbotReply; } // if we are json do not send that to diffbot, like an injected // json diffbot object. should fix json injections into gobal index uint8_t *ct = getContentType(); if ( ! ct || ct == (void *)-1 ) return (SafeBuf *)ct; if ( *ct == CT_JSON ) { m_diffbotReplyValid = true; return &m_diffbotReply; } // we make a "fake" url for the diffbot reply when indexing it // by appending -diffbotxyz%"UINT32". see "fakeUrl" below. if ( m_firstUrl.getUrlLen() + 24 >= MAX_URL_LEN ) { if ( m_firstUrlValid ) log("build: diffbot url would be too long for " "%s", m_firstUrl.getUrl() ); else log("build: diffbot url would be too long for " "%"INT64"", m_docId ); m_diffbotReplyValid = true; return &m_diffbotReply; } // getIndexCode() calls getDiffbotReply(), so avoid a loop! //if ( *getIndexCode() ) // return &m_diffbotReply; if ( m_indexCodeValid && m_indexCode ) return &m_diffbotReply; if ( m_isDiffbotJSONObject ) { m_diffbotReplyValid = true; return &m_diffbotReply; } // if this is a robots.txt or a root page we are downloading // separately to get the title for to compare to this page's title, // or whatever, do not pass to diffbot if ( m_isChildDoc ) { m_diffbotReplyValid = true; return &m_diffbotReply; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // get list of substring patterns char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart(); char *upp = cr->m_diffbotUrlProcessPattern.getBufStart(); if ( upp && ! upp[0] ) upp = NULL; if ( ucp && ! ucp[0] ) ucp = NULL; // do we match the url process pattern or regex? // get the compiled regular expressions //regex_t *ucr = &cr->m_ucr; regex_t *upr = &cr->m_upr; //if ( ! cr->m_hasucr ) ucr = NULL; if ( ! cr->m_hasupr ) upr = NULL; // get the url Url *f = getFirstUrl(); char *url = f->getUrl(); // . "upp" is a ||-separated list of substrings // . "upr" is a regex // . regexec returns 0 for a match if ( upr && regexec(upr,url,0,NULL,0) ) { // return empty reply m_diffbotReplyValid = true; return &m_diffbotReply; } if ( upp && !upr &&!doesStringContainPattern(url,upp)) { // return empty reply m_diffbotReplyValid = true; return &m_diffbotReply; } // if already processed and onlyprocessifnewurl is enabled then // we recycle and do not bother with this, we also do not nuke // the diffbot json objects we have already indexed by calling // nukeJSONObjects() bool *recycle = getRecycleDiffbotReply(); if ( ! recycle || recycle == (void *)-1) return (SafeBuf *)recycle; if ( *recycle ) { m_diffbotReplyValid = true; return &m_diffbotReply; } // if set from title rec, do not do it. we are possibly an "old doc" // and we should only call diffbot.com with new docs if ( m_setFromTitleRec ) { m_diffbotReplyValid = true; return &m_diffbotReply; } // "none" means none too! Parms.cpp doesn't like &dapi1=& because // it does not call setParm() on such things even though it probably // should, it doesn't like no values, so i put "none" in there. if ( strncasecmp(au->getBufStart(),"none",4) == 0 ) { m_diffbotReplyValid = true; return &m_diffbotReply; } if ( strncasecmp(au->getBufStart(),"donotprocess",12) == 0 ) { m_diffbotReplyValid = true; return &m_diffbotReply; } // invalid url? Url apiUrl; apiUrl.set ( au->getBufStart() ); if ( apiUrl.getUrlLen() <= 0 || apiUrl.getHostLen() <= 0 || apiUrl.getDomainLen() <= 0 ) { log("build: invalid diffbot api url of \"%s\".", au->getBufStart() ); m_diffbotReplyValid = true; return &m_diffbotReply; } // when respidering an "old" doc, never call this. we already // have the diffbot replies xyz.com/-diffbot-0 and xyz.com/-diffbot-1 // etc. //if ( m_setFromTitleRec ) { char *xx = NULL; *xx = 0; } // sanity check. no! barfs on legit url with -diffbot- in it //if ( strstr(m_firstUrl.m_url,"-diffbot-") ) { // char *xx=NULL; *xx = 0; } // we should not "process" (i.e. send to diffbot) urls that do // not match the supplied CollectionRec::m_diffbotUrlProcessPattern // let's just put a checkbox in the url filters box for this! // i.e. Send to Diffbot? [X] //if ( m_useDiffbot && ! doesUrlMatchDiffbotProcessPattern() ) { // m_diffbotReplyValid = true; // return &m_diffbotReply; //} // empty content, do not send to diffbot then char **u8 = getUtf8Content(); if ( ! u8 || u8 == (char **)-1 ) return (SafeBuf *)u8; if ( ! *u8 ) { m_diffbotReplyValid = true; return &m_diffbotReply; } // do not send to diffbot if its binary! char *ib = getIsBinary(); if ( ! ib || ib == (void *)-1 ) return (SafeBuf *)ib; if ( *ib ) { m_diffbotReplyValid = true; log("diffbot: skipping binary page %s",m_firstUrl.m_url); return &m_diffbotReply; } // or if original page content matches the page regex dont hit diffbot if ( ! doesPageContentMatchDiffbotProcessPattern() ) { m_diffbotReplyValid = true; return &m_diffbotReply; } // now include referring link anchor text, etc. LinkInfo *info1 = getLinkInfo1 (); if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1; setStatus("getting diffbot reply"); // set up dedup table for deduping on link text HashTableX dedup; char tmp[512]; if ( ! dedup.set ( 4,0,32,tmp,512,false,m_niceness,"difdedup") ) return NULL; SafeBuf headers; bool first = true; // . make additional headers // . add two headers for every "good" (non-dup) link // . do NOT end headers in \r\n since HttpServer adds that! for ( Inlink *k=NULL ; info1 && (k=info1->getNextInlink(k)) ; ) { // breathe QUICKPOLL(m_niceness); // sanity if ( k->size_urlBuf <= 1 ) continue; // skip if too long if ( k->size_linkText > 1024 ) continue; // or not enough! (size includes \0) if ( k->size_linkText <= 1 ) continue; // sanity check char *txt = k->getLinkText(); int32_t tlen = k->size_linkText; if ( tlen > 0 ) tlen--; // this seems to happen sometimes.. if ( ! verifyUtf8 ( txt , tlen ) ) continue; // if anchor text has \0 skip it if ( gbstrlen(txt) != tlen ) continue; // or if surrounding text has \0 skip as well char *surStr = k->getSurroundingText(); int32_t surLen = k->size_surroundingText; if ( surLen > 0 ) surLen--; if ( surStr && gbstrlen(surStr) != surLen ) continue; // dedup on that int32_t h32 = hash32 ( txt , tlen ); if ( dedup.isInTable ( &h32 ) ) continue; if ( ! dedup.addKey ( &h32 ) ) return NULL; // separate with \r\n if ( ! first && ! headers.safePrintf("\r\n" ) ) return NULL; first = false; // add to http header if ( ! headers.safePrintf("X-referring-url: ") ) return NULL; // do not include the terminating \0, so -1 if ( ! headers.safeMemcpy(k->getUrl() , k->size_urlBuf-1 )) return NULL; // and link text if ( ! headers.safePrintf("\r\nX-anchor-text: ") ) return NULL; // store the anchor text without any \r or \n chars if ( ! headers.reserve ( tlen ) ) return NULL; char *p = txt; char *pend = txt + tlen; for ( ; p < pend ; p++ ) { if ( *p == '\r' ) continue; if ( *p == '\n' ) continue; headers.pushChar(*p); } // do not include it if more than 2000 chars big if ( surLen > 0 && surLen < 2000 ) { if ( ! headers.safePrintf("\r\nX-surrounding-text: ") ) return NULL; // make room for copying the surrounding text if ( ! headers.reserve ( surLen ) ) return NULL; // copy minus any \r or \n so its mime header safe p = surStr; pend = surStr + surLen; for ( ; p < pend ; p++ ) { if ( *p == '\r' ) continue; if ( *p == '\n' ) continue; headers.pushChar(*p); } } } // make sure to null term the headers if ( headers.length() && ! headers.nullTerm() ) return NULL; //char *path = "api"; //if ( strcmp(cr->m_diffbotApi.getBufStart(),"product") == 0 ) // path = "v2"; // // DIFFBOT injection interface TODO // // if we are intercepting a direct injection diffbot request // then we will probably take the exact same parms provided and // just relay them to diffbot here. maybe Diffbot.cpp can set // the original diffbot.com request url in this xmldoc class that // is being inject using the url encoded in that request. // // url can be on the stack since httpserver.cpp makes an http mime // from this url //SafeBuf diffbotUrl; // TODO: make sure "api" works as hostname for not just product... //diffbotUrl.safePrintf("http://www.diffbot.com/"); // skip extra '/'? //char *api = au->getBufStart(); //int32_t apiLen = au->length(); //if ( api && api[0] == '/' ) { api++; apiLen--; } // append the custom url. i.e. /api/analyze?mode=auto&u= //if ( api ) diffbotUrl.safeMemcpy ( api , apiLen ); // store the api url into here m_diffbotUrl.safeMemcpy ( apiUrl.getUrl() , apiUrl.getUrlLen() ); // . m_diffbotApi Is like "article" or "product" etc. // . if classify is true we always return the classification // of the page in the JSON. like "type":"frontpage" regardless // of the "api" specified. // . otherwise, if classify is false empty json will be returned // if there is no json objects of the specified page type, "api" // . BUT if api is "all" return all types of json objects // . SHOULD we return "type" in the json output? /* if ( *an == DBA_ALL ) diffbotUrl.safePrintf("analyze?mode=auto&" ); else if ( *an == DBA_ARTICLE_FORCE ) diffbotUrl.safePrintf("article?"); else if ( *an == DBA_ARTICLE_AUTO ) diffbotUrl.safePrintf("analyze?mode=article&"); else if ( *an == DBA_PRODUCT_FORCE ) diffbotUrl.safePrintf("product?"); else if ( *an == DBA_PRODUCT_AUTO ) diffbotUrl.safePrintf("analyze?mode=product&"); else if ( *an == DBA_IMAGE_FORCE ) diffbotUrl.safePrintf("image?"); else if ( *an == DBA_IMAGE_AUTO ) diffbotUrl.safePrintf("analyze?mode=image&"); else if ( *an == DBA_FRONTPAGE_FORCE ) diffbotUrl.safePrintf("frontpage?"); else if ( *an == DBA_FRONTPAGE_AUTO ) diffbotUrl.safePrintf("analyze?mode=frontpage&"); else { log("build: unknown diffbot api num = %"INT32". assuming all",*an ); diffbotUrl.safePrintf("analyze?mode=auto&" ); } */ //CollectionRec *cr = getCollRec(); //if ( ! cr ) return NULL; // add a '?' if none if ( ! strchr ( apiUrl.getUrl() , '?' ) ) m_diffbotUrl.pushChar('?'); else m_diffbotUrl.pushChar('&'); //diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u=" // only print token if we have one, because if user provides their // own diffbot url (apiUrl in Parms.cpp) then they might include // the token in that for their non-custom crawl. m_customCrawl=0. if ( cr->m_diffbotToken.length()) m_diffbotUrl.safePrintf("token=%s", cr->m_diffbotToken.getBufStart()); bool useProxies = true; // user can turn off proxy use with this switch if ( ! g_conf.m_useProxyIps ) useProxies = false; // did collection override? if ( cr->m_forceUseFloaters ) useProxies = true; // we gotta have some proxy ips that we can use if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false; // if we used a proxy to download the doc, then diffbot should too // BUT tell diffbot to go through host #0 so we can send it to the // correct proxy using our load balancing & backoff algos. if ( useProxies ) { //Host *h0 = g_hostdb.getHost(0); // use a random host now to avoid host #0 running // out of sockets from diffbot trying to connect // for downloading hundreds of urls from the same // high crawl delay site. // round robin over the hosts just to be more evenly // distributed. it will likely get several http requests // from diffbot. static int32_t s_lastHostId = -1; if ( s_lastHostId == -1 ) s_lastHostId = g_hostdb.m_myHost->m_hostId; int32_t r = s_lastHostId;//rand() % g_hostdb.m_numHosts; if ( ++s_lastHostId >= g_hostdb.m_numHosts ) s_lastHostId = 0; Host *h0 = g_hostdb.getHost(r); m_diffbotUrl.safePrintf("&proxy=%s:%"INT32"", iptoa(h0->m_ip), (int32_t)h0->m_httpPort); } char *p = g_conf.m_proxyAuth.getBufStart(); if ( useProxies && p ) { char *p1 = p; for ( ; *p1 && is_wspace_a(*p1) ; p1++ ); char *p2 = p1; for ( ; *p2 && ! is_wspace_a(*p2) ; p2++ ); char c = *p2; *p2 = '\0'; m_diffbotUrl.safePrintf("&proxyAuth="); m_diffbotUrl.urlEncode(p1); *p2 = c; } // if we use proxies then increase the timeout since proxies // increase the crawl delay in hopes of backing off to discover // the website's policy so we don't hit it too hard and get banned. // so to avoid diffbot timing out tell it to wait up to a minute // because the crawl delay can be as high as that, even higher if ( useProxies ) m_diffbotUrl.safePrintf("&timeout=%"INT32"", (int32_t)MAX_PROXYCRAWLDELAYMS+10000); m_diffbotUrl.safePrintf("&url="); // give diffbot the url to process m_diffbotUrl.urlEncode ( m_firstUrl.getUrl() ); // append this just in case the next thing doesn't have it. //if ( cr->m_diffbotApiQueryString.length() && // cr->m_diffbotApiQueryString.getBufStart()[0] != '&' ) // diffbotUrl.pushChar('&'); // then user provided parms that are dependent on if it is an // article, product, etc. like "&dontstripads=1" or whatever //diffbotUrl.safeStrcpy ( cr->m_diffbotApiQueryString.getBufStart()); // for analyze requests without mode=, make sure that diffbot expands all objects // "expand" is not used for all crawls as of Defect #2292: User crawls should only index embedded objects if crawling with analyze // null term it so that we can use strstr (shouldn't be necessary since safePrintf appears to do this already and is called above) if (m_diffbotUrl.nullTerm()) { char *u = m_diffbotUrl.getBufStart(); if (strstr(u, "/analyze") && !strstr(u, "mode=")) { m_diffbotUrl.safePrintf("&expand"); } } // null term it m_diffbotUrl.nullTerm(); // mark as tried if ( m_srepValid ) { char *xx=NULL;*xx=0; } m_sentToDiffbot = 1; // count it for stats cr->m_localCrawlInfo.m_pageProcessAttempts++; cr->m_globalCrawlInfo.m_pageProcessAttempts++; // changing status, resend local crawl info to all cr->localCrawlInfoUpdate(); cr->m_needsSave = true; char *additionalHeaders = NULL; if ( headers.length() > 0 ) additionalHeaders = headers.getBufStart(); // if did not get the web page first and we are crawling, not // doing a bulk, then core. we need the webpage to harvest links // and sometimes to check the pageprocesspattern to see if we should // process. if ( cr->m_isCustomCrawl ==1 && ! m_downloadStatusValid ) { char *xx=NULL;*xx=0; } log(LOG_INFO, "diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(), additionalHeaders); if ( ! g_httpServer.getDoc ( m_diffbotUrl.getBufStart() , 0 , // ip 0 , // offset -1 , // size 0 , // ifmodifiedsince this , // state gotDiffbotReplyWrapper , 180*1000, // 180 sec timeout 0,//proxyip 0,//proxyport // unlimited replies i guess -1,//maxtextdoclen unlimited -1,//maxotherdoclen unlimited g_conf.m_spiderUserAgent , "HTTP/1.0", false, // do post? NULL, // cookie additionalHeaders ) ) // return -1 if blocked return (SafeBuf *)-1; // error? if ( ! g_errno ) { char *xx=NULL;*xx=0; } // wha? log("diffbot: http error %s",mstrerror(g_errno)); // had an error! return NULL; } char **XmlDoc::getHttpReply ( ) { // both must be valid now if ( m_redirUrlValid && m_httpReplyValid ) { // might have been a download error of ECORRUPTDATA if ( m_downloadStatus == ECORRUPTDATA ) { // set g_errno so caller knows g_errno = m_downloadStatus; // null means error return NULL; } // otherwise, assume reply is valid return &m_httpReply; } setStatus("getting http reply"); // come back up here if a redirect invalidates it loop: // sanity test -- only if not the test collection (NO, might be EBADIP) //if ( m_indexCode && strcmp(m_coll,"qatest123")){char*xx=NULL;*xx=0;} // get the http reply char **replyPtr = getHttpReply2(); if ( ! replyPtr || replyPtr == (void *)-1 ) return (char **)replyPtr; // . now if the reply was a redirect we should set m_redirUrl to it // and re-do all this code // . this often sets m_indexCode to stuff like ESIMPLIFIEDREDIR, etc. Url **redirp = getRedirUrl(); // we often lookup the assocaited linkInfo on the original url to // see if it is worth keeping and indexing just to take advantage of // the incoming link text it has, so we may block on that! // but in the case of a contactDoc, getContactDoc() sets these things // to NULL to avoid unnecessary lookups. if ( ! redirp || redirp == (void *)-1 ) return (char **)redirp; // sanity check if ( *redirp && ! m_redirUrlValid ) { char *xx=NULL;*xx=0; } // if NULL, we are done if ( ! *redirp ) return &m_httpReply; // . also, hang it up if we got a simplified redir url now // . we set m_redirUrl so that getLinks() can add a spiderRequest // for it, but we do not want to actually redirect to it to get // the content for THIS document if ( m_redirError ) return &m_httpReply; // and invalidate the redir url because we do not know if the // current url will redirect or not (mdwmdw) m_redirUrlValid = false; m_metaRedirUrlValid = false; // free it mfree ( m_httpReply , m_httpReplyAllocSize, "freehr" ); // always nullify if we free so we do not re-use freed mem m_httpReply = NULL; // otherwise, we had a redirect, so invalidate what we had set m_httpReplyValid = false; // do not invalidate this any more, now it is when we STARTED spidering // the document //m_spideredTimeValid = false; m_isContentTruncatedValid = false; // do not redo robots.txt lookup if the redir url just changed from // http to https or vice versa Url *ru = *redirp; Url *cu = getCurrentUrl(); if ( ! cu || cu == (void *)-1) return (char **)cu; if ( strcmp ( ru->getUrl() + ru->getSchemeLen() , cu->getUrl() + cu->getSchemeLen() ) ) { // redo robots.txt lookup. might be cached. m_isAllowedValid = false; m_crawlDelayValid = false; } // keep the same ip if hostname is unchanged if ( ru->getHostLen() != cu->getHostLen() || strncmp ( ru->getHost() , cu->getHost(), cu->getHostLen() ) ) // ip is supposed to be that of the current url, which changed m_ipValid = false; // we set our m_xml to the http reply to check for meta redirects // in the html sometimes in getRedirUrl() so since we are redirecting, // invalidate that xml m_xmlValid = false; m_wordsValid = false; m_rawUtf8ContentValid = false; m_expandedUtf8ContentValid= false; m_utf8ContentValid = false; m_filteredContentValid = false; m_contentValid = false; m_mimeValid = false; // update our current url now to be the redirected url m_currentUrl.set ( *redirp , false ); m_currentUrlValid = true; // loop it goto loop; } void gotHttpReplyWrapper ( void *state ) { // point to us XmlDoc *THIS = (XmlDoc *)state; // this sets g_errno on error THIS->gotHttpReply ( ); // resume. this checks g_errno for being set. THIS->m_masterLoop ( THIS->m_masterState ); } // "NULL" can be a valid http reply (empty page) so we need to use "char **" char **XmlDoc::getHttpReply2 ( ) { if ( m_httpReplyValid ) return &m_httpReply; setStatus("getting http reply2"); // if recycle is set then NEVER download if doing query reindex // but if doing an injection then i guess we can download. // do not even do ip lookup if no old titlerec, which is how we // ended up here... if ( m_recycleContent && m_sreqValid && m_sreq.m_isPageReindex ) { g_errno = ENOTITLEREC; return NULL; } // doing a query reindex on diffbot objects does not have a // valid spider request, only sets m_recycleContent to true // in reindexJSONObjects()/redoJSONObjects() if ( m_recycleContent && m_isDiffbotJSONObject ) { g_errno = ENOTITLEREC; return NULL; } // get ip int32_t *ip = getIp(); if ( ! ip || ip == (int32_t *)-1 ) return (char **)ip; // reset m_httpReplySize = 0; m_httpReply = NULL; // if ip is bogus, we are done if ( *ip == 0 || *ip == -1 ) { log("xmldoc: ip is bogus 0 or -1 for %s. skipping download", m_firstUrl.getUrl()); m_httpReplyValid = true; m_isContentTruncated = false; m_isContentTruncatedValid = true; // need this now too. but don't hurt a nonzero val if we have if ( ! m_downloadEndTimeValid ) { m_downloadEndTime = 0; m_downloadEndTimeValid = true; } return &m_httpReply; //return gotHttpReply ( ); } // get this. should operate on current url (i.e. redir url if there) bool *isAllowed = getIsAllowed(); // error or blocked if ( ! isAllowed || isAllowed == (void *)-1) return (char **)isAllowed; // this must be valid, since we share m_msg13 with it if ( ! m_isAllowedValid ) { char *xx=NULL;*xx=0; } int32_t *cd = getFinalCrawlDelay(); if ( ! cd || cd == (void *)-1 ) return (char **)cd; // we might bail if ( ! *isAllowed ) { m_httpReplyValid = true; m_isContentTruncated = false; m_isContentTruncatedValid = true; // need this now too. but don't hurt a nonzero val if we have if ( ! m_downloadEndTimeValid ) { m_downloadEndTime = 0; m_downloadEndTimeValid = true; } m_downloadStatusValid = true; // forbidden? assume we downloaded it and it was empty m_downloadStatus = 0; // EDOCDISALLOWED;//403; return &m_httpReply; //return gotHttpReply ( ); } // are we site root page? char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot; //int8_t *hc = getHopCount(); //if ( ! hc || hc == (void *)-1 ) return (char **)hc; XmlDoc *od = NULL; if ( ! m_isSpiderProxy ) { XmlDoc **pod = getOldXmlDoc ( ); if ( ! pod || pod == (XmlDoc **)-1 ) return (char **)pod; // get ptr to old xml doc, could be NULL if non exists od = *pod; } // sanity check if ( od && m_recycleContent ) {char *xx=NULL;*xx=0; } // validate m_firstIpValid int32_t *pfip = getFirstIp(); if ( ! pfip || pfip == (void *)-1 ) return (char **)pfip; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // robots.txt and css files etc should have m_isChildDoc as true //if ( ! m_downloadAttempted && ! m_isChildDoc ) // // keep track of spider stats // cr->m_localCrawlInfo.m_pageDownloadAttempts++; // we made an attempt to download, so mark it //m_downloadAttempted = true; // if we didn't block getting the lock, keep going setStatus ( "getting web page" ); // sanity check if ( ! m_masterLoop ) { char *xx=NULL;*xx=0; } // int16_tcut. this will return the redirUrl if it is non-empty. Url *cu = getCurrentUrl(); if ( ! cu || cu == (void *)-1 ) return (char **)cu; /* // if on google, make it empty so we do not hit them if ( strstr(cu->getUrl(),".google.com/") ) { log("spider: encountered google.com url. emptying."); m_httpReplyValid = true; m_isContentTruncated = false; m_isContentTruncatedValid = true; // need this now too. but don't hurt a nonzero val if we have if ( ! m_downloadEndTimeValid ) { m_downloadEndTime = 0; m_downloadEndTimeValid = true; } return &m_httpReply; } */ // no ip found means empty page i guess //if ( *ip == 0 || *ip == -1 ) // return gotHttpReply ( ); bool useTestCache = false; if ( ! strcmp(cr->m_coll,"qatest123") ) useTestCache = true; // unless its the pagesubmit.cpp event submission tool //if ( m_sreqValid && m_sreq.m_isPageSubmit ) useTestCache = false; // sanity check //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } // set parms Msg13Request *r = &m_msg13Request; // clear it first r->reset(); // and set the url //strcpy ( r->m_url , cu->getUrl() ); r->ptr_url = cu->getUrl(); r->size_url = cu->getUrlLen()+1; // sanity check if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; } // max to download in bytes. currently 1MB. int32_t maxDownload = (int32_t)MAXDOCLEN; // but if url is http://127.0.0.1.... or local then if ( m_ipValid ) { // make into a string char *ipStr = iptoa(m_ip); // is it local? bool isLocal = false; if ( strncmp(ipStr,"192.168.",8) == 0) isLocal = true; if ( strncmp(ipStr,"10." ,3) == 0) isLocal = true; if ( m_ip == 16777343 ) isLocal = true; // 127.0.0.1 ? // . if local then make web page download max size unlimited // . this is for adding the gbdmoz.urls.txt.* files to // populate dmoz. those files are about 25MB each. if ( isLocal ) maxDownload = -1; } // m_maxCacheAge is set for getting contact or root docs in // getContactDoc() and getRootDoc() and it only applies to // titleRecs in titledb i guess... but still... for Msg13 it applies // to its cache ... for robots.txt files too r->m_maxCacheAge = m_maxCacheAge; r->m_urlIp = *ip; r->m_firstIp = m_firstIp; r->m_urlHash48 = getFirstUrlHash48(); r->m_maxTextDocLen = maxDownload; r->m_maxOtherDocLen = maxDownload; r->m_forwardDownloadRequest = (bool)m_forwardDownloadRequest; r->m_useTestCache = (bool)useTestCache; r->m_spideredTime = getSpideredTime();//m_spideredTime; r->m_ifModifiedSince = 0; r->m_skipHammerCheck = 0; //if ( g_conf.m_qaBuildMode ) r->m_addToTestCache = true; //else r->m_addToTestCache = false; r->m_addToTestCache = (bool)useTestCache; if ( m_redirCookieBufValid && m_redirCookieBuf.length() ) { r->ptr_cookie = m_redirCookieBuf.getBufStart(); r->size_cookie = m_redirCookieBuf.length() + 1; // . only do once per redirect // . do not invalidate because we might have to carry it // through to the next redir... unless we change domain // . this fixes the nyt.com bug some more //m_redirCookieBufValid = false; } // . this is -1 if unknown. none found in robots.txt or provided // in the custom crawl parms. // . it should also be 0 for the robots.txt file itself r->m_crawlDelayMS = *cd; // let's time our crawl delay from the initiation of the download // not from the end of the download. this will make things a little // faster but could slam servers more. r->m_crawlDelayFromEnd = false; // need this in order to get all languages, etc. and avoid having // to set words class at the spider compression proxy level r->m_forEvents = 0; // new stuff r->m_contentHash32 = 0; // if valid in SpiderRequest, use it. if spider compression proxy // sees the content is unchanged it will not send it back! it will // send back g_errno = EDOCUNCHANGED or something if ( m_sreqValid ) r->m_contentHash32 = m_sreq.m_contentHash32; // if we have the old doc already set use that if ( od ) r->m_contentHash32 = od->m_contentHash32; // force floater usage on even if "use spider proxies" parms is off // if we're a diffbot crawl and use robots is off. //if ( cr && ! cr->m_useRobotsTxt && cr->m_isCustomCrawl ) // r->m_forceUseFloaters = true; // for beta testing, make it a collection specific parm for diffbot // so we can turn on manually if ( cr->m_forceUseFloaters ) r->m_forceUseFloaters = true; // eventgurubot is the max //char *userAgent = g_conf.m_spiderUserAgent; // hardcode it //char *userAgent = "EventGuruBot"; //int32_t uaLen = gbstrlen(userAgent); //if ( uaLen > 12 ) { // log("spider: user agent string too long"); // uaLen = 12; //} //strncpy(r->m_userAgent,userAgent,uaLen); //r->m_userAgent[uaLen] = '\0'; // turn this off too r->m_attemptedIframeExpansion = false; // turn off r->m_useCompressionProxy = false; r->m_compressReply = false; r->m_isCustomCrawl = cr->m_isCustomCrawl; // set it for this too if ( g_conf.m_useCompressionProxy && // do not use for the test collection ever, that is qa'ing strcmp(cr->m_coll,"qatest123") ) { r->m_useCompressionProxy = true; r->m_compressReply = true; } // are we a robots.txt file? //bool isRobotsTxt = isRobotsTxtFile ( cu->getUrl() , cu->getUrlLen()); char *td = getTestDir(); if ( td ) strncpy ( r->m_testDir, td, 31); //r->m_isPageParser = getIsPageParser(); //r->m_isPageInject = ( m_sreqValid && m_sreq.m_isInjecting ); // if current url IS NOT EQUAL to first url then set redir flag if ( strcmp(cu->m_url,m_firstUrl.m_url) ) r->m_skipHammerCheck = 1; // or if this an m_extraDoc or m_rootDoc for another url then // do not bother printing the hammer ip msg in msg13.cpp either if ( m_isChildDoc ) r->m_skipHammerCheck = 1; if ( m_contentInjected ) // oldsrValid && m_sreq.m_isInjecting ) r->m_skipHammerCheck = 1; // or if ahrefs if ( strncmp(cu->m_url,"http://api.ahrefs.com/",22) == 0 ) r->m_skipHammerCheck = 1; if ( r->m_skipHammerCheck ) log(LOG_DEBUG,"build: skipping hammer check"); // if we had already spidered it... try to save bandwidth and time if ( od ) { // sanity check if ( ! od->m_spideredTimeValid ) { char *xx=NULL;*xx=0; } // only get it if modified since last spider time r->m_ifModifiedSince = od->m_spideredTime; } // tell msg13 he is scraping... if ( m_sreqValid && m_sreq.m_isScraping ) r->m_isScraping = 1; // if doing frame expansion on a doc we just downloaded as the // spider proxy, we are asking ourselves now to download the url // from an ", 9 ); // use our own special tag so Sections.cpp can set // Section::m_gbFrameNum which it uses internally m_esbuf.safePrintf(""); // gbiframe // identify javascript bool javascript = false; if ( *ed->getContentType() == CT_JS ) javascript = true; // so we do not mine javascript for cities and states etc. // in Address.cpp if ( javascript ) m_esbuf.safePrintf(""); m_esbuf.safePrintf(""); // free up ed nukeDoc ( ed ); // end of frame tag, skip over whole thing m_oldp = fend ; // sanity check if ( m_oldp > pend ) { char *xx=NULL;*xx=0; } // another flag m_didExpansion = true; // count how many we did if ( ++m_numExpansions >= 5 ) break; } // default m_expandedUtf8Content = m_rawUtf8Content; m_expandedUtf8ContentSize = m_rawUtf8ContentSize; // point to expansion buffer if we did any expanding if ( m_didExpansion ) { // copy over the rest m_esbuf.safeMemcpy ( m_oldp , pend - m_oldp ); // null term it m_esbuf.pushChar('\0'); // and point to that buffer m_expandedUtf8Content = m_esbuf.getBufStart();//m_buf; // include the \0 as part of the size m_expandedUtf8ContentSize = m_esbuf.m_length; // + 1; } // sanity -- must be \0 terminated if ( m_expandedUtf8Content[m_expandedUtf8ContentSize-1] ) { char *xx=NULL;*xx=0; } m_expandedUtf8ContentValid = true; return &m_expandedUtf8Content; } // . get the final utf8 content of the document // . all html entities are replaced with utf8 chars // . all iframes are expanded // . if we are using diffbot then getting the utf8 content should return // the json which is the output from the diffbot api. UNLESS we are getting // the webpage itself for harvesting outlinks to spider later. char **XmlDoc::getUtf8Content ( ) { // if we already computed it, return that if ( m_utf8ContentValid ) return &ptr_utf8Content; if ( m_setFromTitleRec ) { m_utf8ContentValid = true; return &ptr_utf8Content; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // recycle? if ( cr->m_recycleContent || m_recycleContent || // if trying to delete from index, load from old titlerec m_deleteFromIndex ) { // get the old xml doc from the old title rec XmlDoc **pod = getOldXmlDoc ( ); if ( ! pod || pod == (void *)-1 ) return (char **)pod; // int16_tcut XmlDoc *od = *pod; // this is non-NULL if it existed if ( od ) { ptr_utf8Content = od-> ptr_utf8Content; size_utf8Content = od->size_utf8Content; m_utf8ContentValid = true; m_contentType = od->m_contentType; m_contentTypeValid = true; // sanity check if ( ptr_utf8Content && ptr_utf8Content[size_utf8Content-1] ) { char *xx=NULL;*xx=0; } return &ptr_utf8Content; } // if could not find title rec and we are docid-based then // we can't go any further!! if ( m_setFromDocId || // it should be there if trying to delete as well! m_deleteFromIndex ) { log("xmldoc: null utf8 content for docid-based " "titlerec (d=%"INT64") lookup which was not found", m_docId); ptr_utf8Content = NULL; size_utf8Content = 0; m_utf8ContentValid = true; m_contentType = CT_HTML; m_contentTypeValid = true; return &ptr_utf8Content; } } char **ep = getExpandedUtf8Content(); if ( ! ep || ep == (void *)-1 ) return ep; // NULL out if no content if ( ! *ep ) { ptr_utf8Content = NULL; size_utf8Content = 0; m_utf8ContentValid = true; return &ptr_utf8Content; } uint8_t *ct = getContentType(); if ( ! ct || ct == (void *)-1 ) return (char **)ct; // if we have a json reply, leave it alone... expanding a " // into a double quote will mess up the JSON! if ( *ct == CT_JSON ) { ptr_utf8Content = (char *)m_expandedUtf8Content; size_utf8Content = m_expandedUtf8ContentSize; m_utf8ContentValid = true; return &ptr_utf8Content; } // why would the spider proxy, who use msg13.cpp to call // XmlDoc::getExpandedUtf8Content() want to call this??? it seems // to destroy expandedutf8content with a call to htmldecode if ( m_isSpiderProxy ) { char *xx=NULL;*xx=0; } // not if rss file extension //bool isRSSExt = false; //char *ext = m_firstUrl.getExtension(); //if ( ext && strcasecmp(ext,"rss") == 0 ) isRSSExt = true; //if ( ext && strcasecmp(ext,"xml") == 0 ) isRSSExt = true; //if ( ext && strcasecmp(ext,"atom") == 0 ) isRSSExt = true; //if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; } //if ( m_contentTypeValid && m_contentType == CT_XML ) isRSSExt = true; // convert < to ???? and & to utf32 char // for a double wide ampersand? //bool doSpecial = true; // convert to what it should be if we are an .rss file extension //if ( isRSSExt ) doSpecial = false; // sabnity check if ( m_xmlValid ) { char *xx=NULL;*xx=0; } if ( m_wordsValid ) { char *xx=NULL;*xx=0; } QUICKPOLL(m_niceness); // // convert illegal utf8 characters into spaces // // fixes santaclarachorale.vbotickets.com/tickets/g.f._handels_israel_in_egypt/1062 // which has a 228,0x80,& sequence (3 chars, last is ascii) uint8_t *x = (uint8_t *)m_expandedUtf8Content; char size; for ( ; *x ; x += size ) { QUICKPOLL(m_niceness); size = getUtf8CharSize(x); // ok, make it a space i guess if it is a bad utf8 char if ( ! isSaneUtf8Char(x) ) { *x = ' '; size = 1; continue; } // skip if only one byte if ( size == 1 ) continue; // now each byte in the sequence must have 0x80 set... if ( ! (x[1] & 0x80) ) { x[0] = ' '; size = 1; continue; } if ( size == 2 ) continue; if ( ! (x[2] & 0x80) ) { x[0] = ' '; size = 1; continue; } if ( size == 3 ) continue; if ( ! (x[3] & 0x80) ) { x[0] = ' '; size = 1; continue; } } // sanity if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; } // if we are an xml doc, then before we call htmlDecode translate // all tags like or <link> to <gbtitle> or <gblink> so we // know they are xml tags. because stuff like <br> will // become <br> and will be within its xml tag like <gbdescription> // or <gbtitle>. // MDW: 9/28/2014. no longer do this since i added hashXmlFields(). /* if ( m_contentType == CT_XML ) { // count the xml tags char *p = m_expandedUtf8Content; char *pend = p + m_expandedUtf8ContentSize - 1; int32_t need = m_expandedUtf8ContentSize; for ( ; p < pend ; p++ ) { QUICKPOLL(m_niceness); if ( *p == '<' ) need += 5; // for adding "gbxml" } if ( ! m_xbuf.reserve(need) ) return NULL; // reset ptr p = m_expandedUtf8Content; // ponit to dst char *dst = m_xbuf.getBufStart(); // do the copy for ( ; p < pend ; p++ ) { // breathe QUICKPOLL(m_niceness); // copy it over *dst++ = *p; if ( *p != '<' ) continue; // if <?xml> copy over as is, do not insert 'gb' if ( p[1] == '?' ) continue; // same for comments <!--...--> if ( p[1] == '!' ) continue; // point to tagname char *tagName = p+1; if ( p[1] == '/' ) tagName++; // also get the full node now NodeType *nt; getTagId ( tagName , &nt ); // if it is not an html tag, do not fuss with it! if ( ! nt ) continue; // if its in the list but is xml, let it go too if ( nt->m_isXmlTag ) continue; // . otherwise, its an html tag being used as an xml // tag and we need to encode (append gbxml to it) // . insert / first if there if ( p[1] == '/' ) {p++;*dst++ = *p;} // then "gb" *dst++ = 'g'; *dst++ = 'b'; *dst++ = 'x'; *dst++ = 'm'; *dst++ = 'l'; } // update m_xbuf.m_length = dst - m_xbuf.getBufStart(); // final \0 *dst = '\0'; // re-assign these m_expandedUtf8Content = m_xbuf.getBufStart();//m_buf; m_expandedUtf8ContentSize = m_xbuf.m_length + 1; // free esbuf if we were referencing that to save mem m_esbuf.purge(); } */ // richmondspca.org has " in some tags and we do not like // expanding that to " because it messes up XmlNode::getTagLen() // and creates big problems. same for www.first-avenue.com. so // by setting doSpecial to try we change < > and " to // [ ] and ' which have no meaning in html per se. bool doSpecial = true; if ( m_contentType == CT_XML ) doSpecial = false; // . now decode those html entites into utf8 so that we never have to // check for html entities anywhere else in the code. a big win!! // . doSpecial = true, so that <, >, & and " are // encoded into high value // utf8 chars so that Xml::set(), etc. still work properly and don't // add any more html tags than it should // . this will decode in place // . MDW: 9/28/2014. no longer do for xml docs since i added // hashXmlFields() int32_t n = m_expandedUtf8ContentSize - 1; if ( m_contentType != CT_XML ) n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content, m_expandedUtf8Content,//ptr_utf8Content, m_expandedUtf8ContentSize-1,//size_utf8Con doSpecial, m_niceness); // can't exceed this! n does not include the final \0 even though // we do right it out. if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; } // sanity if ( m_expandedUtf8Content[n] != '\0' ) { char *xx=NULL;*xx=0; } // now rss has crap in it like "&nbsp;" so we have to do another // decoding pass // . MDW: 9/28/2014. no longer do for xml docs since i added // hashXmlFields() // if ( m_contentType == CT_XML ) // isRSSExt ) // n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content, // m_expandedUtf8Content,//ptr_utf8Content, // n, // false,//doSpecial, // m_niceness); // sanity if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; } // sanity if ( m_expandedUtf8Content[n] != '\0' ) { char *xx=NULL;*xx=0; } // finally transform utf8 apostrophe's into regular apostrophes // to make parsing easier uint8_t *p = (uint8_t *)m_expandedUtf8Content; uint8_t *dst = (uint8_t *)m_expandedUtf8Content; uint8_t *pend = p + n; for ( ; *p ; p += size ) { QUICKPOLL(m_niceness); size = getUtf8CharSize(p); // quick copy if ( size == 1 && p[0] != '<' ) { *dst++ = *p; continue; } // make "1<super>st</super>" into "1st" so Dates.cpp can // have an easier time if ( p[0] == '<' && to_lower_a(p[1]) == 's' && to_lower_a(p[2]) == 'u' && to_lower_a(p[3]) == 'p' ) { // assume no go! *dst++ = '<'; // use this char *s = (char *)p; // did number preceed? char *pn = s - 1; for (;pn>=m_expandedUtf8Content&&is_wspace_a(*pn);pn--) QUICKPOLL(m_niceness); // must be like "1st" or "32nd" if ( ! is_digit(*pn) ) continue; // skip the "<sup" s += 4; // skip until > for (; *s && *s != '>' ; s++ ) QUICKPOLL(m_niceness); // crazy? if ( ! *s ) continue; // skip the '>' s++; // skip spaces after the "<sup>" tag for (; *s && is_wspace_a(*s) ; s++ ) QUICKPOLL(m_niceness); // crazy? if ( ! *s ) continue; // check for "st" etc bool gotIt = false; char *suffix = s; if ( (to_lower_a(s[0])=='s'&&to_lower_a(s[1]) == 't')|| (to_lower_a(s[0])=='n'&&to_lower_a(s[1]) == 'd')|| (to_lower_a(s[0])=='r'&&to_lower_a(s[1]) == 'd')|| (to_lower_a(s[0])=='t'&&to_lower_a(s[1]) == 'h')) gotIt = true; if ( ! gotIt ) continue; // skip that s += 2; // skip more spaces for (; *s && is_wspace_a(*s) ; s++ ) QUICKPOLL(m_niceness); // crazy? if ( ! *s ) continue; // find </super> tag if ( s[0] != '<' ) continue; if ( s[1] != '/' ) continue; if ( to_lower_a(s[2]) != 's' ) continue; if ( to_lower_a(s[3]) != 'u' ) continue; if ( to_lower_a(s[4]) != 'p' ) continue; if ( s[5] != '>' ) continue; // skip it, point to > s += 5; // assign p to that p = (unsigned char *)s; // back up ove rthe no-go dst--; // rewrite it *dst++ = to_lower_a(suffix[0]); *dst++ = to_lower_a(suffix[1]); // do next round continue; } // check for crazy apostrophes if ( p[0]==0xe2 && p[1]==0x80 && (p[2]==0x99 || p[2]==0x98 || p[2]==0x9b ) ) { *dst++ = '\''; continue; } // utf8 control character? if ( p[0] == 0xc2 && p[1] >= 0x80 && p[1] <= 0x9f ) { *dst++ = ' '; continue; } // double quotes in utf8 // DO NOT do this if type JSON!! json uses quotes as // control characters if ( p[0] == 0xe2 && p[1] == 0x80 && m_contentType != CT_JSON ) { if (p[2] == 0x9c ) { *dst++ = '\"'; continue; } if (p[2] == 0x9d ) { *dst++ = '\"'; continue; } } // and crazy hyphens (8 - 10pm) if ( p[0]==0xc2 && p[1]==0xad ) { *dst++ = '-'; continue; } if ( p[0]==0xe2 && p[1]==0x80 && p[2]==0x93 ) { *dst++ = '-'; continue; } if ( p[0]==0xe2 && p[1]==0x80 && p[2]==0x94 ) { *dst++ = '-'; continue; } // . convert all utf8 white space to ascii white space // . should benefit the string matching algo in // XmlDoc::getEventSummary() which needs to skip spaces if ( ! g_map_is_ascii[(unsigned char)*p] && is_wspace_utf8(p) ) { *dst++ = ' '; continue; } // otherwise, just copy it gbmemcpy(dst,p,size); dst += size; } // null term *dst++ = '\0'; // now set it up ptr_utf8Content = (char *)m_expandedUtf8Content; //size_utf8Content = n+1;//m_expandedUtf8ContentSize; size_utf8Content = (char *)dst - m_expandedUtf8Content; // sanity -- skipped over the \0??? if ( p > pend ) { char *xx=NULL;*xx=0; } // sanity check if ( ptr_utf8Content && ptr_utf8Content[size_utf8Content-1] ) { char *xx=NULL;*xx=0; } m_utf8ContentValid = true; return &ptr_utf8Content; } // *pend should be \0 int32_t getContentHash32Fast ( unsigned char *p , int32_t plen , int32_t niceness ) { // sanity if ( ! p ) return 0; if ( plen <= 0 ) return 0; if ( p[plen] != '\0' ) { char *xx=NULL;*xx=0; } unsigned char *pend = p + plen; static bool s_init = false; static char s_qtab0[256]; static char s_qtab1[256]; static char s_qtab2[256]; static char *s_skips[] = { "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec", "sun", "mon", "tue", "wed", "thu", "fri", "sat" }; if ( ! s_init ) { // only call this crap once s_init = true; // clear up memset(s_qtab0,0,256); memset(s_qtab1,0,256); memset(s_qtab2,0,256); for ( int32_t i = 0 ; i < 19 ; i++ ) { unsigned char *s = (unsigned char *)s_skips[i]; s_qtab0[(unsigned char)to_lower_a(s[0])] = 1; s_qtab0[(unsigned char)to_upper_a(s[0])] = 1; // do the quick hash unsigned char qh = to_lower_a(s[0]); qh ^= to_lower_a(s[1]); qh <<= 1; qh ^= to_lower_a(s[2]); s_qtab1[qh] = 1; // try another hash, the swift hash unsigned char sh = to_lower_a(s[0]); sh <<= 1; sh ^= to_lower_a(s[1]); sh <<= 1; sh ^= to_lower_a(s[2]); s_qtab2[sh] = 1; } } bool lastWasDigit = false; bool lastWasPunct = true; uint32_t h = 0LL; //char size = 0; unsigned char pos = 0; for ( ; p < pend ; p++ ) { // += size ) { // breathe QUICKPOLL ( niceness ); // get size // this might not be utf8!!! //size = getUtf8CharSize(p); // skip if not alnum // this might not be utf8!!! //if ( ! is_alnum_utf8 ( (char *)p ) ) { if ( ! is_alnum_a ( *p ) ) { lastWasDigit = false; lastWasPunct = true; continue; } // if its a digit, call it 1 if ( is_digit(*p) ) { // skip consecutive digits if ( lastWasDigit ) continue; // xor in a '1' h ^= g_hashtab[pos][(unsigned char)'1']; pos++; lastWasDigit = true; continue; } // reset lastWasDigit = false; // exclude days of the month or week so clocks do // not affect this hash if ( s_qtab0[p[0]] && lastWasPunct && p[1] && p[2] ) { // quick hash unsigned char qh = to_lower_a(p[0]); qh ^= to_lower_a(p[1]); qh <<= 1; qh ^= to_lower_a(p[2]); // look that up if ( ! s_qtab1[qh] ) goto skip; // try another hash, the swift hash unsigned char sh = to_lower_a(p[0]); sh <<= 1; sh ^= to_lower_a(p[1]); sh <<= 1; sh ^= to_lower_a(p[2]); if ( ! s_qtab2[sh] ) goto skip; // ok, probably a match.. unsigned char *s = p + 3; // skip to end of word //char size2; //for ( ; s < pend ; s += size2 ) { for ( ; s < pend ; s++ ) { //size2 = getUtf8CharSize(s); //if ( ! is_alnum_utf8 ((char *)s) ) if ( ! is_alnum_a ( *s ) ) break; } // it is already point to the next char, so clr this //size = 0; // advance p now p = s; // hash as one type of thing... h ^= g_hashtab[pos][(unsigned char)'X']; pos++; continue; } skip: // reset this lastWasPunct = false; // xor this in right h ^= g_hashtab[pos][p[0]]; pos++; // assume ascii or latin1 continue; /* // one more? if ( size == 1 ) continue; // do that h ^= g_hashtab[pos][p[1]]; pos++; // one more? if ( size == 2 ) continue; // do that h ^= g_hashtab[pos][p[2]]; pos++; // one more? if ( size == 3 ) continue; // do that h ^= g_hashtab[pos][p[3]]; pos++; // that should do it! continue; */ } return h; } int32_t *XmlDoc::getContentHash32 ( ) { // return it if we got it if ( m_contentHash32Valid ) return &m_contentHash32; setStatus ( "getting contenthash32" ); uint8_t *ct = getContentType(); if ( ! ct || ct == (void *)-1 ) return (int32_t *)ct; // we do not hash the url/resolved_url/html fields in diffbot json // because the url field is a mirror of the url and the html field // is redundant and would slow us down if ( *ct == CT_JSON ) return getContentHashJson32(); // if we are a diffbot json object, fake this for now, it will // be set for real in hashJSON() // no, because we call this before hashJSON() for to set // EDOCUNCHANGED above... so just hash the json normally for now //if ( m_isDiffbotJSONObject ) { // m_contentHash32 = 0; // return &m_contentHash32; //} // . get the content. get the pure untouched content!!! // . gotta be pure since that is what Msg13.cpp computes right // after it downloads the doc... // . if iframes are present, msg13 gives up char **pure = getContent(); if ( ! pure || pure == (char **)-1 ) return (int32_t *)pure; // size //int32_t n = size_utf8Content - 1; // hash up to first 10,000 chars //if ( n > 10000 ) n = 10000; // do it //m_contentHash32 = hash32 ( ptr_utf8Content , n ); unsigned char *p = (unsigned char *)(*pure); int32_t plen = m_contentLen;//size_utf8Content - 1; // no content means no hash32 if ( plen <= 0 ) {//ptr_utf8Content ) { m_contentHash32 = 0; m_contentHash32Valid = true; return &m_contentHash32; } // we set m_contentHash32 in ::hashJSON() below because it is special // for diffbot since it ignores certain json fields like url: and the // fields are independent, and numbers matter, like prices //if ( m_isDiffbotJSONObject ) { char *xx=NULL; *xx=0; } // *pend should be \0 m_contentHash32 = getContentHash32Fast ( p , plen , m_niceness ); // validate m_contentHash32Valid = true; return &m_contentHash32; } // we do not hash the url/resolved_url/html fields in diffbot json // because the url field is a mirror of the url and the html field // is redundant and would slow us down int32_t *XmlDoc::getContentHashJson32 ( ) { if ( m_contentHash32Valid ) return &m_contentHash32; // use new json parser Json *jp = getParsedJson(); if ( ! jp || jp == (void *)-1 ) return (int32_t *)jp; JsonItem *ji = jp->getFirstItem(); int32_t totalHash32 = 0; //logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url); for ( ; ji ; ji = ji->m_next ) { QUICKPOLL(m_niceness); // skip if not number or string if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING ) continue; char *topName = NULL; // what name level are we? int32_t numNames = 1; JsonItem *pi = ji->m_parent; for ( ; pi ; pi = pi->m_parent ) { // empty name? if ( ! pi->m_name ) continue; if ( ! pi->m_name[0] ) continue; topName = pi->m_name; numNames++; } // if we are the diffbot reply "html" field do not hash this // because it is redundant and it hashes html tags etc.! // plus it slows us down a lot and bloats the index. if ( ji->m_name && numNames==1 && strcmp(ji->m_name,"html") == 0 ) continue; if ( ji->m_name && numNames==1 && strcmp(ji->m_name,"url") == 0 ) continue; if ( ji->m_name && numNames==1 && strcmp(ji->m_name,"pageUrl") == 0 ) continue; // mike will track down how the hash works in article|3|123456 //if ( ji->m_name && numNames==1 && // strcmp(ji->m_name,"diffbotUri") == 0 ) // continue; if ( ji->m_name && numNames==1 && strcmp(ji->m_name,"resolved_url") == 0 ) continue; if ( topName && strcmp(topName,"stats") == 0 ) continue; if ( topName && strcmp(topName,"queryString") == 0 ) continue; if ( topName && strcmp(topName,"nextPages") == 0 ) continue; if ( topName && strcmp(topName,"textAnalysis") == 0 ) continue; if ( topName && strcmp(topName,"links") == 0 ) continue; // hash the fully compound name int32_t nameHash32 = 0; JsonItem *p = ji; char *lastName = NULL; for ( ; p ; p = p->m_parent ) { // empty name? if ( ! p->m_name ) continue; if ( ! p->m_name[0] ) continue; // dup? can happen with arrays. parent of string // in object, has same name as his parent, the // name of the array. "dupname":[{"a":"b"},{"c":"d"}] if ( p->m_name == lastName ) continue; // update lastName = p->m_name; // hash it up nameHash32 = hash32(p->m_name,p->m_nameLen,nameHash32); } // // now Json.cpp decodes and stores the value into // a buffer, so ji->getValue() should be decoded completely // // . get the value of the json field // . if it's a number or bool it converts into a string int32_t vlen; char *val = ji->getValueAsString( &vlen ); // // for deduping search results we set m_contentHash32 here for // diffbot json objects. // // we use this hash for setting EDOCUNCHANGED when reindexing // a diffbot reply. we also use to see if the diffbot reply // is a dup with another page in the index. thirdly, we use // to dedup search results, which could be redundant because // of our spider-time deduping. // // make the content hash so we can set m_contentHash32 // for deduping. do an exact hash for now... int32_t vh32 = hash32 ( val , vlen , m_niceness ); // combine int32_t combined32 = hash32h ( nameHash32 , vh32 ); // accumulate field/val pairs order independently totalHash32 ^= combined32; // debug note //logf(LOG_DEBUG,"ch32: field=%s nh32=%"UINT32" vallen=%"INT32"", // ji->m_name, // nameHash32, // vlen); } m_contentHash32 = totalHash32; m_contentHash32Valid = true; return &m_contentHash32; } // do not consider tags except frame and iframe... make all months // and days of weeks and digits basically the same int64_t *XmlDoc::getLooseContentHash64 ( ) { if ( m_looseContentHash64Valid ) return &m_looseContentHash64; Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (int64_t *)xml; int64_t h64 = 0LL; int32_t n = xml->getNumNodes(); XmlNode *nodes = xml->getNodes (); for ( int32_t i = 0 ; i < n ; i++ ) { // breathe QUICKPOLL(m_niceness); // skip if not the right kinda tag if ( nodes[i].isTag() && nodes[i].getNodeId() != TAG_FRAME && nodes[i].getNodeId() != TAG_IFRAME && nodes[i].getNodeId() != TAG_IMG ) continue; // hash that node up int64_t ch64; // this is really a 32-bit hash ch64=getContentHash32Fast((unsigned char *)nodes[i].getNode() , nodes[i].getNodeLen() , m_niceness ); // incorporate hash from that node h64 = hash64h ( ch64 , h64 ); } m_looseContentHash64Valid = true; m_looseContentHash64 = h64; return &m_looseContentHash64; } int32_t XmlDoc::getHostHash32a ( ) { if ( m_hostHash32aValid ) return m_hostHash32a; m_hostHash32aValid = true; Url *f = getFirstUrl(); m_hostHash32a = f->getHostHash32(); return m_hostHash32a; } int32_t XmlDoc::getHostHash32b ( ) { if ( m_hostHash32bValid ) return m_hostHash32b; m_hostHash32bValid = true; Url *c = getCurrentUrl(); m_hostHash32b = c->getHostHash32(); return m_hostHash32b; } int32_t XmlDoc::getDomHash32( ) { if ( m_domHash32Valid ) return m_domHash32; m_domHash32Valid = true; Url *f = getFirstUrl(); m_domHash32 = hash32 ( f->getDomain(), f->getDomainLen() ); return m_domHash32; } // . this will be the actual pnm data of the image thumbnail // . you can inline it in an image tag like // <img src="...."/> // background-image:url(...); // . FORMAT of ptr_imageData: // <origimageUrl>\0<4bytethumbwidth><4bytethumbheight><thumbnaildatajpg> char **XmlDoc::getThumbnailData ( ) { if ( m_imageDataValid ) return &ptr_imageData; Images *images = getImages(); if ( ! images || images == (Images *)-1 ) return (char **)images; ptr_imageData = NULL; size_imageData = 0; m_imageDataValid = true; if ( ! images || ! images->m_imageBufValid ) return &ptr_imageData; if ( images->m_imageBuf.length() <= 0 ) return &ptr_imageData; // this buffer is a ThumbnailArray ptr_imageData = images->m_imageBuf.getBufStart(); size_imageData = images->m_imageBuf.length(); return &ptr_imageData; } Images *XmlDoc::getImages ( ) { if ( m_imagesValid ) return &m_images; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; if ( ! cr->m_makeImageThumbnails ) { m_images.reset(); m_imagesValid = true; return &m_images; } if ( cr->m_isCustomCrawl ) { m_images.reset(); m_imagesValid = true; return &m_images; } setStatus ( "getting thumbnail" ); Words *words = getWords(); if ( ! words || words == (Words *)-1 ) return (Images *)words; Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (Images *)xml; Sections *sections = getSections(); if ( ! sections || sections==(Sections *)-1) return (Images *)sections; char *site = getSite (); if ( ! site || site == (char *)-1 ) return (Images *)site; int64_t *d = getDocId(); if ( ! d || d == (int64_t *)-1 ) return (Images *)d; int8_t *hc = getHopCount(); if ( ! hc || hc == (void *)-1 ) return (Images *)hc; Url *cu = getCurrentUrl(); if ( ! cu || cu == (void *)-1 ) return (Images *)cu; // . this does not block or anything // . if we are a diffbot json reply it should just use the primary // image, if any, as the only candidate m_images.setCandidates ( cu , words , xml , sections , this ); setStatus ("getting thumbnail"); // assume valid m_imagesValid = true; // now get the thumbnail if ( ! m_images.getThumbnail ( site , gbstrlen(site) , *d , this , cr->m_collnum , //NULL , // statusPtr ptr *hc , m_masterState, m_masterLoop ) ) return (Images *)-1; return &m_images; } // . get different attributes of the Links as vectors // . these are 1-1 with the Links::m_linkPtrs[] array TagRec ***XmlDoc::getOutlinkTagRecVector () { // if page has a <meta name=usefakeips content=1> tag // then use the hash of the links host as the firstip. // this will speed things up when adding a gbdmoz.urls.txt.* // file to index every url in dmoz. char *useFakeIps = hasFakeIpsMetaTag(); if ( ! useFakeIps || useFakeIps == (void *)-1 ) return (TagRec ***)useFakeIps; // no error and valid, return quick if ( m_outlinkTagRecVectorValid && *useFakeIps ) return &m_outlinkTagRecVector; // error? if ( m_outlinkTagRecVectorValid && m_msge0.m_errno ) { g_errno = m_msge0.m_errno; return NULL; } // if not using fake ips, give them the real tag rec vector if ( m_outlinkTagRecVectorValid ) return &m_msge0.m_tagRecPtrs; Links *links = getLinks(); if ( ! links || links == (void *) -1 ) return (TagRec ***)links; if ( *useFakeIps ) { // set to those m_fakeTagRec.reset(); // just make a bunch ptr to empty tag rec int32_t need = links->m_numLinks * sizeof(TagRec *); if ( ! m_fakeTagRecPtrBuf.reserve ( need ) ) return NULL; // make them all point to the fake empty tag rec TagRec **grv = (TagRec **)m_fakeTagRecPtrBuf.getBufStart(); for ( int32_t i = 0 ; i < links->m_numLinks ; i++ ) grv[i] = &m_fakeTagRec; // set it m_outlinkTagRecVector = grv; m_outlinkTagRecVectorValid = true; return &m_outlinkTagRecVector; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // update status msg setStatus ( "getting outlink tag rec vector" ); TagRec *gr = getTagRec(); if ( ! gr || gr == (TagRec *)-1 ) return (TagRec ***)gr; // assume valid m_outlinkTagRecVectorValid = true; // go get it if ( ! m_msge0.getTagRecs ( links->m_linkPtrs , links->m_linkFlags , links->m_numLinks , false , // skip old? // make it point to this basetagrec if // the LF_SAMEHOST flag is set for the link gr , cr->m_collnum , m_niceness , m_masterState , m_masterLoop )) { // sanity check if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; } // we blocked return (TagRec ***)-1; } // error? if ( g_errno ) return NULL; // or this? if ( m_msge0.m_errno ) { g_errno = m_msge0.m_errno; return NULL; } // set it //m_outlinkTagRecVector = m_msge0.m_tagRecPtrs; // ptr to a list of ptrs to tag recs return &m_msge0.m_tagRecPtrs; } char *XmlDoc::hasNoIndexMetaTag() { if ( m_hasNoIndexMetaTagValid ) return &m_hasNoIndexMetaTag; // assume none m_hasNoIndexMetaTag = false; // store value/content of meta tag in here char mbuf[16]; mbuf[0] = '\0'; char *tag = "noindex"; int32_t tlen = gbstrlen(tag); // check the xml for a meta tag Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; xml->getMetaContent ( mbuf, 16 , tag , tlen ); if ( mbuf[0] == '1' ) m_hasNoIndexMetaTag = true; m_hasNoIndexMetaTagValid = true; return &m_hasNoIndexMetaTag; } char *XmlDoc::hasFakeIpsMetaTag ( ) { if ( m_hasUseFakeIpsMetaTagValid ) return &m_hasUseFakeIpsMetaTag; char mbuf[16]; mbuf[0] = '\0'; char *tag = "usefakeips"; int32_t tlen = gbstrlen(tag); // check the xml for a meta tag Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; xml->getMetaContent ( mbuf, 16 , tag , tlen ); m_hasUseFakeIpsMetaTag = false; if ( mbuf[0] == '1' ) m_hasUseFakeIpsMetaTag = true; m_hasUseFakeIpsMetaTagValid = true; return &m_hasUseFakeIpsMetaTag; } int32_t **XmlDoc::getOutlinkFirstIpVector () { Links *links = getLinks(); if ( ! links ) return NULL; // if page has a <meta name=usefakeips content=1> tag // then use the hash of the links host as the firstip. // this will speed things up when adding a gbdmoz.urls.txt.* // file to index every url in dmoz. char *useFakeIps = hasFakeIpsMetaTag(); if ( ! useFakeIps || useFakeIps == (void *)-1 ) return (int32_t **)useFakeIps; if ( *useFakeIps && m_outlinkIpVectorValid ) return &m_outlinkIpVector; if ( *useFakeIps ) { int32_t need = links->m_numLinks * 4; m_fakeIpBuf.reserve ( need ); for ( int32_t i = 0 ; i < links->m_numLinks ; i++ ) { uint64_t h64 = links->getHostHash64(i); int32_t ip = h64 & 0xffffffff; m_fakeIpBuf.pushLong(ip); } int32_t *ipBuf = (int32_t *)m_fakeIpBuf.getBufStart(); m_outlinkIpVector = ipBuf; m_outlinkIpVectorValid = true; return &m_outlinkIpVector; } // return msge1's buf otherwise if ( m_outlinkIpVectorValid ) return &m_msge1.m_ipBuf; // should we have some kinda error for msge1? //if ( m_outlinkIpVectorValid && m_msge1.m_errno ) { // g_errno = m_msge1.m_errno; // return NULL; //} // . we now scrounge them from TagRec's "firstip" tag if there! // . that way even if a domain changes its ip we still use the // original ip, because the only reason we need this ip is for // deciding which group of hosts will store this SpiderRequest and // we use that for throttling, so we have to be consistent!!! // . we never add -1 or 0 ips to tagdb though.... (NXDOMAIN,error...) // . uses m_msgeForTagRecs for this one TagRec ***grv = getOutlinkTagRecVector(); if ( ! grv || grv == (void *)-1 ) return (int32_t **)grv; // note it setStatus ( "getting outlink first ip vector" ); // assume valid m_outlinkIpVectorValid = true; // sanity check //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } // use this int32_t nowGlobal = getSpideredTime();//m_spideredTime; // add tags to tagdb? bool addTags = true; //if ( m_sreqValid && m_sreq.m_isPageParser ) addTags = false; if ( getIsPageParser() ) addTags = false; // get this char *testDir = getTestDir(); CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // . go get it // . if coll is "qatest123" then try to use the file ./test/ips.txt to // see if the ip is in there for the given url hostname // . this will now update Tagdb with the "firstip" tags if it should!! // . this just dns looks up the DOMAINS of each outlink because these // are *first* ips and ONLY used by Spider.cpp for throttling!!! if ( ! m_msge1.getFirstIps ( *grv , links->m_linkPtrs , links->m_linkFlags , links->m_numLinks , false , // skip old? cr->m_coll , m_niceness , m_masterState , m_masterLoop , nowGlobal , addTags , testDir )) { // sanity check if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; } // we blocked return (int32_t **)-1; } // error? if ( g_errno ) return NULL; // . ptr to a list of ptrs to tag recs // . ip will be -1 on error return &m_msge1.m_ipBuf; } /* // really this could just check titledb in memory tree and tfndb and should // be really fast!! char **XmlDoc::getOutlinkIsIndexedVector () { if ( m_outlinkIsIndexedVectorValid ) return &m_msge2.m_isIndexedBuf; setStatus ( "getting outlink is indexed vector" ); Links *links = getLinks(); if ( ! links ) return NULL; // assume valid m_outlinkIsIndexedVectorValid = true; // go get it bool status = m_msge2.getIsIndexed ( links->m_linkPtrs , links->m_linkFlags , links->m_numLinks , false , // skip old? m_coll , m_niceness , m_masterState , m_masterLoop ); // set it //m_outlinkIsIndexedVector = m_msge2.m_isIndexedBuf; // we blocked if ( ! status ) return (char **)-1; // error? if ( g_errno ) return NULL; // ptr to a list of ptrs to tag recs return &m_msge2.m_isIndexedBuf; } */ /* char *XmlDoc::getIsVisible ( ) { if ( m_isVisibleValid ) return &m_isVisible; setStatus ( "getting is visible" ); // to get a live reading, invalidate tag rec from title rec m_oldTagRecValid = false; // . loop over all regular expression in the url filters table // . stop at first regular expression it matches int32_t *rn = getRegExpNum2 ( -1 ); // need to wait for a callback at this point (or we had critical error) if ( ! rn || rn == (int32_t *)-1 ) return (char *)rn; // assume yes m_isVisible = true; // and valid m_isVisibleValid = true; // no match if ( *rn == -1 ) return &m_isVisible; // get spider priority int32_t pr = m_cr->m_spiderPriorities[*rn]; // test it if ( pr == -2 ) m_isVisible = false; if ( pr == -3 ) m_isVisible = false; return &m_isVisible; } */ int32_t *XmlDoc::getUrlFilterNum ( ) { // return it if already set if ( m_urlFilterNumValid ) return &m_urlFilterNum; // note that setStatus ( "getting url filter row num"); // . make the partial new spider rec // . we need this for matching filters like lang==zh_cn // . crap, but then it matches "hasReply" when it should not // . PROBLEM! this is the new reply not the OLD reply, so it may // end up matching a DIFFERENT url filter num then what it did // before we started spidering it... //SpiderReply *newsr = getNewSpiderReply ( ); // note it //if ( ! newsr ) // log("doc: getNewSpiderReply: %s",mstrerror(g_errno)); //if ( ! newsr || newsr == (void *)-1 ) return (int32_t *)newsr; // need language i guess uint8_t *langId = getLangId(); if ( ! langId || langId == (uint8_t *)-1 ) return (int32_t *)langId; // make a fake one for now SpiderReply fakeReply; // just language for now, so we can FILTER by language if ( m_langIdValid ) fakeReply.m_langId = m_langId; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // this must be valid //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } int32_t spideredTime = getSpideredTime(); // get the spider request SpiderRequest *oldsr = &m_sreq; // null it out if invalid... if ( ! m_sreqValid ) oldsr = NULL; // do not set the spideredTime in the spiderReply to 0 // so we do not trigger the lastSpiderTime //int32_t saved = newsr->m_spideredTime; //newsr->m_spideredTime = 0; // // PROBLEM: we end up matching "isIndexed" in the url filters // even if this is a NEW document because we pass it in the spider // reply that we generate now even though another spider reply // may not exist. // // SOLUTION: just do not supply a spider reply, we only seem to // use the urlfilternum to get a diffbot api url OR to see if the // document is banned/filtered so we should delete it. otherwise // we were supplying "newsr" above... // . look it up // . use the old spidered date for "nowGlobal" so we can be consistent // for injecting into the "qatest123" coll int32_t ufn = ::getUrlFilterNum ( oldsr,&fakeReply,spideredTime,false, m_niceness,cr, false, // isOutlink? NULL); // put it back //newsr->m_spideredTime = saved; // bad news? if ( ufn < 0 ) { log("build: failed to get url filter for xmldoc %s", m_firstUrl.m_url); //g_errno = EBADENGINEER; //return NULL; } // store it m_urlFilterNum = ufn; m_urlFilterNumValid = true; // set this too in case the url filters table changes while // we are spidering this and a row is inserted or deleted or something //SafeBuf *yy = &cr->m_spiderDiffbotApiUrl[ufn]; // copy to ours //m_diffbotApiUrl.safeMemcpy ( yy ); // ensure null term //m_diffbotApiUrl.nullTerm(); //m_diffbotApiUrlValid = true; return &m_urlFilterNum; } // . both "u" and "site" must not start with http:// or https:// or protocol bool isSiteRootFunc ( char *u , char *site ) { // get length of each int32_t slen = gbstrlen(site);//m_siteLen; int32_t ulen = gbstrlen(u); // "site" may or may not end in /, so remove that if ( site[slen-1] == '/' ) slen--; // same for url if ( u[ulen-1] == '/' ) ulen--; // skip http:// or https:// if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; } if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; } if ( strncmp(site,"http://" ,7)==0 ) { site += 7; slen -= 7; } if ( strncmp(site,"https://",8)==0 ) { site += 8; slen -= 8; } // subtract default.asp etc. from "u" //if ( ulen > 15 && strncasecmp(u+ulen-11,"default.asp",11)==0 ) // ulen -= 11; //if ( ulen > 15 && strncasecmp(u+ulen-11,"default.html",12)==0 ) // ulen -= 12; //if ( ulen > 15 && strncasecmp(u+ulen-11,"index.html",10)==0 ) // ulen -= 10; // now they must match exactly if ( slen == ulen && ! strncmp ( site, u, ulen ) ) return true; // all done return false; } bool isSiteRootFunc3 ( char *u , int32_t siteRootHash32 ) { // get length of each int32_t ulen = gbstrlen(u); // remove trailing / if ( u[ulen-1] == '/' ) ulen--; // skip http:// or https:// if ( strncmp(u,"http://" ,7)==0 ) { u += 7; ulen -= 7; } if ( strncmp(u,"https://",8)==0 ) { u += 8; ulen -= 8; } // now they must match exactly int32_t sh32 = hash32(u,ulen); return ( sh32 == siteRootHash32 ); } char *XmlDoc::getIsSiteRoot ( ) { if ( m_isSiteRootValid ) return &m_isSiteRoot2; // get our site char *site = getSite (); if ( ! site || site == (char *)-1 ) return (char *)site; // get our url without the http:// or https:// char *u = getFirstUrl()->getHost(); // assume valid now m_isSiteRootValid = true; // get it m_isSiteRoot2 = m_isSiteRoot = isSiteRootFunc ( u , site ); return &m_isSiteRoot2; } /* bool XmlDoc::getIsOutlinkSiteRoot ( char *u , TagRec *gr ) { // get our site Tag *tag = gr->getTag("site"); // make "host" point to u's hostname int32_t hostLen; char *host = getHostFast ( u , &hostLen ); // use hostname? char *site; int32_t slen; if ( tag ) { site = tag->getTagData(); slen = tag->getTagDataSize() - 1; } // otherwise, use hostname as site else { // must be end, or could be '/' if ( ! host[hostLen] || ! host[hostLen+1] ) return true; // i guess we were more than just a hostname, so not site root return false; } // get length of each int32_t ulen = gbstrlen(u); // "site" may or may not end in /, so remove that if ( site[slen-1] == '/' ) slen--; // same for url if ( u[ulen-1] == '/' ) ulen--; // now they must match exactly if ( slen == ulen && ! strncmp ( site, u, ulen ) ) return true; // all done return false; } */ int8_t *XmlDoc::getHopCount ( ) { // return now if valid if ( m_hopCountValid ) return &m_hopCount; setStatus ( "getting hop count" ); CollectionRec *cr = this->getCollRec(); if(cr && cr->m_isCustomCrawl ) { // for diffbot collections, compute hopcount without casting // site/rss to 0 hopcount -- copied from below LinkInfo *info1 = getLinkInfo1(); if (!info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1; int32_t origHopCount = -1; if ( m_sreqValid ) { origHopCount = m_sreq.m_hopCount; } int32_t hc = -1; if(m_minInlinkerHopCount+1 < hc && m_minInlinkerHopCount >= 0 ) hc = m_minInlinkerHopCount + 1; if ( hc == -1 && m_minInlinkerHopCount >= 0 ) hc = m_minInlinkerHopCount + 1; if ( origHopCount < hc && origHopCount >= 0 ) hc = origHopCount; if ( hc == -1 && origHopCount >= 0 ) hc = origHopCount; if ( hc == -1 ) hc = 1; if ( hc > 0x7f ) hc = 0x7f; m_hopCountValid = true; m_hopCount = hc; //printf("Custom hopcount: %d for url: %s", //m_hopCount, this->ptr_firstUrl); return &m_hopCount; } // the unredirected url Url *f = getFirstUrl(); // get url as string, skip "http://" or "https://" //char *u = f->getHost(); // if we match site, we are a site root, so hop count is 0 //char *isr = getIsSiteRoot(); //if ( ! isr || isr == (char *)-1 ) return (int8_t *)isr; //if ( *isr ) { // m_hopCount = 0; // m_hopCountValid = true; // return &m_hopCount; //} // ping servers have 0 hop counts if ( f->isPingServer() ) { m_hopCount = 0; m_hopCountValid = true; return &m_hopCount; } char *isRSS = getIsRSS(); if ( ! isRSS || isRSS == (char *)-1) return (int8_t *)isRSS; // and now so do rss urls if ( *isRSS ) { // force it to one, not zero, otherwise it gets pounded // too hard on the aggregator sites. spider priority // is too high m_hopCount = 1; m_hopCountValid = true; return &m_hopCount; } // check for site root TagRec *gr = getTagRec(); if ( ! gr || gr == (TagRec *)-1 ) return (int8_t *)gr; // and site roots char *isSiteRoot = getIsSiteRoot(); if (!isSiteRoot ||isSiteRoot==(char *)-1) return (int8_t *)isSiteRoot; if ( *isSiteRoot ) { m_hopCount = 0; m_hopCountValid = true; return &m_hopCount; } // make sure m_minInlinkerHopCount is valid LinkInfo *info1 = getLinkInfo1(); if ( ! info1 || info1 == (LinkInfo *)-1 ) return (int8_t *)info1; // . fix bad original hop counts // . assign this hop count from the spider rec int32_t origHopCount = -1; if ( m_sreqValid ) origHopCount = m_sreq.m_hopCount; // derive our hop count from our parent hop count int32_t hc = -1; // . BUT use inlinker if better // . if m_linkInfo1Valid is true, then m_minInlinkerHopCount is valid if ( m_minInlinkerHopCount + 1 < hc && m_minInlinkerHopCount >= 0 ) hc = m_minInlinkerHopCount + 1; // or if parent is unknown, but we have a known inlinker with a // valid hop count, use the inlinker hop count then if ( hc == -1 && m_minInlinkerHopCount >= 0 ) hc = m_minInlinkerHopCount + 1; // or use our hop count from the spider rec if better if ( origHopCount < hc && origHopCount >= 0 ) hc = origHopCount; // or if neither parent or inlinker was valid hop count if ( hc == -1 && origHopCount >= 0 ) hc = origHopCount; // if we have no hop count at this point, i guess just pick 1! if ( hc == -1 ) hc = 1; // truncate, hop count is only one byte in the TitleRec.h::m_hopCount if ( hc > 0x7f ) hc = 0x7f; // unknown hop counts (-1) are propogated, except for root urls m_hopCountValid = true; m_hopCount = hc; return &m_hopCount; } /* int8_t *XmlDoc::getOutlinkHopCountVector ( ) { if ( m_outlinkHopCountVectorValid ) return m_outlinkHopCountVector; // need these of course Links *links = getLinks(); if ( ! links || links == (Links *)-1 ) return (int8_t *)links; // and these for seeing if outlink is a site root TagRec ***grv = getOutlinkTagRecVector(); if ( ! grv || grv == (void *)-1 ) return (int8_t *)grv; // hop count of parent int8_t *ph = getHopCount(); if ( ! ph || ph == (void *)-1 ) return (int8_t *)ph; // int16_tcut int32_t n = links->getNumLinks(); // sanity check if ( m_outlinkHopCountVector ) { char *xx=NULL;*xx=0; } // make some space m_outlinkHopCountVector = (int8_t *)mmalloc ( n * 4 ,"xdhc"); // return NULL on error with g_errno set if ( ! m_outlinkHopCountVector ) return NULL; // save size m_outlinkHopCountVectorSize = n * 4; // stock it for ( int32_t i = 0 ; i < n ; i++ ) { // get it char *u = links->getLinkPtr(i); // and this TagRec *gr = (*grv)[i]; // flags linkflags_t flags = links->m_linkFlags[i]; // hop count. default to 1. int32_t hc = 1; if ( getIsOutlinkSiteRoot ( u , gr ) ) hc = 0; else if ( isPingServer ( u ) ) hc = 0; else if ( flags & LF_RSS ) hc = 0; else hc = *ph + 1; // assign it m_outlinkHopCountVector[i] = hc; } m_outlinkHopCountVectorValid = true; return m_outlinkHopCountVector; } */ //set to false fo rinjecting and validate it... if &spiderlinks=0 // should we spider links? char *XmlDoc::getSpiderLinks ( ) { // set it to false on issues //if ( m_indexCode ) { // m_spiderLinks = false; // m_spiderLinks2 = false; // m_spiderLinksValid = true ; } // this slows importing down because we end up doing ip lookups // for every outlink if "firstip" not in tagdb. // shoot. set2() already sets m_spiderLinksValid to true so we // have to override if importing. if ( m_isImporting && m_isImportingValid ) { m_spiderLinks = false; m_spiderLinks2 = false; m_spiderLinksValid = true; return &m_spiderLinks2; } // return the valid value if ( m_spiderLinksValid ) return &m_spiderLinks2; setStatus ( "getting spider links flag"); // do not add links now if doing the parser test if ( g_conf.m_testParserEnabled || m_isDiffbotJSONObject ) { m_spiderLinks = false; m_spiderLinks2 = false; m_spiderLinksValid = true; return &m_spiderLinks2; } CollectionRec *cr = getCollRec(); if ( ! cr ) return (char *)cr; int32_t *ufn = getUrlFilterNum(); if ( ! ufn || ufn == (void *)-1 ) return (char *)ufn; // if url filters forbids it if ( ! cr->m_harvestLinks[*ufn] ) { m_spiderLinksValid = true; m_spiderLinks2 = false; m_spiderLinks = false; return &m_spiderLinks2; } // hack for bulk job detection. never spider links //if ( cr->m_isCustomCrawl == 2 ) { // m_spiderLinks = false; // m_spiderLinks2 = false; // m_spiderLinksValid = true; // return &m_spiderLinks2; //} // check the xml for a meta robots tag Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; // assume true m_spiderLinks = true; // or if meta tag says not to char buf1 [256]; char buf2 [256]; buf1[0] = '\0'; buf2[0] = '\0'; xml->getMetaContent ( buf1, 255 , "robots" , 6 ); xml->getMetaContent ( buf2, 255 , "gigabot", 7 ); if ( strstr ( buf1 , "nofollow" ) || strstr ( buf2 , "nofollow" ) || strstr ( buf1 , "none" ) || strstr ( buf2 , "none" ) ) m_spiderLinks = false; // spider links if doing custom crawl or not using robots.txt if ( ! m_useRobotsTxt || cr->m_isCustomCrawl ) m_spiderLinks = true; // spider request forbade it? diffbot.cpp crawlbot api when // specifying urldata (list of urls to add to spiderdb) usually // they do not want the links crawled i'd imagine. if ( m_sreqValid && m_sreq.m_avoidSpiderLinks ) m_spiderLinks = false; // also check in url filters now too // set shadow member m_spiderLinks2 = m_spiderLinks; // validate m_spiderLinksValid = true; return &m_spiderLinks2; } // // . DELETE ALL SPAM FROM THE INDEX!!! // // . for a page to be spam these must ALL be true, with the current ip: // . site is not in google // . site has no "stars" in google's dir // . site has no authorityinlink tag // . site has less than 10 fresh inlinks // . site has less than 500 total inlinks // . ip is not from ultra dns // . TODO: site is not linked to by wikipedia.com // . TODO: site is not linked to by about.com // . TODO: site is not linked to by a .gov site // . the page IP address changed significantly since the same since last // time we indexed it when it was not spam (if applicable) // // . if the page was indexed at one time and then we decided it was spam, // and its ip changed significantly since last time, we just // reschedule the spider rec for 15 days later and do not touch anything // else. that way we keep the index somewhat stable. // /* char *XmlDoc::getIsSpam() { // return it if valid if ( m_isSpamValid ) return &m_isSpam; setStatus ("getting is spam"); // assume it is not spam m_isSpam = false; // debug //logf(LOG_DEBUG,"doc: NOT SPAM!!"); //m_isSpamValid = true; return &m_isSpam; // we disable this check for the contact doc if ( m_spamCheckDisabled ) { m_isSpamValid = true; return &m_isSpam; } // . i put this here for debugging purposes // . some big sites have no easy to find contact info // . get our domain Url *fu = getFirstUrl(); char *dom = fu->getDomain (); int32_t dlen = fu->getDomainLen(); if ( dlen == 12 && !strncmp(dom,"facebook.com",dlen) ) { m_isSpamValid = true; return &m_isSpam; } if ( dlen == 9 && !strncmp(dom,"yahoo.com",dlen) ) { m_isSpamValid = true; return &m_isSpam; } // get our site's tag rec TagRec *gr = getTagRec(); if ( ! gr || gr == (TagRec *)-1 ) return (char *)gr; // are we already in the index? //char *isIndexed = getIsIndexed(); //if (!isIndexed || isIndexed == (char *)-1 ) return (char *)isIndexed; // this will update m_oldTagRec with the latest info if its stale int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni; char *hci = getHasContactInfo(); if ( ! hci || hci == (char *)-1 ) return (char *)hci; //int32_t *ip = getIp(); //if ( ! ip || ip == (int32_t *)-1 ) return (char *)ip; //XmlDoc **od = getOldXmlDoc ( ); //if ( ! od || od == (void *)-1 ) return (char *)od; //int32_t oldIp = 0 ; //if ( *od ) { // int32_t *ip2 = (*od)->getIp(); // if ( ! ip2 || ip2 == (int32_t *)-1 ) return (char *)ip2; // oldIp = *ip2; //} // i am guessing that most sites that use ultra dns will have a lot // of site inlinks! so comment this our for now //char *ultra = getIpIsUltraDns(); //if ( ultra || ultra==(char *)-1 ) return (char *)ultra; // spammers do not use ultradns //if ( *ultra ) return false; Url *f = getFirstUrl(); char *u = f->getUrl(); int32_t now = getTimeGlobal(); // this will be valid m_isSpamValid = true; // use this routine m_isSpam = isSpam ( u, gr, now, // *isIndexed, //oldIp , // *ip , *hci ); // we are doomed! delete in its entirety if ( m_isSpam ) m_indexCode = EDOCSPAM; return &m_isSpam; } // . "u" must be NORMALIZED. i.e. start with http:// or https:// etc. // . we call this on outlinks as well // . we no longer look at the old and newip to determine ownership change, // because that is not reliable enough // . we now maybe rely on a major change to the site root page... bool XmlDoc::isSpam ( char *u , TagRec *gr , int32_t now , char isIndexed , int32_t oldIp , int32_t newIp , bool hasContactInfo ) { // we need to mine that same database that firefox does... Tag *tag = gr->getTag ( "malware" ); if ( tag && tag->getTagData()[0] != '0' ) return true; // if they have contact info, that is a really good sign if ( hasContactInfo ) return false; // .edu and .gov sites are always fine int32_t tlen; char *tld = getTLDFast(u,&tlen); if ( tlen == 3 && ! strncmp(tld,"edu",3) ) return false; if ( tlen == 3 && ! strncmp(tld,"gov",3) ) return false; // the current top ip address //int32_t top = newIp & 0x00ffffff; // TODO: in the case of multiple ips on one domain, ensure we select // the same IP every time we do a lookup in MsgC. // ok if in google if ( gr->getTag ( "ingoogle" ) ) return false; //if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false; // can also be in google's dmoz dir. must have a decent page rank. if ( gr->getTag ( "pagerank" ) ) return false; //if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false; // . if was linked to by a high quality root as a new external outlink // . TODO: include about.com and wikipedia.com i guess (TODO) if ( gr->getTag ( "authorityinlink" ) ) return false; //if ( tag && ((tag->m_ip & 0x00ffffff) == top) ) return false; tag = gr->getTag("sitenuminlinks"); // i guess if it has no entry for this, assume the best if ( ! tag ) return false; // or just a massive amount of any-age inlinks if ( atol(tag->getTagData()) >= 500 ) return false; tag = gr->getTag("sitenuminlinksfresh"); // i guess if it has no entry for this, assume the best if ( ! tag ) return false; // if site has enough good FRESH inlinks from the last 3 mos, no spam if( atol(tag->getTagData()) >= 10 ) return false; // if we are old and the top 3 bytes of the ip is the same as the last // time we were indexed and thereby not identified as spam... // then assume we are still not spam! because it was unlikely that // the domain ownership changed... //if ( isIndexed (oldIp & 0x00ffffff) == top ) return false; // if they have contact info, that is a really good sign //if ( hasContactInfo && (oldIp & 0x00ffffff) == top ) return false; // if first time... accept them if they got contact info //if ( ! oldIp && hasContactInfo ) return false; // . if it has had the same ip for the last 365 days, let it in // . getTagRec() updates this tag immediately if the ip changes // . so we can't really use this tag for outlinks, because they might // never get thrown into spiderdb to where we can add this tag to // their tag rec... UNLESS msgc/msge were to update their tag rec... // . i've seen quite a few old spam sites/pages. they just kinda stay // there. so let's not do this... //tag = gr->get("iptimestamp"); //int32_t now; //if ( tag ) now = getTimeGlobal(); //if(tag&&now-atol(tag->getTagData())>365*24*3600&& // ((tag->m_ip&0x00ffffff)==top)) // return false; return true; } */ // should we index the doc? if already indexed, and is filtered, we delete it char *XmlDoc::getIsFiltered ( ) { if ( m_isFilteredValid ) return &m_isFiltered; if ( m_isDiffbotJSONObject ) { m_isFiltered = false; m_isFilteredValid = true; return &m_isFiltered; } int32_t *priority = getSpiderPriority(); if ( ! priority || priority == (void *)-1 ) return (char *)priority; m_isFiltered = false; if ( *priority == SPIDER_PRIORITY_FILTERED ) m_isFiltered = true; if ( *priority == SPIDER_PRIORITY_BANNED ) m_isFiltered = true; m_isFilteredValid = true; return &m_isFiltered; } int32_t *XmlDoc::getSpiderPriority ( ) { if ( m_priorityValid ) return &m_priority; setStatus ("getting spider priority"); // need tagrec to see if banned TagRec *gr = getTagRec(); if ( ! gr || gr == (TagRec *)-1 ) return (int32_t *)gr; // this is an automatic ban! if ( gr->getLong("manualban",0) ) { m_priority = SPIDER_PRIORITY_BANNED; m_priorityValid = true; return &m_priority; } int32_t *ufn = getUrlFilterNum(); if ( ! ufn || ufn == (void *)-1 ) return (int32_t *)ufn; // sanity check if ( *ufn < 0 ) { char *xx=NULL;*xx=0; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; m_priority = cr->m_spiderPriorities[*ufn]; m_priorityValid = true; return &m_priority; } bool XmlDoc::logIt ( SafeBuf *bb ) { // set errCode int32_t errCode = m_indexCode; if ( ! errCode && g_errno ) errCode = g_errno; // were we new? //char isIndexed = -1; //if ( m_isIndexedValid ) isIndexed = m_isIndexed; bool isNew = true; if ( m_sreqValid && m_sreq.m_hadReply ) isNew = false; // keep track of stats g_stats.addSpiderPoint ( errCode, isNew ); // !isIndexed ); // do not log if we should not, saves some time //if ( ! g_conf.m_logSpideredUrls && ! m_forceDelete ) return true; if ( ! g_conf.m_logSpideredUrls ) return true; // patch the ip int32_t ip = m_ip; // invalid? if ( ! m_ipValid ) ip = 0; char *coll = "nuked"; CollectionRec *cr = getCollRec(); if ( cr ) coll = cr->m_coll; SafeBuf tmpsb; // print into this now SafeBuf *sb = &tmpsb; // log into provided safebuf if not null if ( bb ) sb = bb; // // coll // sb->safePrintf("coll=%s ",coll); sb->safePrintf("collnum=%"INT32" ",(int32_t)m_collnum); // // print ip // if ( m_ipValid ) sb->safePrintf("ip=%s ",iptoa(m_ip) ); if ( m_firstIpValid ) sb->safePrintf("firstip=%s ",iptoa(m_firstIp) ); // . first ip from spider req if it is fake // . we end up spidering the same url twice because it will have // different "firstips" in the SpiderRequest key. maybe just // use domain hash instead of firstip, and then let msg13 // make queues in the case of hammering an ip, which i think // it already does... if ( m_sreqValid && m_sreq.m_firstIp != m_firstIp ) sb->safePrintf("fakesreqfirstip=%s ",iptoa(m_sreq.m_firstIp) ); // // print when this spider request was added // //if ( m_sreqValid && m_sreq.m_addedTime ) { // struct tm *timeStruct = gmtime ( &m_sreq.m_addedTime ); // char tmp[64]; // strftime(tmp,64,"requestadded=%b-%d-%Y(%H:%M:%S)", timeStruct); // sb->safePrintf("%s(%"UINT32") ",tmp,m_sreq.m_addedTime); //} // // print spidered time // //if ( m_spideredTimeValid ) { time_t spideredTime = (time_t)getSpideredTime(); struct tm *timeStruct = gmtime ( &spideredTime ); char tmp[64]; strftime(tmp,64,"spidered=%b-%d-%Y(%H:%M:%S)", timeStruct ); sb->safePrintf("%s(%"UINT32") ",tmp,(uint32_t)spideredTime); // when it was scheduled to be spidered if ( m_sreqValid && m_sreq.m_addedTime ) { time_t ts = m_sreq.m_addedTime; struct tm *timeStruct = gmtime ( &ts ); char tmp[64]; strftime ( tmp , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct ); sb->safePrintf("scheduledtime=%s(%"UINT32") ", tmp,(uint32_t)m_sreq.m_addedTime); } // print first indexed time if ( m_firstIndexedDateValid ) { time_t ts = m_firstIndexedDate; timeStruct = gmtime ( &ts );//m_firstIndexedDate ); strftime(tmp,64,"firstindexed=%b-%d-%Y(%H:%M:%S)", timeStruct); sb->safePrintf("%s(%"UINT32") ",tmp, (uint32_t)m_firstIndexedDate); } //if ( ! m_isIndexedValid ) { char *xx=NULL;*xx=0; } // just use the oldurlfilternum for grepping i guess //if ( m_oldDocValid && m_oldDoc ) // when injecting a request we have no idea if it had a reply or not if ( m_sreqValid && m_sreq.m_isInjecting ) sb->safePrintf("firsttime=? "); else if ( m_sreqValid && m_sreq.m_hadReply ) sb->safePrintf("firsttime=0 "); else if ( m_sreqValid ) sb->safePrintf("firsttime=1 "); else sb->safePrintf("firsttime=? "); // // print # of link texts // if ( m_linkInfo1Valid && ptr_linkInfo1 ) { LinkInfo *info = ptr_linkInfo1; int32_t nt = info->getNumLinkTexts(); sb->safePrintf("goodinlinks=%"INT32" ",nt ); // new stuff. includes ourselves i think. sb->safePrintf("ipinlinks=%"INT32" ",info->m_numUniqueIps); sb->safePrintf("cblockinlinks=%"INT32" ",info->m_numUniqueCBlocks); } // // print # of link texts from 2nd coll // if ( m_linkInfo2Valid ) { LinkInfo *info = ptr_linkInfo2; int32_t nt = 0; if ( info ) nt = info->getNumLinkTexts(); if ( nt ) sb->safePrintf("goodinlinks2=%"INT32" ",nt ); } if ( m_docIdValid ) sb->safePrintf("docid=%"UINT64" ",m_docId); if ( m_siteNumInlinksValid ) { sb->safePrintf("siteinlinks=%04"INT32" ",m_siteNumInlinks ); sb->safePrintf("siteipinlinks=%"INT32" ", m_siteNumInlinksUniqueIp); sb->safePrintf("sitecblockinlinks=%"INT32" ", m_siteNumInlinksUniqueCBlock); int32_t sr = ::getSiteRank ( m_siteNumInlinks ); sb->safePrintf("siterank=%"INT32" ", sr ); } // int16_tcut int64_t uh48 = hash64b ( m_firstUrl.m_url ); // mask it uh48 &= 0x0000ffffffffffffLL; sb->safePrintf ("uh48=%"UINT64" ",uh48 ); if ( m_charsetValid ) sb->safePrintf("charset=%s ",get_charset_str(m_charset)); if ( m_contentTypeValid ) sb->safePrintf("ctype=%s ", g_contentTypeStrings [m_contentType]); if ( m_sreqValid ) sb->safePrintf("parentlang=%02"INT32"(%s) ", (int32_t)m_sreq.m_parentLangId, getLanguageAbbr(m_sreq.m_parentLangId)); if ( m_langIdValid ) sb->safePrintf("lang=%02"INT32"(%s) ",(int32_t)m_langId, getLanguageAbbr(m_langId)); if ( m_countryIdValid ) sb->safePrintf("country=%02"INT32"(%s) ",(int32_t)m_countryId, g_countryCode.getAbbr(m_countryId)); if ( m_hopCountValid ) sb->safePrintf("hopcount=%02"INT32" ",(int32_t)m_hopCount); if ( m_contentValid ) sb->safePrintf("contentlen=%06"INT32" ",m_contentLen); if ( m_robotsTxtLenValid ) sb->safePrintf("robotstxtlen=%04"INT32" ",m_robotsTxtLen ); if ( m_contentHash32Valid ) sb->safePrintf("ch32=%010"UINT32" ",m_contentHash32); if ( m_domHash32Valid ) sb->safePrintf("dh32=%010"UINT32" ",m_domHash32); if ( m_siteHash32Valid ) sb->safePrintf("sh32=%010"UINT32" ",m_siteHash32); if ( m_isPermalinkValid ) sb->safePrintf("ispermalink=%"INT32" ",(int32_t)m_isPermalink); if ( m_isRSSValid ) sb->safePrintf("isrss=%"INT32" ",(int32_t)m_isRSS); if ( m_linksValid ) sb->safePrintf("hasrssoutlink=%"INT32" ", (int32_t)m_links.hasRSSOutlink() ); if ( m_numOutlinksAddedValid ) sb->safePrintf("outlinksadded=%04"INT32" ",(int32_t)m_numOutlinksAdded); if ( m_metaListValid ) sb->safePrintf("addlistsize=%05"INT32" ",(int32_t)m_metaListSize); else sb->safePrintf("addlistsize=%05"INT32" ",(int32_t)0); if ( m_addedSpiderRequestSizeValid ) sb->safePrintf("addspiderreqsize=%05"INT32" ", m_addedSpiderRequestSize); else sb->safePrintf("addspiderreqsize=%05"INT32" ",0); if ( m_addedSpiderReplySizeValid ) sb->safePrintf("addspiderrepsize=%05"INT32" ", m_addedSpiderReplySize); else sb->safePrintf("addspiderrepsize=%05"INT32" ",0); if ( m_addedStatusDocSizeValid ) sb->safePrintf("addstatusdocsize=%05"INT32" ", m_addedStatusDocSize); else sb->safePrintf("addstatusdocsize=%05"INT32" ",0); if ( size_imageData && m_imageDataValid ) { // url is in data now ThumbnailArray *ta = (ThumbnailArray *)ptr_imageData; int32_t nt = ta->getNumThumbnails(); ThumbnailInfo *ti = ta->getThumbnailInfo(0); sb->safePrintf("thumbnail=%s,%"INT32"bytes,%"INT32"x%"INT32",(%"INT32") ", ti->getUrl(), ti->m_dataSize, ti->m_dx, ti->m_dy, nt); } else sb->safePrintf("thumbnail=none "); /* if ( m_hasAddressValid && m_addressesValid ) sb->safePrintf("numaddr=%"INT32" ",(int32_t)m_addresses.m_numValid); //if ( m_skipIndexingValid ) // sb->safePrintf("skipindexing=%"INT32" ",(int32_t)m_skipIndexing); if ( m_hasTODValid ) sb->safePrintf("hastod=%"INT32" ",(int32_t)m_hasTOD); */ // get the content type uint8_t ct = CT_UNKNOWN; if ( m_contentTypeValid ) ct = m_contentType; bool isRoot = false; if ( m_isSiteRootValid ) isRoot = m_isSiteRoot; // make sure m_minInlinkerHopCount is valid LinkInfo *info1 = NULL; if ( m_linkInfo1Valid ) info1 = ptr_linkInfo1; //bool isContacty = getIsContacty(&m_firstUrl, // info1, // m_hopCount , // ct , // contentType // isRoot , // m_niceness ); /* // just use this now if ( m_hasContactInfoValid ) sb->safePrintf("iscontacty=%"INT32" ",(int32_t)m_hasContactInfo); if ( m_hasSiteVenueValid ) sb->safePrintf("hassitevenue=%"INT32" ",(int32_t)m_hasSiteVenue); */ // hack this kinda // . in PageInject.cpp we do not have a valid priority without // blocking because we did a direct injection! // so ignore this!! // . a diffbot json object, an xmldoc we set from a json object // in a diffbot reply, is a childDoc (m_isChildDoc) is true // and does not have a spider priority. only the parent doc // that we used to get the diffbot reply (array of json objects) // will have the spider priority if ( ! getIsInjecting() && ! m_isDiffbotJSONObject ) { //int32_t *priority = getSpiderPriority(); //if ( ! priority ||priority==(void *)-1){char *xx=NULL;*xx=0;} if ( m_priorityValid ) sb->safePrintf("priority=%"INT32" ", (int32_t)m_priority); } // should be valid since we call getSpiderPriority() if ( m_urlFilterNumValid ) sb->safePrintf("urlfilternum=%"INT32" ",(int32_t)m_urlFilterNum); if ( m_diffbotApiUrlValid && m_diffbotApiUrl.getBufStart() && m_diffbotApiUrl.getBufStart()[0] ) sb->safePrintf("diffbotjsonobjects=%"INT32" ", (int32_t)m_diffbotJSONCount); if ( m_siteValid ) sb->safePrintf("site=%s ",ptr_site); // // . sometimes we print these sometimes we do not // . put this at the end so we can awk out the above fields reliably // // print when it was last spidered if ( m_oldDocValid && m_oldDoc ) { time_t spideredTime = m_oldDoc->getSpideredTime(); struct tm *timeStruct = gmtime ( &spideredTime ); char tmp[64]; strftime(tmp,64,"lastspidered=%b-%d-%Y(%H:%M:%S)",timeStruct); sb->safePrintf("%s(%"UINT32") ", tmp,(uint32_t)spideredTime); } // print new pubdate if ( m_pubDateValid && m_pubDate!=(uint32_t)-1 && m_pubDate!=0 ) { char tmp[64]; time_t ts = (time_t)m_pubDate; struct tm *timeStruct = gmtime ( &ts ); strftime ( tmp, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct ); sb->safePrintf("pubdate=%s ", tmp ); } if ( m_linkInfo1Valid && ptr_linkInfo1 && ptr_linkInfo1->hasRSSItem()) sb->safePrintf("hasrssitem=1 "); // was the content itself injected? if ( m_wasContentInjected ) sb->safePrintf("contentinjected=1 "); else sb->safePrintf("contentinjected=0 "); // might have just injected the url and downloaded the content? if ( (m_sreqValid && m_sreq.m_isInjecting) || (m_isInjecting && m_isInjectingValid) ) sb->safePrintf("urlinjected=1 "); else sb->safePrintf("urlinjected=0 "); if ( m_sreqValid && m_sreq.m_isAddUrl ) sb->safePrintf("isaddurl=1 "); else sb->safePrintf("isaddurl=0 "); if ( m_sreqValid && m_sreq.m_isPageReindex ) sb->safePrintf("pagereindex=1 "); if ( m_spiderLinksValid && m_spiderLinks ) sb->safePrintf("spiderlinks=1 "); if ( m_spiderLinksValid && ! m_spiderLinks ) sb->safePrintf("spiderlinks=0 "); if ( m_crawlDelayValid && m_crawlDelay != -1 ) sb->safePrintf("crawldelayms=%"INT32" ",(int32_t)m_crawlDelay); if ( m_recycleContent ) sb->safePrintf("recycleContent=1 "); if ( m_exactContentHash64Valid ) sb->safePrintf("exactcontenthash=%"UINT64" ", m_exactContentHash64 ); // . print percent changed // . only print if non-zero! if ( m_percentChangedValid && m_oldDocValid && m_oldDoc && m_percentChanged ) sb->safePrintf("changed=%.00f%% ",m_percentChanged); // only print if different now! good for grepping changes if ( m_oldDocValid && m_oldDoc && m_oldDoc->m_docId != m_docId ) sb->safePrintf("olddocid=%"UINT64" ",m_oldDoc->m_docId); // only print if different now! good for grepping changes if ( m_sreqValid && m_sreq.m_ufn >= 0 && m_sreq.m_ufn != m_urlFilterNum ) sb->safePrintf("oldurlfilternum=%"INT32" ", (int32_t)m_sreq.m_ufn); if ( m_sreqValid && m_sreq.m_priority >= 0 && m_sreq.m_priority != m_priority ) sb->safePrintf("oldpriority=%"INT32" ", (int32_t)m_sreq.m_priority); if ( m_oldDoc && m_oldDoc->m_langIdValid && m_oldDoc->m_langId != m_langId ) sb->safePrintf("oldlang=%02"INT32"(%s) ",(int32_t)m_oldDoc->m_langId, getLanguageAbbr(m_oldDoc->m_langId)); if ( m_useSecondaryRdbs && m_useTitledb && m_logLangId != m_langId ) sb->safePrintf("oldlang=%02"INT32"(%s) ",(int32_t)m_logLangId, getLanguageAbbr(m_logLangId)); if ( m_useSecondaryRdbs && m_useTitledb && m_logSiteNumInlinks != m_siteNumInlinks ) sb->safePrintf("oldsiteinlinks=%04"INT32" ",m_logSiteNumInlinks); if ( m_useSecondaryRdbs && m_useTitledb && m_oldDocValid && m_oldDoc && strcmp(ptr_site,m_oldDoc->ptr_site) ) sb->safePrintf("oldsite=%s ",m_oldDoc->ptr_site); // . print old pubdate // . -1 means unsupported, 0 means could not find one // . only print if different now! good for grepping changes if ( m_oldDocValid && m_oldDoc && m_oldDoc->m_pubDate!= (uint32_t)-1 && m_oldDoc->m_pubDate !=0 && m_oldDoc->m_pubDate != m_pubDate ) { char tmp[64]; time_t ts = m_oldDoc->m_pubDate; struct tm *timeStruct = gmtime ( &ts ); strftime ( tmp, 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct ); sb->safePrintf("oldpubdate=%s ",tmp ); } if ( m_isAdultValid ) sb->safePrintf("isadult=%"INT32" ",(int32_t)m_isAdult); // only print if different now! good for grepping changes if ( m_oldDocValid && m_oldDoc && m_oldDoc->m_siteNumInlinks >= 0 && m_oldDoc->m_siteNumInlinks != m_siteNumInlinks ) { int32_t sni = -1; if ( m_oldDoc ) sni = m_oldDoc->m_siteNumInlinks; sb->safePrintf("oldsiteinlinks=%04"INT32" ",sni); } // Spider.cpp sets m_sreq.m_errCount before adding it to doledb if ( m_sreqValid ) // && m_sreq.m_errCount ) sb->safePrintf("errcnt=%"INT32" ",(int32_t)m_sreq.m_errCount ); else sb->safePrintf("errcnt=? "); if ( ptr_redirUrl ) { // m_redirUrlValid && m_redirUrlPtr ) { sb->safePrintf("redir=%s ",ptr_redirUrl);//m_redirUrl.getUrl()); if ( m_numRedirects > 2 ) sb->safePrintf("numredirs=%"INT32" ",m_numRedirects); } if ( m_isDupValid && m_isDup ) sb->safePrintf("dupofdocid=%"INT64" ",m_docIdWeAreADupOf); if ( m_firstUrlValid ) sb->safePrintf("url=%s ",m_firstUrl.m_url); else sb->safePrintf("urldocid=%"INT64" ",m_docId); // // print error/status // sb->safePrintf(": %s",mstrerror(m_indexCode)); // breathe QUICKPOLL ( m_niceness ); // if safebuf provided, do not log to log if ( bb ) return true; // log it out logf ( LOG_INFO , "build: %s", //getFirstUrl()->getUrl(), sb->getBufStart() ); return true; } // . returns false and sets g_errno on error // . make sure that the title rec we generated creates the exact same // meta list as what we got bool XmlDoc::doConsistencyTest ( bool forceTest ) { if ( ! m_doConsistencyTesting ) return true; // if we had an old doc then our meta list will have removed // stuff already in the database from indexing the old doc. // so it will fail the parsing consistency check... because of // the 'incremental indexing' algo above // disable for now... just a secondfor testing cheatcc.com if ( m_oldDoc && m_oldDocValid && g_conf.m_doIncrementalUpdating ) return true; // if not test coll skip this //if ( strcmp(cr->m_coll,"qatest123") ) return true; // title rec is null if we are reindexing an old doc // and "unchanged" was true. if ( m_unchangedValid && m_unchanged ) { if ( ! m_titleRecBufValid ) return true; if ( m_titleRecBuf.length()==0 ) return true; } CollectionRec *cr = getCollRec(); if ( ! cr ) return true; // leave this uncommented so we can see if we are doing it setStatus ( "doing consistency check" ); // log debug log("spider: doing consistency check for %s",ptr_firstUrl); // . set another doc from that title rec // . do not keep on stack since so huge! XmlDoc *doc ; try { doc = new ( XmlDoc ); } catch ( ... ) { g_errno = ENOMEM; return false; } mnew ( doc , sizeof(XmlDoc),"xmldcs"); if ( ! doc->set2 ( m_titleRecBuf.getBufStart() , -1 , cr->m_coll , NULL , m_niceness , // no we provide the same SpiderRequest so that // it can add the same SpiderReply to the metaList &m_sreq ) ) { mdelete ( doc , sizeof(XmlDoc) , "xdnuke"); delete ( doc ); return false; } // . some hacks // . do not look up title rec in titledb, assume it is new doc->m_isIndexed = false; doc->m_isIndexedValid = true; // inherit this doc's tag rec since it has not called updateTagdb() yet //doc->ptr_tagRecData = ptr_tagRecData; //doc->size_tagRecData = size_tagRecData; // getNewSpiderReply() calls getDownloadEndTime() which is not valid // and causes the page to be re-downloaded, so stop that..! doc->m_downloadEndTime = m_downloadEndTime; doc->m_downloadEndTimeValid = true; // inherit doledb key as well to avoid a core there doc->m_doledbKey = m_doledbKey; // skip the robots.txt lookup! that was causing this too block! //doc->m_isAllowed = true; //doc->m_isAllowedValid = true; // do not get outlink info for this, that stuff is for adding outlinks // to spiderdb, and tagdb may have changed. so we can't really compare // spider recs! if this is false then the call to doc->getMetaList() // blocks to lookup the tagdb and titledb recs for each outlink! // therefore, set it to true! //doc->m_isInjecting = true; // mdw: shouldn't this have the same effect? //doc->m_spiderLinks2 = false; //doc->m_spiderLinksValid = true; // flag it doc->m_doingConsistencyCheck = true; // get get its metalist. rv = return value char *rv = doc->getMetaList ( ); // sanity check - compare urls if ( doc->m_firstUrl.m_ulen != m_firstUrl.m_ulen){char *xx=NULL;*xx=0;} // error setting it? if ( ! rv ) { // sanity check if ( ! g_errno ) { char *xx=NULL;*xx=0; } // free it mdelete ( doc , sizeof(XmlDoc) , "xdnuke"); delete ( doc ); // error return false; } // blocked? that is not allowed if ( rv == (void *)-1 ) { char *xx=NULL; *xx=0; } // compare with the old list char *list1 = m_metaList; int32_t listSize1 = m_metaListSize; char *list2 = doc->m_metaList; int32_t listSize2 = doc->m_metaListSize; // show it for now //log("build: printing meta list 1"); //printMetaList(list1,list1+listSize1,NULL); //log("build: printing meta list 2"); //printMetaList(list2,list2+listSize2,NULL); // do a compare HashTableX ht1; HashTableX ht2; ht1.set ( sizeof(key224_t),4,262144,NULL,0,false,m_niceness,"xmlht1"); ht2.set ( sizeof(key224_t),4,262144,NULL,0,false,m_niceness,"xmlht2"); // format of a metalist... see XmlDoc::addTable() where it adds keys // from a table into the metalist // <nosplitflag|rdbId><key><dataSize><data> // where nosplitflag is 0x80 char *p1 = list1; char *p2 = list2; char *pend1 = list1 + listSize1; char *pend2 = list2 + listSize2; // see if each key in list1 is in list2 if ( ! hashMetaList ( &ht1 , p1 , pend1 , false ) ) { char *xx=NULL;*xx=0; mdelete ( doc , sizeof(XmlDoc) , "xdnuke"); delete ( doc ); return log("doc: failed consistency test for %s",ptr_firstUrl); } if ( ! hashMetaList ( &ht2 , p2 , pend2 , false ) ) { char *xx=NULL;*xx=0; mdelete ( doc , sizeof(XmlDoc) , "xdnuke"); delete ( doc ); return log("doc: failed consistency test for %s",ptr_firstUrl); } // . now make sure each list matches the other // . first scan the guys in "p1" and make sure in "ht2" hashMetaList ( &ht2 , p1 , pend1 , true ); // . second scan the guys in "p2" and make sure in "ht1" hashMetaList ( &ht1 , p2 , pend2 , true ); mdelete ( doc , sizeof(XmlDoc) , "xdnuke"); delete ( doc ); log ("spider: passed consistency test for %s",ptr_firstUrl ); // no serious error, although there might be an inconsistency return true; } int32_t XmlDoc::printMetaList ( ) { SafeBuf sb; printMetaList ( m_metaList , m_metaList + m_metaListSize , &sb ); fprintf(stderr,"%s\n",sb.getBufStart()); return 0; } #define TABLE_ROWS 25 // print this also for page parser output! void XmlDoc::printMetaList ( char *p , char *pend , SafeBuf *sb ) { verifyMetaList ( p , pend , false ); SafeBuf tmp; if ( ! sb ) sb = &tmp; char *hdr = "<table border=1>\n" "<tr>" "<td><b>rdb</b></td>" "<td><b>del?</b></td>" "<td><b>shardByTermId?</b></td>" // illustrates key size "<td><b>key</b></td>" // break it down. based on rdb, of course. "<td><b>desc</b></td>" "</tr>\n" ; sb->safePrintf("%s",hdr); int32_t recSize = 0; int32_t rcount = 0; for ( ; p < pend ; p += recSize ) { // get rdbid uint8_t rdbId = *p & 0x7f; // skip p++; // get key size int32_t ks = getKeySizeFromRdbId ( rdbId ); // point to it char *rec = p; // init this int32_t recSize = ks; // convert into a key128_t, the biggest possible key //key224_t k ; char k[MAX_KEY_BYTES]; if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; } //k.setMin(); gbmemcpy ( &k , p , ks ); // is it a negative key? char neg = false; if ( ! ( p[0] & 0x01 ) ) neg = true; // this is now a bit in the posdb key so we can rebalance char shardByTermId = false; if ( rdbId==RDB_POSDB && g_posdb.isShardedByTermId(k)) shardByTermId = true; // skip it p += ks; // get datasize int32_t dataSize = getDataSizeFromRdbId ( rdbId ); // . always zero if key is negative // . this is not the case unfortunately... if ( neg ) dataSize = 0; // if -1, read it in if ( dataSize == -1 ) { dataSize = *(int32_t *)p; // inc this recSize += 4; // sanity check if ( dataSize < 0 ) { char *xx=NULL;*xx=0; } p += 4; } // point to it char *data = p; // skip the data p += dataSize; // inc it recSize += dataSize; // NULL it for negative keys if ( dataSize == 0 ) data = NULL; // see if one big table causes a browser slowdown if ( (++rcount % TABLE_ROWS) == 0 ) sb->safePrintf("<!--ignore--></table>%s",hdr); //if ( rdbId != RDB_LINKDB ) continue; // print dbname sb->safePrintf("<tr>"); char *dn = getDbnameFromId ( rdbId ); sb->safePrintf("<td>%s</td>",dn); if ( neg ) sb->safePrintf("<td>D</td>"); else sb->safePrintf("<td> </td>"); if ( shardByTermId ) sb->safePrintf("<td>shardByTermId</td>"); else sb->safePrintf("<td> </td>"); sb->safePrintf("<td><nobr>%s</nobr></td>", KEYSTR(k,ks)); if ( rdbId == RDB_POSDB ) { // get termid et al key144_t *k2 = (key144_t *)k; int64_t tid = g_posdb.getTermId(k2); //uint8_t score8 = g_posdb.getScore ( *k2 ); //uint32_t score32 = score8to32 ( score8 ); // sanity check if(dataSize!=0){char*xx=NULL;*xx=0;} sb->safePrintf("<td>" "termId=%020"UINT64" " //"score8=%03"UINT32" " //"score32=%010"UINT32"" "</td>" ,(uint64_t)tid //(int32_t)score8, //(int32_t)score32 ); } else if ( rdbId == RDB_DATEDB ) { // get termid et al key128_t *k2 = (key128_t *)k; int64_t tid = g_datedb.getTermId(k2); // use indexdb's function for this. should be the same uint8_t score8 = g_indexdb.getScore ( (char *)k ); int32_t date = g_datedb.getDate ( k2 ); uint32_t score32 = score8to32 ( score8 ); // sanity check if(dataSize!=0){char*xx=NULL;*xx=0;} sb->safePrintf("<td>" "termId=%020"UINT64" " "date=%010"UINT32" " "score8=%03"UINT32" " "score32=%010"UINT32"" "</td>", tid, date, (int32_t)score8, (int32_t)score32); } // key parsing logic from Sections.cpp::gotSectiondbList() else if ( rdbId == RDB_SECTIONDB ) { key128_t *k2 = (key128_t *)k; int32_t secType = g_indexdb.getScore ( (char *)k2); int32_t tagHash = g_datedb.getDate ( k2 ); int64_t tid = g_datedb.getTermId(k2); int64_t siteHash = tid; // not quite 64 bits SectionVote *sv = (SectionVote *)data; char *dd = "tagHash32"; if ( secType == SV_TAGCONTENTHASH ) dd ="tagcontentHash32"; if ( secType == SV_TAGPAIRHASH ) dd = "tagPairHash32"; // sanity check int32_t ds = sizeof(SectionVote); if (!neg&&dataSize!=ds){char*xx=NULL;*xx=0;} if ( neg&&dataSize!=0 ){char*xx=NULL;*xx=0;} float score = 0.0; float numSampled = 0.0; if ( data ) { score = sv->m_score; numSampled = sv->m_numSampled; } sb->safePrintf("<td>" "<nobr>" "siteHash48=0x%016"XINT64" " "%s=0x%08"XINT32" " "secType=%s " "score=%.02f " "numSampled=%.02f" "</nobr>" "</td>", siteHash, dd,tagHash, getSectionTypeAsStr(secType), score, numSampled); } else if ( rdbId == RDB_LINKDB ) { key224_t *k2 = (key224_t *)k; int64_t linkHash=g_linkdb.getLinkeeUrlHash64_uk(k2); int32_t linkeeSiteHash = g_linkdb.getLinkeeSiteHash32_uk(k2); int32_t linkerSiteHash = g_linkdb.getLinkerSiteHash32_uk(k2); char linkSpam = g_linkdb.isLinkSpam_uk (k2); int32_t siteRank = g_linkdb.getLinkerSiteRank_uk (k2); //int32_t hopCount = g_linkdb.getLinkerHopCount_uk (k2); //int32_t ip24 = g_linkdb.getLinkerIp24_uk (k2); int32_t ip32 = g_linkdb.getLinkerIp_uk (k2); int64_t docId = g_linkdb.getLinkerDocId_uk (k2); // sanity check if(dataSize!=0){char*xx=NULL;*xx=0;} sb->safePrintf("<td>" "<nobr>" "linkeeSiteHash32=0x%08"XINT32" " "linkeeUrlHash=0x%016"XINT64" " "linkSpam=%"INT32" " "siteRank=%"INT32" " //"hopCount=%03"INT32" " "sitehash32=0x%"XINT32" " "IP32=%s " "docId=%"UINT64"" "</nobr>" "</td>", linkeeSiteHash, linkHash, (int32_t)linkSpam, siteRank, //hopCount, linkerSiteHash, iptoa(ip32), docId); } else if ( rdbId == RDB_CLUSTERDB ) { key128_t *k2 = (key128_t *)k; char *r = (char *)k2; int32_t siteHash26 = g_clusterdb.getSiteHash26 ( r ); char lang = g_clusterdb.getLanguage ( r ); int64_t docId = g_clusterdb.getDocId ( r ); char ff = g_clusterdb.getFamilyFilter ( r ); // sanity check if(dataSize!=0){char*xx=NULL;*xx=0;} sb->safePrintf("<td>" // 26 bit site hash "siteHash26=0x%08"XINT32" " "family=%"INT32" " "lang=%03"INT32" " "docId=%"UINT64"" "</td>", siteHash26 , (int32_t)ff, (int32_t)lang, docId ); } // key parsing logic taken from Address::makePlacedbKey else if ( rdbId == RDB_PLACEDB ) { key128_t *k2 = (key128_t *)k; int64_t bigHash = g_placedb.getBigHash ( k2 ); int64_t docId = g_placedb.getDocId ( k2 ); int32_t snh = g_placedb.getStreetNumHash ( k2 ); //int32_t smallHash = g_placedb.getSmallHash ( k2 ); // sanity check if(!neg &&dataSize<=0){char*xx=NULL;*xx=0;} if( neg &&dataSize!=0){char*xx=NULL;*xx=0;} sb->safePrintf("<td><nobr>" "bigHash64=0x%016"XINT64" " "docId=%"UINT64" " "streetNumHash25=0x%08"XINT32" " "dataSize=%010"INT32" " "address=%s" "</nobr>" "</td>", bigHash, docId, snh, dataSize , data ); } // key parsing logic taken from Address::makePlacedbKey else if ( rdbId == RDB_SPIDERDB ) { sb->safePrintf("<td><nobr>"); key128_t *k2 = (key128_t *)k; if ( g_spiderdb.isSpiderRequest(k2) ) { SpiderRequest *sreq = (SpiderRequest *)rec; sreq->print ( sb ); } else { SpiderReply *srep = (SpiderReply *)rec; srep->print ( sb ); } sb->safePrintf("</nobr></td>"); } else if ( rdbId == RDB_DOLEDB ) { key_t *k2 = (key_t *)k; sb->safePrintf("<td><nobr>"); sb->safePrintf("priority=%"INT32" " "spidertime=%"UINT32" " "uh48=%"XINT64" " "isdel=%"INT32"", g_doledb.getPriority(k2), (uint32_t)g_doledb.getSpiderTime(k2), g_doledb.getUrlHash48(k2), g_doledb.getIsDel(k2)); sb->safePrintf("</nobr></td>"); } else if ( rdbId == RDB_TITLEDB ) { //XmlDoc tr; //SafeBuf tmp; //tr.set2 ( rec,recSize ,"qatest123",&tmp,m_niceness); // print each offset and size for the variable crap sb->safePrintf("<td><nobr>titlerec datasize=%"INT32" " //"sizeofxmldoc=%"INT32" " //"hdrSize=%"INT32" " //"version=%"INT32" " //"%s" "</nobr></td>", dataSize //(int32_t)sizeof(XmlDoc), //(int32_t)tr.m_headerSize, //(int32_t)tr.m_version, //tmp.getBufStart()); ); } //else if ( rdbId == RDB_REVDB ) { // sb->safePrintf("<td><nobr>revdb datasize=%"INT32" ", // dataSize); //} else if ( rdbId == RDB_TAGDB ) { Tag *tag = (Tag *)rec; sb->safePrintf("<td><nobr>"); if ( rec[0] & 0x01 ) tag->printToBuf(sb); else sb->safePrintf("negativeTagKey"); sb->safePrintf("</nobr></td>"); } else { char *xx=NULL;*xx=0; } // close it up sb->safePrintf("</tr>\n"); /* // hash the data into a int32_t for hash table char *ns = "no"; if ( noSplit ) ns = "yes"; char *del = ""; if ( neg ) del = " (delete)"; if ( ks==12 ) { key_t *k2 = (key_t *)k; int64_t tid = g_indexdb.getTermId(k2); uint8_t score8 = g_indexdb.getScore ( *k2 ); uint32_t score32 = score8to32 ( score8 ); log("build: key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" " "tid=%"UINT64" score8=%"UINT32" score32=%"UINT32" nosplit=%s%s", count,getDbnameFromId(rdbId),(int32_t)ks, (int32_t)dataSize,tid ,(int32_t)score8,(int32_t)score32, ns,del); } else { log("build: key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" " "nosplit=%s%s", count,getDbnameFromId(rdbId),(int32_t)ks, (int32_t)dataSize,ns,del); } */ } sb->safePrintf("</table>\n"); if ( sb == &tmp ) sb->print(); } bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) { CollectionRec *cr = getCollRec(); if ( ! cr ) return true; // do not do this if not test collection for now if ( strcmp(cr->m_coll,"qatest123") ) return true; log("xmldoc: VERIFYING METALIST"); // store each record in the list into the send buffers for ( ; p < pend ; ) { // breathe QUICKPOLL(m_niceness); // first is rdbId //char rdbId = -1; // m_rdbId; //if ( rdbId < 0 ) rdbId = *p++; uint8_t rdbId = *p++; // mask off rdbId rdbId &= 0x7f; // get the key of the current record //char *key = p; // negative key? bool del ; if ( *p & 0x01 ) del = false; else del = true; // must always be negative if deleteing // spiderdb is exempt because we add a spiderreply that is // positive and a spiderdoc // no, this is no longer the case because we add spider // replies to the index when deleting or rejecting a doc. //if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB) { // char *xx=NULL;*xx=0; } // get the key size. a table lookup in Rdb.cpp. int32_t ks ; if ( rdbId == RDB_POSDB || rdbId == RDB2_POSDB2 ) { ks = 18; // no compress bits set! if ( p[0] & 0x06 ) { char*xx=NULL;*xx=0; } // alignment bit set or cleared if ( ! ( p[1] & 0x02 ) ) { char *xx=NULL;*xx=0; } if ( ( p[7] & 0x02 ) ) { char *xx=NULL;*xx=0; } int64_t docId = g_posdb.getDocId(p); if ( docId != m_docId && !cr->m_indexSpiderReplies) { log("xmldoc: %"INT64" != %"INT64"" , docId , m_docId ); char *xx=NULL;*xx=0; } // else // log("xmldoc: %"INT64" == %"INT64"" // , docId // , m_docId ); // uint64_t termId = g_posdb.getTermId(p); // if ( termId == 59194288760543LL ) { // log("xmldoc: debug"); // //char *xx=NULL;*xx=0; // } } else if ( rdbId == RDB_DATEDB ) ks = 16; else ks = getKeySizeFromRdbId ( rdbId ); // sanity if ( ks < 12 ) { char *xx=NULL;*xx=0; } if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; } // another check Rdb *rdb = getRdbFromId(rdbId); if ( ! rdb ) { char *xx=NULL;*xx=0; } if ( rdb->m_ks < 12 || rdb->m_ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0;} // special linkdb check //if ( rdbId == RDB_LINKDB ) { // // parse it up // key192_t *k = (key192_t *)p; // unsigned char hc = g_linkdb.getLinkerHopCount_uk(k); // if ( hc != 0 ){ char *xx=NULL;*xx=0; } //} char *rec = p; // set this //bool split = true; //if(rdbId == RDB_POSDB && g_posdb.isShardedByTermId(p) ) // split =false; // skip key p += ks; // . if key belongs to same group as firstKey then continue // . titledb now uses last bits of docId to determine groupId // . but uses the top 32 bits of key still // . spiderdb uses last 64 bits to determine groupId // . tfndb now is like titledb(top 32 bits are top 32 of docId) //uint32_t gid = getGroupId ( rdbId , key , split ); // get the record, is -1 if variable. a table lookup. int32_t dataSize; if ( rdbId == RDB_POSDB || rdbId==RDB2_POSDB2)dataSize=0; else if ( rdbId == RDB_DATEDB ) dataSize = 0; //else if ( rdbId == RDB_REVDB ) dataSize = -1; else if ( rdbId == RDB2_POSDB2 ) dataSize = 0; else if ( rdbId == RDB2_DATEDB2 ) dataSize = 0; //else if ( rdbId == RDB2_REVDB2 ) dataSize = -1; else dataSize = getDataSizeFromRdbId ( rdbId ); // . for delete never stores the data // . you can have positive keys without any dataSize member // when they normally should have one, like titledb if ( forDelete ) dataSize = 0; // . negative keys have no data // . this is not the case unfortunately if ( del ) dataSize = 0; // ensure spiderdb request recs have data/url in them if ( (rdbId == RDB_SPIDERDB || rdbId == RDB2_SPIDERDB2) && g_spiderdb.isSpiderRequest ( (SPIDERDBKEY *)rec ) && ! forDelete && ! del && dataSize == 0 ) { char *xx=NULL;*xx=0; } // if variable read that in if ( dataSize == -1 ) { // -1 means to read it in dataSize = *(int32_t *)p; // sanity check if ( dataSize < 0 ) { char *xx=NULL;*xx=0; } // skip dataSize p += 4; } // skip over the data, if any p += dataSize; // breach us? if ( p > pend ) { char *xx=NULL;*xx=0; } } // must be exactly equal to end if ( p != pend ) return false; return true; /* int32_t recSize = 0; int32_t count = 0; for ( ; p < pend ; p += recSize , count++ ) { // get rdbid char rdbId = *p & 0x7f; // get nosplit flag char noSplit = *p & 0x80; // skip p++; // get key size int32_t ks = getKeySizeFromRdbId ( rdbId ); // sanity if ( ks > 16 ) { char *xx=NULL;*xx=0;} // negative key? bool del; if ( *p & 0x01 ) del = false; else del = true; // convert into a key128_t, the biggest possible key char k[16]; gbmemcpy ( &k , p , ks ); // skip it p += ks; // flip this char split = ! noSplit; // test it g_hostdb.getGroupId(rdbId,k,split); // if negative, no data size allowed if ( ( k[0] & 0x01 ) == 0x00 ) continue; // get datasize int32_t dataSize = getDataSizeFromRdbId ( rdbId ); // no negative key has data if ( del ) dataSize = 0; // if -1, read it in if ( dataSize == -1 ) { dataSize = *(int32_t *)p; // sanity check if ( dataSize < 0 ) { char *xx=NULL;*xx=0; } p += 4; } // skip the data p += dataSize; } */ } bool XmlDoc::hashMetaList ( HashTableX *ht , char *p , char *pend , bool checkList ) { int32_t recSize = 0; int32_t count = 0; for ( ; p < pend ; p += recSize , count++ ) { // breathe QUICKPOLL(m_niceness); // get rdbid char rdbId = *p & 0x7f; // skip rdb id p++; // save that char *rec = p; // get key size int32_t ks = getKeySizeFromRdbId ( rdbId ); // sanity check if ( ks > 28 ) { char *xx=NULL;*xx=0; } // is it a delete key? char del ; if ( ( p[0] & 0x01 ) == 0x00 ) del = true; else del = false; // convert into a key128_t, the biggest possible key char k[MAX_KEY_BYTES];//key128_t k ; // zero out KEYMIN(k,MAX_KEY_BYTES); //k.setMin(); gbmemcpy ( k , p , ks ); // skip it p += ks; // if negative, no data size allowed -- no if ( del ) continue; // get datasize int32_t dataSize = getDataSizeFromRdbId ( rdbId ); // if -1, read it in if ( dataSize == -1 ) { dataSize = *(int32_t *)p; // sanity check if ( dataSize < 0 ) { char *xx=NULL;*xx=0; } p += 4; } // hash the data into a int32_t for hash table //int32_t h32 = 0; //h32 = hash32 ( p , dataSize ); // do not allow 0 //if ( h32 == 0 ) h32 = 1; // skip the data p += dataSize; // ignore spiderdb recs for parsing consistency check if ( rdbId == RDB_SPIDERDB ) continue; if ( rdbId == RDB2_SPIDERDB2 ) continue; // ignore tagdb as well! if ( rdbId == RDB_TAGDB || rdbId == RDB2_TAGDB2 ) continue; // skip revdb for now too //if ( rdbId == RDB_REVDB ) continue; // set our rec size, includes key/dataSize/data int32_t recSize = p - rec; // debug point //if ( *(uint64_t *)k == 4828936067112479745LL ) // log("hey"); // if just adding, do it if ( ! checkList ) { // we now store ptr to the rec, not hash! if ( ! ht->addKey ( k , &rec ) ) return false; continue; } // check to see if this rec is in the provided hash table int32_t slot = ht->getSlot ( k ); // bitch if not found if ( slot < 0 && ks==12 ) { key144_t *k2 = (key144_t *)k; int64_t tid = g_posdb.getTermId(k2); char shardByTermId = g_posdb.isShardedByTermId(k2); //uint8_t score8 = g_indexdb.getScore ( *k2 ); //uint32_t score32 = score8to32 ( score8 ); log("build: missing key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" " "tid=%"UINT64" " "key=%s " //"score8=%"UINT32" score32=%"UINT32" " "shardByTermId=%"INT32"", count,getDbnameFromId(rdbId),(int32_t)ks, (int32_t)dataSize,tid , //(int32_t)score8,(int32_t)score32, KEYSTR(k2,ks), (int32_t)shardByTermId); // look it up // int16_tcut HashTableX *wt = m_wts; // point to keys, termids? //TermInfo **tp = (TermInfo **)wt->m_keys; // now print the table we stored all we hashed into for ( int32_t i = 0 ; i < wt->m_numSlots ; i++ ) { // skip if empty if ( wt->m_flags[i] == 0 ) continue; // get the TermInfo TermDebugInfo *ti; ti = (TermDebugInfo *)wt->getValueFromSlot(i); // skip if not us if((ti->m_termId & TERMID_MASK)!=tid)continue; // got us char *start = m_wbuf.getBufStart(); char *term = start + ti->m_termOff; char *prefix = ""; if ( ti->m_prefixOff >= 0 ) { prefix = start + ti->m_prefixOff; //prefix[ti->m_prefixLen] = '\0'; } // NULL term it term[ti->m_termLen] = '\0'; // print it log("parser: term=%s prefix=%s",//score32=%"INT32"", term,prefix);//,(int32_t)ti->m_score32); } char *xx=NULL; *xx=0; // count it for PageStats.cpp g_stats.m_parsingInconsistencies++; continue; } if ( slot < 0 && ks != 12 ) { // if it is sectiondb and the orig doc did not // add sectiondb recs because m_totalSiteVoters >= // MAX_SITE_VOTERS, then that is ok! if ( (rdbId == RDB_SECTIONDB || rdbId == RDB2_SECTIONDB2 ) && m_sectionsValid && m_sections.m_totalSiteVoters >= MAX_SITE_VOTERS ) continue; log("build: missing key #%"INT32" rdb=%s ks=%"INT32" ds=%"INT32" " "ks=%s " ,count,getDbnameFromId(rdbId),(int32_t)ks, (int32_t)dataSize,KEYSTR(k,ks)); char *xx=NULL; *xx=0; // count it for PageStats.cpp g_stats.m_parsingInconsistencies++; continue; } // if in there, check the hashes //int32_t h2 = *(int32_t *)ht->getValueFromSlot ( slot ); char *rec2 = *(char **)ht->getValueFromSlot ( slot ); // get his dataSize int32_t dataSize2 = getDataSizeFromRdbId(rdbId); // his keysize int32_t ks2 = getKeySizeFromRdbId(rdbId); // get his recsize int32_t recSize2 = ks2 ; // if -1 that is variable if ( dataSize2 == -1 ) { dataSize2 = *(int32_t *)(rec2+ks2); recSize2 += 4; } // add it up recSize2 += dataSize2; // keep on chugging if they match if ( recSize2==recSize && !memcmp(rec,rec2,recSize) ) continue; // otherwise, bitch char shardByTermId = false; if ( rdbId == RDB_POSDB ) shardByTermId = g_posdb.isShardedByTermId(rec2); log("build: data not equal for key=%s " "rdb=%s splitbytermid=%"INT32" dataSize=%"INT32"", KEYSTR(k,ks2), getDbnameFromId(rdbId),(int32_t)shardByTermId,dataSize); // print into here SafeBuf sb1; SafeBuf sb2; // print it out if ( rdbId == RDB_SPIDERDB ) { // get rec if ( g_spiderdb.isSpiderRequest((key128_t *)rec) ) { SpiderRequest *sreq1 = (SpiderRequest *)rec; SpiderRequest *sreq2 = (SpiderRequest *)rec2; sreq1->print(&sb1); sreq2->print(&sb2); } else { SpiderReply *srep1 = (SpiderReply *)rec; SpiderReply *srep2 = (SpiderReply *)rec2; srep1->print(&sb1); srep2->print(&sb2); } log("build: rec1=%s",sb1.getBufStart()); log("build: rec2=%s",sb2.getBufStart()); } char *xx=NULL; *xx=0; // count it for PageStats.cpp g_stats.m_parsingInconsistencies++; } return true; } /* bool checkRegex ( SafeBuf *regex , char *target , bool *boolVal , bool *boolValValid , int32_t *compileError , CollectionRec *cr ) { if ( compileError ) *compileError = false; if ( *boolValValid ) return *boolVal; // if not using diffbot or there is no regex, it is ok if ( regex->length() <= 0 ) { *boolVal = true; *boolValValid = true; return boolVal; } // whip out the regex shit i guess... regex_t buf; // this will store the compiled regular expression into "buf" int32_t ret = regcomp ( &buf , // the actual regular expression to compile regex->getBufStart() , // some flags REG_EXTENDED|REG_ICASE|REG_NEWLINE|REG_NOSUB); if ( ret ) { //g_errno = ret; if ( compileError ) *compileError = errno; log("xmldoc: diffbot regcomp failed: %s. This should have " "been tested before crawl was started. Ignoring.", mstrerror(errno)); return true; } // now see if it is a match if ( regexec(&buf,target,0,NULL,0) ) *boolVal = true; else *boolVal = false; *boolValValid = true; return boolVal; } */ // . should we send this url off to diffbot or processing? // . if the url's downloaded content does not match the provided regex // in m_diffbotPageProcessPattern, then we do not send the url to diffbot // for processing // . make sure this regex is pre-tested before starting the crawl // so we know it compiles bool XmlDoc::doesUrlMatchDiffbotCrawlPattern() { if ( m_matchesCrawlPatternValid ) return m_matchesCrawlPattern; CollectionRec *cr = getCollRec(); if ( ! cr ) return true; // get the compiled regular expressions regex_t *ucr = &cr->m_ucr; if ( ! cr->m_hasucr ) ucr = NULL; if ( ! m_firstUrlValid ) return false; m_matchesCrawlPatternValid = true; m_matchesCrawlPattern = false; Url *furl = getFirstUrl(); char *url = furl->getUrl(); // if we had a url crawl regex then regexec will return non-zero // if our url does NOT match i guess if ( ucr && regexec(ucr,url,0,NULL,0) ) return false; // int16_tcut char *ucp = cr->m_diffbotUrlCrawlPattern.getBufStart(); if ( ucp && ! ucp[0] ) ucp = NULL; // do not require a match on ucp if ucr is given if ( ucp && ! ucr && ! doesStringContainPattern(url,ucp) ) return false; m_matchesCrawlPattern = true; return true; } /* bool XmlDoc::doesUrlMatchDiffbotProcessPattern() { return checkRegex ( &cr->m_diffbotUrlProcessPattern , m_firstUrl.m_url , &m_diffbotUrlProcessPatternMatch, &m_diffbotUrlProcessPatternMatchValid, NULL, cr); } bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() { if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; } return checkRegex ( &cr->m_diffbotPageProcessPattern , ptr_utf8Content, &m_diffbotPageProcessPatternMatch, &m_diffbotPageProcessPatternMatchValid, NULL, cr); } */ bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() { if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; } CollectionRec *cr = getCollRec(); if ( ! cr ) return false; char *p = cr->m_diffbotPageProcessPattern.getBufStart(); // empty? no pattern matches everything. if ( ! p ) return true; // how many did we have? return doesStringContainPattern ( m_content , p ); } int32_t *XmlDoc::reindexJSONObjects ( int32_t *newTitleHashes, int32_t numNewHashes ) { return redoJSONObjects (newTitleHashes,numNewHashes,false ); } int32_t *XmlDoc::nukeJSONObjects ( int32_t *newTitleHashes , int32_t numNewHashes ) { return redoJSONObjects (newTitleHashes,numNewHashes,true ); } // . returns ptr to status // . diffbot uses this to remove the indexed json pages associated with // a url. each json object is basically its own url. a json object // url is the parent page's url with a -diffbotxyz-%"UINT32" appended to it // where %"INT32" is the object # starting at 0 and incrementing from there. // . XmlDoc::m_diffbotJSONCount is how many json objects the parent url had. int32_t *XmlDoc::redoJSONObjects ( int32_t *newTitleHashes , int32_t numNewHashes , bool deleteFromIndex ) { // use this static int32_t s_return = 1; // if none, we are done if ( m_diffbotJSONCount <= 0 ) return &s_return; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // i was trying to re-index some diffbot json docs in the global // index but it wasn't set as custom crawl //if ( ! cr->m_isCustomCrawl ) return &s_return; // already did it? if ( m_joc >= m_diffbotJSONCount ) return &s_return; // new guy here if ( ! m_dx ) { try { m_dx = new ( XmlDoc ); } catch ( ... ) { g_errno = ENOMEM; log("xmldoc: failed to alloc m_dx"); return NULL; } mnew ( m_dx , sizeof(XmlDoc),"xmldocdx"); } // // index the hashes of the latest diffbot json items for this parent // HashTableX dedup; if ( ! dedup.set(4,0,numNewHashes*4,NULL,0,false,m_niceness,"njodt") ) return NULL; for ( int32_t i = 0 ; i < numNewHashes ; i++ ) dedup.addKey ( &newTitleHashes[i] ); // get this old doc's current title hashes int32_t numOldHashes; int32_t *oldTitleHashes = getDiffbotTitleHashes ( &numOldHashes ); // sanity. should return right away without having to block if ( oldTitleHashes == (void *)-1 ) { char *xx=NULL;*xx=0; } // sanity again if ( numOldHashes != m_diffbotJSONCount ) { char *xx=NULL;*xx=0; } // scan down each for ( ; m_joc < m_diffbotJSONCount ; ) { // only NUKE the json items for which title hashes we lost int32_t th32 = oldTitleHashes[m_joc]; // . if still in the new diffbot reply, do not DELETE!!! // . if there was no title, it uses hash of entire object if ( deleteFromIndex && dedup.isInTable(&th32) ) { m_joc++; continue; } // if m_dx has no url set, call set4 i guess if ( ! m_dx->m_firstUrlValid ) { // make the fake url for this json object for indexing SafeBuf fakeUrl; fakeUrl.set ( m_firstUrl.getUrl() ); // get his title hash32 //int32_t jsonTitleHash32 = titleHashes[m_joc]; // append -diffbotxyz%"UINT32" for fake url fakeUrl.safePrintf("-diffbotxyz%"UINT32"", (uint32_t)th32); // set url of new xmldoc if ( ! m_dx->set1 ( fakeUrl.getBufStart(), cr->m_coll , NULL , // pbuf m_niceness ) ) // g_errno should be set! return NULL; // we are indexing json objects, don't use all these m_dx->m_useClusterdb = false; m_dx->m_useSpiderdb = false; m_dx->m_useTagdb = false; m_dx->m_usePlacedb = false; m_dx->m_useLinkdb = false; m_dx->m_isChildDoc = true; // are we doing a query reindex or a nuke? m_dx->m_deleteFromIndex = deleteFromIndex;//true; // do not try to download this url if ( ! deleteFromIndex ) m_dx->m_recycleContent = true; // we need this because only m_dx->m_oldDoc will // load from titledb and have it set m_dx->m_isDiffbotJSONObject = true; // for debug char *str = "reindexing"; if ( deleteFromIndex ) str = "nuking"; log("xmldoc: %s %s",str,fakeUrl.getBufStart()); } // when the indexdoc completes, or if it blocks, call us! // we should just pass through here m_dx->setCallback ( m_masterState , m_masterLoop ); // . this should ultimately load from titledb and not // try to download the page since m_deleteFromIndex is // set to true // . if m_dx got its msg4 reply it ends up here, in which // case do NOT re-call indexDoc() so check for // m_listAdded. if ( ! m_dx->m_listAdded && ! m_dx->indexDoc ( ) ) return (int32_t *)-1; // critical error on our part trying to index it? // does not include timeouts or 404s, etc. mostly just // OOM errors. if ( g_errno ) return NULL; // count as deleted cr->m_localCrawlInfo.m_objectsDeleted++; cr->m_globalCrawlInfo.m_objectsDeleted++; cr->m_needsSave = true; // but gotta set this crap back //log("diffbot: resetting %s",m_dx->m_firstUrl.m_url); // clear for next guy if there is one. clears // m_dx->m_contentValid so the set4() can be called again above m_dx->reset(); // try to do more json objects indexed from this parent doc m_joc++; } // nuke it mdelete ( m_dx , sizeof(XmlDoc), "xddx" ); delete ( m_dx ); m_dx = NULL; return &s_return; } void getMetaListWrapper ( void *state ) { XmlDoc *THIS = (XmlDoc *)state; // make sure has not been freed from under us! if ( THIS->m_freed ) { char *xx=NULL;*xx=0;} // note it THIS->setStatus ( "in get meta list wrapper" ); // get it char *ml = THIS->getMetaList ( ); // sanity check if ( ! ml && ! g_errno ) { log("doc: getMetaList() returned NULL without g_errno"); sleep(5); char *xx=NULL;*xx=0; } // return if it blocked if ( ml == (void *)-1 ) return; // sanityh check if ( THIS->m_callback1 == getMetaListWrapper ) { char *xx=NULL;*xx=0;} // otherwise, all done, call the caller callback if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state ); else THIS->m_callback2 ( THIS->m_state ); } XmlDoc *g_od = NULL; // . returns NULL and sets g_errno on error // . make a meta list to call Msg4::addMetaList() with // . called by Msg14.cpp // . a meta list is just a buffer of Rdb records of the following format: // rdbid | rdbRecord // . meta list does not include title rec since Msg14 adds that using Msg1 // . returns false and sets g_errno on error // . sets m_metaList ptr and m_metaListSize // . if "deleteIt" is true, we are a delete op on "old" // . returns (char *)-1 if it blocks and will call your callback when done // . generally only Repair.cpp changes these use* args to false char *XmlDoc::getMetaList ( bool forDelete ) { if ( m_metaListValid ) return m_metaList; setStatus ( "getting meta list" ); // force it true? // "forDelete" means we want the metalist to consist of "negative" // keys that will annihilate with the positive keys in the index, // posdb and the other rdbs, in order to delete them. "deleteFromIndex" // means to just call getMetaList(tre) on the m_oldDoc (old XmlDoc) // which is built from the titlerec in Titledb. so don't confuse // these two things. otherwise when i add this we were not adding // the spiderreply of "Doc Force Deleted" from doing a query reindex // and it kept repeating everytime we started gb up. //if ( m_deleteFromIndex ) forDelete = true; // assume valid m_metaList = ""; m_metaListSize = 0; // . internal callback // . so if any of the functions we end up calling directly or // indirectly block, this callback will be called if ( ! m_masterLoop ) { m_masterLoop = getMetaListWrapper; m_masterState = this; } // returning from a handler that had an error? if ( g_errno ) return NULL; // any other indexing issue? hey! g_errno might not be set here //if ( m_indexCode ) { g_errno = m_indexCode; return NULL; } // a hacky thing //XmlDoc *od = (XmlDoc *)1; //bool diffbotEmptyReply = false; /* // fake this for diffbot? if ( m_useDiffbot && ! m_isDiffbotJSONObject && ! doesUrlMatchDiffbotCrawlPattern() ) { // flag it so we only add the SpiderReply to spiderdb and bail //diffbotEmptyReply = true; // we should not delete the json objects for this url // from the index just because the user decided to remove // it from her crawl m_isIndexedValid = true; m_isIndexed = false; m_oldDocValid = true; m_oldDoc = NULL; } */ // if "rejecting" from index fake all this stuff if ( m_deleteFromIndex ) { // if we are using diffbot api and diffbot found no json objects // or we never even processed the url, we really just want to // add the SpiderReply for this url to spiderdb and nothing more. // NO! we still want to store the page content in titledb // so we can see if it has changed i guess //diffbotEmptyReply ) { // set these things to bogus values since we don't need them m_contentHash32Valid = true; m_contentHash32 = 0; m_httpStatusValid = true; m_httpStatus = 200; m_siteValid = true; ptr_site = "www.poopoo.com"; size_site = gbstrlen(ptr_site)+1; m_isSiteRootValid = true; m_isSiteRoot2 = 1; //m_tagHash32Valid = true; //m_tagHash32 = 0; m_tagPairHash32Valid = true; m_tagPairHash32 = 0; m_siteHash64Valid = true; m_siteHash64 = 0LL; m_spiderLinksValid = true; m_spiderLinks2 = 1; m_langIdValid = true; m_langId = 1; m_siteNumInlinksValid = true; m_siteNumInlinks = 0; m_isIndexed = true; m_isIndexedValid = true; m_ipValid = true; m_ip = 123456; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // // BEGIN MULTI DOC QUERY REINDEX HACK // // this fixes it so we can do a query reindex on fake child urls // of their original parent multidoc url. the child urls are // subsections of the original parent url that were indexed as // separate documents with their own docid. if we try to do a // query reindex on such things, detect it, and add the request // for the original parent multidoc url. // if ( m_sreqValid && m_sreq.m_isPageReindex && // if it is a force delete, then allow the user to delete // such diffbot reply json children documents, however. ! m_sreq.m_forceDelete ) { // see if its diffbot json object XmlDoc **pod = getOldXmlDoc ( ); if ( ! pod || pod == (XmlDoc **)-1 ) return (char *)pod; XmlDoc *od = *pod; // if no old doc then we might have just been a diffbot // json url that was directly injected into GLOBAL-INDEX // like xyz.com/-diffbotxyz12345 (my format) or if ( ! od ) goto skip9; // if we are indexing a subdoc piece of a multidoc url // then parentUrl should return non-NULL char *parentUrl = getDiffbotParentUrl(od->m_firstUrl.m_url); if ( ! parentUrl && od->m_contentType != CT_STATUS ) goto skip9; // in that case we need to reindex the parent url not the // subdoc url, so make the spider reply gen quick //SpiderReply *newsr = od->getFakeSpiderReply(); //if ( ! newsr || newsr == (void *)-1 ) return (char *)newsr; // use our ip though //newsr->m_firstIp = od->m_firstIp; // however we have to use our docid-based spider request SpiderReply srep; srep.reset(); // it MUST match up with original spider request so the // lock key in Spider.cpp can unlock it. that lock key // uses the "uh48" (48bit hash of the url) and "srep.m_firstIp" // in this case the SpiderRequest, sreq, is docid-based because // it was added through PageReindex.cpp (query reindex) so // it will be the 48 bit hash64b() of the docid // (see PageReindex.cpp)'s call to SpiderRequest::setKey() srep.m_firstIp = m_sreq.m_firstIp; // assume no error srep.m_errCount = 0; // do not inherit this one, it MIGHT HAVE CHANGE! srep.m_siteHash32 = m_sreq.m_siteHash32; srep.m_domHash32 = m_sreq.m_domHash32; srep.m_spideredTime = getTimeGlobal(); int64_t uh48 = m_sreq.getUrlHash48(); int64_t parentDocId = 0LL; srep.m_contentHash32 = 0; // were we already in titledb before we started spidering? // yes otherwise we would have called "goto skip9" above srep.m_wasIndexed = 1; srep.m_wasIndexedValid = 1; srep.m_isIndexed = 1; srep.m_isIndexedINValid = false; srep.m_errCode = EREINDEXREDIR; // indexCode srep.m_downloadEndTime = 0; srep.setKey ( srep.m_firstIp, parentDocId , uh48 , false ); // lock of request needs to match that of reply so the // reply, when recevied by Rdb.cpp which calls addSpiderReply() // can unlock this url so it can be spidered again. int64_t lock1 = makeLockTableKey(&m_sreq); int64_t lock2 = makeLockTableKey(&srep); if ( lock1 != lock2 ) { char *xx=NULL;*xx=0; } // make a fake spider reply so this docid-based spider // request is not used again //SpiderReply srep; // store the rdbid char rd = RDB_SPIDERDB; if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2; if ( ! m_zbuf.pushChar(rd) ) return NULL; // store that reply to indicate this spider request has // been fulfilled! if( ! m_zbuf.safeMemcpy (&srep, srep.getRecSize())) return NULL; // but also store a new spider request for the parent url SpiderRequest ksr; int64_t pd; // skip if doc is a spider status "document". their docids // often get added during a query reindex but we should ignore // them completely. if ( od->m_contentType == CT_STATUS ) goto returnList; //goto returnList; // complain if ( cr->m_diffbotApiUrl.length()<1 && !cr->m_isCustomCrawl ) log("build: doing query reindex but diffbot api " "url is not set in spider controls"); // just copy original request gbmemcpy ( &ksr , &m_sreq , m_sreq.getRecSize() ); // do not spider links, it's a page reindex of a multidoc url ksr.m_avoidSpiderLinks = 1; // avoid EDOCUNCHANGED ksr.m_ignoreDocUnchangedError = 1; // no longer docid based we set it to parentUrl ksr.m_urlIsDocId = 0; // but consider it a manual add. this should already be set. ksr.m_isPageReindex = 1; // but it is not docid based, so overwrite the docid // in ksr.m_url with the parent multidoc url. it \0 terms it. strcpy(ksr.m_url , parentUrl );//, MAX_URL_LEN-1); // this must be valid //if ( ! od->m_firstIpValid ) { char *xx=NULL;*xx=0; } // set the key, ksr.m_key. isDel = false // fake docid pd = g_titledb.getProbableDocId(parentUrl); ksr.setKey ( m_sreq.m_firstIp, pd , false ); // store this if ( ! m_zbuf.pushChar(rd) ) return NULL; // then the request if ( ! m_zbuf.safeMemcpy(&ksr,ksr.getRecSize() ) ) return NULL; returnList: // prevent cores in indexDoc() m_indexCode = EREINDEXREDIR; m_indexCodeValid = true; // for now we set this crap m_metaList = m_zbuf.getBufStart(); m_metaListSize = m_zbuf.length(); m_metaListValid = true; return m_metaList; } // // END DIFFBOT OBJECT QUERY REINDEX HACK // skip9: // get our checksum int32_t *plainch32 = getContentHash32(); if ( ! plainch32 || plainch32 == (void *)-1 ) return (char *)plainch32; // get this too int16_t *hs = getHttpStatus (); if ( ! hs || hs == (void *)-1 ) return (char *)hs; // make sure site is valid char *site = getSite(); if ( ! site || site == (void *)-1 ) return (char *)site; // this seems to be an issue as well for "unchanged" block below char *isr = getIsSiteRoot(); if ( ! isr || isr == (void *)-1 ) return (char *)isr; // get hash of all tags from tagdb that affect what we index //int32_t *tagHash = getTagHash32 ( ); //if ( ! tagHash || tagHash == (void *)-1 ) return (char *)tagHash; int64_t *sh64 = getSiteHash64(); if ( ! sh64 || sh64 == (void *)-1 ) return (char *)sh64; // make sure docid valid int64_t *mydocid = getDocId(); if ( ! mydocid || mydocid == (int64_t *)-1) return (char *)mydocid; // . get the old version of our XmlDoc from the previous spider time // . set using the old title rec in titledb // . should really not do any more than set m_titleRec... // . should not even uncompress it! // . getNewSpiderReply() will use this to set the reply if // m_indexCode == EDOCUNCHANGED... XmlDoc **pod = getOldXmlDoc ( ); if ( ! pod || pod == (XmlDoc **)-1 ) return (char *)pod; // point to the old xml doc if no error, etc. XmlDoc *od = *pod; // check if we are already indexed char *isIndexed = getIsIndexed (); if ( ! isIndexed || isIndexed == (char *)-1 ) return (char *)isIndexed; // do not delete anything in these cases, but do remove us from // spiderdb, and from tfndb (except for EDOCNOTNEW) //if ( m_indexCode == EDOCNOTNEW || m_indexCode == EDOCNOTOLD ) // od = NULL; // why call this way down here? it ends up downloading the doc! int32_t *indexCode = getIndexCode(); if ( ! indexCode || indexCode ==(void *)-1) return (char *)indexCode; // sanity check if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; } // this means to abandon the injection if ( *indexCode == EABANDONED || *indexCode == EHITCRAWLLIMIT || *indexCode == EHITPROCESSLIMIT ) { m_metaList = (char *)0x123456; m_metaListSize = 0; m_metaListValid = true; return m_metaList; } // if diffbot reply is empty, don't bother adding anything except // for the spider reply... reply might be "-1" too! //if ( m_useDiffbot && // ! m_isDiffbotJSONObject && // m_diffbotReplyValid && // m_diffbotReply.length() <= 3 ) // diffbotEmptyReply = true; // . some index code warrant retries, like EDNSTIMEDOUT, ETCPTIMEDOUT, // etc. these are deemed temporary errors. other errors basically // indicate a document that will never be indexable and should, // if currently indexed, be deleted. // . just add the spider reply and we're done if ( *indexCode == EDNSTIMEDOUT || *indexCode == ETCPTIMEDOUT || *indexCode == EUDPTIMEDOUT || *indexCode == EDNSDEAD || *indexCode == ENETUNREACH || *indexCode == EHOSTUNREACH // . rejected from a diffbot regex url crawl filter? // . or no json objects returned from diffbot? // . or rejected from the processign regex filter? // . then just add the SpiderReply to avoid respidering // . NO! still need to add outlinks //|| diffbotEmptyReply // . treat this as a temporary error i guess // . getNewSpiderReply() below will clear the error in it and // copy stuff over from m_sreq and m_oldDoc for this case || *indexCode == EDOCUNCHANGED ) { // sanity - in repair mode? if ( m_useSecondaryRdbs ) { char *xx=NULL;*xx=0; } // . this seems to be an issue for blocking // . if we do not have a valid ip, we can't compute this, // in which case it will not be valid in the spider reply // . why do we need this for timeouts etc? if the doc is // unchanged // we should probably update its siteinlinks in tagdb // periodically and reindex the whole thing... // . i think we were getting the sitenuminlinks for // getNewSpiderReply() if ( m_ipValid && m_ip != 0 && m_ip != -1 ) { int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni; } // all done! bool addReply = true; // Scraper.cpp uses this if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false; // page parser calls set4 and sometimes gets a dns time out! if ( m_sreqValid && m_sreq.m_isPageParser ) addReply = false; // return nothing if done if ( ! addReply ) { m_metaListSize = 0; m_metaList = (char *)0x1; return m_metaList; } // save this int32_t savedCode = *indexCode; // before getting our spider reply, assign crap from the old // doc to us since we are unchanged! this will allow us to // call getNewSpiderReply() without doing any processing, like // setting the Xml or Words classes, etc. copyFromOldDoc ( od ); // need this though! i don't want to print out "Success" // in the log in the logIt() function m_indexCode = savedCode; m_indexCodeValid = true; // but set our m_contentHash32 from the spider request // which got it from the spiderreply in the case of // EDOCUNCHANGED. this way ch32=xxx will log correctly. // I think this is only when EDOCUNCHANGED is set in the // Msg13.cpp code, when we have a spider compression proxy. if ( *indexCode == EDOCUNCHANGED && m_sreqValid && ! m_contentHash32Valid ) { m_contentHash32 = m_sreq.m_contentHash32; m_contentHash32Valid = true; } // we need these got getNewSpiderReply() m_wasInIndex = false; if ( od ) m_wasInIndex = true; m_isInIndex = m_wasInIndex; m_wasInIndexValid = true; m_isInIndexValid = true; // unset our ptr_linkInfo1 so we do not free it and core // since we might have set it in copyFromOldDoc() above ptr_linkInfo1 = NULL; size_linkInfo1 = 0; m_linkInfo1Valid = false; // . if not using spiderdb we are done at this point // . this happens for diffbot json replies (m_dx) if ( ! m_useSpiderdb ) { m_metaList = NULL; m_metaListSize = 0; return (char *)0x01; } // get our spider reply SpiderReply *newsr = getNewSpiderReply(); // return on error if ( ! newsr ) return (char *)newsr; // . panic on blocking! this is supposed to be fast! // . it might still have to lookup the tagdb rec????? if ( newsr == (void *)-1 ) { char *xx=NULL;*xx=0; } // how much we need int32_t needx = sizeof(SpiderReply) + 1; // . INDEX SPIDER REPLY (1a) // . index ALL spider replies as separate doc. error or not. // . then print out error histograms. // . we should also hash this stuff when indexing the // doc as a whole // i guess it is safe to do this after getting the spiderreply SafeBuf *spiderStatusDocMetaList = NULL; if ( cr->m_indexSpiderReplies && m_useSpiderdb && // doing it for diffbot throws off smoketests. // yeah, but we need it, so we'll just have to update // the smoketests ! cr->m_isCustomCrawl ) { // get the spiderreply ready to be added spiderStatusDocMetaList = getSpiderStatusDocMetaList ( newsr ); // error? if ( ! spiderStatusDocMetaList ) return NULL; // blocked? if (spiderStatusDocMetaList==(void *)-1) return (char *)-1; // need to alloc space for it too int32_t len = spiderStatusDocMetaList->length(); needx += len; // this too m_addedStatusDocSize = len; m_addedStatusDocSizeValid = true; } // doledb key? //if ( m_doledbKey.n0 || m_doledbKey.n1 ) // needx += 1 + sizeof(key_t); // + 4; // the FAKEDB unlock key for msg12 in spider.cpp //needx += 1 + sizeof(key_t); // FAKEDB // make the buffer m_metaList = (char *)mmalloc ( needx , "metalist"); if ( ! m_metaList ) return NULL; // save size for freeing later m_metaListAllocSize = needx; // ptr and boundary m_p = m_metaList; m_pend = m_metaList + needx; // save it char *saved = m_p; // first store spider reply "document" if ( spiderStatusDocMetaList ) { gbmemcpy ( m_p, spiderStatusDocMetaList->getBufStart(), spiderStatusDocMetaList->length() ); m_p += spiderStatusDocMetaList->length(); } /* Not any more, now we remove from doledb as soon as we get all the lock grants in our group (shard) using Msg4 in Spider.cpp. That way we can add a "0" entry into the waiting tree (or a time X ms into the future from now) to try to enforce a sameIpWait constraint and also allow up to maxSpidersPerIP. // remove from doledb if we had a valid key // (BEFORE adding SpiderReply) if ( m_doledbKey.n0 || m_doledbKey.n1 ) { // note it setStatus ( "removing key from doledb" ); // . now remove the original spider rec from "doledb" // . rdbid first *m_p = RDB_DOLEDB; m_p++; // then the key *(key_t *)m_p = m_doledbKey; // nukey, clear del bit to delete it *m_p = *m_p & 0xfe; // skip key m_p += sizeof(key_t); // then zero for data size // *(int32_t *)m_p = 0; //m_p += 4; // sanity check verifyMetaList( m_metaList , m_p , forDelete ); } */ // sanity check if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; } // . make a fake titledb key // . remove the spider lock (Msg12 in Spider.cpp) // . now SPider.cpp uses SpiderReply reception to remove lock // - mdw 9/28/13 //*m_p++ = RDB_FAKEDB; //*(key_t *)m_p = g_titledb.makeKey ( m_docId , 0LL , true ); //key_t fakeKey; //fakeKey.n1 = 0; //fakeKey.n0 = m_docId; //gbmemcpy ( m_p , &fakeKey , sizeof(key_t) ); //m_p += sizeof(key_t); // now add the new rescheduled time setStatus ( "adding SpiderReply to spiderdb" ); // rdbid first char rd = RDB_SPIDERDB; if ( m_useSecondaryRdbs ) rd = RDB2_SPIDERDB2; *m_p++ = rd; // get this if ( ! m_srepValid ) { char *xx=NULL;*xx=0; } // store the spider rec int32_t newsrSize = newsr->getRecSize(); gbmemcpy ( m_p , newsr , newsrSize ); m_p += newsrSize; m_addedSpiderReplySize = newsrSize; m_addedSpiderReplySizeValid = true; // sanity check if ( m_p - saved != needx ) { char *xx=NULL;*xx=0; } // sanity check verifyMetaList( m_metaList , m_p , forDelete ); // verify it m_metaListValid = true; // set size m_metaListSize = m_p - m_metaList; // all done return m_metaList; } // if using diffbot do not index the content of the web page we // got the json objects from, although, do keep it cached in titledb // because that can be useful // Not any more, now index the pages as well! then restrict search // to type:json to just search json objects. //if ( m_useDiffbot && ! m_isDiffbotJSONObject ) { // m_usePosdb = false; // m_useClusterdb = false; //} // get the old meta list if we had an old doc char *oldList = NULL; int32_t oldListSize = 0; if ( od ) { od->m_useSpiderdb = false; od->m_useTagdb = false; // do not use diffbot for old doc since we call // od->nukeJSONObjects below() od->m_diffbotApiUrlValid = true; // api url should be empty by default //od->m_diffbotApiNum = DBA_NONE; //log("break it here. shit this is not getting the list!!!"); // if we are doing diffbot stuff, we are still indexing this // page, so we need to get the old doc meta list oldList = od->getMetaList ( true ); oldListSize = od->m_metaListSize; if ( ! oldList || oldList ==(void *)-1) return (char *)oldList; } // . set whether we should add recs to titledb, posdb, linkdb, etc. // . if this doc is set by titlerec we won't change these // . we only turn off m_usePosdb, etc. if there is a // <meta name=noindex content=1> // . we will still add to spiderdb, but not posdb, linkdb, titledb // and clusterdb. // . so we'll add the spiderreply for this doc and the spiderrequests // for all outlinks and "firstIp" tagrecs to tagdb for those outlinks // . we use this for adding the url seed file gbdmoz.urls.txt // which contains a list of all the dmoz urls we want to spider. // gbdmoz.urls.txt is generated by dmozparse.cpp. we spider all // these dmoz urls so we can search the CONTENT of the pages in dmoz, // something dmoz won't let you do. char *mt = hasNoIndexMetaTag(); if ( ! mt || mt == (void *)-1 ) return (char *)mt; if ( *mt ) { m_usePosdb = false; m_useLinkdb = false; m_useTitledb = false; m_useClusterdb = false; // do not add the "firstIp" tagrecs of the outlinks any more // because it might hurt us? m_useTagdb = false; } if ( cr->m_isCustomCrawl ) m_useLinkdb = false; // . should we recycle the diffbot reply for this url? // . if m_diffbotOnlyProcessIfNewUrl is true then we want to keep // our existing diffbot reply, i.e. recycle it, even though we // respidered this page. bool *recycle = getRecycleDiffbotReply(); if ( ! recycle || recycle == (void *)-1) return (char *)recycle; // in that case inherit this from the old doc... if ( od && *recycle && od->m_diffbotJSONCount && // somehow i am seeing that this is empty! // this is how many title hashes of diffbot replies we've // stored in the old doc's titlerec. if these are not equal // and we call reindexJSONObjects() below then it cores // in redoJSONObjects(). od->size_linkInfo2/4 == od->m_diffbotJSONCount && // only call this once otherwise we double stock // m_diffbotTitleHashBuf m_diffbotJSONCount == 0 ) {//cr->m_isCustomCrawl){ m_diffbotJSONCount = od->m_diffbotJSONCount; m_sentToDiffbot = od->m_sentToDiffbot; m_gotDiffbotSuccessfulReply = od->m_gotDiffbotSuccessfulReply; // copy title hashes info. it goes hand in hand with the // NUMBER of diffbot items we have. int nh = 0; int32_t *ohbuf = od->getDiffbotTitleHashes ( &nh ); if ( ! m_diffbotTitleHashBuf.safeMemcpy ( ohbuf , nh*4 ) ) return NULL; ptr_linkInfo2 =(LinkInfo *)m_diffbotTitleHashBuf.getBufStart(); size_linkInfo2=m_diffbotTitleHashBuf.length(); } // but we might have to call reindexJSONObjects() multiple times if // it would block if ( od && *recycle && // only reindex if it is a query reindex i guess otherwise // just leave it alone m_sreqValid && m_sreq.m_isPageReindex && od->m_diffbotJSONCount && size_linkInfo2 ) { // similar to od->nukeJSONObjects int32_t *ohbuf =(int32_t *)m_diffbotTitleHashBuf.getBufStart(); int32_t nh =m_diffbotTitleHashBuf.length() / 4; int32_t *status = reindexJSONObjects( ohbuf , nh ); if ( ! status || status == (void *)-1) return (char *)status; } // just delete the json items whose "title hashes" are present // in the "old doc" but NOT i the "new doc". // we use the title hash to construct a unique url for each json item. // if the title hash is present in both the old and new docs then // do not delete it here, but we will reindex it later in // getMetaList() below when we call indexDoc() on each one after // setting m_dx to each one. bool nukeJson = true; if ( ! od ) nukeJson = false; if ( od && od->m_diffbotJSONCount <= 0 ) nukeJson = false; // if recycling json objects, leave them there! if ( *recycle ) nukeJson = false; // you have to be a diffbot crawl to do this // no, not if you have th diffbot api url set... so take this out //if ( ! cr->m_isCustomCrawl ) nukeJson = false; // do not remove old diffbot json objects if pageparser.cpp test // because that can not change the index, etc. if ( getIsPageParser() ) nukeJson = false; if ( nukeJson ) { // it should only nuke/delete the json items that we LOST, // so if we still have the title hash in our latest // diffbot reply, then do not nuke that json item, which // will have a url ending in -diffboyxyz%"UINT32" // (where %"UINT32" is the json item title hash). // This will download the diffbot reply if not already there. int32_t numHashes; int32_t *th = getDiffbotTitleHashes(&numHashes); if ( ! th && ! g_errno ) { char *xx=NULL;*xx=0; } if ( ! th || th == (void *)-1 ) return (char *)th; // this returns false if it blocks int32_t *status = od->nukeJSONObjects( th , numHashes ); if ( ! status || status == (void *)-1) return (char *)status; } // . need this if useTitledb is true // . otherwise XmlDoc::getTitleRecBuf() cores because its invalid if ( m_useTitledb ) { LinkInfo *info1 = getLinkInfo1(); if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1; } // global debug g_od = od; /* // is the document content unchanged? bool unchanged = false; if ( od && od->m_contentHash32 == *ch32 ) unchanged = true; // http status of 304 means "not modified since" if ( od && *hs == 304 ) unchanged = true; // compare to last time if ( od && *tagHash != od->m_tagHash32 ) unchanged = false; // do not do this if from pageparser.cpp //if ( m_sreqValid && m_sreq.m_isPageParser ) unchanged = false; if ( getIsPageParser() ) unchanged = false; // force reindex if it was from query reindex (docid based spider req) if ( m_sreqValid && m_sreq.m_urlIsDocId ) unchanged = false; // if we were turked... how to tell???? if ( m_sreqValid && m_sreq.m_isInjecting ) unchanged = false; // just turn it all off for now because our parsing logic might // have changed unchanged = false; // set this i guess for doConsistencyTest() m_unchanged = unchanged; m_unchangedValid = true; // . if doc content was unchanged just add the SpiderReply to the // meta list so that spiderdb knows we attempted it at this time. // . copy over member vars of the old titleRec/XmlDoc into us so // we can save time and cpu if ( unchanged ) { // this seems to be an issue for blocking int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni; // all done! bool addReply = true; // Scraper.cpp uses this if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false; // return nothing if done if ( ! addReply ) { m_metaListSize = 0; m_metaList = (char *)0x1; return m_metaList; } // before getting our spider reply, assign crap from the old // doc to us since we are unchanged! this will allow us to // call getNewSpiderReply() without doing any processing, like // setting the Xml or Words classes, etc. copyFromOldDoc ( od ); // and don't forget to validate this int32_t *ic = getIndexCode(); // should never block since we copied from old doc if ( ic == (void *)-1 ) { char *xx=NULL;*xx=0; } // get our spider reply SpiderReply *newsr = getNewSpiderReply(); // return on error if ( ! newsr ) return (char *)newsr; // . panic on blocking! this is supposed to be fast! // . it might still have to lookup the tagdb rec????? if ( newsr == (void *)-1 ) { char *xx=NULL;*xx=0; } // unset our ptr_linkInfo1 so we do not free it and core // since we might have set it in copyFromOldDoc() above ptr_linkInfo1 = NULL; size_linkInfo1 = 0; // how much we need int32_t needx = sizeof(SpiderReply) + 1; // doledb key? if ( m_doledbKey.n0 || m_doledbKey.n1 ) needx += 1 + sizeof(key_t); // + 4; // the titledb unlock key for msg12 in spider.cpp needx += 1 + sizeof(key_t); // make the buffer m_metaList = (char *)mmalloc ( needx , "metalist"); if ( ! m_metaList ) return NULL; // save size for freeing later m_metaListAllocSize = needx; // ptr and boundary m_p = m_metaList; m_pend = m_metaList + needx; // save it char *saved = m_p; // remove from doledb if we had a valid key (BEFORE adding SpiderReply) if ( m_doledbKey.n0 || m_doledbKey.n1 ) { // note it setStatus ( "removing key from doledb" ); // . now remove the original spider rec from "doledb" // . rdbid first *m_p = RDB_DOLEDB; m_p++; // then the key *(key_t *)m_p = m_doledbKey; // nukey, clear del bit to delete it *m_p = *m_p & 0xfe; // skip key m_p += sizeof(key_t); // then zero for data size // *(int32_t *)m_p = 0; //m_p += 4; // sanity check verifyMetaList( m_metaList , m_p ); } // sanity check if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; } // . make a fake titledb key // . remove the spider lock (Msg12 in Spider.cpp) *m_p++ = RDB_FAKEDB; *(key_t *)m_p = g_titledb.makeKey ( m_docId , 0LL , true ); m_p += sizeof(key_t); // now add the new rescheduled time // note it setStatus ( "adding SpiderReply to spiderdb" ); // rdbid first *m_p = RDB_SPIDERDB; // use secondary? if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2; m_p++; // get this if ( ! m_srepValid ) { char *xx=NULL;*xx=0; } // store the spider rec int32_t newsrSize = newsr->getRecSize(); gbmemcpy ( m_p , newsr , newsrSize ); m_p += newsrSize; // sanity check if ( m_p - saved != needx ) { char *xx=NULL;*xx=0; } // sanity check verifyMetaList( m_metaList , m_p ); // verify it m_metaListValid = true; // set size m_metaListSize = m_p - m_metaList; // all done return m_metaList; } */ // so getSiteRank() works int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni; // so addTable144 works uint8_t *langId = getLangId(); if ( ! langId || langId == (uint8_t *)-1 ) return (char *) langId; // // . before making the title rec we need to set all the ptrs! // . so at least now set all the data members we will need to // seriazlize into the title rec because we can't be blocking further // down below after we set all the hashtables and XmlDoc::ptr_ stuff if ( ! m_setFromTitleRec || m_useSecondaryRdbs ) { // all member vars should already be valid if set from titlerec char *ptg = prepareToMakeTitleRec (); // return NULL with g_errno set on error if ( ! ptg || ptg == (void *)-1 ) return (char *)ptg; } // sanity check - if the valid title rec is null, m_indexCode is set! //if ( ! *tr && ! m_indexCode ) { char *xx=NULL;*xx=0; } // . bail. return an empty meta list, m_metaListSize should be 0! // . NO! we need to add a SpiderReply to spiderdb... //if ( ! *tr ) // log("HEY"); /* if ( ! *tr ) { m_metaList = ""; m_metaListSize = 0; m_metaListValid = true; return m_metaList; } */ // get this for hashing stuff //Spam *sp = getSpam(); //if ( ! sp || sp == (void *)-1 ) return (char *)sp; // our next slated spider priority char *spiderLinks3 = getSpiderLinks(); if ( ! spiderLinks3 || spiderLinks3 == (char *)-1 ) return (char *)spiderLinks3; bool spideringLinks = *spiderLinks3; // int16_tcut XmlDoc *nd = this; /////////////////////////////////// /////////////////////////////////// // // // if we had an error, do not add us regardless to the index // although we might add SOME things depending on the error. // Like add the redirecting url if we had a ESIMPLIFIEDREDIR error. // So what we had to the Rdbs depends on the indexCode. // if ( m_indexCode ) nd = NULL; // OR if deleting from index, we just want to get the metalist // directly from "od" if ( m_deleteFromIndex ) nd = NULL; // // /////////////////////////////////// /////////////////////////////////// if ( ! nd ) spideringLinks = false; // set these for getNewSpiderReply() so it can set // SpiderReply::m_wasIndexed and m_isIndexed... m_wasInIndex = false; m_isInIndex = false; if ( od ) m_wasInIndex = true; if ( nd ) m_isInIndex = true; m_wasInIndexValid = true; m_isInIndexValid = true; // if we are adding a simplified redirect as a link to spiderdb if ( m_indexCode == EDOCSIMPLIFIEDREDIR ) spideringLinks = true; // likewise if there error was ENONCANONICAL treat it like that if ( m_indexCode == EDOCNONCANONICAL ) spideringLinks = true; // // . prepare the outlink info if we are adding links to spiderdb! // . do this before we start hashing so we do not block and re-hash!! // if ( spideringLinks && ! m_doingConsistencyCheck && m_useSpiderdb){ setStatus ( "getting outlink info" ); TagRec ***grv = getOutlinkTagRecVector(); if ( ! grv || grv == (void *)-1 ) return (char *)grv; //char **iiv = getOutlinkIsIndexedVector(); //if ( ! iiv || iiv == (void *)-1 ) return (char *)iiv; int32_t **ipv = getOutlinkFirstIpVector(); if ( ! ipv || ipv == (void *)-1 ) return (char *)ipv; //int8_t *hcv = getOutlinkHopCountVector(); //if ( ! hcv || hcv == (void *)-1 ) return (char *)hcv; //char *ipi = getIsIndexed(); // is the parent indexed? //if ( ! ipi || ipi == (char *)-1 ) return (char *)ipi; } // get the tag buf to add to tagdb SafeBuf *ntb = NULL; if ( m_useTagdb && ! m_deleteFromIndex ) { ntb = getNewTagBuf(); if ( ! ntb || ntb == (void *)-1 ) return (char *)ntb; } char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot; Words *ww = getWords(); if ( ! ww || ww == (void *)-1 ) return (char *)ww; int64_t *pch64 = getExactContentHash64(); //int64_t *pch64 = getLooseContentHash64(); if ( ! pch64 || pch64 == (void *)-1 ) return (char *)pch64; // get the voting table which we will add to sectiondb SectionVotingTable *nsvt = NULL; SectionVotingTable *osvt = NULL; // seems like // sectiondb takes up abotu 15% of the disk space like this. no! // cuz then there is revdb, so we are 30%. so that's a no go. bool addSectionVotes = false; if ( nd ) addSectionVotes = true; if ( ! m_useSectiondb ) addSectionVotes = false; // to save disk space no longer add the roots! nto only saves sectiondb // but also saves space in revdb //if ( nd && *isRoot ) addSectionVotes = true; if ( addSectionVotes ) { nsvt = getNewSectionVotingTable(); if ( ! nsvt || nsvt == (void *)-1 ) return (char *)nsvt; // get the old table too! osvt = getNewSectionVotingTable(); if ( ! osvt || osvt == (void *)-1 ) return (char *)osvt; } // get the addresses for hashing tag hashes that indicate place names Addresses *na = NULL; //Addresses *oa = NULL; if ( nd ) na = getAddresses(); //if ( od ) oa = od->getAddresses(); // get dates ready for hashing Dates *ndp = NULL; //Dates *odp = NULL; if ( nd ) { ndp = nd->getDates(); if ( ! ndp || ndp==(void *)-1) return (char *)ndp; } //if ( od ) { // odp = od->getDates(); // if ( ! odp || odp==(void *)-1) return (char *)odp; //} // need firstip if adding a rebuilt spider request if ( m_useSecondaryRdbs && ! m_isDiffbotJSONObject && m_useSpiderdb ) { int32_t *fip = getFirstIp(); if ( ! fip || fip == (void *)-1 ) return (char *)fip; } // shit, we need a spider reply so that it will not re-add the // spider request to waiting tree, we ignore docid-based // recs that have spiderreplies in Spider.cpp SpiderReply *newsr = NULL; if ( m_useSpiderdb ) { // && ! m_deleteFromIndex ) { newsr = getNewSpiderReply(); if ( ! newsr || newsr == (void *)-1 ) return (char *)newsr; } // i guess it is safe to do this after getting the spiderreply SafeBuf *spiderStatusDocMetaList = NULL; if ( cr->m_indexSpiderReplies && m_useSpiderdb && // doing it for diffbot throws off smoketests ! cr->m_isCustomCrawl ) { // get the spiderreply ready to be added to the rdbs w/ msg4 spiderStatusDocMetaList = getSpiderStatusDocMetaList ( newsr ); // block? if ( ! spiderStatusDocMetaList || spiderStatusDocMetaList == (void *)-1) return (char *)spiderStatusDocMetaList; } // the site hash for hashing int32_t *sh32 = getSiteHash32(); if ( ! sh32 || sh32 == (int32_t *)-1 ) return (char *)sh32; // set ptr_clockCandidatesData if ( nd ) { HashTableX *cct = nd->getClockCandidatesTable(); if ( ! cct || cct==(void *)-1) return (char *)cct; } if ( m_useLinkdb && ! m_deleteFromIndex ) { int32_t *linkSiteHashes = getLinkSiteHashes(); if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 ) return (char *)linkSiteHashes; } //SafeBuf *au = getDiffbotApiUrl(); //if ( ! au || au == (void *)-1 ) return (char *)au; // test json parser // /* char *json = "{\"icon\":\"http://www.pixar.com/sites/default/files/pixar_2012_favicon_0.ico\",\"text\":\"\",\"title\":\"Pixar\",\"type\":\"article\",\"media\":[{\"primary\":\"true\",\"link\":\"http://www.pixar.com/sites/default/files/home_billboard_v7.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/roz1_0.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/home_bu-thumb_v1.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/mu_home_thumb.jpg\",\"type\":\"image\"},{\"link\":\"http://www.pixar.com/sites/default/files/brenda_homepage.jpg\",\"type\":\"image\"}],\"url\":\"http://www.pixar.com/\"}"; char *dd = getNextJSONObject ( json ); if ( *dd ) { char *xx=NULL;*xx=0; } */ /////////// // // BEGIN the diffbot json object index hack // // if we are using diffbot, then each json object in the diffbot reply // should be indexed as its own document. // /////////// // . get the reply of json objects from diffbot // . this will be empty if we are a json object! // . will also be empty if not meant to be sent to diffbot // . the TOKENIZED reply consists of \0 separated json objects that // we create from the original diffbot reply SafeBuf *tdbr = getTokenizedDiffbotReply(); if ( ! tdbr || tdbr == (void *)-1 ) return (char *)tdbr; int32_t tdbrLen = tdbr->length(); // do not index json items as separate docs if we are page parser if ( getIsPageParser() ) tdbrLen = 0; // once we have tokenized diffbot reply we can get a unique // hash of the title of each json item. that way, if a page changes // and it gains or loses a diffbot item, the old items will still // have the same url and we can set their m_indexCode to EDOCUNCHANGED // if the individual json item itself has not changed when we // call m_dx->indexDoc() below. int32_t numHashes = 0; int32_t *titleHashBuf = NULL; // // if we got a json object or two from diffbot, index them // as their own child xmldocs. // watch out for reply from diffbot of "-1" indicating error! // if ( tdbrLen > 3 ) { // get title hashes of the json items titleHashBuf = getDiffbotTitleHashes(&numHashes); if (!titleHashBuf || titleHashBuf == (void *)-1){ char *xx=NULL;*xx=0;} // make sure diffbot reply is valid for sure if ( ! m_diffbotReplyValid ) { char *xx=NULL;*xx=0; } // set status for this setStatus ( "indexing diffbot json doc"); // new guy here if ( ! m_dx ) { try { m_dx = new ( XmlDoc ); } catch ( ... ) { g_errno = ENOMEM; log("xmldoc: failed to alloc m_dx"); return NULL; } mnew ( m_dx , sizeof(XmlDoc),"xmldocdx"); // we now parse the array of products out of the // diffbot reply. each product is an item/object. m_diffbotObj = tdbr->getBufStart(); m_diffbotJSONCount = 0; } // loop back up here to process next json object from below jsonloop: // if m_dx has no url set, call set4 i guess if ( ! m_dx->m_contentValid ) { // sanity. ensure the json item we are trying to // index has a title hash in this buf if(m_diffbotJSONCount>=numHashes){char *xx=NULL;*xx=0;} // get the title of the json we are indexing int32_t jth = titleHashBuf [ m_diffbotJSONCount ]; // make the fake url for this json object for indexing SafeBuf fakeUrl; fakeUrl.set ( m_firstUrl.getUrl() ); // append -diffbot-0 etc. for fake url fakeUrl.safePrintf("-diffbotxyz%"UINT32"", //(int32_t)m_diffbotJSONCount); (uint32_t)jth); m_diffbotJSONCount++; // this can go on the stack since set4() copies it SpiderRequest sreq; sreq.reset(); // string ptr char *url = fakeUrl.getBufStart(); // use this as the url strcpy( sreq.m_url, url ); // parentdocid of 0 int32_t firstIp = hash32n ( url ); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; sreq.setKey( firstIp,0LL, false ); sreq.m_isInjecting = 1; sreq.m_isPageInject = 1; sreq.m_hopCount = 0; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; // so we can match url filters' "insitelist" directive // in Spider.cpp::getUrlFilterNum() sreq.m_domHash32 = m_domHash32; sreq.m_siteHash32 = m_siteHash32; sreq.m_hostHash32 = m_siteHash32; // set this if (!m_dx->set4 ( &sreq , NULL , cr->m_coll , NULL , // pbuf // give it a niceness of 1, we have // to be careful since we are a // niceness of 0!!!! m_niceness, // 1 , // inject this content m_diffbotObj, false, // deleteFromIndex , 0, // forcedIp , CT_JSON, // contentType , 0, // lastSpidered , false )) // hasMime // g_errno should be set! return NULL; // we are indexing json objects, don't use all these m_dx->m_useClusterdb = false; m_dx->m_useSpiderdb = false; m_dx->m_useTagdb = false; m_dx->m_usePlacedb = false; m_dx->m_useLinkdb = false; m_dx->m_isChildDoc = true; // we like to sort json objects using // 'gbsortby:spiderdate' query to get the most // recent json objects, so this must be valid if ( m_spideredTimeValid ) { m_dx->m_spideredTimeValid = true; m_dx->m_spideredTime = m_spideredTime; } m_dx->m_isDiffbotJSONObject = true; } // when the indexdoc completes, or if it blocks, call us! // we should just pass through here //xd->setCallback ( this , getMetaListWrapper ); m_dx->setCallback ( m_masterState , m_masterLoop ); /////////////// // . inject the content of the json using this fake url // . return -1 if this blocks // . if m_dx got its msg4 reply it ends up here, in which // case do NOT re-call indexDoc() so check for // m_listAdded. /////////////// if ( ! m_dx->m_listAdded && ! m_dx->indexDoc ( ) ) return (char *)-1; // critical error on our part trying to index it? // does not include timeouts or 404s, etc. mostly just // OOM errors. if ( g_errno ) return NULL; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // count as deleted cr->m_localCrawlInfo.m_objectsAdded++; cr->m_globalCrawlInfo.m_objectsAdded++; cr->m_needsSave = true; // we successfully index the json object, skip to next one m_diffbotObj += gbstrlen(m_diffbotObj) + 1; // but gotta set this crap back log(LOG_INFO,"diffbot: resetting %s",m_dx->m_firstUrl.m_url); // clear for next guy if there is one. clears // m_dx->m_contentValid so the set4() can be called again above m_dx->reset(); // have we breached the buffer of json objects? if not, do more if ( m_diffbotObj < tdbr->getBuf() ) goto jsonloop; } ///// // // END the diffbot json object index hack // ///// // // CAUTION // // CAUTION // // We should never "block" after this point, lest the hashtables // we create get messed up. // // // // START HASHING // // // store what we hash into this table if ( (m_pbuf || m_storeTermListInfo) && ! m_wts ) { // init it. the value is a TermInfo class. allowDups=true! m_wtsTable.set (12,sizeof(TermDebugInfo), 0,NULL,0,true,m_niceness, "wts-tab"); // point to it, make it active m_wts = &m_wtsTable; } // how much to alloc? compute an upper bound int32_t need = 0; // should we index this doc? bool index1 = true; setStatus ( "hashing posdb and datedb terms" ); // . hash our documents terms into "tt1" // . hash the old document's terms into "tt2" // . by old, we mean the older versioned doc of this url spidered b4 HashTableX tt1; HashTableX tt2; // how many words we got? int32_t nw = m_words.getNumWords(); // . prepare it, 5000 initial terms // . make it nw*8 to avoid have to re-alloc the table!!! // . i guess we can have link and neighborhood text too! we don't // count it here though... but add 5k for it... int32_t need4 = nw * 4 + 5000; if ( nd && index1 && m_usePosdb ) { if ( ! tt1.set ( 18 , 4 , need4,NULL,0,false,m_niceness, "posdb-indx")) return NULL; int32_t did = tt1.m_numSlots; //bool index2 = true; // . hash the document terms into "tt1" // . this is a biggie!!! // . only hash ourselves if m_indexCode is false // . m_indexCode is non-zero if we should delete the doc from // index // . i think this only adds to posdb //log("xmldoc: CALLING HASHALL"); // shit, this blocks which is bad!!! char *nod = hashAll ( &tt1 ) ; // you can't block here because if we are re-called we lose tt1 if ( nod == (char *)-1 ) { char *xx=NULL;*xx=0; } // error? if ( ! nod ) return NULL; int32_t done = tt1.m_numSlots; if ( done != did ) log("xmldoc: reallocated big table! bad. old=%"INT32" " "new=%"INT32" nw=%"INT32"",did,done,nw); } // if indexing the spider reply as well under a different docid // there is no reason we can't toss it into our meta list here if ( spiderStatusDocMetaList ) need += spiderStatusDocMetaList->length(); // now we use revdb // before hashing the old doc into it //if ( od && index2 ) { // // if this hash table init fails, return NULL // if (!tt2.set(12,4,5000,NULL,0,false,m_niceness)) return NULL; // char *rod = od->hash ( &tt2 ) ; // if ( ! rod || rod == (char *)-1 ) return rod; //} // space for indexdb AND DATEDB! +2 for rdbids int32_t needIndexdb = 0; needIndexdb +=tt1.m_numSlotsUsed*(sizeof(key144_t)+2+sizeof(key128_t)); //needIndexdb+=tt2.m_numSlotsUsed * (sizeof(key_t)+2+sizeof(key128_t)); need += needIndexdb; // sanity check //if ( ! od && m_skipIndexing && needIndexdb ) { char *xx=NULL;*xx=0; } // . sanity check - must have one or the other! // . well, not in the case of EDOCNOTNEW or EDOCNOTOLD, in which // case we just remove ourselves from spiderdb, and in the case // of EDOCNOTOLD, from tfndb as well //if ( ! od && ! nd ) { char *xx=NULL;*xx=0; } // what pub dates do the old and new doc have? -1 means none. int32_t date1 = -1; if ( nd ) date1 = nd->m_pubDate; //int32_t date2 = -1; if ( od ) date2 = od->m_pubDate; // now we also add the title rec. true = ownsCbuf? ret NULL on error // with g_errno set. //if ( nd && ! nd->compress( true , m_niceness ) ) return NULL; /* now we have the bit in the posdb key, so this should not be needed... use Posdb::isShardedByTermId() to see if it is such a spcial case key like Hostdb::getShardNum() now does... setStatus ( "hashing nosplit keys" ); // hash no split terms into ns1 and ns2 HashTableX ns1; // prepare it, 500 initial terms if ( ! ns1.set ( 18 , 4 , 500,NULL,0,false,m_niceness,"nosplt-indx" )) return NULL; // . hash for no splits // . like above, but these are "no split" termids if ( nd && m_usePosdb && ! hashNoSplit ( &ns1 ) ) return NULL; //if(index2 && od && ! od->hashNoSplit ( &ns2 ) ) return NULL; // needs for hashing no split terms int32_t needNoSplit1 = 0; // add em up. +1 for rdbId. add to both indexdb AND datedb i guess... needNoSplit1 += ns1.m_numSlotsUsed * (18+1); // +16+1); //needNoSplit += ns2.m_numSlotsUsed * (12+1+16+1); // add it in need += needNoSplit1; // sanity check //if ( ! od && m_skipIndexing && needNoSplit ) { char *xx=NULL;*xx=0; } */ setStatus ( "hashing sectiondb keys" ); // add in special sections keys. "ns" = "new sections", etc. // add in the special nosplit datedb terms from the Sections class // these hash into the term table so we can do incremental updating HashTableX st1; // <key128_t,char> dt1; //HashTableX st2; // <key128_t,char> dt2; // set key/data size int32_t svs = sizeof(SectionVote); st1.set(sizeof(key128_t),svs,0,NULL,0,false,m_niceness,"sectdb-indx"); // tell hashtable to use the sectionhash for determining the slot, // not the lower 4 bytes because that is the docid which is the // same for every key st1.m_maskKeyOffset = 6; //st2.set(sizeof(key128_t),svs,0,NULL,0,false,m_niceness); // do not bother if deleting if ( m_indexCode ) nsvt = NULL; // . now we hash the root just to get some section votes i guess //if ( nts && ! *isr ) nsvt = NULL; // if old voting table add more than 100,000 votes forget it!!! do // not bloat sectiondb that big... if ( osvt && osvt->m_totalSiteVoters >= MAX_SITE_VOTERS ) nsvt = NULL; // hash terms into a table that uses full datedb keys if ( nsvt && ! nsvt->hash (m_docId,&st1,*sh64,m_niceness)) return NULL; // needs for hashing no split terms int32_t needSectiondb = 0; // add em up. plus one for rdbId needSectiondb += st1.m_numSlotsUsed * (16+svs+1); //needSectiondb += st2.m_numSlotsUsed * (16+svs+1); // add it in need += needSectiondb; // Sections::respiderLineWaiters() adds one docid-based spider rec // for every url waiting in line. Sections::m_numLineWaiters. assume // 64 bytes per line waiter spider rec i guess //int32_t needLineWaiters = 0; // +1 for rdbId //if ( ns ) needLineWaiters = ns->m_numLineWaiters * 64; // forgot to add this? //need += needLineWaiters; // . for adding Sections.cpp keys // . Sections::hash() does not bother with invalid sections // . waitInLine might be true in Sections::hash() too, so always add 12 //if ( ns ) need += (ns->m_numSections - ns->m_numInvalids)*12 + 12; //if ( os ) need += (os->m_numSections - os->m_numInvalids)*12 + 12; // for adding Addresses::m_keys[] (Addresses::hash()) //if ( na ) need += (na->m_numKeys * 16); //if ( oa ) need += (oa->m_numKeys * 16); // don't forget Dates! //if ( ndp ) need += ndp->m_numPubDates * sizeof(key_t); //if ( odp ) need += odp->m_numPubDates * sizeof(key_t); // clusterdb keys. plus one for rdbId int32_t needClusterdb = 0; //if ( nd && ! nd->m_skipIndexing ) needClusterdb += 13; //if ( od && ! od->m_skipIndexing ) needClusterdb += 13; if ( nd ) needClusterdb += 13; //if ( od ) needClusterdb += 13; need += needClusterdb; // . LINKDB // . linkdb records. assume one per outlink // . we may index 2 16-byte keys for each outlink Links *nl2 = NULL; //if ( spideringLinks ) nl2 = &m_links; // if injecting, spideringLinks is false, but then we don't // add the links to linkdb, which causes the qainlinks() test to fail nl2 = &m_links; // do not bother if deleting. but we do add simplified redirects // to spiderdb as SpiderRequests now. int32_t code = m_indexCode; if ( code == EDOCSIMPLIFIEDREDIR ) code = 0; if ( code == EDOCNONCANONICAL ) code = 0; if ( code ) nl2 = NULL; //Links *ol = NULL; if ( od ) ol = od->getLinks(); // . set key/data size // . use a 16 byte key, not the usual 12 // . use 0 for the data, since these are pure keys, which have no // scores to accumulate HashTableX kt1; //HashTableX kt2; int32_t nis = 0; if ( nl2 && m_useLinkdb ) nis = nl2->getNumLinks() * 4; // pre-grow table based on # outlinks kt1.set ( sizeof(key224_t),0,nis,NULL,0,false,m_niceness,"link-indx" ); // use magic to make fast kt1.m_useKeyMagic = true; // linkdb keys will have the same lower 4 bytes, so make hashing fast. // they are 28 byte keys. bytes 20-23 are the hash of the linkEE // so that will be the most random. kt1.m_maskKeyOffset = 20; // faster //kt2.set ( sizeof(key128_t) , 0,0,NULL,0,false,m_niceness ); // do not add these //bool add1 = true; // do not add negative key if no old doc //if ( ! od ) add2 = false; // . we already have a Links::hash into the Termtable for links: terms, // but this will have to be for adding to Linkdb. basically take a // lot of it from Linkdb::fillLinkdbList() // . these return false with g_errno set on error if ( m_useLinkdb && nl2 && ! hashLinksForLinkdb(&kt1) ) return NULL; //if ( add2 && ol && ! !od->m_skipIndexing && // ol->hash(&kt2,od,m_niceness) ) // return NULL; // add up what we need. +1 for rdbId int32_t needLinkdb = 0; needLinkdb += kt1.m_numSlotsUsed * (sizeof(key224_t)+1); //needLinkdb += kt2.m_numSlotsUsed * (sizeof(key128_t)+1); need += needLinkdb; // sanity check //if ( ! od && m_skipIndexing && needLinkdb ) { char *xx=NULL;*xx=0; } // PLACEDB HashTableX pt1; //HashTableX pt2; // . set key/data size // . limit every address to 512 bytes pt1.set(sizeof(key128_t),512,0,NULL,0,false,m_niceness,"placedb-indx"); //pt2.set(sizeof(key128_t),512,0,NULL,0,false,m_niceness); // // if this is true, then we just store the placedb recs // directly into the title rec. That way we do not have // to store the content of the web page, and we save space. // // otherwise, we have to parse out the sections and it is much slower //else if (oa && !oa->hashForPlacedb(m_docId,*sh32,*od->getIp(),&pt2) ) // return NULL; // hash terms into a table that uses full datedb keys if ( na && !na->hashForPlacedb(m_docId,*sh32,*nd->getIp(),&pt1)) return NULL; setStatus("hashing place info"); int32_t needPlacedb = 0; // . +1 for rdbId // . up to 512 bytes per address needPlacedb += pt1.m_numSlotsUsed * (sizeof(key128_t)+1+512); //needPlacedb += pt2.m_numSlotsUsed * (sizeof(key128_t)+1+512); need += needPlacedb; // sanity check -- coring here because we respider the page and // the address is gone so it tries to delete it! //if ( ! od && m_skipIndexing && needPlacedb ) { char *xx=NULL;*xx=0; } // we add a negative key to doledb usually (include datasize now) int32_t needDoledb = sizeof(key_t) + 1 ; // + 4; if ( forDelete ) needDoledb = 0; need += needDoledb; // for adding the SpiderReply to spiderdb (+1 for rdbId) int32_t needSpiderdb1 = sizeof(SpiderReply) + 1; if ( forDelete ) needSpiderdb1 = 0; need += needSpiderdb1; // if injecting we add a spiderrequest to be able to update it // but don't do this if it is pagereindex. why is pagereindex // setting the injecting flag anyway? int32_t needSpiderdb3 = 0; if ( m_sreqValid && m_sreq.m_isInjecting && m_sreq.m_fakeFirstIp && ! m_sreq.m_forceDelete && // do not rebuild spiderdb if only rebuilding posdb // this is explicitly for injecting so we need to add // the spider request to spiderdb... //m_useSpiderdb && /// don't add requests like http://xyz.com/xxx-diffbotxyz0 though ! m_isDiffbotJSONObject ) needSpiderdb3 = m_sreq.getRecSize() + 1; // or if we are rebuilding spiderdb else if (m_useSecondaryRdbs && !m_isDiffbotJSONObject && m_useSpiderdb) needSpiderdb3 = sizeof(SpiderRequest) + m_firstUrl.m_ulen+1; need += needSpiderdb3; //int32_t needSpiderdb3 = 0; //if ( m_sreqValid ) needSpiderdb3 = m_sreq.getRecSize() + 1; //need += needSpiderdb3; // . for adding our outlinks to spiderdb // . see SpiderRequest::getRecSize() for description // . SpiderRequest::getNeededSize() will include the null terminator int32_t hsize = SpiderRequest::getNeededSize ( 0 ); int32_t needSpiderdb2 = hsize * m_links.getNumLinks(); // and the url buffer of outlinks. includes \0 terminators i think needSpiderdb2 += m_links.getLinkBufLen(); // don't need this if doing consistecy check if ( m_doingConsistencyCheck ) needSpiderdb2 = 0; // nor for generating the delete meta list for incremental indexing if ( forDelete ) needSpiderdb2 = 0; // accumulate it need += needSpiderdb2; // the new tags for tagdb int32_t needTagdb = 0; if ( ntb ) needTagdb = ntb->length() ; // add 1 byte for up to 128 rdbids //needTagdb += needTagdb/sizeof(Tag) + 1; // add that in need += needTagdb; // . add in title rec size // . should be valid because we called getTitleRecBuf() above // . this should include the key // . add in possible negative key for deleting old title rec //int32_t needTitledb = sizeof(key96_t); // +1 for rdbId //if ( nd && m_useTitledb ) needTitledb = m_titleRecSize + 1; //need += needTitledb; // // . CHECKSUM PARSING CONSISTENCY TEST // // . set m_metaListChecksum member (will be stored in titleRec header) // . gotta set m_metaListCheckSum8 before making titleRec below // . also, if set from titleRec, verify metalist is the same! // if ( ! m_computedMetaListCheckSum ) { // do not call twice! m_computedMetaListCheckSum = true; // all keys in tt1, ns1, kt1 and pt1 int32_t ck32 = 0; ck32 ^= tt1.getKeyChecksum32(); //ck32 ^= ns1.getKeyChecksum32(); //ck32 ^= kt1.getKeyChecksum32(); //ck32 ^= pt1.getKeyChecksum32(); // set this before calling getTitleRecBuf() below uint8_t currentMetaListCheckSum8 = (uint8_t)ck32; // see if matches what was in old titlerec if ( m_metaListCheckSum8Valid && // if we were set from a titleRec, see if we got // a different hash of terms to index this time around... m_setFromTitleRec && // fix for import log spam ! m_isImporting && m_version >= 120 && m_metaListCheckSum8 != currentMetaListCheckSum8 ) log("xmldoc: checksum parsing inconsistency for %s " "%i != %i", m_firstUrl.getUrl(), (int)m_metaListCheckSum8, (int)currentMetaListCheckSum8); // assign the new one, getTitleRecBuf() call below needs this m_metaListCheckSum8 = currentMetaListCheckSum8; m_metaListCheckSum8Valid = true; } // // now that we've set all the ptr_* members vars, we can make // the title rec // // . MAKE the title rec from scratch, that is all we need at this point // . sets m_indexCode to EDOCNOTNEW or EDOCNOTOLD sometimes // . if repairing and not rebuilding titledb, we do not need the // titlerec if ( m_useTitledb ) { // this buf includes key/datasize/compressdata SafeBuf *tr = getTitleRecBuf (); // panic if this blocks! it should not at this point because // we'd have to re-hash the crap above if ( tr == (void *) -1 ) { char *xx=NULL;*xx=0; } // return NULL with g_errno set on error if ( ! tr ) return (char *)tr; // sanity check - if the valid title rec is null, // m_indexCode is set! if ( tr->length()==0 && ! m_indexCode ) { char *xx=NULL;*xx=0;} } // . add in title rec size // . should be valid because we called getTitleRecBuf() above // . this should include the key // . add in possible negative key for deleting old title rec int32_t needTitledb = sizeof(key96_t) + 1; // +1 for rdbId if ( nd && m_useTitledb && ! forDelete ) needTitledb += m_titleRecBuf.length(); // set new and old keys for titledb //key_t ok; key_t nk; //ok.setMin(); nk.setMin(); //if ( od ) ok = *od->getTitleRecKey(); if ( nd && m_useTitledb ) nk = *nd->getTitleRecKey(); //if ( od && m_useTitledb && ok != nk ) needTitledb += sizeof(key_t)+1; if ( m_useTitledb ) { // then add it in need += needTitledb; // the titledb unlock key for msg12 in spider.cpp need += sizeof(key_t); } // // now space for the revdb record, which is the meta list itself! // //need = need + 12 + 4 + need; // . alloc mem for metalist // . sanity if ( m_metaListSize > 0 ) { char *xx=NULL;*xx=0; } // make the buffer m_metaList = (char *)mmalloc ( need , "metalist"); if ( ! m_metaList ) return NULL; // save size for freeing later m_metaListAllocSize = need; // ptr and boundary m_p = m_metaList; m_pend = m_metaList + need; // // TITLEDB // setStatus ("adding titledb recs"); // checkpoint char *saved = m_p; // . delete old title rec key if different // . Repair.cpp might set useTitledb to false! //if ( od && m_useTitledb && ok != nk ) { // // rdbId // *m_p++ = RDB_TITLEDB; // // key // *(key_t *)m_p = *od->getTitleRecKey(); // // make it negative // *m_p &= 0xfe; // // skip over it // m_p += sizeof(key_t); // // then data size, 0 // //*(int32_t *)m_p = 0; // //m_p+= 4; //} // . store title rec // . Repair.cpp might set useTitledb to false! if ( nd && m_useTitledb ) { // rdbId if ( m_useSecondaryRdbs ) *m_p++ = RDB2_TITLEDB2; else *m_p++ = RDB_TITLEDB; // sanity if ( ! nd->m_titleRecBufValid ) { char *xx=NULL;*xx=0; } // key, dataSize, data is the whole rec int32_t tsize = nd->m_titleRecBuf.length(); // if getting an "oldList" to do incremental posdb updates // then do not include the data portion of the title rec if ( forDelete ) tsize = sizeof(key_t); gbmemcpy ( m_p , nd->m_titleRecBuf.getBufStart() , tsize ); // make it a negative key //if ( forDelete ) *m_p = *m_p & 0xfe; m_p += tsize;//nd->m_titleRecSize; // store a zero datasize, key is still positive until the dt8 // table deletes it //if ( forDelete ) { *(int32_t *)m_p = 0; m_p += 4; } } // sanity check if ( m_p - saved > needTitledb ) { char *xx=NULL;*xx=0; } // sanity check verifyMetaList( m_metaList , m_p , forDelete ); // // ADD BASIC INDEXDB/DATEDB TERMS // setStatus ( "adding posdb and datedb terms"); // checkpoint saved = m_p; // store indexdb terms into m_metaList[] if ( m_usePosdb && ! addTable144 ( &tt1 , m_docId )) return NULL; //if(!addTable96 ( &tt2, &tt1, date2, date1, true ,false)) return NULL; //if ( od ) tt2.clear(); // sanity check if ( m_p - saved > needIndexdb ) { char*xx=NULL;*xx=0; } // free all mem tt1.reset(); //tt2.reset(); // sanity check verifyMetaList( m_metaList , m_p , forDelete ); // // ADD NOSPLIT INDEXDB/DATEDB TERMS // /* we added these now in hashAll() to tt1, no longer ns1 since we have the sharded by termid bit in the actual posdb key now so Rebalance.cpp works setStatus ( "adding posdb shardByTermId terms"); // checkpoint saved = m_p; // no longer anything special now since the // Posdb::isShardedyTermId() bit // is in the key now so Rebalance.cpp can work if ( m_usePosdb && ! addTable144 ( &ns1 )) return NULL; //if(! addTable96 ( &ns2, &ns1, -1, -1, true ,true)) return NULL; // sanity check if ( m_p - saved > needNoSplit1 ) { char*xx=NULL;*xx=0; } // free all mem ns1.reset(); // sanity check verifyMetaList( m_metaList , m_p , forDelete ); */ /* setStatus ( "adding datedb nosplit terms"); // checkpoint saved = m_p; // this is now for datedb if ( m_useDatedb && ! addTableDate(&ns2,m_docId,RDB_DATEDB,true)) return NULL; // sanity check if ( m_p - saved > needNoSplit2 ) { char*xx=NULL;*xx=0; } // free all mem ns2.reset(); // sanity check verifyMetaList( m_metaList , m_p ); */ // // ADD SECTIONS SPECIAL TERMS // setStatus ( "adding sectiondb keys"); // checkpoint saved = m_p; // add that table to the metalist if ( m_useSectiondb && !addTable128(&st1,RDB_SECTIONDB,forDelete)) return NULL; //if(! addTable128 (&st2,&st1, RDB_SECTIONDB,true ,true))return NULL; // sanity check if ( m_p - saved > needSectiondb ) { char *xx=NULL;*xx=0; } // free mem st1.reset(); //st2.reset(); // sanity check verifyMetaList( m_metaList , m_p , forDelete ); // // ADD CLUSTERDB KEYS // setStatus ( "adding clusterdb keys" ); // checkpoint saved = m_p; // . do we have adult content? // . should already be valid! if ( nd && ! m_isAdultValid ) { char *xx=NULL;*xx=0; } // . get new clusterdb key // . we use the host hash for the site hash! hey, this is only 26 bits! key_t newk ; newk.setMin(); if ( nd ) newk = g_clusterdb.makeClusterRecKey ( *nd->getDocId() , *nd->getIsAdult() , *nd->getLangId(), nd->getHostHash32a(), false ); // del? //key_t oldk; oldk.setMin(); //if ( od ) // && add2 ) // oldk = g_clusterdb.makeClusterRecKey ( *od->getDocId(), // *od->getIsAdult() , // *od->getLangId() , // od->getHostHash32a(), // true ); // del? // . store old only if new tr is good and keys are different from old // . now we store even if skipIndexing is true because i'd like to // see how many titlerecs we have and count them towards the // docsIndexed count... if ( nd && m_useClusterdb ) { // store rdbid *m_p = RDB_CLUSTERDB; // use secondary if we should if ( m_useSecondaryRdbs ) *m_p = RDB2_CLUSTERDB2; // skip m_p++; // and key *(key_t *)m_p = newk; // skip it m_p += sizeof(key_t); } // store new if different //if ( od && ( ! nd || newk != oldk ) ) { // && !od->m_skipIndexing ) { // // store rdbid // *m_p = RDB_CLUSTERDB; // // use secondary if we should // if ( m_useSecondaryRdbs ) *m_p = RDB2_CLUSTERDB2; // // skip // m_p++; // // turn on last bit (undo del) // //newk.n0 |= 0x01; // // and key // *(key_t *)m_p = oldk; // // skip it // m_p += sizeof(key_t); //} // sanity check if ( m_p - saved > needClusterdb ) { char *xx=NULL;*xx=0; } // sanity check verifyMetaList( m_metaList , m_p , forDelete ); // // ADD LINKDB KEYS // setStatus ( "adding linkdb keys" ); // checkpoint saved = m_p; // add that table to the metalist (LINKDB) if ( m_useLinkdb && !addTable224(&kt1)) return NULL; //if(add2&&!addTable128(&kt2,&kt1,RDB_LINKDB, false))return NULL; // sanity check if ( m_p - saved > needLinkdb ) { char *xx=NULL;*xx=0; } // all done kt1.reset(); //kt2.reset(); // sanity check verifyMetaList( m_metaList , m_p , forDelete ); // // . ADD ADDRESSES TO NAMEDB/PLACEDB // . key is basically a hash of the address (excluding place name // and street indicators) // setStatus ( "adding to placedb" ); // checkpoint saved = m_p; // add that table to the metalist if ( m_usePlacedb && ! addTable128 ( &pt1, RDB_PLACEDB,forDelete)) return NULL; //if(! addTable128 ( &pt2, &pt1, RDB_PLACEDB, true , true))return NULL; // sanity check if ( m_p - saved > needPlacedb ) { char *xx=NULL;*xx=0; } // free mem pt1.reset(); //pt2.reset(); // sanity check verifyMetaList( m_metaList , m_p , forDelete ); /* // // ADD REVDB RECORD // // // . add the metalist to itself // . this way, when we delete this doc from the index, we just // lookup the original metalist in revdb, set all the // delbits, and re-add that. this avoid having to ensure // parsing consistency, which is a royal pain in the ass // . now we also update getMetaList() to check revdb to get // the meta list if the doc is already indexed... // // define current meta list char *x = m_metaList; char *xend = m_p; // skip adding to revdb? if ( ! m_useRevdb ) xend = x; int32_t *dataSizePtr; char *savedp; // if nothing in current list do not add revdb rec bool hadStuff = ( x < xend); if ( hadStuff ) { // put in the rdbId if ( m_useSecondaryRdbs ) *m_p++ = RDB2_REVDB2; else *m_p++ = RDB_REVDB; // the key if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; } *(key_t *)m_p = g_revdb.makeKey ( m_docId , false ); m_p += sizeof(key_t); // data size dataSizePtr = (int32_t *)m_p; // skip for now m_p += 4; // save it savedp = m_p; } // scan the current metalist and add keys to the revdb record for ( ; x < xend ; ) { // breathe QUICKPOLL(m_niceness); // save this char byte = *x; // get rdbId char rdbId = byte & 0x7f; // // convert if adding to secondary rdbids!!!!!!!! // if ( m_useSecondaryRdbs ) { if ( rdbId == RDB2_POSDB2 ) rdbId = RDB_POSDB; else if ( rdbId == RDB2_DATEDB2 ) rdbId = RDB_DATEDB; else if ( rdbId == RDB2_SECTIONDB2 ) rdbId = RDB_SECTIONDB; else if ( rdbId == RDB2_PLACEDB2 ) rdbId = RDB_PLACEDB; else if ( rdbId == RDB2_TITLEDB2 ) rdbId = RDB_TITLEDB; else if ( rdbId == RDB2_LINKDB2 ) rdbId = RDB_LINKDB; else if ( rdbId == RDB2_CLUSTERDB2 ) rdbId = RDB_CLUSTERDB; else if ( rdbId == RDB2_SPIDERDB2 ) rdbId = RDB_SPIDERDB; else if ( rdbId == RDB2_TAGDB2 ) rdbId = RDB_TAGDB; // must be covered!! else { char *xx=NULL;*xx=0; } // rewrite byte now b/c we store it below byte = (byte & 0x80) | rdbId; } // skip that x++; // copy that over *m_p++ = byte; // sanity check -- no negative keys allowed in here if ( (x[0] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0; } // get key size int32_t ks = getKeySizeFromRdbId(rdbId); // copy that over gbmemcpy ( m_p , x , ks ); // skip that m_p += ks; x += ks; // datasize? int32_t ds = getDataSizeFromRdbId(rdbId); if ( ds == -1 ) { ds = *(int32_t *)x; x += 4; } // skip data x += ds; } // record size of what we wrote if ( hadStuff ) *dataSizePtr = ( m_p - savedp ); // sanity check if ( m_p > m_pend || m_p < m_metaList ) { char *xx=NULL;*xx=0;} // sanity check verifyMetaList( m_metaList , m_p ); */ ////// // // add SPIDERREPLY BEFORE and SPIDERREQUEST!!! // // add spider reply first so we do not immediately respider // this same url if we were injecting it because no SpiderRequest // may have existed, and SpiderColl::addSpiderRequest() will // spawn a spider of this url again unless there is already a REPLY // in spiderdb!!! crazy... bool addReply = true; // Scraper.cpp uses this if ( m_sreqValid && m_sreq.m_isScraping ) addReply = false; // save it saved = m_p; // now add the new rescheduled time if ( addReply && m_useSpiderdb && ! forDelete ) { // note it setStatus ( "adding SpiderReply to spiderdb" ); // rdbid first *m_p = RDB_SPIDERDB; // use secondary? if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2; m_p++; // get this if ( ! m_srepValid ) { char *xx=NULL;*xx=0; } // store the spider rec int32_t newsrSize = newsr->getRecSize(); gbmemcpy ( m_p , newsr , newsrSize ); m_p += newsrSize; m_addedSpiderReplySize = newsrSize; m_addedSpiderReplySizeValid = true; // sanity check - must not be a request, this is a reply if ( g_spiderdb.isSpiderRequest( &newsr->m_key ) ) { char *xx=NULL;*xx=0; } // sanity check if ( m_p - saved != needSpiderdb1 ) { char *xx=NULL;*xx=0; } // sanity check verifyMetaList( m_metaList , m_p , forDelete ); } // if we are injecting we must add the spider request // we are injecting from so the url can be scheduled to be // spidered again if ( needSpiderdb3 ) { // note it setStatus("adding spider request"); // checkpoint saved = m_p; // store it here SpiderRequest revisedReq; // if doing a repair/rebuild of spiderdb... if ( m_useSecondaryRdbs ) getRebuiltSpiderRequest ( &revisedReq ); // this fills it in for doing injections if ( ! m_useSecondaryRdbs ) { getRevisedSpiderRequest ( &revisedReq ); // sanity log if ( ! m_firstIpValid ) { char *xx=NULL;*xx=0; } // sanity log if ( m_firstIp == 0 || m_firstIp == -1 ) { char *url = "unknown"; if ( m_sreqValid ) url = m_sreq.m_url; log("build: error3 getting real firstip of " "%"INT32" for %s. not adding new request.", (int32_t)m_firstIp,url); goto skipNewAdd2; } } // copy it if ( m_useSecondaryRdbs ) *m_p++ = RDB2_SPIDERDB2; else *m_p++ = RDB_SPIDERDB; // store it back gbmemcpy ( m_p , &revisedReq , revisedReq.getRecSize() ); // skip over it m_p += revisedReq.getRecSize(); // sanity check if ( m_p - saved > needSpiderdb3 ) { char *xx=NULL;*xx=0; } m_addedSpiderRequestSize = revisedReq.getRecSize(); m_addedSpiderRequestSizeValid = true; } skipNewAdd2: // // ADD SPIDERDB RECORDS of outlinks // // - do this AFTER computing revdb since we do not want spiderdb recs // to be in revdb. // setStatus ( "adding spiderdb keys" ); // sanity check. cannot spider until in sync if ( ! isClockInSync() ) { char *xx=NULL;*xx=0; } // checkpoint saved = m_p; // . should be fixed from Links::setRdbList // . we should contain the msge that msg16 uses! // . we were checking m_msg16.m_recycleContent, but i have not done // that in years!!! MDW // . we were also checking if the # of banned outlinks >= 2, then // we would not do this... // . should also add with a time of now plus 5 seconds to that if // we spider an outlink linkdb should be update with this doc // pointing to it so it can get link text then!! if ( spideringLinks && nl2 && ! m_doingConsistencyCheck && m_useSpiderdb && ! forDelete ){ // returns NULL and sets g_errno on error char *ret = addOutlinkSpiderRecsToMetaList (); // sanity check if ( ! ret && ! g_errno ) { char *xx=NULL;*xx=0; } // return NULL on error if ( ! ret ) return NULL; // this MUST not block down here, to avoid re-hashing above if ( ret == (void *)-1 ) { char *xx=NULL;*xx=0; } } // sanity check if ( m_p - saved > needSpiderdb2 ) { char *xx=NULL;*xx=0; } // sanity check verifyMetaList( m_metaList , m_p , forDelete ); // // ADD TAG RECORDS TO TAGDB // // checkpoint saved = m_p; // . only do this if NOT setting from a title rec // . it might add a bunch of forced spider recs to spiderdb // . store into tagdb even if indexCode is set! if ( ntb && m_useTagdb && ! forDelete ) { // ntb is a safebuf of Tags, which are already Rdb records // so just gbmemcpy them directly over char *src = ntb->getBufStart(); int32_t srcSize = ntb->length(); gbmemcpy ( m_p , src , srcSize ); m_p += srcSize; } // sanity check if ( m_p - saved > needTagdb ) { char *xx=NULL;*xx=0; } // sanity check verifyMetaList( m_metaList , m_p , forDelete ); // // ADD INDEXED SPIDER REPLY with different docid so we can // search index of spider replies! (NEW!) // // . index spider reply with separate docid so they are all searchable. // . see getSpiderStatusDocMetaList() function to see what we index // and the titlerec we create for it if ( spiderStatusDocMetaList ) { gbmemcpy ( m_p , spiderStatusDocMetaList->getBufStart() , spiderStatusDocMetaList->length() ); m_p += spiderStatusDocMetaList->length(); } /* // // ADD FORCED RESPIDER DOCID-BASED SPIDER RECS for Sections // // used by Sections.cpp to respider docs because we just identified an // article section and they need to be re-indexed to take advantage // of that // // checkpoint saved = m_p; // . only do this if NOT setting from a title rec // . it might add a bunch of forced spider recs to spiderdb if ( ! m_setFromTitleRec && nd ) { // && ! m_isInjecting ) { Sections *ss = &m_sections; m_p = ss->respiderLineWaiters ( m_p , m_pend ); if ( ! m_p ) return NULL; } // sanity check if ( m_p - saved > needLineWaiters ) { char *xx=NULL;*xx=0; } // sanity check verifyMetaList( m_metaList , m_p ); */ // // NOW UPDATE OURSELVES (OUR URL) IN SPIDERDB // // but not if injecting! //if ( ! m_sreqValid ) { // // set the list size, different from the alloc size // m_metaListSize = m_p - m_metaList; // // all done // return m_metaList; //} // note it //setStatus ( "deleting old spider rec key" ); // rdbid first // *p = RDB_SPIDERDB; // use secondary? //if ( m_useSecondaryRdbs ) *p = RDB2_SPIDERDB2; //p++; // must be legit //if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; } // then the key // *(key_t *)p = m_sreq.m_key; // nukey, clear del bit to delete it // *p &= 0xfe; // skip key //p += sizeof(key_t); // int16_tcut saved = m_p; /* See comment under DOLEDB above! this approach is no longer used. // . remove from doledb if we had a valid key // . DO THIS BEFORE adding the SpiderReply since // Spider.cpp::addSpiderReply() will // decrement the count for firstIp in m_doleIpTable if ( (m_doledbKey.n0 || m_doledbKey.n1) && ! m_useSecondaryRdbs && // do not add if we are generating the meta list for incremental // indexing purposes from an old doc ! forDelete ) { // note it setStatus ( "removing key from doledb" ); // . now remove the original spider rec from "doledb" // . rdbid first *m_p = RDB_DOLEDB; m_p++; // then the key *(key_t *)m_p = m_doledbKey; // nukey, clear del bit to delete it *m_p = *m_p & 0xfe; // skip key m_p += sizeof(key_t); // datasize is 0 // *(int32_t *)m_p = 0; //m_p += 4; // sanity check if ( m_p - saved != needDoledb ) { char *xx=NULL;*xx=0; } // sanity check verifyMetaList( m_metaList , m_p , forDelete ); } */ // note it //setStatus ( "removing spider lock"); // . make a fake titledb key // . remove the spider lock (Msg12 in Spider.cpp) // . no need to do this if called from Repair.cpp // . the uh48 is zero, that means fake! // . i added "&& m_useSpiderdb" here because it was messing up // the cacheTermLists() function which ONLY wants posdb keys and // any other keys in the metalist messes it up. MDW 1/26/13 // . now SPider.cpp uses SpiderReply reception to remove lock // - mdw 9/28/13 //if ( ! m_useSecondaryRdbs && ! forDelete && m_useSpiderdb ) { // *m_p++ = RDB_FAKEDB; // ((key_t *)m_p)->n1 = 0; // ((key_t *)m_p)->n0 = m_docId; // //= g_titledb.makeKey ( m_docId , 0LL , true ); // m_p += sizeof(key_t); //} // MDW: new spider algo does not need this /* // save it saved = m_p; // re-add the same request since it was removed from Spider.cpp's // m_urlBuf and the associated orderTree,ipTree, etc. and now // since we are un-doling (undoling) it we need to re-add and this // is the easiest way. it really was never removed from spiderdb // but it will no longer be in the spider's cache since we delete // it from there when we add it to doledb. so this is just a quick // way of getting it back into the cache. // now, we add this first since now Rdb.cpp calls evaluateAllReqeusts() // AFTER the REPLY now if ( m_sreqValid && // page parser has an invalid firstIp which causes printMetaList() // to core when trying to print this out, so don't add it when // doing page parser ! m_sreq.m_isPageParser ) { // note it setStatus ( "adding SpiderRequest back to spiderdb" ); // rdbid first *m_p = RDB_SPIDERDB; // use secondary? if ( m_useSecondaryRdbs ) *m_p = RDB2_SPIDERDB2; m_p++; // store the spider rec int32_t size = m_sreq.getRecSize(); gbmemcpy ( m_p , &m_sreq , size ); // set this one bit SpiderRequest *rr = (SpiderRequest *)m_p; rr->m_readd = 1; // and hafta reset this junk otherwise it cores // (see Spider.h::SpiderRequest::reset()) rr->m_ufn = -1; rr->m_priority = -1; rr->m_doled = 0; // skip over the whole rec m_p += size; // sanity check - must not be a request, this is a reply if ( ! g_spiderdb.isSpiderRequest( &m_sreq.m_key ) ) { char *xx=NULL;*xx=0; } // sanity check if ( m_p - saved != needSpiderdb3 ) { char *xx=NULL;*xx=0; } // sanity check verifyMetaList( m_metaList , m_p ); } */ // sanity check if ( m_p > m_pend || m_p < m_metaList ) { char *xx=NULL;*xx=0;} int32_t now = getTimeGlobal(); ///////////////// // // INCREMENTAL INDEXING / INCREMENTAL UPDATING // // now prune/manicure the metalist to remove records that // were already added, and insert deletes for records that // changed since the last time. this is how we do deletes // now that we have revdb. this allows us to avoid // parsing inconsistency errors. // ///////////////// // disable for parsing consistency testing of already indexed docs //oldList = NULL; if ( oldList ) { // && oldList->m_listSize > 16 ) { // point to start of the old meta list, the first and only // record in the oldList char *om = oldList;// + 12 + 4; // the size int32_t osize = oldListSize;//*(int32_t *)(oldList + 12); // the end char *omend = om + osize; int32_t needx = 0; // init these. data is just the rdbid, a single byte. //HashTableX dt12; //HashTableX dt16; //char dbuf12[30000]; //char dbuf16[40000]; //dt12.set ( 12,1,2048,dbuf12,30000,false,m_niceness); //dt16.set ( 16,1,2048,dbuf16,40000,false,m_niceness); HashTableX dt8; char dbuf8[34900]; // value is the ptr to the rdbId/key in the oldList dt8.set ( 8,sizeof(char *),2048,dbuf8,34900, false,m_niceness,"dt8-tab"); // just for linkdb: //HashTableX dt9; //char dbuf9[30000]; //dt9.set ( 8,4,2048,dbuf9,30000,false,m_niceness,"dt9-tab"); // scan recs in that and hash them for ( char *p = om ; p < omend ; ) { // breathe QUICKPOLL(m_niceness); // save this char byte = *p; // save this char *rec = p; // get the rdbid for this rec char rdbId = byte & 0x7f; // skip that p++; // get the key size int32_t ks = getKeySizeFromRdbId ( rdbId ); // get that char *k = p; // unlike a real meta list, this meta list has // no data field, just rdbIds and keys only! because // we only use it for deleting, which only requires // a key and not the data p += ks; // tally this up in case we have to add the delete // version of this key back (add 1 for rdbId) needx += ks + 1; // always re-add titledb record! // if our current/new list is basically empty // except for a SpiderReply because it got deleted // from the index, we need to store the titledb key // in dt8 so we can add it as a negative! so i // don't really know what this was trying to fix // because it broke that! //if ( rdbId == RDB_TITLEDB ) continue; // for linkdb, sometimes we also add a "lost" link // key in addition to deleting the old key! see below if ( rdbId == RDB_LINKDB ) needx += ks + 1; // do not add it if datasize > 0 uint64_t hk; // do not include discovery or lost dates in the // linkdb key... if ( rdbId == RDB_LINKDB ) hk = hash64 (k+12,ks-12); else hk = hash64 (k,ks); // sanity check if ( rdbId == RDB_LINKDB && g_linkdb.getLinkerDocId_uk((key224_t *)k)!= m_docId ) { char *xx=NULL;*xx=0; } //if ( getDataSize(rdbId) != 0 ) continue; // hash this key //bool status; // sectiondb keys all have the same last few bits... // so this clogs up the hash table. // so mix up the key bits for hashing //uint64_t hk = hash64 ( k,ks); //if (ks == 12 ) status = dt12.addKey ( k, &byte); //else if (ks == 16 ) status = dt16.addKey ( k, &byte); //else { char *xx=NULL; *xx=0; } if ( ! dt8.addKey(&hk,&rec) ) return NULL; // return NULL with g_errno set on error //if ( ! status ) return NULL; } // also need all the new keys just to be sure, in case none // are already in the rdbs needx += (m_p - m_metaList); // now alloc for our new manicured metalist char *nm = (char *)mmalloc( needx, "newmeta" ); if ( ! nm ) return NULL; char *nptr = nm; char *nmax = nm + needx; // scan each rec in the current meta list, see if its in either // the dt12 or dt16 hash table, if it already is, then // do NOT add it to the new metalist, nm, because there is // no need to. char *p = m_metaList; char *pend = p + (m_p - m_metaList); for ( ; p < pend ; ) { // breathe QUICKPOLL(m_niceness); // save it with the flag char byte = *p; // get rdbId char rdbId = byte & 0x7f; // skip that p++; // key size int32_t ks = getKeySizeFromRdbId(rdbId); // get key char *key = p; // skip that p += ks; // get data size int32_t ds = getDataSizeFromRdbId(rdbId); // assume we do not store the datasize bool neg = false; // . if key is negative, no data is present // . the doledb key is negative for us here if ( (key[0] & 0x01) == 0x00 ) { neg = true; ds = 0; } // if datasize variable, read it in if ( ds == -1 ) { // get data size ds = *(int32_t *)p; // skip data size int32_t p += 4; } // point to data char *data = p; // skip data if not zero p += ds; // mix it up for hashtable speed uint64_t hk ;//= hash64 ( key,ks); // skip if for linkdb, we do that below if ( rdbId == RDB_LINKDB ) hk = hash64(key+12,ks-12); else hk = hash64(key,ks); // was this key already in the "old" list? int32_t slot = dt8.getSlot(&hk); // do we got a linkdb key that existed last time // we indexed this doc? if so, inherit its discovery // date. if ( slot >= 0 && rdbId == RDB_LINKDB ) { /* // get old key from last time char *oldk=*(char**)dt8.getValueFromSlot(slot); // skip rdbid oldk++; // sanity if(g_linkdb.getLinkerDocId_uk((key224_t *)oldk) !=m_docId){ char *xx=NULL;*xx=0; } // copy rdbid into new meta list *nptr++ = byte; // point to where key will be stored in new lst char *nk = nptr; // store the new key in the new meta list gbmemcpy ( nptr , key , ks ); // advance ptr nptr += ks; // get disocvery time of old key from last time int32_t dd = g_linkdb.getDiscoveryDate_uk(oldk); // sanity if ( dd < 0 ) { char *xx=NULL;*xx=0; } // but mod the new key's discovery time g_linkdb.setDiscoveryDate_uk ( nk, dd ); */ // . no need to deal with this any further // . yeah, because there could be dups! // so don't delete it just yet // . but make the data ptr NULL so we // know to disregard it below...??? dt8.removeSlot(slot); // all done for this key continue; } // see if already in an rdb, IFF dataless, otherwise // the keys might be the same but with different data! if ( slot >= 0 ) { // dt8.isInTable(&hk) ) { // remove from hashtable so we do not add it // as a delete key below // dt8.removeKey(&hk); dt8.removeSlot(slot); // but do add like a titledb rec that has the // same key, because its data is probably // different... // HACK: enable for now since we lost // the url:www.geico.com term somehow!!! // geico got deleted but not the title rec!! // MAKE SURE TITLEREC gets deleted then!!! if ( ds==0 && g_conf.m_doIncrementalUpdating ) continue; } // ok, it is not already in an rdb, so add it *nptr++ = byte; // store key gbmemcpy ( nptr, key , ks ); // skip over it nptr += ks; // store data size. BUT not if negative key! if ( getDataSizeFromRdbId(rdbId) == -1 && ! neg ) { *(int32_t *)nptr = ds; nptr += 4; } // store data if ( ds ) { gbmemcpy ( nptr , data , ds ); nptr += ds; } } // now scan dt8 and add their keys as del keys for ( int32_t i = 0 ; i < dt8.m_numSlots ; i++ ) { // breathe QUICKPOLL(m_niceness); // skip if empty if ( ! dt8.m_flags[i] ) continue; // store rdbid first char *rec = *(char **)dt8.getValueFromSlot(i); // get rdbId with hi bit possibly set char rdbId = rec[0] & 0x7f; // key size int32_t ks = getKeySizeFromRdbId(rdbId); // sanity test - no negative keys if ( (rec[1] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0;} // copy the rdbId byte and key gbmemcpy ( nptr , rec , 1 + ks ); // skip over rdbid nptr++; // make it a negative key by clearing lsb *nptr = *nptr & 0xfe; // skip it nptr += ks; // if it is from linkdb, and unmet, then it is a // lost link, so set the lost date of it. we keep // these so we can graph lost links if ( rdbId == RDB_LINKDB ) { // the real linkdb rec is at rec+1 int32_t lost = g_linkdb.getLostDate_uk( rec+1 ); // how can it be non-zero? it should have // been freshly made from the old titlerec... if ( lost ) { char *xx=NULL;*xx=0; } // if zero, set it to now! //g_linkdb.setLostDate_uk(realRec,now); // copy the rdbId byte and key gbmemcpy ( nptr , rec , 1 + ks ); // set it in there now g_linkdb.setLostDate_uk(nptr+1,now); // carry it through on revdb, do not delete // it! we want a linkdb history for seomasters nptr += 1 + ks; // and go on to delete the old linkdb key that // did not have a lost date //continue; } } // sanity. check for metalist breach if ( nptr > nmax ) { char *xx=NULL;*xx=0; } // free the old meta list mfree ( m_metaList , m_metaListAllocSize , "fm" ); // now switch over to the new one m_metaList = nm; m_metaListAllocSize = needx; m_p = nptr; } // if we only removed it from index, set this flag if ( oldList && ! nd ) m_didDelete = true; // // repeat this logic special for linkdb since we keep lost links // and may update the discovery date or lost date in the keys // // 1. hash keys of old linkdb keys into dt9 here // 2. do not hash the discovery/lost dates when making key hash for dt9 // 3. scan keys in meta list and add directly into new meta list // if not in dt9 // 4. if in dt9 then add dt9 key instead // 5. remove dt9 keys as we add them // 6. then add remaining dt9 keys into meta list but with lost date // set to now UNLESS it's already set // // // validate us! // m_metaListValid = true; // set the list size, different from the alloc size m_metaListSize = m_p - m_metaList;//end - m_p; // sanity check verifyMetaList( m_metaList , m_metaList + m_metaListSize , forDelete ); // all done return m_metaList; } // . copy from old title rec to us to speed things up! // . returns NULL and set g_errno on error // . returns -1 if blocked // . returns 1 otherwise // . when to doc content is unchanged, just inherit crap from the old title // rec so we can make the spider reply in getNewSpiderReply() void XmlDoc::copyFromOldDoc ( XmlDoc *od ) { // skip if none if ( ! od ) return; // skip if already did it if ( m_copied1 ) return; // do not repeat m_copied1 = true; // set these m_percentChanged = 0; m_percentChangedValid = true; // copy over bit members m_contentHash32 = od->m_contentHash32; //m_tagHash32 = od->m_tagHash32; m_tagPairHash32 = od->m_tagPairHash32; //m_sitePop = od->m_sitePop; m_httpStatus = od->m_httpStatus; m_hasAddress = od->m_hasAddress; m_hasTOD = od->m_hasTOD; //m_hasSiteVenue = od->m_hasSiteVenue; m_isRSS = od->m_isRSS; m_isPermalink = od->m_isPermalink; m_hasContactInfo= od->m_hasContactInfo; m_hopCount = od->m_hopCount; m_crawlDelay = od->m_crawlDelay; // do not forget the shadow members of the bit members m_hasAddress2 = m_hasAddress; m_hasTOD2 = m_hasTOD; //m_hasSiteVenue2 = m_hasSiteVenue; m_isRSS2 = m_isRSS; m_isPermalink2 = m_isPermalink; // validate them m_contentHash32Valid = true; //m_tagHash32Valid = true; m_tagPairHash32Valid = true; //m_sitePopValid = true; m_httpStatusValid = true; m_hasAddressValid = true; m_hasTODValid = true; //m_hasSiteVenueValid = true; m_isRSSValid = true; m_isPermalinkValid = true; m_hasContactInfoValid= true; m_hopCountValid = true; m_crawlDelayValid = true; m_pubDate = od->m_pubDate; m_langId = od->m_langId; m_pubDateValid = true; m_langIdValid = true; // so get sitenuminlinks doesn't crash when called by getNewSpiderReply // because dns timed out. it timed out with EDNSTIMEDOUT before. // so overwrite it here... if ( m_ip == -1 || m_ip == 0 || ! m_ipValid ) { m_ip = od->m_ip; m_ipValid = true; m_siteNumInlinks = od->m_siteNumInlinks; m_siteNumInlinksUniqueIp = od->m_siteNumInlinksUniqueIp; m_siteNumInlinksUniqueCBlock= od->m_siteNumInlinksUniqueCBlock; m_siteNumInlinksTotal = od->m_siteNumInlinksTotal; m_siteNumInlinksValid = od->m_siteNumInlinksValid; m_siteNumInlinksUniqueIpValid = od->m_siteNumInlinksUniqueIpValid; m_siteNumInlinksUniqueCBlockValid = od->m_siteNumInlinksUniqueCBlockValid; m_siteNumInlinksTotal = od->m_siteNumInlinksTotalValid; } m_indexCode = 0;//od->m_indexCode; m_indexCodeValid = true; // we need the link info too! ptr_linkInfo1 = od->ptr_linkInfo1; size_linkInfo1 = od->size_linkInfo1; if ( ptr_linkInfo1 && size_linkInfo1 ) m_linkInfo1Valid = true; else m_linkInfo1Valid = false; // turn off for debug ptr_sectiondbData = NULL; size_sectiondbData = 0; } // for adding a quick reply for EFAKEIP and for diffbot query reindex requests SpiderReply *XmlDoc::getFakeSpiderReply ( ) { if ( ! m_tagRecValid ) { m_tagRec.reset(); m_tagRecValid = true; } if ( ! m_siteHash32Valid ) { m_siteHash32 = 1; m_siteHash32Valid = true; } if ( ! m_downloadEndTimeValid ) { m_downloadEndTime = 0; m_downloadEndTimeValid = true; } if ( ! m_ipValid ) { m_ipValid = true; m_ip = atoip("1.2.3.4"); } if ( ! m_spideredTimeValid ) { m_spideredTimeValid = true; m_spideredTime = getTimeGlobal();//0; use now! } // don't let it get the diffbot reply either! it should be empty. if ( ! m_diffbotReplyValid ) { m_diffbotReplyValid = true; } // if doing diffbot query reindex // TODO: does this shard the request somewhere else??? if ( ! m_firstIpValid ) { m_firstIp = m_ip;//atoip("1.2.3.4"); m_firstIpValid = true; } //if ( ! m_sreqValid ) { // m_sreqValid = true; // m_sreq.m_parentDocId = 0LL; // } // if error is EFAKEFIRSTIP, do not core //if ( ! m_isIndexedValid ) { // m_isIndexed = false; // m_isIndexedValid = true; //} // if this is EABANDONED or EHITCRAWLLIMIT or EHITPROCESSLIMIT // or ECORRUPTDATA (corrupt gzip reply) // then this should not block. we need a spiderReply to release the // url spider lock in SpiderLoop::m_lockTable. // if m_isChildDoc is true, like for diffbot url, this should be // a bogus one. SpiderReply *nsr = getNewSpiderReply (); if ( nsr == (void *)-1) { char *xx=NULL;*xx=0; } if ( ! nsr ) { log("doc: crap, could not even add spider reply " "to indicate internal error: %s",mstrerror(g_errno)); if ( ! g_errno ) g_errno = EBADENGINEER; //return true; return NULL; } return nsr; //if ( nsr->getRecSize() <= 1) { char *xx=NULL;*xx=0; } //CollectionRec *cr = getCollRec(); //if ( ! cr ) return true; } SpiderReply *XmlDoc::getNewSpiderReply ( ) { if ( m_srepValid ) return &m_srep; setStatus ( "getting spider reply" ); // diffbot guys, robots.txt, frames, sshould not be here if ( m_isChildDoc ) { char *xx=NULL;*xx=0; } // . get the mime first // . if we are setting XmlDoc from a titleRec, this causes // doConsistencyCheck() to block and core //HttpMime *mime = getMime(); //if ( ! mime || mime == (HttpMime *)-1 ) return (SpiderReply *)mime; // if we had a critical error, do not do this int32_t *indexCode = getIndexCode(); if (! indexCode || indexCode == (void *)-1) return (SpiderReply *)indexCode; // if it has been abandoned early, i.e. cut-off, then we should // add a "fake" spider reply to release the lock in // SpiderLoop::m_lockTable at least. see Spider.cpp's addSpiderReply() // to see what parts of this are relevant. /* if ( *indexCode == EABANDONED || // . any internal "error" needs to be here really // . was there an error unzipping the title rec? *indexCode == ECORRUPTDATA || *indexCode == EHITCRAWLLIMIT || *indexCode == EHITPROCESSLIMIT ) { // clear everything m_srep.reset(); // get from spider request, if there int32_t firstIp = 0; if ( m_sreqValid ) firstIp = m_sreq.m_firstIp; // otherwise, wtf? if ( ! firstIp ) log("build: no first ip to make fake spiderReply. " "injected?"); // we at least need this m_srep.m_firstIp = firstIp; Url *fu = getFirstUrl(); // this is the lock key int64_t uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL; m_srep.setKey ( firstIp, 0 , uh48 , false ); // tell it we are fake and not to really add us to // spiderdb, but just to release the lock m_srep.m_errCode = *indexCode; m_srepValid = true; return &m_srep; } */ TagRec *gr = getTagRec(); if ( ! gr || gr == (TagRec *)-1 ) return (SpiderReply *)gr; // can't call getIsPermalink() here without entering a dependency loop //char *pp = getIsUrlPermalinkFormat(); //if ( !pp || pp == (char *)-1 ) return (SpiderReply *)pp; // the site hash int32_t *sh32 = getSiteHash32(); if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SpiderReply *)sh32; int64_t *de = getDownloadEndTime(); if ( ! de || de == (void *)-1 ) return (SpiderReply *)de; // need to set m_sentToDiffbot!! SafeBuf *dbr = getDiffbotReply(); if ( ! dbr || dbr == (void *)-1 ) return (SpiderReply *)dbr; // was the doc index when we started trying to spider this url? //char *wasIndexed = getIsIndexed(); //if ( ! wasIndexed || wasIndexed == (void *)-1 ) // return (SpiderReply *)wasIndexed; //Tag *vt = m_oldTagRec.getTag("venueaddress"); //bool siteHasVenue = (bool)vt; // int16_tcut Url *fu = NULL; // watch out for titlerec lookup errors for docid based spider reqs if ( m_firstUrlValid ) fu = getFirstUrl(); // reset m_srep.reset(); int32_t firstIp = -1; // inherit firstIp Tag *tag = m_tagRec.getTag("firstip"); // tag must be there? if ( tag ) firstIp = atoip(tag->getTagData()); // this is usually the authority if ( m_firstIpValid ) firstIp = m_firstIp; // otherwise, inherit from oldsr to be safe // BUT NOT if it was a fakeip and we were injecting because // the SpiderRequest was manufactured and not actually taken // from spiderdb! see XmlDoc::injectDoc() because that is where // it came from!! if it has m_sreq.m_isAddUrl and // m_sreq.m_fakeFirstIp then we actually do add the reply with that // fake ip so that they will exist in the same shard. // BUT if it is docid pased from PageReindex.cpp (a query reindex) // we set the injection bit and the pagereindex bit, we should let // thise guys keep the firstip because the docid-based spider request // is in spiderdb. it needs to match up. if ( m_sreqValid && (!m_sreq.m_isInjecting||m_sreq.m_isPageReindex) ) firstIp = m_sreq.m_firstIp; // sanity if ( firstIp == 0 || firstIp == -1 ) { if ( m_firstUrlValid ) log("xmldoc: BAD FIRST IP for %s",m_firstUrl.getUrl()); else log("xmldoc: BAD FIRST IP for %"INT64"",m_docId); firstIp = 12345; //char *xx=NULL;*xx=0; } } // store it m_srep.m_firstIp = firstIp; // assume no error m_srep.m_errCount = 0; // otherwise, inherit from oldsr to be safe //if ( m_sreqValid ) // m_srep.m_firstIp = m_sreq.m_firstIp; // do not inherit this one, it MIGHT HAVE CHANGE! m_srep.m_siteHash32 = m_siteHash32; // need this for updating crawl delay table, m_cdTable in Spider.cpp if ( fu ) m_srep.m_domHash32 = getDomHash32(); else m_srep.m_domHash32 = 0; if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; } if ( ! m_ipValid ) { char *xx=NULL;*xx=0; } if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; } //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } // . set other fields besides key // . crap! if we are the "qatest123" collection then m_spideredTime // was read from disk usually and is way in the past! watch out!! m_srep.m_spideredTime = getSpideredTime();//m_spideredTime; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // crap, for the test coll this is often a very old time and it // causes the spider request to be repeatedly executed, so let's // fix that if ( ! strcmp(cr->m_coll,"qatest123") ) m_srep.m_spideredTime = getTimeGlobal(); // TODO: expire these when "ownershipchanged" tag is newer!! if ( gr->getTag ( "ingoogle" ) ) { m_srep.m_inGoogle = 1; m_srep.m_inGoogleValid = 1; } if ( gr->getTag ( "authorityinlink" ) ) m_srep.m_hasAuthorityInlink = 1; // automatically valid either way m_srep.m_hasAuthorityInlinkValid = 1; // but for this tag, it must exist even if it has no contact info //tag = gr->getTag ( "hascontactinfo" ); //if ( tag ) { int64_t uh48 = 0LL; // we might be a docid based spider request so fu could be invalid // if the titlerec lookup failed if ( fu ) uh48 = hash64b(fu->m_url) & 0x0000ffffffffffffLL; int64_t parentDocId = 0LL; if ( m_sreqValid ) parentDocId = m_sreq.getParentDocId(); //else { char *xx=NULL;*xx=0; } // for docid based urls from PageReindex.cpp we have to make // sure to set the urlhash48 correctly from that. if ( m_sreqValid ) uh48 = m_sreq.getUrlHash48(); // note it if ( g_conf.m_logDebugSpider ) log("xmldoc: uh48=%"UINT64" parentdocid=%"UINT64"",uh48,parentDocId); // set the key, m_srep.m_key m_srep.setKey ( firstIp, parentDocId , uh48 , false ); // . did we download a page? even if indexcode is set we might have // . if this is non-zero that means its valid if ( m_contentHash32Valid ) m_srep.m_contentHash32 = m_contentHash32; // injecting the content (url implied) if ( m_contentInjected ) // m_sreqValid && m_sreq.m_isInjecting ) m_srep.m_fromInjectionRequest = 1; // can be injecting a url too, content not necessarily implied if ( m_sreqValid && m_sreq.m_isInjecting ) m_srep.m_fromInjectionRequest = 1; if ( m_sentToDiffbot ) m_srep.m_sentToDiffbot = true; else m_srep.m_sentToDiffbot = false; if ( m_diffbotReplyError ) m_srep.m_hadDiffbotError = true; else m_srep.m_hadDiffbotError = false; // if we only had an error code in the diffbot reply, record that if ( ! m_indexCode && m_diffbotReplyError ) m_srep.m_errCode = m_diffbotReplyError; // sanity. if being called directly from indexDoc() because of // an error like out of memory, then we do not know if it is // indexed or not or was indexed... //if ( ! m_wasInIndexValid ) { char *xx=NULL;*xx=0; } //if ( ! m_isInIndexValid ) { char *xx=NULL;*xx=0; } // were we already in titledb before we started spidering? m_srep.m_wasIndexed = m_wasInIndex; // note whether m_wasIndexed is valid because if it isn't then // we shouldn't be counting this reply towards the page counts. // if we never made it this far i guess we should not forcibly call // getIsIndexed() at this point so our performance is fast in case // this is an EFAKEFIRSTIP error or something similar where we // basically just add this reply and we're done. // NOTE: this also pertains to SpiderReply::m_isIndexed. m_srep.m_wasIndexedValid = m_wasInIndexValid; // assume no change m_srep.m_isIndexed = m_isInIndex; // we need to know if the m_isIndexed bit is valid or not // because sometimes like if we are being called directly from // indexDoc() because of an error situation, we do not know! if ( m_isInIndexValid ) m_srep.m_isIndexedINValid = false; else m_srep.m_isIndexedINValid = true; // likewise, we need to know if we deleted it so we can decrement the // quota count for this subdomain/host in SpiderColl::m_quotaTable //if ( m_srep.m_wasIndexed ) m_srep.m_isIndexed = true; // treat error replies special i guess, since langId, etc. will be // invalid if ( m_indexCode ) { // validate m_srepValid = true; // set these items if valid already, but don't bother // trying to compute them, since we are not indexing. if ( m_siteNumInlinksValid ) { m_srep.m_siteNumInlinks = m_siteNumInlinks; m_srep.m_siteNumInlinksValid = true; } //if ( m_percentChangedValid ) // m_srep.m_percentChangedPerDay = m_percentChanged; if ( m_crawlDelayValid && m_crawlDelay >= 0 ) // we already multiply x1000 in isAllowed2() m_srep.m_crawlDelayMS = m_crawlDelay;// * 1000; else m_srep.m_crawlDelayMS = -1; if ( m_pubDateValid ) m_srep.m_pubDate = m_pubDate; if ( m_langIdValid ) m_srep.m_langId = m_langId; if ( m_isRSSValid ) m_srep.m_isRSS = m_isRSS; if ( m_isPermalinkValid ) m_srep.m_isPermalink =m_isPermalink; if ( m_httpStatusValid ) m_srep.m_httpStatus = m_httpStatus; // stuff that is automatically valid m_srep.m_isPingServer = 0; if ( fu ) m_srep.m_isPingServer = (bool)fu->isPingServer(); // this was replaced by m_contentHash32 //m_srep.m_newRequests = 0; m_srep.m_errCode = m_indexCode; if ( m_downloadEndTimeValid ) m_srep.m_downloadEndTime = m_downloadEndTime; else m_srep.m_downloadEndTime = 0; // is the original spider request valid? if ( m_sreqValid ) { // preserve the content hash in case m_indexCode is // EDOCUNCHANGED. so we can continue to get that // in the future. also, if we had the doc indexed, // just carry the contentHash32 forward for the other // errors like EDNSTIMEDOUT or whatever. m_srep.m_contentHash32 = m_sreq.m_contentHash32; // int16_tcuts SpiderReply *n = &m_srep; SpiderRequest *o = &m_sreq; // more stuff n->m_inGoogle = o->m_inGoogle; n->m_hasContactInfo = o->m_hasContactInfo; n->m_isContacty = o->m_isContacty; n->m_hasAuthorityInlink = o->m_hasAuthorityInlink; n->m_isPingServer = o->m_isPingServer; // the validator flags n->m_inGoogleValid = o->m_inGoogleValid; n->m_hasContactInfoValid = o->m_hasContactInfoValid; n->m_isContactyValid = o->m_isContactyValid; n->m_hasAuthorityInlinkValid = o->m_hasAuthorityInlinkValid; // get error count from original spider request int32_t newc = m_sreq.m_errCount; // inc for us, since we had an error newc++; // contain to one byte if ( newc > 255 ) newc = 255; // store in our spiderreply m_srep.m_errCount = newc; } // . and do not really consider this an error // . i don't want the url filters treating it as an error reply // . m_contentHash32 should have been carried forward from // the block of code right above if ( m_indexCode == EDOCUNCHANGED ) { // we should have had a spider request, because that's // where we got the m_contentHash32 we passed to // Msg13Request. if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; } // make it a success m_srep.m_errCode = 0; // and no error count, it wasn't an error per se m_srep.m_errCount = 0; // call it 200 m_srep.m_httpStatus = 200; } // copy flags and data from old doc... if ( m_indexCode == EDOCUNCHANGED && m_oldDocValid && m_oldDoc ) { m_srep.m_pubDate = m_oldDoc->m_pubDate; m_srep.m_langId = m_oldDoc->m_langId; m_srep.m_isRSS = m_oldDoc->m_isRSS; m_srep.m_isPermalink = m_oldDoc->m_isPermalink; m_srep.m_hasAddress = m_oldDoc->m_hasAddress; m_srep.m_hasTOD = m_oldDoc->m_hasTOD; //m_srep.m_hasSiteVenue = m_oldDoc->m_hasSiteVenue; m_srep.m_siteNumInlinks = m_oldDoc->m_siteNumInlinks; // they're all valid m_srep.m_hasAddressValid = true; m_srep.m_hasTODValid = true; //m_srep.m_hasSiteVenueValid = true; m_srep.m_siteNumInlinksValid = true; } // do special things if return &m_srep; } // this will help us avoid hammering ips & respect same ip wait if ( ! m_downloadEndTimeValid ) { char *xx=NULL;*xx=0; } m_srep.m_downloadEndTime = m_downloadEndTime; // . if m_indexCode was 0, we are indexed then... // . this logic is now above //m_srep.m_isIndexed = 1; // get ptr to old doc/titlerec XmlDoc **pod = getOldXmlDoc ( ); if ( ! pod || pod == (XmlDoc **)-1 ) return (SpiderReply *)pod; // this is non-NULL if it existed XmlDoc *od = *pod; // status is -1 if not found int16_t *hs = getHttpStatus (); if ( ! hs || hs == (void *)-1 ) return (SpiderReply *)hs; int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (int32_t *)-1 ) return (SpiderReply *)sni; float *pc = getPercentChanged(); if ( ! pc || pc == (void *)-1 ) return (SpiderReply *)pc; // these are "non-dup" addresses (nondup) bool *hasAddress = getHasAddress(); if ( ! hasAddress || hasAddress == (void *)-1 ) return (SpiderReply *)hasAddress; // does it have a tod (i.e. 6pm) in there somewhere? bool *hasTOD = getHasTOD(); if ( ! hasTOD || hasTOD == (void *)-1 ) return (SpiderReply *)hasTOD; // does it have a venue address? //bool *hasSiteVenue = getHasSiteVenue(); //if ( ! hasSiteVenue || hasSiteVenue == (void *)-1 ) // return (SpiderReply *)hasSiteVenue; // get the content type uint8_t *ct = getContentType(); if ( ! ct ) return NULL; char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (SpiderReply *)isRoot; char *hci = getHasContactInfo(); if ( ! hci || hci == (char *)-1 ) return (SpiderReply *)hci; int32_t *pubDate = getPubDate(); if ( ! pubDate || pubDate == (int32_t *)-1 ) return (SpiderReply *)pubDate; uint8_t *langId = getLangId(); if ( ! langId || langId == (uint8_t *)-1 ) return (SpiderReply *)langId; char *isRSS = getIsRSS(); if ( ! isRSS || isRSS == (char *)-1 ) return (SpiderReply *)isRSS; char *pl = getIsPermalink(); if ( ! pl || pl == (char *)-1 ) return (SpiderReply *)pl; if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; } if ( m_hasContactInfo ) { m_srep.m_hasContactInfo = 1; m_srep.m_hasContactInfoValid = 1; } // this is only know if we download the robots.tt... if ( od && m_recycleContent ) { m_crawlDelay = od->m_crawlDelay; m_crawlDelayValid = true; } // sanity checks //if(! m_sreqValid ) { char *xx=NULL;*xx=0; } if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; } if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; } if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; } if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; } if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; } //if ( ! m_pageNumInlinksValid ) { char *xx=NULL;*xx=0; } if ( ! m_percentChangedValid ) { char *xx=NULL;*xx=0; } //if ( ! m_isSpamValid ) { char *xx=NULL;*xx=0; } if ( ! m_crawlDelayValid ) { char *xx=NULL;*xx=0; } // httpStatus is -1 if not found (like for empty http replies) m_srep.m_httpStatus = *hs; // zero if none //m_srep.m_percentChangedPerDay = 0; // . only if had old one // . we use this in url filters to set the respider wait time usually if ( od ) { int32_t spideredTime = getSpideredTime(); int32_t oldSpideredTime = od->getSpideredTime(); float numDays = spideredTime - oldSpideredTime; m_srep.m_percentChangedPerDay = (m_percentChanged+.5)/numDays; } // . update crawl delay, but we must store now as milliseconds // because Spider.cpp like it better that way // . -1 implies crawl delay unknown or not found if ( m_crawlDelay >= 0 ) // we already multiply x1000 in isAllowed2() m_srep.m_crawlDelayMS = m_crawlDelay;// * 1000; else // -1 means invalid/unknown m_srep.m_crawlDelayMS = -1; if ( ! m_hasAddressValid ) { char *xx=NULL;*xx=0; } if ( ! m_hasTODValid ) { char *xx=NULL;*xx=0; } //if ( ! m_hasSiteVenueValid ) { char *xx=NULL;*xx=0; } if ( ! m_hasContactInfoValid) { char *xx=NULL;*xx=0; } // . we use this to store "bad" spider recs to keep from respidering // a "bad" url over and over again // . it is up to the url filters whether they want to retry this // again or not! // . TODO: how to represent "ETCPTIMEDOUT"???? // . EUDPTIMEDOUT, EDNSTIMEDOUT, ETCPTIMEDOUT, EDNSDEAD, EBADIP, // ENETUNREACH,EBADMIME,ECONNREFUED,ECHOSTUNREACH m_srep.m_siteNumInlinks = m_siteNumInlinks; m_srep.m_pubDate = *pubDate; // this was replaced by m_contentHash32 //m_srep.m_newRequests = 0; m_srep.m_langId = *langId; m_srep.m_isRSS = (bool)*isRSS; m_srep.m_isPermalink = (bool)*pl; m_srep.m_isPingServer = (bool)fu->isPingServer(); //m_srep.m_isSpam = m_isSpam; m_srep.m_siteNumInlinksValid = true; // . ignore address in dup sections (nondup/non-dup addresses only) // . this way if the place always has their address in the header or // footer of every web page we will ignore it m_srep.m_hasAddress = *hasAddress; m_srep.m_isContacty = *hci;//getIsContacty(fu, // info1, // m_hopCount , // *ct , // contentType // *isRoot , // m_niceness ); m_srep.m_hasTOD = *hasTOD; //m_srep.m_hasSiteVenue = *hasSiteVenue; // validate all m_srep.m_inGoogleValid = 1; m_srep.m_hasContactInfoValid = 1; m_srep.m_hasAuthorityInlinkValid = 1; m_srep.m_isContactyValid = 1; m_srep.m_hasAddressValid = 1; m_srep.m_hasTODValid = 1; //m_srep.m_hasSiteVenueValid = 1; // a quick validation. reply must unlock the url from the lock table. // so the locks must be equal. if ( m_sreqValid && // we create a new spiderrequest if injecting with a fake firstip // so it will fail this test... ! m_sreq.m_isInjecting ) { int64_t lock1 = makeLockTableKey(&m_sreq); int64_t lock2 = makeLockTableKey(&m_srep); if ( lock1 != lock2 ) { log("build: lock1 != lock2 lock mismatch for %s", m_firstUrl.m_url); char *xx=NULL;*xx=0; } } // validate m_srepValid = true; return &m_srep; } // . so Msg20 can see if we are banned now or not... // . we must skip certain rules in getUrlFilterNum() when doing to for Msg20 // because things like "parentIsRSS" can be both true or false since a url // can have multiple spider recs associated with it! void XmlDoc::setSpiderReqForMsg20 ( SpiderRequest *sreq , SpiderReply *srep ) { // sanity checks if ( ! m_ipValid ) { char *xx=NULL;*xx=0; } //if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; } //if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; } if ( ! m_pubDateValid ) { char *xx=NULL;*xx=0; } if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; } if ( ! m_isRSSValid ) { char *xx=NULL;*xx=0; } if ( ! m_isPermalinkValid ) { char *xx=NULL;*xx=0; } //if ( ! m_isUrlPermalinkFormatValid ) { char *xx=NULL;*xx=0; } //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } //if ( ! m_pageNumInlinksValid ) { char *xx=NULL;*xx=0; } //if ( ! m_percentChangedValid ) { char *xx=NULL;*xx=0; } Url *fu = getFirstUrl(); // get this //TagRec *gr = (TagRec *)ptr_tagRecData; //Tag *tag = NULL; //if ( gr ) tag = gr->getTag("sitenuminlinks"); // reset sreq->reset(); // assume not valid sreq->m_siteNumInlinks = -1; if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } // how many site inlinks? sreq->m_siteNumInlinks = m_siteNumInlinks; sreq->m_siteNumInlinksValid = true; // set other fields besides key sreq->m_firstIp = m_ip; sreq->m_hostHash32 = m_hostHash32a; //sreq->m_domHash32 = m_domHash32; //sreq->m_siteNumInlinks = m_siteNumInlinks; //sreq->m_pageNumInlinks = m_pageNumInlinks; sreq->m_hopCount = m_hopCount; sreq->m_parentHostHash32 = 0;//m_sreq.m_parentHostHash32; sreq->m_parentDomHash32 = 0;//m_sreq.m_parentDomHash32; sreq->m_parentSiteHash32 = 0;//m_sreq.m_parentSiteHash32; sreq->m_parentFirstIp = 0;//m_sreq.m_parentFirstIp; sreq->m_isNewOutlink = 0; sreq->m_isAddUrl = 0;//m_isAddUrl; sreq->m_isPingServer = fu->isPingServer(); //sreq->m_isUrlPermalinkFormat = m_isUrlPermalinkFormat; // transcribe from old spider rec, stuff should be the same sreq->m_addedTime = m_firstIndexedDate; sreq->m_sameDom = 0;//m_sreq.m_sameDom; sreq->m_sameHost = 0;//m_sreq.m_sameHost; sreq->m_sameSite = 0;//m_sreq.m_sameSite; sreq->m_wasParentIndexed = 0;//m_sreq.m_parentWasIndexed; sreq->m_parentIsRSS = 0;//m_sreq.m_parentIsRSS; sreq->m_parentIsPermalink = 0;//m_sreq.m_parentIsPermalink; sreq->m_parentIsPingServer = 0;//m_sreq.m_parentIsPingServer; // validate the stuff so getUrlFilterNum() acks it sreq->m_hopCountValid = 1; srep->m_spideredTime = getSpideredTime();//m_spideredTime; //srep->m_isSpam = isSpam; // real-time update this!!! srep->m_isRSS = m_isRSS; srep->m_isPermalink = m_isPermalink; srep->m_httpStatus = 200; //srep->m_retryNum = 0; srep->m_langId = m_langId; srep->m_percentChangedPerDay = 0;//m_percentChanged; // we need this now for ucp ucr upp upr new url filters that do // substring matching on the url if ( m_firstUrlValid ) strcpy(sreq->m_url,m_firstUrl.m_url); } // . add the spiderdb recs to the meta list // . used by XmlDoc::setMetaList() // . returns NULL and sets g_errno on error // . otherwise returns the "new p" // . if Scraper.cpp or PageAddUrl.cpp and Msg7.cpp should all use the XmlDoc // class even if just adding links. they should make a fake html page and // "inject" it, with only m_useSpiderdb set to true... char *XmlDoc::addOutlinkSpiderRecsToMetaList ( ) { if ( m_doingConsistencyCheck ) { char *xx=NULL;*xx=0; } // do not do this if recycling content // UNLESS REBUILDING... if ( m_recycleContent && ! m_useSecondaryRdbs ) return (char *)0x01; // for now skip in repair tool if ( m_useSecondaryRdbs && ! g_conf.m_rebuildAddOutlinks ) return (char *)0x01; Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; Links *links = getLinks(); if ( ! links || links == (Links *)-1 ) return (char *)links; char *spiderLinks = getSpiderLinks(); if ( ! spiderLinks || spiderLinks == (char *)-1 ) return (char *)spiderLinks; TagRec ***grv = getOutlinkTagRecVector(); if ( ! grv || grv == (void *)-1 ) return (char *)grv; //char **iiv = getOutlinkIsIndexedVector(); //if ( ! iiv || iiv == (void *)-1 ) return (char *)iiv; int32_t **ipv = getOutlinkFirstIpVector(); if ( ! ipv || ipv == (void *)-1 ) return (char *)ipv; //int8_t *hcv = getOutlinkHopCountVector(); //if ( ! hcv || hcv == (void *)-1 ) return (char *)hcv; char *ipi = getIsIndexed(); // is the parent indexed? if ( ! ipi || ipi == (char *)-1 ) return (char *)ipi; Addresses *aa = getAddresses (); if ( ! aa || aa == (Addresses *)-1 ) return (char *)aa; // sanity check if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; } // . ignore address in dup sections // . this way if the place always has their address in the header or // footer of every web page we will ignore it (SEC_DUP section flag) bool parentHasAddress = (bool)(aa->getNumNonDupAddresses()>0); // need this int32_t parentDomHash32 = getDomHash32(); if ( parentDomHash32 != m_domHash32 ) { char *xx=NULL;*xx=0; } char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (char *)isRoot; int32_t *psni = getSiteNumInlinks(); if ( ! psni || psni == (int32_t *)-1 ) return (char *)psni; int32_t *pfip = getFirstIp(); if ( ! pfip || pfip == (void *)-1 ) return (char *)pfip; int64_t *d = getDocId(); if ( ! d || d == (int64_t *)-1 ) return (char *)d; Url *fu = getFirstUrl(); if ( ! fu || fu == (void *)-1 ) return (char *)fu; Url *cu = getCurrentUrl(); if ( ! cu || cu == (void *)-1 ) return (char *)cu; uint8_t *langId = getLangId(); if ( ! langId || langId == (uint8_t *)-1 ) return (char *)langId; // validate this to prevent core for simplified redirect links int32_t hostHash32a = getHostHash32a(); // so linkSites[i] is site for link #i in Links.cpp class int32_t *linkSiteHashes = getLinkSiteHashes ( ); if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 ) return (char *)linkSiteHashes; XmlDoc *nd = this; // set "od". will be NULL if no old xml doc, i.e. no old title rec //XmlDoc **pod = getOldXmlDoc ( ); //if ( ! pod || pod == (void *)-1 ) return (char *)pod; //XmlDoc *od = *pod; // if this page is hacked, then do not spider external outlinks //char *comp = getIsCompromised(); //if ( ! comp || comp == (char *)-1 ) return (char *)comp; //if ( *comp ) // onlyInternal = true; bool isParentRSS = false; bool parentIsPermalink = false; // PageAddUrl.cpp does not supply a valid new doc, so this is NULL if ( nd ) { isParentRSS = *nd->getIsRSS() ; parentIsPermalink = *nd->getIsPermalink(); } int32_t n = links->m_numLinks; // return early if nothing to do. do not return NULL though cuz we // do not have g_errno set! if ( n <= 0 ) return (char *)0x01; // sanity checks if ( ! m_ipValid ) { char *xx=NULL;*xx=0; } if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; } if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } if ( ! m_hostHash32aValid ) { char *xx=NULL;*xx=0; } if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; } if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; } //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } // . pre-allocate a buffer to hold the spider recs // . taken from SpiderRequest::store() int32_t size = 0; for ( int32_t i = 0 ; i < n ; i++ ) size += SpiderRequest::getNeededSize ( links->getLinkLen(i) ); // append spider recs to this list ptr char *p = m_p; // hash table to avoid dups HashTableX ht; char buf2[8192]; if ( ! ht.set ( 4,0,1000,buf2 , 8192,false,m_niceness,"linkdedup" ) ) return NULL; // count how many we add int32_t numAdded = 0; int32_t numAddedFromSameDomain = 0; int32_t linksBanned = 0; int32_t linksFiltered = 0; bool isParentPingServer = false; if ( fu && fu->isPingServer() ) isParentPingServer = true; if ( cu && cu->isPingServer() ) isParentPingServer = true; // int16_tcut bool isScraping = (m_sreqValid && m_sreq.m_isScraping); bool useTestSpiderDir = (m_sreqValid && m_sreq.m_useTestSpiderDir); CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // do not do this if not test collection for now bool isTestColl = (! strcmp(cr->m_coll,"qatest123") ); // turn off for now isTestColl = false; //char **wptrs = m_words.getWords(); //int32_t *wlens = m_words.getWordLens(); // need this for setting SpiderRequest::m_spiderTime //int32_t nowGlobal = getTimeGlobal(); // for setting LF_CONTACTY bit on the outlinks char disbuf[1000]; HashTableX disqualify; disqualify.set(4,0,32,disbuf,1000,false,m_niceness,"disqual"); int32_t consec = 0; int32_t linkTypes[2000]; int32_t lastType = 0; // if the file we are indexing now has // "<meta name=spiderlinkslinks value=0>" then that means to // add the links to spiderdb, but do not spider their links! // dmozparse uses this to make a file called gbdmoz.urs.txt.0 // that is just filled with urls that are in dmoz. and we want // to index just those urls. // // now just make dmozparse output urls as <a href=> tags. // char mbuf[16]; mbuf[0] = '\0'; char *tag = "spiderlinkslinks"; int32_t tlen = gbstrlen(tag); xml->getMetaContent ( mbuf, 16 , tag , tlen ); bool avoid = false; if ( mbuf[0] == '0' ) avoid = true; // if this is a simplified redir and we should not be spidering // links then turn it off as well! because we now add simplified // redirects back into spiderdb using this function. if ( m_spiderLinksValid && ! m_spiderLinks ) avoid = true; // it also has this meta tag now too mbuf[0] = '\0'; tag = "ignorelinksexternalerrors"; tlen = gbstrlen(tag); xml->getMetaContent ( mbuf, 16 , tag , tlen ); bool ignore = false; if ( mbuf[0] == '1' ) ignore = true; //SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull ( m_collnum ); // // serialize each link into the metalist now // for ( int32_t i = 0 ; i < n ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // grab our info TagRec *gr = (*grv)[i]; int32_t firstIp = (*ipv)[i]; //char isIndexed = (*iiv)[i]; //int32_t hc = hcv[i]; // ip lookup failed? do not add to spiderdb then if ( firstIp == 0 || firstIp == -1 ) continue; // sanity check //if ( firstIp == 0x03 ) {char *xx=NULL;*xx=0; } // get flags linkflags_t flags = links->m_linkFlags[i]; // . skip if we are rss page and this link is an <a href> link // . we only harvest <link> urls from rss feeds, not href links // . or in the case of feedburner, those orig tags if ( isParentRSS && (flags & LF_AHREFTAG) ) continue; // if we have a <feedburner:origLink> tag, then ignore <link> // tags and only get the links from the original links if ( links->m_isFeedBurner && !(flags & LF_FBTAG) ) continue; // do not add self links, pointless if ( flags & LF_SELFLINK ) continue; // do not add if no follow if ( flags & LF_NOFOLLOW ) continue; // point to url char *s = links->getLink (i); int32_t slen = links->getLinkLen(i); // breathe QUICKPOLL(m_niceness); // get hash int32_t uh = hash32 ( s , slen ); // it does not like keys of 0, that means empty slot if ( uh == 0 ) uh = 1; // skip if dup if ( ht.isInTable ( &uh ) ) continue; // add it, returns false and sets g_errno on error if ( ! ht.addKey ( &uh ) ) return NULL; // we now supports HTTPS if ( strncmp(s,"http://",7) && strncmp(s,"https://",8) ) continue; // . do not add if "old" // . Links::set() calls flagOldOutlinks() // . that just means we probably added it the last time // we spidered this page // . no cuz we might have a different siteNumInlinks now // and maybe this next hop count is now allowed where as // before it was not! //if ( flags & LF_OLDLINK ) continue; // set it. addWWW = true! no.. make it false because of issues // like tmblr.co/ZHw5yo1E5TAaW injection where // www.tmblr.co has no IP Url url; url.set ( s , slen , false ); // true ); // if hostname length is <= 2 then SILENTLY reject it if ( url.getHostLen() <= 2 ) continue; // are we a new outlink from a ? i.e. a "hot link"? assume so bool newOutlink = true; // if no old links, can not be a new outlink then if ( flags & LF_OLDLINK ) newOutlink = false; // . do not consider outlinks of new pages to be newOutlinks. // that is somewhat redundant. // . you can use "parentisnew" to do what you want in the url // filters table //if ( ! isIndexed ) newOutlink = false; // get # of inlinks to this site... if recorded... int32_t ksni = -1; Tag *st = NULL; if ( gr ) st = gr->getTag ("sitenuminlinks"); if ( st ) ksni = atol(st->getTagData()); //if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } //int32_t ksni = m_siteNumInlinks; // . get possible pub date from url (.../2008/09/23/page.htm) // . this returns 0 if none found //int32_t urlPubDate = parseDateFromUrl(s); // use zero for the timestamp so SiteGetter does not recompute // any tags in the tagRec thereby blocking! //SiteGetter sg; //sg.getSite ( s , gr , 0, m_coll, m_niceness,false,NULL,NULL); // get this bool issiteroot = isSiteRootFunc3 ( s , linkSiteHashes[i] ); //int32_t siteHash32 = hash32n ( linkSite ); // get it quick bool ispingserver = url.isPingServer(); int32_t hostHash32 = url.getHostHash32(); int32_t domHash32 = url.getDomainHash32(); // is link rss? bool isrss = false; if (slen>6 && !strncasecmp(s+slen-4,".rss",4)) isrss = true; // make the spider request rec for it SpiderRequest ksr; // to defaults (zero out) ksr.reset(); // set other fields besides key ksr.m_firstIp = firstIp; ksr.m_hostHash32 = hostHash32; ksr.m_domHash32 = domHash32; ksr.m_siteHash32 = linkSiteHashes[i];//siteHash32; ksr.m_siteNumInlinks = ksni; ksr.m_siteNumInlinksValid = true; // continue using "test-spider" subdir to cache web pages // if our parent was using that ksr.m_useTestSpiderDir = useTestSpiderDir; // now we need this so we can share Msg12 spider locks with // query reindex docid-based spider requests. that way // we do not spider the same document at the same time. ksr.m_probDocId = g_titledb.getProbableDocId(&url); //ksr.m_pageNumInlinks = 0; // hop count is now 16 bits so do not wrap that around int32_t hc = m_hopCount + 1; if ( hc > 65535 ) hc = 65535; ksr.m_hopCount = hc; // keep hopcount the same for redirs if ( m_indexCodeValid && ( m_indexCode == EDOCSIMPLIFIEDREDIR || m_indexCode == EDOCNONCANONICAL ) ) ksr.m_hopCount = m_hopCount; // for diffbot custom crawls we keep the computed hopcount if ( ! cr->m_isCustomCrawl ) { if ( issiteroot ) ksr.m_hopCount = 0; if ( ispingserver ) ksr.m_hopCount = 0; //if ( isrss ) ksr.m_hopCount = 0; } // validate it ksr.m_hopCountValid = true; ksr.m_addedTime = getSpideredTime();//m_spideredTime; //ksr.m_lastAttempt = 0; //ksr.m_urlPubDate = urlPubDate; //ksr.m_errCode = 0; ksr.m_parentHostHash32 = hostHash32a; ksr.m_parentDomHash32 = m_domHash32; ksr.m_parentSiteHash32 = m_siteHash32; ksr.m_parentFirstIp = *pfip;//m_ip; ksr.m_parentHasAddress = parentHasAddress; // get this bool isupf = ::isPermalink(NULL,&url,CT_HTML,NULL,isrss); // set some bit flags. the rest are 0 since we call reset() if ( newOutlink ) ksr.m_isNewOutlink = 1; if ( isupf ) ksr.m_isUrlPermalinkFormat = 1; //if ( isIndexed ) ksr.m_isIndexed = 1; if ( ispingserver ) ksr.m_isPingServer = 1; // is it like www.xxx.com/* (does not include www.xxx.yyy.com) // includes xxx.com/* however ksr.m_isWWWSubdomain = url.isSimpleSubdomain(); // get link text we use for this outlink /* char tbuf[200]; int32_t tlen = links->getLinkText2 ( i , tbuf , 200 , NULL , NULL , NULL , m_niceness ); */ // the updated isContacty algo to fix www.apha.org which // has a ton of apha.org/about/* links int32_t t = getIsContacty ( &url, NULL , ksr.m_hopCount , 0 , // content type (ksr.m_hopCount==0), m_niceness ); // if same type as last one we might disqualify if 3 in a row if ( t && t == lastType ) consec++; else consec = 0; // disqualify this pattern as a contacty link if is abused if ( consec >= 3 ) if ( ! disqualify.addKey(&t) ) return NULL; // remember. use numAdded as the index for this since we do // not add all the outlinks to this list. if ( numAdded < 2000 ) linkTypes[numAdded] = t; // set this lastType = t; // validate ksr.m_isContactyValid = 1; // if parent is a root of a popular site, then it is considered // an authority linker. (see updateTagdb() function above) if ( *isRoot && *psni >= 500 ) ksr.m_hasAuthorityInlink = 1; // this is in request now as well as reply //Tag *tag; // hascontactinfo tag can have a value of 0 or 1 //tag = gr->getTag("hascontactinfo"); //if ( tag ) { if ( ! m_hasContactInfoValid ) { char *xx=NULL;*xx=0; } if ( m_hasContactInfo ) { ksr.m_hasContactInfo = 1; ksr.m_hasContactInfoValid = true; } // if we just set the contact info, use us, more recent if ( linkSiteHashes[i]==m_siteHash32 && m_hasContactInfoValid){ ksr.m_hasContactInfo = m_hasContactInfo; ksr.m_hasContactInfoValid = true; } if ( gr->getTag("ingoogle" ) ) { ksr.m_inGoogle = 1; ksr.m_inGoogleValid = true; } // the mere existence of these tags is good if ( gr->getTag("authorityinlink"))ksr.m_hasAuthorityInlink =1; ksr.m_hasAuthorityInlinkValid = true; // set parent based info if ( domHash32 == m_domHash32 ) ksr.m_sameDom = 1; if ( hostHash32 == m_hostHash32a ) ksr.m_sameHost = 1; if ( linkSiteHashes[i]==m_siteHash32 ) ksr.m_sameSite = 1; if ( *ipi ) ksr.m_wasParentIndexed = 1; if ( isParentRSS ) ksr.m_parentIsRSS = 1; if ( parentIsPermalink ) ksr.m_parentIsPermalink = 1; if ( isParentPingServer ) ksr.m_parentIsPingServer= 1; // this is used for building dmoz. we just want to index // the urls in dmoz, not their outlinks. if ( avoid ) ksr.m_avoidSpiderLinks = 1; // this is used for building dmoz. we need to index this // url even in the case of ETCPTIMEDOUT, etc. if ( ignore ) ksr.m_ignoreExternalErrors = 1; // . if this is the 2nd+ time we were spidered and this outlink // wasn't there last time, then set this! // . if this is the first time spidering this doc then set it // to zero so that m_minPubDate is set to -1 when the outlink // defined by "ksr" is spidered. if ( m_oldDocValid && m_oldDoc ) { int32_t oldSpideredTime = m_oldDoc->getSpideredTime(); ksr.m_parentPrevSpiderTime = oldSpideredTime; } else ksr.m_parentPrevSpiderTime = 0; // // . inherit manual add bit if redirecting to simplified url // . so we always spider seed url even if prohibited by // the regex, and even if it simplified redirects // if ( m_indexCodeValid && ( m_indexCode == EDOCSIMPLIFIEDREDIR || m_indexCode == EDOCNONCANONICAL ) && m_sreqValid ) { if ( m_sreq.m_isInjecting ) ksr.m_isInjecting = 1; if ( m_sreq.m_isAddUrl ) ksr.m_isAddUrl = 1; } // it is useful to know the primary langid of the parent // when prioritizing links for spidering in the case of // focussing the search engine on a particular set of langs ksr.m_parentLangId = *langId; // don't forget this one! //ksr.m_spiderTime = nowGlobal; // . is it "spam"? XmlDoc.cpp::isSpam() // . we need to make that root quality into site root quality! // . let's put spam detection logic into url filters //if ( isSpam ( s,gr,m_spideredTime,true ) ) // // set the bit flag // ksr.m_isSpam = 1; // copy the url into SpiderRequest::m_url buffer strcpy(ksr.m_url,s); // this must be valid if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; } // set the key, ksr.m_key. isDel = false ksr.setKey ( firstIp, *d , false ); // if we've recently added this url to spiderdb in Spider.cpp, skip it //if ( sc && sc->isInDupCache ( &ksr , false ) ) // continue; // . technically speaking we do not have any reply so we // should not be calling this! cuz we don't have all the info // . see if banned or filtered, etc. // . at least try to call it. getUrlFilterNum() should // break out and return -1 if it encounters a filter rule // that it does not have enough info to answer. // so if your first X filters all map to a "FILTERED" // priority and this url matches one of them we can // confidently toss this guy out. // . show this for debugging! // int32_t ufn = ::getUrlFilterNum ( &ksr , NULL, m_spideredTime , // false, m_niceness, cr, // false,//true , // outlink? // NULL ); // quotatable // logf(LOG_DEBUG,"build: ufn=%"INT32" for %s", // ufn,ksr.m_url); // bad? //if ( ufn < 0 ) { // log("build: link %s had bad url filter." // , ksr.m_url ); // g_errno = EBADENGINEER; // return NULL; //} //int32_t priority = -1; //if ( ufn >= 0 ) // priority = cr->m_spiderPriorities[ufn]; // debug if ( g_conf.m_logDebugUrlAttempts || isScraping ) { // print the tag rec out into sb2 SafeBuf sb2; if ( gr ) gr->printToBuf ( &sb2 ); // get it //SafeBuf sb1; char *action = "add"; if ( isScraping ) action = "scrape"; logf(LOG_DEBUG, "spider: attempting to %s link. " "%s " "tags=%s " "onpage=%s" , action , ksr.m_url, //sb1.getBufStart(), sb2.getBufStart(), m_firstUrl.m_url); } // do not add if bad priority, SPIDER_PRIORITY_FILTERED, ... // . mdw: oct 24, 2013. now i add so the urls show up in // the pagecrawlbot.cpp spiderdb dump, so you can examine // exactly why a url was crawled or not. plus if you change // your mind about banning/filtering then it'd be nice to // have these urls readily available. //if ( priority == SPIDER_PRIORITY_FILTERED ) { // linksFiltered++; continue; } //if ( priority == SPIDER_PRIORITY_BANNED ) { // linksBanned++; continue; } // serialize into the buffer int32_t need = ksr.getRecSize(); // is that what we thought it would be? //int32_t thought = links->m_linkLens[i] + 1 + hsize; // sanity check //if ( need + 12 + 4 > thought ) { char *xx=NULL;*xx=0; } // sanity check if ( p + 1 + need > m_pend ) { char *xx=NULL;*xx=0; } // store the rdbId if ( m_useSecondaryRdbs ) *p++ = RDB2_SPIDERDB2; else *p++ = RDB_SPIDERDB; // print it for debug if ( isTestColl ) { SafeBuf tmp; ksr.print(&tmp); log("spider: attempting to add outlink " "%s",tmp.getBufStart()); } // store the spider rec gbmemcpy ( p , &ksr , need ); // skip it p += need; // count it numAdded++; // check domain if ( domHash32 == m_domHash32 ) numAddedFromSameDomain++; } // // scan through requests and set m_isContacty // char *s = m_p; int32_t k = 0; for ( ; s < p ; k++ ) { // advance over rdbid s++; // breathe QUICKPOLL(m_niceness); // cast SpiderRequest *ksr = (SpiderRequest *)s; // set size size = ksr->getRecSize(); // advance over that s += size; // stop if breach if ( k >= 2000 ) break; // must be isContacty if ( ! linkTypes[k] ) continue; // and not disqualified if ( disqualify.isInTable(&linkTypes[k] )) continue; // ok, we are good to go ksr->m_isContacty = 1; } // . this is just how many urls we tried to index // . move into Spider::addSpiderRequest() //cr->m_localCrawlInfo.m_urlsHarvested += numAdded; //cr->m_globalCrawlInfo.m_urlsHarvested += numAdded; //cr->m_needsSave = true; // save it m_numOutlinksAdded = numAdded; m_numOutlinksAddedValid = true; m_numOutlinksAddedFromSameDomain = numAddedFromSameDomain; m_numOutlinksFiltered = linksFiltered; m_numOutlinksBanned = linksBanned; // update end of list once we have successfully added all spider recs m_p = p; // return current ptr return m_p ; } /* // add keys/recs from the table into the metalist bool XmlDoc::addTable96 ( HashTableX *tt1 , int32_t date1 , bool nosplit ) { // sanity check if ( tt1->m_numSlots ) { if ( tt1->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;} if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;} } // docid is handy int64_t d = *getDocId(); uint8_t f = 0; if ( nosplit ) f = 0x80; // use secondary rdbs if repairing //bool useRdb2 = ( g_repair.isRepairActive() && // ! g_repair.m_fullRebuild && // ! g_repair.m_removeBadPages ); char rdbId1 = RDB_INDEXDB; char rdbId2 = RDB_DATEDB; if ( m_useSecondaryRdbs ) { // useRdb2 ) { rdbId1 = RDB2_INDEXDB2; rdbId2 = RDB2_DATEDB2; } // store terms from "tt1" table for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) { // breathe QUICKPOLL(m_niceness); // skip if empty if ( tt1->m_flags[i] == 0 ) continue; // get its key int64_t *termId1 = (int64_t *)tt1->getKey ( i ); // get the score uint8_t score1 = score32to8( tt1->getScoreFromSlot(i) ); // sanity check if ( score1 <= 0 ) { char *xx=NULL;*xx=0; } // store rdbid *m_p++ = (rdbId1 | f); // store it. not a del key. *(key_t *)m_p=g_indexdb.makeKey(*termId1,score1,d,false); // skip it m_p += sizeof(key_t); // add to datedb? if ( date1 == -1 ) continue; // yes *m_p++ = (rdbId2 | f); // store it. not a del key. *(key128_t *)m_p= g_datedb.makeKey(*termId1,date1,score1,d,false); // advance over that m_p += sizeof(key128_t); } return true; } */ bool XmlDoc::addTable128 ( HashTableX *tt1 , // T <key128_t,char> *tt1 uint8_t rdbId , bool forDelete ) { // sanity check if ( rdbId == 0 ) { char *xx=NULL;*xx=0; } bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive(); //if ( g_repair.m_fullRebuild ) useRdb2 = false; //if ( g_repair.m_removeBadPages ) useRdb2 = false; // store this rdbId into the list char useRdbId = rdbId; //if ( useRdb2 && rdbId == RDB_CLUSTERDB ) useRdbId = RDB2_CLUSTERDB2; if ( useRdb2 && rdbId == RDB_LINKDB ) useRdbId = RDB2_LINKDB2; if ( useRdb2 && rdbId == RDB_DATEDB ) useRdbId = RDB2_DATEDB2; if ( useRdb2 && rdbId == RDB_PLACEDB ) useRdbId = RDB2_PLACEDB2; if ( useRdb2 && rdbId == RDB_SECTIONDB ) useRdbId = RDB2_SECTIONDB2; // sanity checks if ( tt1->m_ks != 16 ) { char *xx=NULL;*xx=0; } if ( rdbId == RDB_PLACEDB ) { if ( tt1->m_ds != 512 ) { char *xx=NULL;*xx=0; } } else if ( rdbId == RDB_SECTIONDB ) { int32_t svs = sizeof(SectionVote); if ( tt1->m_ds != svs ) { char *xx=NULL;*xx=0; } } else { if ( tt1->m_ds != 0 ) { char *xx=NULL;*xx=0; } } int32_t count = 0; // store terms from "tt1" table for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) { // skip if empty if ( tt1->m_flags[i] == 0 ) continue; // breathe QUICKPOLL(m_niceness); // get its key key128_t *k = (key128_t *)tt1->getKey ( i ); // no key is allowed to have the del bit clear at this point // because we reserve that for making negative keys! if ( ! ( k->n0 & 0x0000000000000001LL ) ){char*xx=NULL;*xx=0;} // store rdbid *m_p++ = useRdbId; // (useRdbId | f); // store it // *(key128_t *)m_p = *k; does this work? gbmemcpy ( m_p , k , sizeof(key128_t) ); // all keys must be positive at this point if ( ! ( m_p[0] & 0x01 ) ) { char *xx=NULL;*xx=0; } // or if getting for incremental indexing and this is // from the "oldList" //if ( forDelete ) *m_p = *m_p & 0xfe; // skip key m_p += sizeof(key128_t); // count it count++; // do not add the data if deleting if ( forDelete ) continue; // skip if not sectiondb or placedb if ( rdbId != RDB_SECTIONDB && rdbId != RDB_PLACEDB ) continue; // ok test it out (MDW) //logf(LOG_DEBUG,"doc: UNDO ME!!!!!!!!"); // this below //if ( count > 1 ) continue; // get the data value char *val = (char *)tt1->getValue ( k ); // get the size of the data to store. assume Sectiondb vote. int32_t ds = sizeof(SectionVote); // placedb is special even. include the \0 terminator if ( rdbId == RDB_PLACEDB ) { // "ds" is how many bytes we store as data ds = gbstrlen(val)+1; // store dataSize first *(int32_t *)m_p = ds; // skip it m_p += 4; } // store possible accompanying date of the rdb record gbmemcpy (m_p,val, ds ); // skip it m_p += ds; } //if(rdbId==RDB_LINKDB ) log("doc: added %"INT32" linkdb keys" ,count); //if(rdbId==RDB_SECTIONDB ) log("doc: added %"INT32" sectiondb keys",count); return true; } int32_t XmlDoc::getSiteRank ( ) { if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } return ::getSiteRank ( m_siteNumInlinks ); } // . add keys/recs from the table into the metalist // . we store the keys into "m_p" unless "buf" is given bool XmlDoc::addTable144 ( HashTableX *tt1 , int64_t docId , SafeBuf *buf ) { // sanity check if ( tt1->m_numSlots ) { if ( tt1->m_ks != sizeof(key144_t) ) {char *xx=NULL;*xx=0;} if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;} } // assume we are storing into m_p char *p = m_p; // reserve space if we had a safebuf and point into it if there if ( buf ) { int32_t slotSize = (sizeof(key144_t)+2+sizeof(key128_t)); int32_t need = tt1->getNumSlotsUsed() * slotSize; if ( ! buf->reserve ( need ) ) return false; // get cursor into buf, NOT START of buf p = buf->getBufStart(); } int32_t siteRank = getSiteRank (); if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; } char rdbId = RDB_POSDB; if ( m_useSecondaryRdbs ) rdbId = RDB2_POSDB2; // store terms from "tt1" table for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) { // breathe QUICKPOLL(m_niceness); // skip if empty if ( tt1->m_flags[i] == 0 ) continue; // get its key char *kp = (char *)tt1->getKey ( i ); // store rdbid *p++ = rdbId; // (rdbId | f); // store it as is gbmemcpy ( p , kp , sizeof(key144_t) ); // sanity check //int64_t final = hash64n("products.offerprice",0); //int64_t prefix = hash64n("gbsortby",0); //int64_t h64 = hash64 ( final , prefix); //h64 &= TERMID_MASK; //if ( g_posdb.getTermId(kp) == h64 ) { // log("hey: docid=%"INT64" float=%f",m_docId, // g_posdb.getFloat(kp) ); //} /* // get the score int32_t score = tt1->getScoreFromSlot ( i ) ; // set the M-bits to the score. used to accumulate link texts // that are the same so pages like google.com do not have // the word 'google' like 1 million times. this should reduce // our "score" logarithmacly into the 7-bits or whatever. // // NO! now we just always increment the distance cursor // m_dist so there will never be a collision of any posdb // key we add... so we think if ( score ) { int32_t newScore = score; if ( score >= 65 ) newScore = 65 +(score/100); //if ( score >= 65+3200) newScore = 65 +(score/100); if ( newScore > MAXMULTIPLIER ) newScore = MAXMULTIPLIER; g_posdb.setMultiplierBits(m_p,(unsigned char)newScore); } */ // this was zero when we added these keys to zero, so fix it g_posdb.setDocIdBits ( p , docId ); // if this is a numeric field we do not want to set // the siterank or langid bits because it will mess up // sorting by the float which is basically in the position // of the word position bits. if ( g_posdb.isAlignmentBitClear ( p ) ) { // make sure it is set again. it was just cleared // to indicate that this key contains a float // like a price or something, and we should not // set siterank or langid so that its termlist // remains sorted just by that float g_posdb.setAlignmentBit ( p , 1 ); } // otherwise, set the siterank and langid else { // this too g_posdb.setSiteRankBits ( p , siteRank ); // set language here too g_posdb.setLangIdBits ( p , m_langId ); } // advance over it p += sizeof(key144_t); } // all done if ( ! buf ) { m_p = p; return true; } // update safebuf otherwise char *start = buf->getBufStart(); // fix SafeBuf::m_length buf->setLength ( p - start ); // sanity if ( buf->length() > buf->getCapacity() ) { char *xx=NULL;*xx=0; } return true; } // add keys/recs from the table into the metalist bool XmlDoc::addTable224 ( HashTableX *tt1 ) { // sanity check if ( tt1->m_numSlots ) { if ( tt1->m_ks != sizeof(key224_t) ) {char *xx=NULL;*xx=0;} if ( tt1->m_ds != 0 ) {char *xx=NULL;*xx=0;} } char rdbId = RDB_LINKDB; if ( m_useSecondaryRdbs ) rdbId = RDB2_LINKDB2; // store terms from "tt1" table for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) { // breathe QUICKPOLL(m_niceness); // skip if empty if ( tt1->m_flags[i] == 0 ) continue; // get its key char *kp = (char *)tt1->getKey ( i ); // store rdbid *m_p++ = rdbId; // (rdbId | f); // store it as is gbmemcpy ( m_p , kp , sizeof(key224_t) ); // advance over it m_p += sizeof(key224_t); } return true; } /* // . add table into our metalist pointed to by m_p // . k.n1 = date (see hashWords() below) // . k.n0 = termId (see hashWords() below) // . and the value is the score, 32-bits bool XmlDoc::addTableDate ( HashTableX *tt1 , // T <key128_t,char> *tt1 uint64_t docId , uint8_t rdbId , bool nosplit ) { if ( tt1->m_numSlotsUsed == 0 ) return true; uint8_t f = 0; if ( nosplit ) f = 0x80; // sanity check if ( rdbId == 0 ) { char *xx=NULL;*xx=0; } // sanity checks if ( nosplit ) { if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; } } bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive(); //if ( g_repair.m_fullRebuild ) useRdb2 = false; //if ( g_repair.m_removeBadPages ) useRdb2 = false; //if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2; if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2; if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2; // sanity checks if ( tt1->m_ks != 12 ) { char *xx=NULL;*xx=0; } if ( tt1->m_ds != 4 ) { char *xx=NULL;*xx=0; } // store terms from "tt1" table for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) { // skip if empty if ( tt1->m_flags[i] == 0 ) continue; // breathe QUICKPOLL(m_niceness); // get its key key96_t *k = (key96_t *)tt1->getKey ( i ); // get its value uint32_t v = *(uint32_t *)tt1->getValueFromSlot ( i ); // convert to 8 bits v = score32to8 ( v ); // . make the meta list key for datedb // . a datedb key (see Datedb.h) key128_t mk = g_datedb.makeKey ( k->n0 , // termId k->n1 , // date v , // score (8 bits) docId , false );// del key? // store rdbid with optional "nosplit" flag *m_p++ = (rdbId | f); // store it. it is a del key. *(key128_t *)m_p = mk; // skip it m_p += sizeof(key128_t); } return true; } */ /* // add keys/recs from the table into the metalist bool XmlDoc::addTable96 ( HashTableX *tt1 , HashTableX *tt2 , int32_t date1 , int32_t date2 , bool del , bool nosplit ) { // sanity check if ( tt1->m_numSlots ) { if ( tt1->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;} if ( tt1->m_ds != 4 ) {char *xx=NULL;*xx=0;} } if ( tt2->m_numSlots ) { if ( tt2->m_ks != sizeof(key96_t) ) {char *xx=NULL;*xx=0;} if ( tt2->m_ds != 4 ) {char *xx=NULL;*xx=0;} } // docid is handy int64_t d = *getDocId(); uint8_t f = 0; if ( nosplit ) f = 0x80; // use secondary rdbs if repairing //bool useRdb2 = ( g_repair.isRepairActive() && // ! g_repair.m_fullRebuild && // ! g_repair.m_removeBadPages ); char rdbId1 = RDB_INDEXDB; char rdbId2 = RDB_DATEDB; if ( m_useSecondaryRdbs ) { // useRdb2 ) { rdbId1 = RDB2_INDEXDB2; rdbId2 = RDB2_DATEDB2; } // store terms from "tt1" table for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) { // skip if empty if ( tt1->m_flags[i] == 0 ) continue; // breathe QUICKPOLL(m_niceness); // get its key int64_t *termId1 = (int64_t *)tt1->getKey ( i ); // get the score uint8_t score1 = score32to8( tt1->getScoreFromSlot(i) ); // sanity check if ( score1 <= 0 ) { char *xx=NULL;*xx=0; } // see if in "tt2" int32_t slot = tt2->getSlot ( termId1 ); // assume 0 uint8_t score2 = 0; // look it up in the positive key table if ( slot >= 0 ) { score2 = score32to8 ( tt2->getScoreFromSlot(slot) ); // sanity check if ( score2 <= 0 ) { char *xx=NULL;*xx=0; } } // we annihilate! if ( score1 != score2 ) { // store rdbid *m_p++ = (rdbId1 | f); // store it. it is a del key. *(key_t *)m_p=g_indexdb.makeKey(*termId1,score1,d,del); // skip it m_p += sizeof(key_t); } // add to datedb? if ( date1 == -1 ) continue; // same dates too? if ( date1 == date2 && score1 == score2 ) continue; // yes *m_p++ = (rdbId2 | f); // store it. it is a del key. *(key128_t *)m_p=g_datedb.makeKey(*termId1,date1,score1,d,del); // advance over that m_p += sizeof(key128_t); } return true; } // . add table into our metalist pointed to by m_p // . k.n1 = date (see hashWords() below) // . k.n0 = termId (see hashWords() below) // . and the value is the score, 32-bits bool XmlDoc::addTableDate ( HashTableX *tt1 , // T <key128_t,char> *tt1 HashTableX *tt2 , // <key128_t,char> *tt2 uint64_t docId , uint8_t rdbId , bool del , bool nosplit ) { uint8_t f = 0; if ( nosplit ) f = 0x80; // sanity check if ( rdbId == 0 ) { char *xx=NULL;*xx=0; } // sanity checks if ( nosplit ) { if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; } } bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive(); //if ( g_repair.m_fullRebuild ) useRdb2 = false; //if ( g_repair.m_removeBadPages ) useRdb2 = false; if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2; if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2; if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2; // sanity checks if ( tt1->m_ks != 12 ) { char *xx=NULL;*xx=0; } if ( tt2->m_ks != 12 ) { char *xx=NULL;*xx=0; } if ( tt1->m_ds != 4 ) { char *xx=NULL;*xx=0; } if ( tt2->m_ds != 4 ) { char *xx=NULL;*xx=0; } // store terms from "tt1" table for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) { // skip if empty if ( tt1->m_flags[i] == 0 ) continue; // breathe QUICKPOLL(m_niceness); // get its key key96_t *k = (key96_t *)tt1->getKey ( i ); // get its value uint32_t v = *(uint32_t *)tt1->getValueFromSlot ( i ); // convert to 8 bits v = score32to8 ( v ); // see if in "tt2" int32_t slot = tt2->getSlot ( k ); // get value if there if ( slot >= 0 ) { // get it uint32_t val =*(uint32_t *)tt2->getValueFromSlot(slot); // convert to 8 bits val = score32to8 ( val ); // compare, if same, skip it! if ( val == v ) continue; } // . make the meta list key for datedb // . a datedb key (see Datedb.h) key128_t mk = g_datedb.makeKey ( k->n0 , // termId k->n1 , // date v , // score (8 bits) docId , del );// del key? // store rdbid with optional "nosplit" flag *m_p++ = (rdbId | f); // store it. it is a del key. *(key128_t *)m_p = mk; // skip it m_p += sizeof(key128_t); } return true; } bool XmlDoc::addTable128 ( HashTableX *tt1 , // T <key128_t,char> *tt1 HashTableX *tt2 , // <key128_t,char> *tt2 uint8_t rdbId , bool del , bool nosplit ) { uint8_t f = 0; if ( nosplit ) f = 0x80; // sanity check if ( rdbId == 0 ) { char *xx=NULL;*xx=0; } // sanity checks if ( nosplit ) { if ( rdbId == RDB_LINKDB ) { char *xx=NULL;*xx=0; } if ( rdbId == RDB_DATEDB ) { char *xx=NULL;*xx=0; } } bool useRdb2 = m_useSecondaryRdbs;//g_repair.isRepairActive(); //if ( g_repair.m_fullRebuild ) useRdb2 = false; //if ( g_repair.m_removeBadPages ) useRdb2 = false; if ( useRdb2 && rdbId == RDB_CLUSTERDB ) rdbId = RDB2_CLUSTERDB2; if ( useRdb2 && rdbId == RDB_LINKDB ) rdbId = RDB2_LINKDB2; if ( useRdb2 && rdbId == RDB_DATEDB ) rdbId = RDB2_DATEDB2; // sanity checks if ( tt1->m_ks != 16 ) { char *xx=NULL;*xx=0; } if ( tt2->m_ks != 16 ) { char *xx=NULL;*xx=0; } if ( rdbId == RDB_PLACEDB ) { if ( tt1->m_ds != 512 ) { char *xx=NULL;*xx=0; } if ( tt2->m_ds != 512 ) { char *xx=NULL;*xx=0; } } else if ( rdbId == RDB_SECTIONDB ) { int32_t svs = sizeof(SectionVote); if ( tt1->m_ds != svs ) { char *xx=NULL;*xx=0; } if ( tt2->m_ds != svs ) { char *xx=NULL;*xx=0; } } else { if ( tt1->m_ds != 0 ) { char *xx=NULL;*xx=0; } if ( tt2->m_ds != 0 ) { char *xx=NULL;*xx=0; } } int32_t count = 0; // store terms from "tt1" table for ( int32_t i = 0 ; i < tt1->m_numSlots ; i++ ) { // skip if empty if ( tt1->m_flags[i] == 0 ) continue; // breathe QUICKPOLL(m_niceness); // get its key key128_t *k = (key128_t *)tt1->getKey ( i ); // no key is allowed to have the del bit clear at this point // because we reserve that for making negative keys! if ( ! ( k->n0 & 0x0000000000000001LL ) ){char*xx=NULL;*xx=0;} // see if in "tt2" int32_t slot = tt2->getSlot ( k ); // . skip if already indexed // . do not do incremental indexing for sectiondb/placedb since // it may have the same key but different data!!!!!!! if ( slot >= 0 && rdbId != RDB_SECTIONDB && rdbId != RDB_PLACEDB ) continue; // store rdbid with optional "nosplit" flag *m_p++ = (rdbId | f); // store it // *(key128_t *)m_p = *k; does this work? gbmemcpy ( m_p , k , sizeof(key128_t) ); // all keys must be positive at this point if ( ! ( m_p[0] & 0x01 ) ) { char *xx=NULL;*xx=0; } // clear the del bit if we are an unmatched key and "del" // is true. we need to be a negative key now if ( del ) m_p[0] = m_p[0] & 0xfe; // skip key m_p += sizeof(key128_t); // count it count++; // skip if not sectiondb or placedb if ( rdbId != RDB_SECTIONDB && rdbId != RDB_PLACEDB ) continue; // ok test it out (MDW) //logf(LOG_DEBUG,"doc: UNDO ME!!!!!!!!"); // this below //if ( count > 1 ) continue; // if we were a negative key, do not add a value, even for // sectiondb if ( del ) continue; // get the data value char *val = (char *)tt1->getValue ( k ); // get the size of the data to store. assume Sectiondb vote. int32_t ds = sizeof(SectionVote); // placedb is special even. include the \0 terminator if ( rdbId == RDB_PLACEDB ) { // "ds" is how many bytes we store as data ds = gbstrlen(val)+1; // store dataSize first *(int32_t *)m_p = ds; // skip it m_p += 4; } // store possible accompanying date of the rdb record gbmemcpy (m_p,val, ds ); // skip it m_p += ds; } //if(rdbId==RDB_LINKDB ) log("doc: added %"INT32" linkdb keys" ,count); //if(rdbId==RDB_SECTIONDB ) log("doc: added %"INT32" sectiondb keys",count); return true; } */ // // . hash terms that are sharded by TERMID not DOCID!! // // . returns false and sets g_errno on error // . these terms are stored in indexdb/datedb, but all terms with the same // termId reside in one and only one group. whereas normally the records // are split based on docid and every group gets 1/nth of the termlist. // . we do this "no splitting" so that only one disk seek is required, and // we know the termlist is small, or the termlist is being used for spidering // or parsing purposes and is usually not sent across the network. bool XmlDoc::hashNoSplit ( HashTableX *tt ) { //if ( m_pbuf ) // m_pbuf->safePrintf("<h3>Terms which are immune to indexdb " // "splitting:</h3>"); //if ( m_skipIndexing ) return true; // this should be ready to go and not block! int64_t *pch64 = getExactContentHash64(); //int64_t *pch64 = getLooseContentHash64(); if ( ! pch64 || pch64 == (void *)-1 ) { char *xx=NULL;*xx=0; } // int16_tcut Url *fu = getFirstUrl(); if ( ! hashVectors ( tt ) ) return false; // constructor should set to defaults automatically HashInfo hi; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_tt = tt; // usually we shard by docid, but these are terms we shard by termid! hi.m_shardByTermId = true; // for exact content deduping setStatus ( "hashing gbcontenthash (deduping) no-split keys" ); char cbuf[64]; int32_t clen = sprintf(cbuf,"%"UINT64"",*pch64); hi.m_prefix = "gbcontenthash"; if ( ! hashString ( cbuf,clen,&hi ) ) return false; //// // // let's stop here for now, until other stuff is actually used again // //// // let's bring back image thumbnail support for the widget project //return true; char *host = fu->getHost (); //int32_t hlen = fu->getHostLen (); /* setStatus ( "hashing no-split qdom keys" ); char *dom = fu->getDomain (); int32_t dlen = fu->getDomainLen(); // desc is NULL, prefix will be used as desc hi.m_prefix = "qdom"; if ( ! hashString ( dom,dlen,&hi ) ) return false; setStatus ( "hashing no-split qhost keys" ); // desc is NULL, prefix will be used as desc hi.m_prefix = "qhost"; if ( ! hashString ( host,hlen,&hi ) ) return false; */ // now hash the site setStatus ( "hashing no-split SiteGetter terms"); // // HASH terms for SiteGetter.cpp // // these are now no-split terms // char *s = fu->getUrl (); int32_t slen = fu->getUrlLen(); // . this termId is used by SiteGetter.cpp for determining subsites // . matches what is in SiteGet::getSiteList() // for www.xyz.com/a/ HASH www.xyz.com // for www.xyz.com/a/b/ HASH www.xyz.com/a/ // for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/ bool add = true; // we only hash this for urls that end in '/' if ( s[slen-1] != '/' ) add = false; // and no cgi if ( fu->isCgi() ) add = false; // skip if root if ( fu->m_plen <= 1 ) add = false; // sanity check if ( ! m_linksValid ) { char *xx=NULL; *xx=0; } // . skip if we have no subdirectory outlinks // . that way we do not confuse all the pages in dictionary.com or // wikipedia.org as subsites!! if ( ! m_links.hasSubdirOutlink() ) add = false; // hash it if ( add ) { // remove the last path component char *end2 = s + slen - 2; // back up over last component for ( ; end2 > fu->m_path && *end2 != '/' ; end2-- ) ; // hash that part of the url hi.m_prefix = "siteterm"; if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false; } //Dates *dp = getDates (); // hash the clocks into indexdb //if ( ! dp->hash ( m_docId , tt , this ) ) return false; // . hash special site/hopcount thing for permalinks // . used by Images.cpp for doing thumbnails // . this returns false and sets g_errno on error // . let's try thumbnails for all... //if ( ! *getIsPermalink() ) return true; setStatus ( "hashing no-split gbsitetemplate keys" ); // must be valid if ( ! m_siteValid ) { char *xx=NULL;*xx=0; } char buf[MAX_URL_LEN+20]; //uint32_t th = m_tagVector.getVectorHash(); uint32_t tph = *getTagPairHash32(); // . skip this so we can do site:xyz.com queries // . but if this is https:// then you will have to // specify that... char *site = getSite(); // sanity check, must NOT start with http:// if ( ! strncmp ( site , "http://", 7 ) ) { char *xx=NULL;*xx=0;} // this must match what we search in Images.cpp::getThumbnail() int32_t blen = sprintf(buf,"%"UINT32"%s",tph,site); // use the prefix as the description if description is NULL hi.m_prefix = "gbsitetemplate"; //if ( ! hashString ( buf,blen,&hi ) ) return false; if ( ! hashSingleTerm ( buf,blen,&hi ) ) return false; setStatus ( "hashing no-split gbimage keys" ); hi.m_prefix = "gbimage"; // hash gbimage: for permalinks only for Images.cpp for ( int32_t i = 0 ; i < m_images.m_numImages ; i++ ) { // get the node number //int32_t nn = m_images.m_imageNodes[i]; // get the url of the image //XmlNode *xn = m_xml.getNodePtr(nn); int32_t srcLen; char *src = m_images.getImageUrl(i,&srcLen); // set it to the full url Url iu; // use "pageUrl" as the baseUrl Url *cu = getCurrentUrl(); // we can addwww to normalize since this is for deduping kinda iu.set ( cu , src , srcLen , true ); // addWWW? yes... char *u = iu.getUrl (); int32_t ulen = iu.getUrlLen(); // hash each one //if ( ! hashString ( u,ulen,&hi ) ) return false; // hash a single entity if ( ! hashSingleTerm ( u,ulen,&hi) ) return false; //log("test: %s",u); } return true; } // . returns -1 if blocked, returns NULL and sets g_errno on error // . "sr" is the tagdb Record // . "ws" store the terms for PageParser.cpp display char *XmlDoc::hashAll ( HashTableX *table ) { setStatus ( "hashing document" ); if ( m_allHashed ) return (char *)1; // sanity checks if ( table->m_ks != 18 ) { char *xx=NULL;*xx=0; } if ( table->m_ds != 4 ) { char *xx=NULL;*xx=0; } if ( m_wts && m_wts->m_ks != 12 ) { char *xx=NULL;*xx=0; } // ptr to term = 4 + score = 4 + ptr to sec = 4 if ( m_wts && m_wts->m_ds!=sizeof(TermDebugInfo)){char *xx=NULL;*xx=0;} unsigned char *hc = (unsigned char *)getHopCount(); if ( ! hc || hc == (void *)-1 ) return (char *)hc; // need this for hashing HashTableX *cnt = getCountTable(); if ( ! cnt ) return (char *)cnt; if ( cnt == (void *)-1 ) { char *xx=NULL;*xx=0; } // and this //Weights *we = getWeights(); //if ( ! we || we == (void *)-1 ) return (char *)we; // and this Links *links = getLinks(); if ( ! links ) return (char *)links; if ( links == (Links *)-1 ) { char *xx=NULL;*xx=0; } // and now this //Synonyms *syn = getSynonyms(); //if ( ! syn || syn == (void *)-1 ) return (char *)syn; char *wordSpamVec = getWordSpamVec(); if (!wordSpamVec) return (char *)wordSpamVec; if (wordSpamVec==(void *)-1) {char *xx=NULL;*xx=0;} char *fragVec = getFragVec();//m_fragBuf.getBufStart(); if ( ! fragVec ) return (char *)fragVec; if ( fragVec == (void *)-1 ) { char *xx=NULL;*xx=0; } // why do we need this? if ( m_wts ) { uint8_t *lv = getLangVector(); if ( ! lv ) return (char *)lv; if ( lv == (void *)-1 ) { char *xx=NULL;*xx=0; } } TagRec *gr = getTagRec(); if ( ! gr ) return (char *)gr; if ( gr == (void *)-1 ) {char *xx=NULL;*xx=0; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // just keep it somewhat sane... //if ( nw > 30000 ) nw = 30000; // then each singleton has one phrase, and 1 empty for good hashing //if ( ! table->setTableSize ( nw * 4 ) ) // return log("build: Could not allocate %"INT32" bytes for table " // "for indexing document.", // (nw*4)*(8+sizeof(int32_t))); /* const char *help = "<table><td bgcolor=lightgreen>\n" "Each document has several associated pieces. Each piece " "is indexed individually. The pieces are listed below and " "are preceeded with a table dictating the parameters with " "which the piece was indexed." "<br><br>" "Below that table the actual text of the piece is displayed. " "Each alphanumeric word in the text has two subscripts of the " "form <i>X/Y</i> where X and Y are percentage weights on the " "score of that particular alphanumeric word. X is the weight " "on the word itself and Y is the weight on the phrase which " "is started by that word. A weight of 100% " "indicates a weight which does not affect the score." "<br><br>" "Words that are struck out and in a box with a red background " "instead of light blue are considered to be spam, meaning " "they are repeated in a pattern. They " "contain a number in that box which indicates the probability " "they are spam and 100 minus that probability is weighted " "with their score to get a new, spam-adjusted score. " "<br>\n" "</tr>\n" "</table>\n" "</td></table>\n" "<br><br>\n"; if ( m_pbuf ) m_pbuf->safePrintf("%s",help); */ /* int32_t inlinks = *getSiteNumInlinks(); int32_t boost1 = getBoostFromSiteNumInlinks ( inlinks ); // . now we hard code "boost2" // . based on # of alnum words // . this makes us look at keyword density, not just the // plain keyword count int32_t naw = m_words.getNumAlnumWords(); // . keep at 100% for up to 200 words then reduce linearly // . only do this for newer title recs to avoid undeletable data // . if we have a huge document, it can still contain a very // relevant paragraph that is dense in the query terms, so // we really only want to punish enough so the post query // reranking has some good candidates for doing proximity // scoring. // . back off by .90 every 1000 words float nn = naw; float bb = 100.0; while ( nn > 1000 ) { nn *= .9; bb *= .9; } // never drop below %1 if ( bb < 1.0 ) bb = 1.0; // set it int64_t boost2 = (int64_t)bb; */ /* int32_t siteNumInlinks = *getSiteNumInlinks(); if ( m_pbuf ) m_pbuf->safePrintf( "<table border=1 cellpadding=2>" "<tr><td>siteNumInlinks</td><td><b>%"INT32"%%</b></td></tr>" "<tr><td>siteNumInlinksBoost</td>" "<td>%"INT32"%%</td></tr>" "<tr><td>numAlnumWords</td>" "<td>%"INT32"</td></tr> " "<tr><td>scoreWeightFromNumAlnumWords" "</td><td>%"INT32"%%</td></tr>" "<tr><td>headerWeight</td>" "<td>%"INT32"%%</td></tr>" "<tr><td>urlPathWeight</td>" "<td>%"INT32"%%</td></tr>" "<tr><td>externalLinkTextWeight</td>" "<td>%"INT32"%%</td></tr>" "<tr><td>internalLinkTextWeight</td>" "<td>%"INT32"%%</td></tr>" "<tr><td>conceptWeight</td>" "<td>%"INT32"%%</td></tr>" "<tr><td>titleWeight</td>" "<td>%"INT32"%%</td></tr>" "</table>" "<br>" , (int32_t)siteNumInlinks, (int32_t)boost1, //(int32_t)len, (int32_t)naw, (int32_t)boost2, (int32_t)boost1, (int32_t)boost2, //(int32_t)boost1, (int32_t)m_headerWeight, (int32_t)m_urlPathWeight, (int32_t)m_externalLinkTextWeight, (int32_t)m_internalLinkTextWeight, (int32_t)m_conceptWeight, (int32_t)m_titleWeight, (int32_t)m_titleWeight, (int32_t)boost1, (int32_t)boost1, ); */ // do not repeat this if the cachedb storage call blocks m_allHashed = true; // reset distance cursor m_dist = 0; // hash diffbot's json output here uint8_t *ct = getContentType(); if ( ! ct ) return NULL; /* if ( *ct == CT_JSON ) { // && m_isDiffbotJSONObject ) { // hash the content type for type:json query if ( ! hashContentType ( table ) ) return NULL; // and the url: query support if ( ! hashUrl ( table ) ) return NULL; // language support if ( ! hashLanguage ( table ) ) return NULL; // country? if ( ! hashCountry ( table ) ) return NULL; if ( ! hashTagRec ( table ) ) return NULL; // hash for gbsortby:gbspiderdate if ( ! hashDateNumbers ( table ) ) return NULL; // has gbhasthumbnail:1 or 0 if ( ! hashImageStuff ( table ) ) return NULL; // and the json itself return hashJSON ( table ); } */ if ( ! hashContentType ( table ) ) return NULL; if ( ! hashUrl ( table ) ) return NULL; if ( ! hashLanguage ( table ) ) return NULL; if ( ! hashCountry ( table ) ) return NULL; if ( ! hashSiteNumInlinks( table ) ) return NULL; if ( ! hashTagRec ( table ) ) return NULL; if ( ! hashAds ( table ) ) return NULL; if ( ! hashSubmitUrls ( table ) ) return NULL; if ( ! hashIsAdult ( table ) ) return NULL; // has gbhasthumbnail:1 or 0 if ( ! hashImageStuff ( table ) ) return NULL; // . hash sectionhash:xxxx terms // . diffbot still needs to hash this for voting info if ( ! hashSections ( table ) ) return NULL; // now hash the terms sharded by termid and not docid here since they // just set a special bit in posdb key so Rebalance.cpp can work. // this will hash the content checksum which we need for deduping // which we use for diffbot custom crawls as well. if ( ! hashNoSplit ( table ) ) return NULL; // MDW: i think we just inject empty html with a diffbotreply into // global index now, so don't need this... 9/28/2014 // hash json fields if ( *ct == CT_JSON ) { // this hashes both with and without the fieldname hashJSONFields ( table ); goto skip; } // same for xml now, so we can search for field:value like w/ json if ( *ct == CT_XML ) { // this hashes both with and without the fieldname hashXMLFields ( table ); goto skip; } // global index unless this is a json object in which case it is // hashed above in the call to hashJSON(). this will decrease disk // usage by about half, posdb* files are pretty big. if ( cr->m_isCustomCrawl || ! cr->m_indexBody ) return (char *)1; // hash the body of the doc first so m_dist is 0 to match // the rainbow display of sections if ( ! hashBody2 (table ) ) return NULL; // hash the title now too so neighborhood singles have more // to match. plus, we only hash these title terms iff they // are not already in the hash table, so as to avoid hashing // repeated title terms because we do not do spam detection // on them. thus, we need to hash these first before anything // else. give them triple the body score if ( ! hashTitle ( table )) return NULL; // . hash the keywords tag, limited to first 2k of them so far // . hash above the neighborhoods so the neighborhoods only index // what is already in the hash table if ( ! hashMetaKeywords(table ) ) return NULL; // then hash the incoming link text, NO ANOMALIES, because // we index the single words in the neighborhoods next, and // we had songfacts.com coming up for the 'street light facts' // query because it had a bunch of anomalous inlink text. if ( ! hashIncomingLinkText(table,false,true)) return NULL; // then the meta summary and description tags with half the score of // the body, and only hash a term if was not already hashed above // somewhere. if ( ! hashMetaSummary(table) ) return NULL; skip: // this will only increment the scores of terms already in the table // because we neighborhoods are not techincally in the document // necessarily and we do not want to ruin our precision if ( ! hashNeighborhoods ( table ) ) return NULL; if ( ! hashLinks ( table ) ) return NULL; if ( ! hashDateNumbers ( table ) ) return NULL; if ( ! hashMetaTags ( table ) ) return NULL; if ( ! hashMetaZip ( table ) ) return NULL; if ( ! hashDMOZCategories( table ) ) return NULL; if ( ! hashCharset ( table ) ) return NULL; if ( ! hashRSSInfo ( table ) ) return NULL; if ( ! hashPermalink ( table ) ) return NULL; // hash gblang:de last for parsing consistency if ( ! hashLanguageString ( table ) ) return NULL; // we set this now in hashWords3() if ( m_doingSEO ) m_wordPosInfoBufValid = true; // store the m_wordPosInfoBuf into cachedb // NO! we are not allowed to block in here it messes shit up!!! //if ( m_doingSEO && ! storeWordPosInfoBufIntoCachedb ( ) ) // return (char *)-1; // . hash gbkeyword:gbmininlinks where the score is the inlink count // . the inlink count can go from 1 to 255 // . an ip neighborhood can vote no more than once // . this is in LinkInfo::hash //if ( ! hashMinInlinks ( table , linkInfo ) ) return NULL; // return true if we don't need to print parser info //if ( ! m_pbuf ) return true; // print out the table into g_bufPtr now if we need to //table->print ( ); return (char *)1; } // . "inlinks" is # of inlinks to the SITE // . returns a percentage boost int32_t XmlDoc::getBoostFromSiteNumInlinks ( int32_t inlinks ) { // . base on # of site inlinks // . just hard code this for now int32_t boost1 = 100; if ( inlinks >= 10 ) boost1 = 150; if ( inlinks >= 50 ) boost1 = 200; if ( inlinks >= 100 ) boost1 = 250; if ( inlinks >= 200 ) boost1 = 300; if ( inlinks >= 400 ) boost1 = 350; if ( inlinks >= 800 ) boost1 = 400; if ( inlinks >= 1600 ) boost1 = 450; if ( inlinks >= 3200 ) boost1 = 500; if ( inlinks >= 6400 ) boost1 = 550; if ( inlinks >= 12800 ) boost1 = 600; if ( inlinks >= 25600 ) boost1 = 650; if ( inlinks >= 51200 ) boost1 = 700; return boost1; } // . this is kinda hacky because it uses a short XmlDoc on the stack // . no need to hash this stuff for regular documents since all the terms // are fielded by gberrorstr, gberrornum or gbisreply. // . normally we might use a separate xmldoc class for this but i wanted // something more lightweight SafeBuf *XmlDoc::getSpiderStatusDocMetaList ( SpiderReply *reply ) { // set status for this setStatus ( "getting spider reply meta list"); if ( m_spiderStatusDocMetaListValid ) return &m_spiderStatusDocMetaList; // if docid based do not hash a spider reply. docid-based spider // requests are added to spiderdb from the query reindex tool. // do not do for diffbot subdocuments either, usespiderdb should be // false for those. // MDW: i disagree, i want to see when these get updated! 9/6/2014 //if ( m_setFromDocId || ! m_useSpiderdb ) { if ( ! m_useSpiderdb ) { m_spiderStatusDocMetaListValid = true; return &m_spiderStatusDocMetaList; } // we double add regular html urls in a query reindex because the // json url adds the parent, so the parent gets added twice sometimes, // and for some reason it is adding a spider status doc the 2nd time // so cut that out. this is kinda a hack b/c i'm not sure what's // going on. but you can set a break point here and see what's up if // you want. // MDW: likewise, take this out, i want these recorded as well.. // if ( m_indexCodeValid && m_indexCode == EDOCFORCEDELETE ) { // m_spiderStatusDocMetaListValid = true; // return &m_spiderStatusDocMetaList; // } // . fake this out so we do not core // . hashWords3() uses it i guess bool forcedLangId = false; if ( ! m_langIdValid ) { forcedLangId = true; m_langIdValid = true; m_langId = langUnknown; } // prevent more cores bool forcedSiteNumInlinks = false; if ( ! m_siteNumInlinksValid ) { forcedSiteNumInlinks = true; m_siteNumInlinks = 0; m_siteNumInlinksValid = true; } SafeBuf *mbuf = getSpiderStatusDocMetaList2 ( reply ); if ( forcedLangId ) m_langIdValid = false; if ( forcedSiteNumInlinks ) { m_siteNumInlinksValid = false; } return mbuf; } // the spider status doc SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) { setStatus ( "making spider reply meta list"); // . we also need a unique docid for indexing the spider *reply* // as a separate document // . use the same url, but use a different docid. // . use now to mix it up //int32_t now = getTimeGlobal(); //int64_t h = hash64(m_docId, now ); // to keep qa test consistent this docid should be consistent // so base it on spidertime of parent doc if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } int64_t h = hash64(m_docId, m_spideredTime ); // mask it out int64_t d = h & DOCID_MASK; // try to get an available docid, preferring "d" if available int64_t *uqd = getAvailDocIdOnly ( d ); if ( ! uqd || uqd == (void *)-1 ) return (SafeBuf *)uqd; unsigned char *hc = (unsigned char *)getHopCount(); if ( ! hc || hc == (void *)-1 ) return (SafeBuf *)hc; // reset just in case m_spiderStatusDocMetaList.reset(); // sanity if ( *uqd <= 0 || *uqd > MAX_DOCID ) { log("xmldoc: avail docid = %"INT64". could not index spider " "reply or %s",*uqd,m_firstUrl.m_url); //char *xx=NULL;*xx=0; } m_spiderStatusDocMetaListValid = true; return &m_spiderStatusDocMetaList; } // the posdb table HashTableX tt4; if ( !tt4.set(18,4,256,NULL,0,false,m_niceness,"posdb-spindx")) return NULL; // BEFORE ANY HASHING int32_t savedDist = m_dist; // re-set to 0 m_dist = 0; // sanity if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; } // why isn't gbhopcount: being indexed consistently? if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; } // hash like gbstatus:"Tcp Timed out" or gbstatus:"Doc unchanged" HashInfo hi; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_tt = &tt4; hi.m_useCountTable = false; hi.m_useSections = false; char buf[64]; int32_t bufLen; // hash 'type:status' similar to 'type:json' etc. hi.m_prefix = "type"; if ( ! hashString("status" , &hi ) ) return NULL; // . hash gbstatus:0 for no error, otherwise the error code // . this also hashes it as a number so we don't have to // . so we can do histograms on this # hi.m_prefix = "gbstatus"; hi.m_desc = "spider error number as string"; bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_indexCode ); if ( ! hashString( buf , &hi ) ) return NULL; /* logf(LOG_DEBUG,"url: %s",m_firstUrl.m_url); logf(LOG_DEBUG,"hashing indexcode=%"INT32"",m_indexCode); bool ok = false; if ( m_indexCode ) ok = true; // scan the keys in tt and make sure the termid fo addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList ); int32_t recSize = 0; int32_t rcount = 0; char *p = m_spiderStatusDocMetaList.getBufStart(); char *pend =m_spiderStatusDocMetaList.getBuf(); for ( ; p < pend ; p += recSize ) { // get rdbid, RDB_POSDB uint8_t rdbId = *p & 0x7f; // skip p++; // get key size int32_t ks = getKeySizeFromRdbId ( rdbId ); // init this int32_t recSize = ks; // convert into a key128_t, the biggest possible key //key224_t k ; char k[MAX_KEY_BYTES]; if ( ks > MAX_KEY_BYTES ) { char *xx=NULL;*xx=0; } //k.setMin(); gbmemcpy ( &k , p , ks ); // is it a negative key? char neg = false; if ( ! ( p[0] & 0x01 ) ) neg = true; // this is now a bit in the posdb key so we can rebalance char shardByTermId = false; if ( rdbId==RDB_POSDB && g_posdb.isShardedByTermId(k)) shardByTermId = true; // skip it p += ks; // . always zero if key is negative // . this is not the case unfortunately... if ( neg ) {char *xx=NULL;*xx=0; } // print dbname if ( rdbId != RDB_POSDB ) { char *xx=NULL;*xx=0; } // get termid et al key144_t *k2 = (key144_t *)k; int64_t tid = g_posdb.getTermId(k2); log("db: tid=%"INT64"",tid); if ( tid == 199947062354729LL ) ok = true; //if ( m_indexCode == 0 && tid != 199947062354729LL ) { // char *xx=NULL;*xx=0; } } if ( ! ok ) { char *xx=NULL;*xx=0; } goto SKIP; // was here.... */ // gbstatus:"tcp timed out" hi.m_prefix = "gbstatusmsg"; hi.m_desc = "spider error msg"; if ( ! hashString( mstrerror(m_indexCode) , &hi ) ) return NULL; //hi.m_prefix = "gbdocid"; //hi.m_desc = "docid"; //bufLen = sprintf ( buf , "%"UINT64"", *uqd ) ; //if ( ! hashString( buf , &hi ) ) return NULL; // . then the url. url: site: ip: etc. terms // . do NOT hash non-fielded terms so we do not get "status" // results poluting the serps => false if ( ! hashUrl ( &tt4 , true ) ) return NULL; // false --> do not hash the gbdoc* terms (CT_STATUS) hashDateNumbers ( &tt4 , true ); // store keys in safebuf then to make our own meta list addTable144 ( &tt4 , *uqd , &m_spiderStatusDocMetaList ); // debug this shit //SafeBuf tmpsb; //printMetaList ( m_spiderStatusDocMetaList.getBufStart() , // m_spiderStatusDocMetaList.getBuf(), // &tmpsb ); //logf(LOG_DEBUG,"%s\n",tmpsb.getBufStart()); // now make the titlerec char xdhead[2048]; // just the head of it. this is the hacky part. XmlDoc *xd = (XmlDoc *)xdhead; // clear it out memset ( xdhead, 0 , 2048); // copy stuff from THIS so the spider reply "document" has the same // header info stuff int32_t hsize = (char *)&ptr_firstUrl - (char *)this; if ( hsize > 2048 ) { char *xx=NULL;*xx=0; } gbmemcpy ( xdhead , (char *)this , hsize ); // override spider time in case we had error to be consistent // with the actual SpiderReply record //xd->m_spideredTime = reply->m_spideredTime; //xd->m_spideredTimeValid = true; // sanity //if ( reply->m_spideredTime != m_spideredTime ) {char *xx=NULL;*xx=0;} // this will cause the maroon box next to the search result to // say "STATUS" similar to "PDF" "DOC" etc. xd->m_contentType = CT_STATUS; int32_t fullsize = &m_dummyEnd - (char *)this; if ( fullsize > 2048 ) { char *xx=NULL;*xx=0; } // the ptr_* were all zero'd out, put the ones we want to keep back in SafeBuf tmp; // was "Spider Status: %s" but that is unnecessary tmp.safePrintf("<title>%s", mstrerror(m_indexCode)); // if we are a dup... if ( m_indexCode == EDOCDUP ) tmp.safePrintf("Dup of docid %"INT64"
", m_docIdWeAreADupOf ); if ( m_redirUrlPtr && m_redirUrlValid ) tmp.safePrintf("Redirected to %s
",m_redirUrlPtr->getUrl()); // put stats like we log out from logIt //tmp.safePrintf("
\n"); // store log output into doc //logIt(&tmp); //tmp.safePrintf("\n
"); // the content is just the title tag above xd->ptr_utf8Content = tmp.getBufStart(); xd->size_utf8Content = tmp.length()+1; // keep the same url as the doc we are the spider reply for xd->ptr_firstUrl = ptr_firstUrl; xd->size_firstUrl = size_firstUrl; // serps need site, otherwise search results core xd->ptr_site = ptr_site; xd->size_site = size_site; // use the same uh48 of our parent int64_t uh48 = m_firstUrl.getUrlHash48(); // then make into a titlerec but store in metalistbuf, not m_titleRec SafeBuf titleRecBuf; // this should not include ptrs that are NULL when compressing // using its m_internalFlags1 if ( ! xd->setTitleRecBuf( &titleRecBuf,*uqd,uh48 ) ) return NULL; // concat titleRec to our posdb key records if ( ! m_spiderStatusDocMetaList.pushChar((char)RDB_TITLEDB) ) return NULL; if ( ! m_spiderStatusDocMetaList.cat(titleRecBuf) ) return NULL; // return the right val m_dist = savedDist; // ok, good to go, ready to add to posdb and titledb m_spiderStatusDocMetaListValid = true; return &m_spiderStatusDocMetaList; } // returns false and sets g_errno on error bool XmlDoc::hashMetaTags ( HashTableX *tt ) { setStatus ( "hashing meta tags" ); // assume it's empty char buf [ 32*1024 ]; int32_t bufLen = 32*1024 - 1; buf[0] = '\0'; int32_t n = m_xml.getNumNodes(); XmlNode *nodes = m_xml.getNodes(); // set up the hashing parms HashInfo hi; hi.m_hashGroup = HASHGROUP_INMETATAG; hi.m_tt = tt; hi.m_desc = "custom meta tag"; // find the first meta summary node for ( int32_t i = 0 ; i < n ; i++ ) { // continue if not a meta tag if ( nodes[i].m_nodeId != 68 ) continue; // only get content for not int32_t tagLen; char *tag = m_xml.getString ( i , "name" , &tagLen ); char *tptr = tag; char tagLower[128]; int32_t j ; int32_t code; // skip if empty if ( ! tag || tagLen <= 0 ) continue; // make tag name lower case and do not allow bad chars if ( tagLen > 126 ) tagLen = 126 ; to_lower3_a ( tag , tagLen , tagLower ); for ( j = 0 ; j < tagLen ; j++ ) { // bail if has unacceptable chars if ( ! is_alnum_a ( tag[j] ) && tag[j] != '-' && tag[j] != '_' && tag[j] != '.' ) break; // convert to lower tagLower[j] = to_lower_a ( tag[j] ); } // skip this meta if had unacceptable chars if ( j < tagLen ) continue; // is it recognized? code = getFieldCode ( tag , tagLen ); // after version 45 or more, do not allow gbrss // meta tags, because those are now reserved for us if ( code == FIELD_GBRSS ) continue; // allow gbrss: fields for earlier versions though if ( code == FIELD_GBRSS ) code = FIELD_GENERIC; // . do not allow reserved tag names // . title,url,suburl, if ( code != FIELD_GENERIC ) continue; // this is now reserved // do not hash keyword, keywords, description, or summary metas // because that is done in hashRange() below based on the // tagdb (ruleset) record if ((tagLen== 7&&strncasecmp(tag,"keyword" , 7)== 0)|| (tagLen== 7&&strncasecmp(tag,"summary" , 7)== 0)|| (tagLen== 8&&strncasecmp(tag,"keywords" , 8)== 0)|| (tagLen==11&&strncasecmp(tag,"description",11)== 0) ) continue; // . don't allow reserved names: site, url, suburl, link and ip // . actually, the colon is included as part of those // field names, so we really lucked out...! // . index this converted tag name tptr = tagLower; // get the content int32_t len; char *s = m_xml.getString ( i , "content" , &len ); if ( ! s || len <= 0 ) continue; // . ensure not too big for our buffer (keep room for a \0) // . TODO: this is wrong, should be len+1 > bufLen, // but can't fix w/o resetting the index (COME BACK HERE // and see where we index meta tags besides this place!!!) // remove those other places, except... what about keywords // and description? if ( len+1 >= bufLen ) { //len = bufLen - 1; // assume no punct to break on! len = 0; // only cut off at punctuation char *p = s; char *pend = s + len; char *last = NULL; int32_t size ; for ( ; p < pend ; p += size ) { // skip if utf8 char size = getUtf8CharSize(*p); // skip if 2+ bytes if ( size > 1 ) continue; // skip if not punct if ( is_alnum_a(*p) ) continue; // mark it last = p; } if ( last ) len = last - s; // this old way was faster...: //while ( len > 0 && is_alnum(s[len-1]) ) len--; } // convert html entities to their chars len = saftenTags ( buf , bufLen , s , len ); // NULL terminate the buffer buf[len] = '\0'; // temp null term char c = tptr[tagLen]; tptr[tagLen] = 0; // custom hi.m_prefix = tptr; // desc is NULL, prefix will be used as desc bool status = hashString ( buf,len,&hi ); // put it back tptr[tagLen] = c; // bail on error, g_errno should be set if ( ! status ) return false; // return false with g_errno set on error //if ( ! hashNumber ( buf , bufLen , &hi ) ) // return false; } return true; } // slightly greater than m_spideredTime, which is the download time. // we use this for sorting as well, like for the widget so things // don't really get added out of order and not show up in the top spot // of the widget list. int32_t XmlDoc::getIndexedTime() { if ( m_indexedTimeValid ) return m_indexedTime; m_indexedTime = getTimeGlobal(); return m_indexedTime; } // . hash dates for sorting by using gbsortby: and gbrevsortby: // . do 'gbsortby:gbspiderdate' as your query to see this in action bool XmlDoc::hashDateNumbers ( HashTableX *tt , bool isStatusDoc ) { // stop if already set if ( ! m_spideredTimeValid ) return true; int32_t indexedTime = getIndexedTime(); // first the last spidered date HashInfo hi; hi.m_hashGroup = 0;// this doesn't matter, it's a numeric field hi.m_tt = tt; hi.m_desc = "last spidered date"; hi.m_prefix = "gbspiderdate"; char buf[64]; int32_t bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_spideredTime ); if ( ! hashNumber ( buf , buf , bufLen , &hi ) ) return false; // and index time is >= spider time, so you want to sort by that for // the widget for instance hi.m_desc = "last indexed date"; hi.m_prefix = "gbindexdate"; bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)indexedTime ); if ( ! hashNumber ( buf , buf , bufLen , &hi ) ) return false; // do not index the rest if we are a "spider reply" document // which is like a fake document for seeing spider statuses //if ( isStatusDoc == CT_STATUS ) return true; if ( isStatusDoc ) return true; // now for CT_STATUS spider status "documents" we also index // gbspiderdate so index this so we can just do a // gbsortby:gbdocspiderdate and only get real DOCUMENTS not the // spider status "documents" hi.m_desc = "doc last spidered date"; hi.m_prefix = "gbdocspiderdate"; bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)m_spideredTime ); if ( ! hashNumber ( buf , buf , bufLen , &hi ) ) return false; hi.m_desc = "doc last indexed date"; hi.m_prefix = "gbdocindexdate"; bufLen = sprintf ( buf , "%"UINT32"", (uint32_t)indexedTime ); if ( ! hashNumber ( buf , buf , bufLen , &hi ) ) return false; // all done return true; } bool XmlDoc::hashMetaZip ( HashTableX *tt ) { setStatus ( "hashing meta zip" ); // . set the score based on quality // . scores are multiplied by 256 to preserve fractions for adding uint32_t score = *getSiteNumInlinks8() * 256 ; if ( score <= 0 ) score = 1; // search for meta date char buf [ 32 ]; int32_t bufLen = m_xml.getMetaContent ( buf, 32, "zipcode", 7 ); if ( bufLen <= 0 ) bufLen = m_xml.getMetaContent ( buf, 32, "zip",3); char *p = buf; char *pend = buf + bufLen ; if ( bufLen <= 0 ) return true; // set up the hashing parms HashInfo hi; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_tt = tt; //hi.m_prefix = "zipcode"; hi.m_prefix = "gbzipcode"; nextZip: // . parse out the zip codes, may be multiple ones // . skip non-digits while ( p < pend && ! is_digit(*p) ) p++; // skip if no digits if ( p == pend ) return true; // need at least 5 consecutive digits if ( p + 5 > pend ) return true; // if not a zip code, skip it if ( ! is_digit(p[1]) ) { p += 1; goto nextZip; } if ( ! is_digit(p[2]) ) { p += 2; goto nextZip; } if ( ! is_digit(p[3]) ) { p += 3; goto nextZip; } if ( ! is_digit(p[4]) ) { p += 4; goto nextZip; } // do we have too many consectuive digits? if ( p + 5 != pend && is_digit(p[5]) ) { // if so skip this whole string of digits p += 5; while ( p < pend && is_digit(*p) ) p++; goto nextZip; } // 90210 --> 90 902 9021 90210 for ( int32_t i = 0 ; i <= 3 ; i++ ) // use prefix as description if ( ! hashString ( p,5-i,&hi ) ) return false; p += 5; goto nextZip; } // returns false and sets g_errno on error bool XmlDoc::hashContentType ( HashTableX *tt ) { CollectionRec *cr = getCollRec(); if ( ! cr ) return false; uint8_t ctype = *getContentType(); char *s = NULL; setStatus ( "hashing content type" ); // hash numerically so we can do gbfacetint:type on it HashInfo hi; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_tt = tt; hi.m_prefix = "type"; char tmp[6]; sprintf(tmp,"%"UINT32"",(uint32_t)ctype); if ( ! hashString (tmp,gbstrlen(tmp),&hi ) ) return false; // these ctypes are defined in HttpMime.h switch (ctype) { case CT_HTML: s = "html"; break; case CT_TEXT: s = "text"; break; case CT_XML : s = "xml" ; break; case CT_PDF : s = "pdf" ; break; case CT_DOC : s = "doc" ; break; case CT_XLS : s = "xls" ; break; case CT_PPT : s = "ppt" ; break; case CT_PS : s = "ps" ; break; // for diffbot. so we can limit search to json objects // in Diffbot.cpp case CT_JSON: s = "json" ; break; } // bail if unrecognized content type if ( ! s ) return true; // hack for diffbot. do not hash type:json because diffbot uses // that for searching diffbot json objects if ( cr->m_isCustomCrawl && ctype==CT_JSON && !m_isDiffbotJSONObject ) return true; // . now hash it // . use a score of 1 for all // . TODO: ensure doc counting works ok with this when it does // it's interpolation return hashString (s,gbstrlen(s),&hi ); } // . hash the link: terms // . ensure that more useful linkers are scored higher // . useful for computing offsite link text for qdb-ish algorithm // . NOTE: for now i do not hash links to the same domain in order to // hopefully save 10%-25% index space // . NOTE: PLUS, they may clog up the link-adjusted quality ratings since // different site links with no link text will be ranked behind them // . the 8-bit bitmap of the score of a link: term: // . 00ubdcss u = link is Unbanned? b = link isBanned? // d = link dirty? c = link clean? // s = 01 if no link text, 10 if link text // . NOTE: this is used in Msg18.cpp for extraction // . CAUTION: IndexList::score32to8() will warp our score if its >= 128 // so i moved the bits down bool XmlDoc::hashLinks ( HashTableX *tt ) { setStatus ( "hashing links" ); // int16_tcuts bool isRSSFeed = *getIsRSS(); Url *cu = getCurrentUrl() ; Url *ru = *getRedirUrl() ; char dbuf[8*4*1024]; HashTableX dedup; dedup.set( 8,0,1024,dbuf,8*4*1024,false,m_niceness,"hldt"); // see ../url/Url2.cpp for hashAsLink() algorithm for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) { // skip links with zero 0 length if ( m_links.m_linkLens[i] == 0 ) continue; // . skip if we are rss page and this link is an
link // . we only harvest/index urls from rss feeds // . or in the case of feedburner, those orig tags if ( isRSSFeed && (m_links.m_linkFlags[i] & LF_AHREFTAG) ) continue; // if we have a tag, then ignore // tags and only get the links from the original links if ( m_links.m_isFeedBurner && !(m_links.m_linkFlags[i] & LF_FBTAG) ) continue; // normalize the link Url link; // now we always add "www" to these links so that any link // to cnn.com is same as link to www.cnn.com, because either // we index cnn.com or www.cnn.com but not both providing // their content is identical (deduping). This way whichever // one we index, we can take advantage of all link text whether // it's to cnn.com or www.cnn.com. // Every now and then we add new session ids to our list in // Url.cpp, too, so we have to version that. // Since this is just for hashing, it shouldn't matter that // www.tmblr.co has no IP whereas only tmblr.co does. link.set ( m_links.m_linkPtrs[i] , m_links.m_linkLens[i] , true , // addWWW? m_links.m_stripIds , false , // stripPound? false , // stripCommonFile? m_version );// used for new session id stripping // breathe QUICKPOLL(m_niceness); // . the score depends on some factors: // . NOTE: these are no longer valid! (see score bitmap above) // . 4 --> if link has different domain AND has link text // . 3 --> if link has same domain AND has link text // . 2 --> if link has different domain AND no link text // . 1 --> if link has sam domain AND no link text // . is domain the same as ours? // . NOTE: ideally, using the IP domain would be better, but // we do not know the ip of the linker right now... so scores // may be topped with a bunch of same-ip domain links so that // we may not get as much link text as we'd like, since we // only sample from one link text per ip domain // . now we also just use the mid domain! (excludes TLD) bool internal = false; int32_t mdlen = cu->getMidDomainLen(); if ( mdlen == link.getMidDomainLen() && strncmp(cu->getMidDomain(),link.getMidDomain(),mdlen)==0) //continue; // sameMidDomain = true; internal = true; // also check the redir url if ( ru ) { mdlen = ru->getMidDomainLen(); if ( mdlen == link.getMidDomainLen() && strncmp(ru->getMidDomain(), link.getMidDomain(),mdlen)==0) //continue; // sameMidDomain = true; internal = true; } // now make the score //unsigned char score ; // . TODO: consider not hashing link w/o text! // . otherwise, give it a higher score if it's got link TEXT //bool gotLinkText = m_links.hasLinkText ( i, m_version ); // otherwise, beginning with version 21, allow internal links, // but with lower scores // score // internal, no link text: 2 // internal, w/ link text: 4 // external, no link text: 6 // external, w/ link text: 8 //if ( internal ) { // if ( ! gotLinkText ) score = 0x02; // else score = 0x04; //} //else { // if ( ! gotLinkText ) score = 0x06; // else score = 0x08; //} // dedup this crap int64_t h = hash64 ( link.getUrl(), link.getUrlLen() ); if ( dedup.isInTable ( &h ) ) continue; if ( ! dedup.addKey ( &h ) ) return false; // set up the hashing parms HashInfo hi; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_tt = tt; hi.m_prefix = "link"; // hash link: if ( ! hashSingleTerm ( link.getUrl(),link.getUrlLen(),&hi )) return false; h = hash64 ( link.getHost() , link.getHostLen() ); if ( dedup.isInTable ( &h ) ) continue; if ( ! dedup.addKey ( &h ) ) return false; // fix parm hi.m_prefix = "sitelink"; // hash sitelink: if ( ! hashSingleTerm ( link.getHost(),link.getHostLen(),&hi)) return false; // breathe QUICKPOLL(m_niceness); } // skip this for now return true; /* setStatus ("hashing gbhasbannedoutlink" ); // only lets a domain vote once int32_t numBannedOutlinks = *getNumBannedOutlinks(); //if ( numBannedOutlinks <= 0 ) return true; // a score of 235 seems to give a negative return for score8to32() uint32_t score = score8to32 ( numBannedOutlinks ); // make score at least 1! if ( score <= 0 ) score = 1; // a hack fix if ( score > 0x7fffffff ) score = 0x7fffffff; // set up the hashing parms HashInfo hi; hi.m_tt = tt; hi.m_prefix = "gbhasbannedoutlink"; // hash this special thing to help us de-spam the index if ( numBannedOutlinks > 0 ) return hashString ("1",1,&hi ); else return hashString ("0",1,&hi ); */ } // . returns false and sets g_errno on error // . hash for linkdb bool XmlDoc::hashLinksForLinkdb ( HashTableX *dt ) { // sanity check if ( dt->m_ks != sizeof(key224_t) ) { char *xx=NULL;*xx=0; } if ( dt->m_ds != 0 ) { char *xx=NULL;*xx=0; } // this will be different with our new site definitions uint32_t linkerSiteHash32 = *getSiteHash32(); char siteRank = getSiteRank(); if ( ! m_linksValid ) { char *xx=NULL;*xx=0; } // we need to store this in the title rec for re-building // the meta list from the title rec... // is this just site info? //TagRec ***pgrv = getOutlinkTagRecVector(); //if ( ! pgrv || pgrv == (void *)-1 ) { char *xx=NULL;*xx=0; } //TagRec **grv = *pgrv; int32_t *linkSiteHashes = getLinkSiteHashes(); if ( ! linkSiteHashes || linkSiteHashes == (void *)-1 ){ char *xx=NULL;*xx=0;} // convert siteNumInlinks into a score //int32_t numSiteInlinks = *xd->getSiteNumInlinks(); unsigned char hopCount = *getHopCount(); // use spidered time! might not be current time! like if rebuilding // or injecting from a past spider time int32_t discoveryDate = getSpideredTime();//TimeGlobal(); int32_t lostDate = 0; // add in new links for ( int32_t i = 0 ; i < m_links.m_numLinks ; i++ ) { // give up control QUICKPOLL ( m_niceness ); // skip if empty if ( m_links.m_linkLens[i] == 0 ) continue; // . skip if spam, ALWAYS allow internal outlinks though!! // . CAUTION: now we must version islinkspam() bool spam = m_links.isLinkSpam(i) ; // or if it has no link text, skip it //if ( ! links->hasLinkText(i,TITLEREC_CURRENT_VERSION) ) //continue; // get site of outlink from tagrec if in there int32_t linkeeSiteHash32 = linkSiteHashes[i]; /* TagRec *gr = grv[i]; char *site = NULL; int32_t siteLen = 0; if ( gr ) { int32_t dataSize = 0; site = gr->getString("site",NULL,&dataSize); if ( dataSize ) siteLen = dataSize - 1; } // otherwise, make it the host or make it cut off at // a "/user/" or "/~xxxx" or whatever path component if ( ! site ) { // GUESS link site... TODO: augment for /~xxx char *s = m_links.getLink(i); //int32_t slen = m_links.getLinkLen(i); //siteLen = slen; site = ::getHost ( s , &siteLen ); } uint32_t linkeeSiteHash32 = hash32 ( site , siteLen , 0 ); */ // // when setting the links class it should set the site hash // // set this key, it is the entire record key224_t k; k = g_linkdb.makeKey_uk ( linkeeSiteHash32 , m_links.getLinkHash64(i) , spam , // link spam? siteRank , // was quality hopCount, *getIp() , *getDocId() , discoveryDate , lostDate , false , // new add? linkerSiteHash32 , false );// delete? /* // debug if ( m_links.getLinkHash64(i) != 0x3df1c439a364e18dLL ) continue; //char c = site[siteLen]; //site[siteLen]=0; //char tmp[1024]; //sprintf(tmp,"xmldoc: hashinglink site=%s sitelen=%"INT32" ", // site,siteLen); //site[siteLen] = c; log(//"%s " "url=%s " "linkeesitehash32=0x%08"XINT32" " "linkersitehash32=0x%08"XINT32" " "urlhash64=0x%16llx " "docid=%"INT64" k=%s", //tmp, m_links.getLink(i), (int32_t)linkeeSiteHash32, linkerSiteHash32, m_links.getLinkHash64(i), *getDocId(), KEYSTR(&k,sizeof(key224_t)) ); */ // store in hash table if ( ! dt->addKey ( &k , NULL ) ) return false; } return true; } // . returns false and sets g_errno on error // . copied Url2.cpp into here basically, so we can now dump Url2.cpp bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) { setStatus ( "hashing url colon" ); // get the first url Url *fu = getFirstUrl(); // set up the hashing parms HashInfo hi; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_tt = tt; // we do not need diversity bits for this hi.m_useCountTable = false; // // HASH url: term // // append a "www." for doing url: searches Url uw; uw.set ( fu->getUrl() , fu->getUrlLen() , true ); hi.m_prefix = "url"; if ( isStatusDoc ) hi.m_prefix = "url2"; if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) ) return false; // use hash of url as score so we can get a # of docs per site est. //uint16_t score = hash16 ( fu->getUrl() , fu->getUrlLen() ); setStatus ( "hashing inurl colon" ); // // HASH inurl: terms // char *s = fu->getUrl (); int32_t slen = fu->getUrlLen(); hi.m_prefix = "inurl"; if ( isStatusDoc ) hi.m_prefix = "inurl2"; if ( ! hashString ( s,slen, &hi ) ) return false; setStatus ( "hashing ip colon" ); // // HASH ip:a.b.c.d // if ( ! m_ipValid ) { char *xx=NULL;*xx=0; } // copy it to save it char ipbuf[64]; int32_t iplen = sprintf(ipbuf,"%s",iptoa(m_ip)); //char *tmp = iptoa ( m_ip ); //int32_t tlen = gbstrlen(tmp); hi.m_prefix = "ip"; if ( isStatusDoc ) hi.m_prefix = "ip2"; if ( ! hashSingleTerm(ipbuf,iplen,&hi) ) return false; // // HASH ip:a.b.c // char *end1 = ipbuf + iplen - 1; while ( *end1 != '.' ) end1--; if ( ! hashSingleTerm(ipbuf,end1-ipbuf,&hi) ) return false; // . sanity check if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } // get the boost //floatboost1=(float)getBoostFromSiteNumInlinks(m_siteNumInlinks)/100.0 // // HASH the url path plain as if in body // // get number of components in the path int32_t pathDepth = fu->getPathDepth(); // make it a density thing //pathScore /= ( pathDepth + 1 ); // ensure score positive //if ( pathScore <= 0 ) pathScore = 1; // get it char *path = fu->getPath(); int32_t plen = fu->getPathLen(); /* // update it float boost2 = (float)m_urlPathWeight / 100; // again float boost3 = 1.0 / ((float)pathDepth + 1.0) ; // make a description char tmp3[190]; sprintf( tmp3 , "path score = " "siteInlinksBoost * " "urlPathWeight * " "pathDepthBoost * " "256 = %.02f * %.02f * %.02f * 256 " , boost1 , boost2 , boost3 ); */ //int32_t pathScore = (int32_t) (256.0 * boost1 * boost2 * boost3); // update parms //hi.m_desc = tmp3; hi.m_prefix = NULL; hi.m_desc = "url path"; hi.m_hashGroup = HASHGROUP_INURL; // if parm "index article content only" is true, do not index this! //if ( m_eliminateMenus ) skipIndex=true; setStatus ( "hashing gbpathdepth"); // // HASH gbpathdepth:X // // xyz.com/foo --> 0 // xyz.com/foo/ --> 1 // xyz.com/foo/boo --> 1 // xyz.com/foo/boo/ --> 2 char buf[20]; int32_t blen = sprintf(buf,"%"INT32"",pathDepth); // update parms hi.m_prefix = "gbpathdepth"; if ( isStatusDoc ) hi.m_prefix = "gbpathdepth2"; hi.m_hashGroup = HASHGROUP_INTAG; // hash gbpathdepth:X if ( ! hashString ( buf,blen,&hi) ) return false; // // HASH gbhopcount:X // setStatus ( "hashing gbhopcount"); if ( ! m_hopCountValid ) { char *xx=NULL;*xx=0; } blen = sprintf(buf,"%"INT32"",(int32_t)m_hopCount); // update parms hi.m_prefix = "gbhopcount"; if ( isStatusDoc ) hi.m_prefix = "gbhopcount2"; hi.m_hashGroup = HASHGROUP_INTAG; // hash gbpathdepth:X if ( ! hashString ( buf,blen,&hi) ) return false; setStatus ( "hashing gbhasfilename"); // // HASH gbhasfilename:0 or :1 // char *hm; if ( fu->getFilenameLen() ) hm = "1"; else hm = "0"; // update parms hi.m_prefix = "gbhasfilename"; if ( isStatusDoc ) hi.m_prefix = "gbhasfilename2"; // hash gbhasfilename:[0|1] if ( ! hashString ( hm,1,&hi) ) return false; setStatus ( "hashing gbiscgi"); // // HASH gbiscgi:0 or gbiscgi:1 // if ( fu->isCgi() ) hm = "1"; else hm = "0"; hi.m_prefix = "gbiscgi"; if ( isStatusDoc ) hi.m_prefix = "gbiscgi2"; if ( ! hashString ( hm,1,&hi) ) return false; setStatus ( "hashing gbext"); // // HASH gbhasext:0 or gbhasext:1 (does it have a fileextension) // // . xyz.com/foo --> gbhasext:0 // . xyz.com/foo.xxx --> gbhasext:1 if ( fu->getExtensionLen() ) hm = "1"; else hm = "0"; hi.m_prefix = "gbhasext"; if ( isStatusDoc ) hi.m_prefix = "gbhasext2"; if ( ! hashString ( hm,1,&hi) ) return false; // // HASH the url's mid domain and host as they were in the body // setStatus ( "hashing site colon terms"); // // HASH the site: terms // // . hash the pieces of the site // . http://host.domain.com/~harry/level1/ should hash to: // . site:host.domain.com/~harry/level1/ // . site:host.domain.com/~harry/ // . site:host.domain.com/~ // . site:host.domain.com/ // . site:domain.com/~harry/level1/ // . site:domain.com/~harry/ // . site:domain.com/~ // . site:domain.com/ // ensure score is positive //if ( siteScore <= 0 ) siteScore = 1; // get the hostname (later we set to domain name) char *name = fu->getHost(); int32_t nameLen = fu->getHostLen(); // . point to the end of the whole thing, including port field // . add in port, if non default char *end3 = name + fu->getHostLen() + fu->getPortLen(); loop: // now loop through the sub paths of this url's path for ( int32_t i = 0 ; ; i++ ) { // get the subpath int32_t len = fu->getSubPathLen(i); // FIX: always include first / if ( len == 0 ) len = 1; // write http://www.whatever.com/path into buf char buf[MAX_URL_LEN+10]; char *p = buf; gbmemcpy ( p , "http://" , 7 ); p += 7; gbmemcpy ( p , name , nameLen ); p += nameLen; gbmemcpy ( p , fu->getPath() , len ); p += len; *p = '\0'; // update hash parms hi.m_prefix = "site"; if ( isStatusDoc ) hi.m_prefix = "site2"; hi.m_hashGroup = HASHGROUP_INURL; // this returns false on failure if ( ! hashSingleTerm (buf,p-buf,&hi ) ) return false; // break when we hash the root path if ( len <=1 ) break; } // now keep moving the period over in the hostname while ( name < end3 && *name != '.' ) { name++; nameLen--; } // skip the '.' name++; nameLen--; // if not '.' we're done if ( name < end3 ) goto loop; setStatus ( "hashing ext colon"); // // HASH ext: term // // i.e. ext:gif ext:html ext:htm ext:pdf, etc. char *ext = fu->getExtension(); int32_t elen = fu->getExtensionLen(); // update hash parms hi.m_prefix = "ext"; if ( isStatusDoc ) hi.m_prefix = "ext2"; if ( ! hashSingleTerm(ext,elen,&hi ) ) return false; setStatus ( "hashing gbdocid" ); hi.m_prefix = "gbdocid"; if ( isStatusDoc ) hi.m_prefix = "gbdocid2"; char buf2[32]; sprintf(buf2,"%"UINT64"",(m_docId) ); if ( ! hashSingleTerm(buf2,gbstrlen(buf2),&hi) ) return false; // if indexing a json diffbot object, index // gbparenturl:xxxx of the original url from which the json was // datamined. we use this so we can act as a diffbot json cache. if ( m_isDiffbotJSONObject ) { setStatus ( "hashing gbparenturl term"); char *p = fu->getUrl() + fu->getUrlLen() - 1; // back up to - as in "http://xyz.com/foo-diffbotxyz123456" for ( ; *p && *p != '-' ; p-- ); // set up the hashing parms hi.m_hashGroup = HASHGROUP_INTAG; hi.m_tt = tt; hi.m_desc = "diffbot parent url"; // append a "www." as part of normalization uw.set ( fu->getUrl() , p - fu->getUrl() , true ); hi.m_prefix = "gbparenturl"; if ( isStatusDoc ) hi.m_prefix = "gbparenturl2"; if ( ! hashSingleTerm(uw.getUrl(),uw.getUrlLen(),&hi) ) return false; } if ( isStatusDoc ) return true; setStatus ( "hashing SiteGetter terms"); // // HASH terms for SiteGetter.cpp // // . this termId is used by SiteGetter.cpp for determining subsites // . matches what is in SiteGet::getSiteList() // for www.xyz.com/a/ HASH www.xyz.com // for www.xyz.com/a/b/ HASH www.xyz.com/a/ // for www.xyz.com/a/b/c/ HASH www.xyz.com/a/b/ bool add = true; // we only hash this for urls that end in '/' if ( s[slen-1] != '/' ) add = false; // and no cgi if ( fu->isCgi() ) add = false; // skip if root if ( fu->m_plen <= 1 ) add = false; // sanity check if ( ! m_linksValid ) { char *xx=NULL; *xx=0; } // . skip if we have no subdirectory outlinks // . that way we do not confuse all the pages in dictionary.com or // wikipedia.org as subsites!! if ( ! m_links.hasSubdirOutlink() ) add = false; char *host = fu->getHost (); int32_t hlen = fu->getHostLen (); // tags from here out hi.m_hashGroup = HASHGROUP_INTAG; hi.m_shardByTermId = true; // hash it if ( add ) { // remove the last path component char *end2 = s + slen - 2; // back up over last component for ( ; end2 > fu->m_path && *end2 != '/' ; end2-- ) ; // hash that part of the url hi.m_prefix = "siteterm"; if ( ! hashSingleTerm ( host,end2-host,&hi) ) return false; } hi.m_shardByTermId = false; setStatus ( "hashing urlhashdiv10 etc"); // // HASH urlhash: urlhashdiv10: urlhashdiv100: terms // // this is for proving how many docs are in the index uint32_t h = hash32 ( s , slen ); blen = sprintf(buf,"%"UINT32"",h); hi.m_prefix = "urlhash"; if ( ! hashString(buf,blen,&hi) ) return false; blen = sprintf(buf,"%"UINT32"",h/10); // update hashing parms hi.m_prefix = "urlhashdiv10"; if ( ! hashString(buf,blen,&hi) ) return false; blen = sprintf(buf,"%"UINT32"",h/100); // update hashing parms hi.m_prefix = "urlhashdiv100"; if ( ! hashString(buf,blen,&hi) ) return false; setStatus ( "hashing url mid domain"); // the final score //int32_t plainScore = (int32_t)(256.0 * boost1 * boost2 * fw); // update parms hi.m_prefix = NULL; hi.m_desc = "middle domain";//tmp3; hi.m_hashGroup = HASHGROUP_INURL; // if parm "index article content only" is true, do not index this! //if ( m_eliminateMenus ) plainScore = 0; //char *mid = fu->getMidDomain (); //int32_t mlen = fu->getMidDomainLen(); //hi.m_desc = "url mid dom"; //if ( ! hashString ( mid,mlen ,&hi ) ) return false; //hi.m_desc = "url host"; if ( ! hashString ( host,hlen,&hi)) return false; setStatus ( "hashing url path"); // hash the path plain if ( ! hashString (path,plen,&hi) ) return false; return true; } ///////////// // // CHROME DETECTION // // we search for these terms we hash here in getSectionsWithDupStats() // so we can remove chrome. // ///////////// // . returns false and sets g_errno on error // . copied Url2.cpp into here basically, so we can now dump Url2.cpp bool XmlDoc::hashSections ( HashTableX *tt ) { //if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; } //if ( m_contentType == CT_HTML ) return true; setStatus ( "hashing sections" ); if ( ! m_sectionsValid ) { char *xx=NULL;*xx=0; } if ( ! m_siteValid ) { char *xx=NULL;*xx=0; } Sections *ss = &m_sections; int32_t siteHash32 = *getSiteHash32(); // set up the hashing parms HashInfo hi; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_tt = tt; // the prefix is custom set for each section below //hi.m_prefix = "gbsectionhash"; // put all guys with the same xpath/site on the same shard hi.m_shardByTermId = true; Section *si = ss->m_rootSection; for ( ; si ; si = si->m_next ) { // breathe QUICKPOLL(m_niceness); // . skip if empty // . this needs to be like 48 bits because 32 bits is not // big enought! //uint64_t ih64 = si->m_sentenceContentHash64; // don't bother with the section if it doesn't have this set // because this eliminates parent dupage to reduce amount // of gbxpathsitehash123456 terms we index if ( ! ( si->m_flags & SEC_HASHXPATH ) ) continue; // skip if sentence, only hash tags now i guess for diffbot //if ( si->m_sentenceContentHash64 ) // continue; // get hash of sentences this tag contains indirectly uint32_t val32 = (uint32_t)si->m_indirectSentHash64; if ( ! val32 ) continue; // the termid is now the xpath and the sitehash, the "value" // will be the hash of the innerhtml, m_sentenceContentHash64 uint64_t thash64 = (uint32_t)si->m_turkTagHash32; // combine with site hash thash64 ^= (uint32_t)siteHash32; // this is a special hack we need to make it the // hash of the inner html //hi.m_sentHash32 = (uint32_t)ih64; // . get section xpath & site hash // . now if user does a gbfacets:gbxpathsitehashxxxxxx query // he will get back a histogram of the values it hash, // which are 32-bit hashes of the innerhtml for that // xpath on this site. char prefix[96]; sprintf(prefix,"gbxpathsitehash%"UINT64"",thash64); // like a normal key but we store "ih64" the innerHTML hash // of the section into the key instead of wordbits etc. // similar to hashNumber*() functions. //if ( ! hashSectionTerm ( term , &hi, (uint32_t)ih64 ) ) // return false; // i guess use facets hi.m_prefix = prefix; // we already have the hash of the inner html of the section hashFacet2 ( "gbfacetstr", prefix, //(int32_t)(uint32_t)ih64 , val32, hi.m_tt , // shard by termId? true ); } return true; } // . returns false and sets g_errno on error bool XmlDoc::hashIncomingLinkText ( HashTableX *tt , bool hashAnomalies , bool hashNonAnomalies ) { // do not index ANY of the body if it is NOT a permalink and // "menu elimination" technology is enabled. //if ( ! *getIsPermalink() && m_eliminateMenus ) return true; setStatus ( "hashing link text" ); // . now it must have an rss item to be indexed in all its glory // . but if it tells us it has an rss feed, toss it and wait for // the feed.... BUT sometimes the rss feed outlink is 404! // . NO, now we discard with ENORSS at Msg16.cpp //if ( ! *getHasRSSItem() && m_eliminateMenus ) return true; // sanity check if ( hashAnomalies == hashNonAnomalies ) { char *xx = NULL; *xx =0; } // display this note in page parser char *note = "hashing incoming link text"; // sanity if ( ! m_linkInfo1Valid ) { char *xx=NULL;*xx=0; } if ( ! m_linkInfo2Valid ) { char *xx=NULL;*xx=0; } // . finally hash in the linkText terms from the LinkInfo // . the LinkInfo class has all the terms of hashed anchor text for us // . if we're using an old TitleRec linkTermList is just a ptr to // somewhere in TitleRec // . otherwise, we generated it from merging a bunch of LinkInfos // and storing them in this new TitleRec LinkInfo *info1 = getLinkInfo1 (); LinkInfo **pinfo2 = getLinkInfo2 (); LinkInfo *info2 = *pinfo2; LinkInfo *linkInfo = info1; // pick the one with the most inlinks with valid incoming link text, // otherwise, we end up with major bias when we stop importing // link text from another cluster, because some pages will have // twice as many links as they should! if ( info2 && info2->getNumLinkTexts() > info1->getNumLinkTexts() ) { linkInfo = info2; note = "hashing incoming link text from other cluster"; } // sanity checks if ( ! m_ipValid ) { char *xx=NULL;*xx=0; } if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } // // brought the following code in from LinkInfo.cpp // int32_t noteLen = 0; if ( note ) noteLen = gbstrlen ( note ); // count "external" inlinkers int32_t ecount = 0; // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_useSynonyms = true; // hashstring should update this like a cursor. hi.m_startDist = 0; // loop through the link texts and hash them for ( Inlink *k = NULL; (k = linkInfo->getNextInlink(k)) ; ) { // is this inlinker internal? bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff)); // count external inlinks we have for indexing gbmininlinks: if ( ! internal ) ecount++; // get score //int64_t baseScore = k->m_baseScore; // get the weight //int64_t ww ; //if ( internal ) ww = m_internalLinkTextWeight; //else ww = m_externalLinkTextWeight; // modify the baseScore //int64_t final = (baseScore * ww) / 100LL; // get length of link text int32_t tlen = k->size_linkText; if ( tlen > 0 ) tlen--; // get the text char *txt = k->getLinkText(); // sanity check if ( ! verifyUtf8 ( txt , tlen ) ) { log("xmldoc: bad link text 2 from url=%s for %s", k->getUrl(),m_firstUrl.m_url); continue; } // if it is anomalous, set this, we don't //if ( k->m_isAnomaly ) // hi.m_hashIffNotUnique = true; //hi.m_baseScore = final; if ( internal ) hi.m_hashGroup = HASHGROUP_INTERNALINLINKTEXT; else hi.m_hashGroup = HASHGROUP_INLINKTEXT; // store the siterank of the linker in this and use that // to set the multiplier M bits i guess hi.m_linkerSiteRank = k->m_siteRank; // now record this so we can match the link text to // a matched offsite inlink text term in the scoring info k->m_wordPosStart = m_dist; // hi.m_startDist; // . hash the link text into the table // . returns false and sets g_errno on error // . we still have the score punish from # of words though! // . for inlink texts that are the same it should accumulate // and use the reserved bits as a multiplier i guess... if ( ! hashString ( txt,tlen,&hi) ) return false; // now record this so we can match the link text to // a matched offsite inlink text term in the scoring info //k->m_wordPosEnd = hi.m_startDist; // spread it out hi.m_startDist += 20; } /* // . hash gbkeyword:numinlinks where score is # of inlinks from 1-255 // . do not hash gbkeyword:numinlinks if we don't got any if ( ecount <= 0 ) return true; // limit it since our score can't be more than 255 (8-bits) //if ( ecount > 255 ) ecount = 255; // convert our 32 bit score to 8-bits so we trick it! //int32_t score = score8to32 ( (uint8_t)ecount ); // watch out for wrap //if ( score < 0 ) score = 0x7fffffff; // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_prefix = "gbkeyword"; hi.m_hashGroup = HASHGROUP_INTAG; // for terms where word position/density/diversity is irrelevant, // we can store this value... hi.m_fakeValue = ecount; // hash gbkeyword:numinlinks term if ( ! hashString ( "numinlinks",10,&hi ) )return false; */ return true; } // . returns false and sets g_errno on error bool XmlDoc::hashNeighborhoods ( HashTableX *tt ) { // seems like iffUnique is off, so do this //if ( ! *getIsPermalink() && m_eliminateMenus ) return true; setStatus ( "hashing neighborhoods" ); //g_tt = table; // . now we also hash the neighborhood text of each inlink, that is, // the text surrounding the inlink text. // . this is also destructive in that it will remove termids that // were not in the document being linked to in order to save // space in the titleRec // . now we only do one or the other, not both LinkInfo *info1 = getLinkInfo1 (); LinkInfo **pinfo2 = getLinkInfo2 (); LinkInfo *info2 = *pinfo2; LinkInfo *linkInfo = info1; char *note = " (internal cluster)"; // pick the one with the most inlinks with valid incoming link text // otherwise, we end up with major bias when we stop importing // link text from another cluster, because some pages will have // twice as many links as they should! if ( info2 && info2->getNumLinkTexts() > info1->getNumLinkTexts() ) { linkInfo = info2; note = " (external cluster)"; } // loop over all the Inlinks Inlink *k = NULL; loop: // get the next inlink k = linkInfo->getNextInlink( k ); // break if done if ( ! k ) return true; // skip if internal, they often have the same neighborhood text if ( (k->m_ip&0x0000ffff)==(m_ip&0x0000ffff) ) goto loop; // get the left and right texts and hash both char *s = k->getSurroundingText(); if ( ! s || k->size_surroundingText <= 1 ) goto loop; //int32_t inlinks = *getSiteNumInlinks(); // HACK: to avoid having to pass a flag to TermTable, then to // Words::hash(), Phrases::hash(), etc. just flip a bit in the // table to make it not add anything unless it is already in there. tt->m_addIffNotUnique = true; // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_desc = "surrounding text"; hi.m_hashGroup = HASHGROUP_NEIGHBORHOOD; // . hash that // . this returns false and sets g_errno on error int32_t len = k->size_surroundingText - 1; if ( ! hashString ( s, len, &hi ) ) return false; // now turn it back off tt->m_addIffNotUnique = false; // get the next Inlink goto loop; return true; } // . returns false and sets g_errno on error bool XmlDoc::hashRSSInfo ( HashTableX *tt ) { setStatus ( "hashing rss info" ); uint8_t *ct = getContentType(); if ( ! ct || ct == (void *)-1 ) { char *xx=NULL;*xx=0; } // . finally hash in the linkText terms from the LinkInfo // . the LinkInfo class has all the terms of hashed anchor text for us // . if we're using an old TitleRec linkTermList is just a ptr to // somewhere in TitleRec // . otherwise, we generated it from merging a bunch of LinkInfos // and storing them in this new TitleRec LinkInfo *linkInfo = getLinkInfo1(); // get the xml of the first rss/atom item/entry referencing this url Xml xml; // . returns NULL if no item xml // . this could also be a "channel" blurb now, so we index channel pgs if ( ! linkInfo->getItemXml ( &xml , m_niceness ) ) return false; if ( xml.isEmpty() ) // hash gbrss:0 return hashRSSTerm ( tt , false ); // parser info msg //if ( m_pbuf ) { // m_pbuf->safePrintf( // "
--BEGIN RSS/ATOM INFO HASH--

"); //} // hash nothing if not a permalink and eliminating "menus" //if ( ! *getIsPermalink() && m_eliminateMenus ) return true; // . IMPORTANT: you must be using the new link algo, so turn it on // in the spider controls. this allows us to include LinkTexts from // the same IP in our LinkInfo class in the TitleRec. // . is it rss or atom? both use title tag, so doesn't matter // . get the title tag bool isHtmlEncoded; int32_t titleLen; char *title = xml.getRSSTitle ( &titleLen , &isHtmlEncoded ); char c = 0; // sanity check if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; } bool hashIffUnique = true; // but if we had no content because we were an mp3 or whatever, // do not worry about avoiding double hashing if ( size_utf8Content <= 0 ) hashIffUnique = false; // decode it? // should we decode it? if they don't use [CDATA[]] then we should // ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA, // but most other feeds do not use it if ( isHtmlEncoded && title && titleLen > 0 ) { // it is html encoded so that the <'s are encoded to <'s so // we must decode them back. this could turn latin1 into utf8 // though? no, because the &'s should have been encoded, too! int32_t newLen =htmlDecode(title,title,titleLen,false,m_niceness); // make sure we don't overflow the buffer if ( newLen > titleLen ) { char *xx = NULL; *xx = 0; } // reassign the length titleLen = newLen; // NULL terminate it c = title[titleLen]; title[titleLen] = '\0'; } // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_TITLE; hi.m_desc = "rss title"; // . hash the rss title // . only hash the terms if they are unique to stay balanced with docs // that are not referenced by an rss feed bool status = hashString ( title,titleLen,&hi ) ; // pop the end back just in case if ( c ) title[titleLen] = c; // return false with g_errno set on error if ( ! status ) return false; // get the rss description int32_t descLen; char *desc = xml.getRSSDescription ( &descLen , &isHtmlEncoded ); // for adavanced hashing Xml xml2; Words w; //Scores scores; Words *wordsPtr = NULL; //Scores *scoresPtr = NULL; c = 0; // should we decode it? if they don't use [CDATA[]] then we should // ex: http://www.abc.net.au/rn/podcast/feeds/lawrpt.xml has CDATA, // but most other feeds do not use it if ( isHtmlEncoded && desc && descLen > 0 ) { // it is html encoded so that the <'s are encoded to <'s so // we must decode them back. this could turn latin1 into utf8 // though? no, because the &'s should have been encoded, too! int32_t newLen = htmlDecode(desc,desc,descLen,false,m_niceness); // make sure we don't overflow the buffer if ( newLen > descLen ) { char *xx = NULL; *xx = 0; } // reassign the length descLen = newLen; } // NULL terminate it if ( desc ) { c = desc[descLen]; desc[descLen] = '\0'; // set the xml class from the decoded html if ( ! xml2.set ( desc , descLen , false , // own data? 0 , // allocSize false , // pure xml? m_version , true , // set parents? m_niceness , *ct ) ) return false; // set the words class from the xml, returns false and sets // g_errno on error if ( ! w.set ( &xml2 , true , // compute Ids true ))// has html ents? (WERE encoded twice!) return false; // pass it in to TermTable::hash() below wordsPtr = &w; } // update hash parms hi.m_tt = tt; hi.m_desc = "rss body"; hi.m_hashGroup = HASHGROUP_BODY; // . hash the rss/atom description // . only hash the terms if they are unique to stay balanced with docs // that are not referenced by an rss feed status = hashString ( desc, descLen, &hi ); // pop the end back just in case if ( c ) desc[descLen] = c; // return false with g_errno set if ( ! status ) return false; // hash gbrss:1 if ( ! hashRSSTerm ( tt , true ) ) return false; // parser info msg //if ( m_pbuf ) { // m_pbuf->safePrintf("
--END RSS/ATOM INFO HASH--" // "

"); //} return true; } bool XmlDoc::hashRSSTerm ( HashTableX *tt , bool inRSS ) { // hash gbrss:0 or gbrss:1 char *value; if ( inRSS ) value = "1"; else value = "0"; // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_prefix = "gbinrss"; hi.m_hashGroup = HASHGROUP_INTAG; // returns false and sets g_errno on error if ( ! hashString(value,1,&hi ) ) return false; // hash gbisrss:1 if we are an rss page ourselves if ( *getIsRSS() ) value = "1"; else value = "0"; // update hash parms hi.m_prefix = "gbisrss"; // returns false and sets g_errno on error if ( ! hashString(value,1,&hi) ) return false; return true; } // . we now do the title hashing here for newer titlerecs, version 80+, rather // than use the block in the ruleset for titles. // . this is not to be confused with hashing the title: terms which still // does have an block in the ruleset. // . the new Weights class hashes title as part of body now with a high weight // given by "titleWeight" parm bool XmlDoc::hashTitle ( HashTableX *tt ) { // sanity check if ( m_hashedTitle ) { char *xx=NULL ; *xx=0; } setStatus ( "hashing title" ); // this has been called, note it m_hashedTitle = true; nodeid_t *tids = m_words.m_tagIds; int32_t nw = m_words.m_numWords; // find the first tag in the doc int32_t i ; for ( i = 0 ; i < nw ; i++ ) if ( tids[i] == TAG_TITLE ) break; // return true if no title if ( i >= nw ) return true; // skip tag i++; // mark it as start of title int32_t a = i; // limit end int32_t max = i + 40; if ( max > nw ) max = nw; // find end of title, either another <title> or a <title> tag for ( ; i < max ; i++ ) if ( (tids[i] & BACKBITCOMP) == TAG_TITLE ) break; // ends on a <title> tag? if ( i == a ) return true; HashInfo hi; hi.m_tt = tt; hi.m_prefix = "title"; hi.m_useSynonyms= true; // the new posdb info hi.m_hashGroup = HASHGROUP_TITLE; // . hash it up! use 0 for the date // . use XmlDoc::hashWords() // . use "title" as both prefix and description //if ( ! hashWords (a,i,&hi ) ) return false; char **wptrs = m_words.getWords(); int32_t *wlens = m_words.getWordLens(); char *title = wptrs[a]; char *titleEnd = wptrs[i-1] + wlens[i-1]; int32_t titleLen = titleEnd - title; if ( ! hashString ( title, titleLen, &hi) ) return false; // now hash as without title: prefix hi.m_prefix = NULL; if ( ! hashString ( title, titleLen, &hi) ) return false; return true; } // . we now do the title hashing here for newer titlerecs, version 80+, rather // than use the <index> block in the ruleset for titles. // . this is not to be confused with hashing the title: terms which still // does have an <index> block in the ruleset. bool XmlDoc::hashBody2 ( HashTableX *tt ) { // do not index ANY of the body if it is NOT a permalink and // "menu elimination" technology is enabled. //if ( ! *getIsPermalink() && m_eliminateMenus ) return true; setStatus ( "hashing body" ); // if more than X% of words are spammed to some degree, index all // words with a minimum score //int64_t x[] = {30,40,50,70,90}; //int64_t y[] = {6,8,10,20,30}; //int32_t mp = getY ( *getSiteNumInlinks8() , x , y , 5 ); //int32_t nw = m_words.getNumWords(); // record this m_bodyStartPos = m_dist; m_bodyStartPosValid = true; HashInfo hi; hi.m_tt = tt; hi.m_desc = "body"; hi.m_useSynonyms= true; hi.m_hashGroup = HASHGROUP_BODY; // use NULL for the prefix return hashWords (&hi ); } bool XmlDoc::hashMetaKeywords ( HashTableX *tt ) { // do not index meta tags if "menu elimination" technology is enabled. //if ( m_eliminateMenus ) return true; setStatus ( "hashing meta keywords" ); // hash the meta keywords tag //char buf [ 2048 + 2 ]; //int32_t len=m_xml.getMetaContentPointer ( buf , 2048 , "keywords" , 8 ); int32_t mklen; char *mk = getMetaKeywords( &mklen ); // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_desc = "meta keywords"; hi.m_hashGroup = HASHGROUP_INMETATAG; // call XmlDoc::hashString return hashString ( mk , mklen , &hi); } // . hash the meta summary, description and keyword tags // . we now do the title hashing here for newer titlerecs, version 80+, rather // than use the <index> block in the ruleset for titles. bool XmlDoc::hashMetaSummary ( HashTableX *tt ) { // sanity check if ( m_hashedMetas ) { char *xx=NULL ; *xx=0; } // this has been called, note it m_hashedMetas = true; // do not index meta tags if "menu elimination" technology is enabled. //if ( m_eliminateMenus ) return true; setStatus ( "hashing meta summary" ); // hash the meta keywords tag //char buf [ 2048 + 2 ]; //int32_t len = m_xml.getMetaContent ( buf , 2048 , "summary" , 7 ); int32_t mslen; char *ms = getMetaSummary ( &mslen ); // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INMETATAG; // udpate hashing parms hi.m_desc = "meta summary"; // hash it if ( ! hashString ( ms , mslen , &hi )) return false; //len = m_xml.getMetaContent ( buf , 2048 , "description" , 11 ); int32_t mdlen; char *md = getMetaDescription ( &mdlen ); // udpate hashing parms hi.m_desc = "meta desc"; // . TODO: only hash if unique????? set a flag on ht then i guess if ( ! hashString ( md , mdlen , &hi ) ) return false; return true; } //bool XmlDoc::linksToGigablast ( ) { // // check m_links for a link to gigablast.com or www.gigablast.com // return m_links.linksToGigablast(); //} bool XmlDoc::searchboxToGigablast ( ) { // . they may have a form variable like // . <form method=get action=http://www.gigablast.com/cgi/0.cgi name=f> return m_xml.hasGigablastForm(); } // . bring back support for dmoz integration // . when clicking on a "search within this category" it does a gbpdcat:<catid> // search to capture all pages that have that dmoz category as one of their // parent topics bool XmlDoc::hashDMOZCategories ( HashTableX *tt ) { getDmozTitles(); char *titlePtr = ptr_dmozTitles; char *sumPtr = ptr_dmozSumms; //char *anchPtr = ptr_dmozAnchors; char buf[128]; HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; int32_t *catIds = (int32_t *)ptr_catIds; int32_t numCatIds = size_catIds / 4; // go through the catIds and hash them for (int32_t i = 0; i < numCatIds; i++) { // write the catid as a string sprintf(buf, "%"UINT32"", (uint32_t)catIds[i]); // term prefix for hashing hi.m_prefix = "gbcatid"; // hash it hashString ( buf , gbstrlen(buf) , &hi ); // we also want to hash the parents int32_t currCatId = catIds[i]; int32_t currParentId = catIds[i]; int32_t currCatIndex; // loop to the Top, Top = 1 while ( currCatId > 1 ) { // hash the parent sprintf(buf, "%"UINT32"", (uint32_t)currParentId); hi.m_prefix = "gbpcatid"; hashString ( buf , gbstrlen(buf), &hi ); // next cat currCatId = currParentId; // get the index for this cat currCatIndex = g_categories->getIndexFromId(currCatId); if ( currCatIndex <= 0 ) break; // get the parent for this cat currParentId = g_categories->m_cats[currCatIndex].m_parentid; } // do not hash titles or summaries if "index article content // only" parm is on //if ( tr->eliminateMenus() ) continue; // hash dmoz title hi.m_prefix = NULL; // call this DMOZ title as regular title i guess hi.m_hashGroup = HASHGROUP_TITLE; // hash the DMOZ title hashString ( titlePtr , gbstrlen(titlePtr), &hi ); // next title titlePtr += gbstrlen(titlePtr) + 1; // hash DMOZ summary hi.m_prefix = NULL; // call this DMOZ summary as body i guess hi.m_hashGroup = HASHGROUP_BODY; // hash the DMOZ summary hashString ( sumPtr , gbstrlen(sumPtr), &hi ); // next summary sumPtr += gbstrlen(sumPtr) + 1; } int32_t numIndCatIds = size_indCatIds / 4; int32_t *indCatIds = (int32_t *)ptr_indCatIds; // go through the INDIRECT catIds and hash them for (int32_t i = 0 ; i < numIndCatIds; i++) { // write the catid as a string sprintf(buf, "%"UINT32"", (uint32_t)indCatIds[i]); // use prefix hi.m_prefix = "gbicatid"; hi.m_hashGroup = HASHGROUP_INTAG; // hash it hashString ( buf , gbstrlen(buf), &hi ); // we also want to hash the parents int32_t currCatId = indCatIds[i]; int32_t currParentId = indCatIds[i]; int32_t currCatIndex; // loop to the Top, Top = 1 while (currCatId > 1) { // hash the parent sprintf(buf, "%"UINT32"", (uint32_t)currParentId); // new prefix hi.m_prefix = "gbipcatid"; // hash it hashString ( buf , gbstrlen(buf), &hi ); // next cat currCatId = currParentId; // get the index for this cat currCatIndex = g_categories->getIndexFromId(currCatId); if ( currCatIndex <= 0 ) break; // get the parent for this cat currParentId = g_categories->m_cats[currCatIndex].m_parentid; } } return true; } bool XmlDoc::hashLanguage ( HashTableX *tt ) { setStatus ( "hashing language" ); int32_t langId = (int32_t)*getLangId(); char s[32]; // numeric langid int32_t slen = sprintf(s, "%"INT32"", langId ); // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_prefix = "gblang"; if ( ! hashString ( s, slen, &hi ) ) return false; // try lang abbreviation sprintf(s , "%s ", getLangAbbr(langId) ); // go back to broken way to try to fix parsing consistency bug // by adding hashLanguageString() function below //sprintf(s , "%s ", getLangAbbr(langId) ); if ( ! hashString ( s, slen, &hi ) ) return false; return true; } bool XmlDoc::hashLanguageString ( HashTableX *tt ) { setStatus ( "hashing language string" ); int32_t langId = (int32_t)*getLangId(); // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_prefix = "gblang"; // try lang abbreviation char s[32]; int32_t slen = sprintf(s , "%s ", getLangAbbr(langId) ); // go back to broken way to try to fix parsing consistency bug if ( ! hashString ( s, slen, &hi ) ) return false; return true; } bool XmlDoc::hashCountry ( HashTableX *tt ) { setStatus ( "hashing country" ); //uint16_t *cids = getCountryIds(); //if ( ! cids ) return true; //if ( cids == (uint16_t *)-1 ) return false; uint16_t *cid = getCountryId(); if ( ! cid || cid == (uint16_t *)-1 ) return false; // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_prefix = "gbcountry"; for ( int32_t i = 0 ; i < 1 ; i++ ) { // get the ith country id //int32_t cid = cids[i]; // convert it char buf[32]; int32_t blen = sprintf(buf,"%s", g_countryCode.getAbbr(*cid) ); // hash it if ( ! hashString ( buf, blen, &hi ) ) return false; } // all done return true; } bool XmlDoc::hashSiteNumInlinks ( HashTableX *tt ) { setStatus ( "hashing site num inlinks" ); char s[32]; int32_t slen = sprintf(s, "%"INT32"", (int32_t)*getSiteNumInlinks() ); // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_prefix = "gbsitenuminlinks"; // hack test // slen = sprintf(s,"%"UINT32"", // ((uint32_t)m_firstUrl.getUrlHash32()) % 1000); // log("xmldoc: sitenuminlinks for %s is %s",m_firstUrl.getUrl(),s); return hashString ( s, slen, &hi ); } bool XmlDoc::hashCharset ( HashTableX *tt ) { setStatus ( "hashing charset" ); char s[128]; // charset string int32_t slen; // hash the charset as a string if ( ! get_charset_str(*getCharset())) slen = sprintf(s, "unknown"); else slen = sprintf(s, "%s", get_charset_str(*getCharset())); // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_prefix = "gbcharset"; if ( ! hashString ( s,slen, &hi ) ) return false; // hash charset as a number slen = sprintf(s, "%d", *getCharset()); return hashString ( s,slen, &hi ) ; } // . only hash certain tags (single byte scores and ST_COMMENT) // . do not hash clocks, ST_SITE, ST_COMMENT // . term = gbtag:blog1 score=0-100 // . term = gbtag:blog2 score=0-100 // . term = gbtag:english1 score=0-100 // . term = gbtag:pagerank1 score=0-100, etc. ... // . term = gbtagmeta:"this site"(special hashing,ST_META,score=qlty) // . later we can support query like gbtag:english1>30 bool XmlDoc::hashTagRec ( HashTableX *tt ) { setStatus ( "hashing tag rec" ); //char *field = "gbtag:"; //int32_t fieldlen = gbstrlen(field); //bool retval = true; // . this tag rec does not have the ST_SITE tag in it to save space // . it does not have clocks either? TagRec *gr = getTagRec(); // count occurence of each tag id //int16_t count [ LAST_TAG ]; //memset ( count , 0 , 2 * LAST_TAG ); // loop over all tags in the title rec for ( Tag *tag = gr->getFirstTag(); tag ; tag = gr->getNextTag(tag) ) { // breathe QUICKPOLL(m_niceness); // get id int32_t type = tag->m_type; // skip tags we are not supposed to index, like // ST_CLOCK, etc. or anything with a dataSize not 1 if ( ! tag->isIndexable() ) continue; // hash these metas below //if ( type == ST_META ) continue; //if ( tag->isType("meta") ) continue; // only single byters. this should have been covered by the // isIndexable() function. //if ( tag->getTagDataSize() != 1 ) continue; // get the name char *str = getTagStrFromType ( type ); // get data size //uint8_t *data = (uint8_t *)tag->getTagData(); // make it a string //char dataStr[6]; //sprintf ( dataStr , "%"INT32"",(int32_t)*data ); // skip if has non numbers //bool num = true; //for ( int32_t i = 0 ; i < tag->getTagDataSize() ; i++ ) // if ( ! is_digit(tag->getTagData()[i]) ) num = false; // skip if it has more than just digits, we are not indexing // strings at this point //if ( ! num ) continue; // point to it, should be a NULL terminated string char *dataStr = tag->getTagData(); // skip if number is too big //int32_t val = atol ( dataStr ); // boost by one so we can index "0" score //val++; // we really only want to index scores from 0-255 //if ( val > 255 ) continue; // no negatives //if ( val <= 0 ) continue; // count occurence //count [ type ]++; // . make the term name to hash after the gbtag: // . we want to hash "gbtag:english3" for example, for the // ST_ENGLISH tag id. char prefix[64]; // . do not include the count for the first occurence // . follows the gbruleset:36 convention // . index gbtagspam:0 or gbtagspam:1, etc.!!! //if ( count[type] == 1 ) sprintf ( prefix , "gbtag%s",str); // assume that is good enough //char *prefix = tmp; // store prefix into m_wbuf so XmlDoc::print() works! //if ( m_pbuf ) { // int32_t tlen = gbstrlen(tmp); // m_wbuf.safeMemcpy(tmp,tlen+1); // prefix = m_wbuf.getBuf() - (tlen+1); //} //else // sprintf ( tmp , "gbtag%s%"INT32"",str,(int32_t)count[type]); // "unmap" it so when it is hashed it will have the correct // 8-bit score. IndexList.cpp will convert it back to 8 bits // in IndexList::set(table), which sets our termlist from // this "table". //int32_t score = score8to32 ( val ); // we already incorporate the score as a string when we hash // gbtagtagname:tagvalue so why repeat it? //int32_t score = 1; // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_prefix = prefix; hi.m_hashGroup = HASHGROUP_INTAG; // meta is special now if ( tag->isType("meta") ) { hi.m_prefix = NULL; } // hash it. like "gbtagenglish:1" with a score of 1, etc. // or "gbtagspam:33" with a score of 33. this would also // hash gbtagclock:0xfe442211 type things as well. int32_t dlen = gbstrlen(dataStr); if ( ! hashString ( dataStr,dlen,&hi ) ) return false; } return true; } bool XmlDoc::hashPermalink ( HashTableX *tt ) { setStatus ( "hashing is permalink" ); // put a colon in there so it can't be faked using a meta tag. char *s = "0"; if ( *getIsPermalink() ) s = "1"; // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_prefix = "gbpermalink"; return hashString ( s,1,&hi ); } //hash the tag pair vector, the gigabit vector and the sample vector bool XmlDoc::hashVectors ( HashTableX *tt ) { setStatus ( "hashing vectors" ); int32_t score = *getSiteNumInlinks8() * 256; if ( score <= 0 ) score = 1; char buf[32]; uint32_t h; //char *field; //char *descr; //h = m_tagVector.getVectorHash(); uint32_t tph = *getTagPairHash32(); int32_t blen = sprintf(buf,"%"UINT32"", tph); //field = "gbtagvector"; //descr = "tag vector hash"; // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_prefix = "gbtagvector"; hi.m_desc = "tag vector hash"; hi.m_shardByTermId = true; // this returns false on failure if ( ! hashString ( buf,blen, &hi ) ) return false; h = *getGigabitVectorScorelessHash(); blen = sprintf(buf,"%"UINT32"",(uint32_t)h); // udpate hash parms hi.m_prefix = "gbgigabitvector"; hi.m_desc = "gigabit vector hash"; // this returns false on failure if ( ! hashString ( buf,blen,&hi) ) return false; // . dup checking uses the two hashes above, not this hash!!! MDW // . i think this vector is just used to see if the page changed // significantly since last spidering // . it is used by getPercentChanged() and by Dates.cpp // . sanity check //if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; } //int32_t *pc = m_pageSampleVec; //h = hash32((char *)m_pageSampleVec, SAMPLE_VECTOR_SIZE); //blen = sprintf(buf,"%"UINT32"",(int32_t unsigned int)h); //field = "gbsamplevector"; //descr = "sample vector hash"; // this returns false on failure //if ( ! hashString ( tt,buf,blen,score,field,descr) ) // return false; // . hash combined for Dup Dectection // . must match XmlDoc::getDupList ( ); //uint64_t h1 = m_tagVector.getVectorHash(); //uint64_t h2 = getGigabitVectorScorelessHash(gigabitVec); //uint64_t h64 = hash64 ( h1 , h2 ); // take this out for now /* uint64_t *dh = getDupHash ( ); blen = sprintf(buf,"%"UINT64"", *dh );//h64); //field = "gbduphash"; //descr = "dup vector hash"; // update hash parms hi.m_prefix = "gbduphash"; hi.m_desc = "dup vector hash"; // this returns false on failure if ( ! hashString ( buf,blen,&hi ) ) return false; */ // hash the wikipedia docids we match if ( ! m_wikiDocIdsValid ) { char *xx=NULL;*xx=0; } for ( int32_t i = 0 ; i < size_wikiDocIds/8 ; i++ ) { blen = sprintf(buf,"%"UINT64"",ptr_wikiDocIds[i]); // convert to int32_t //int32_t convScore = (int32_t)ptr_wikiScores[i]; // get score //uint32_t ws = score8to32 ( convScore ); // update hash parms hi.m_prefix = "gbwikidocid"; hi.m_desc = "wiki docid"; hi.m_hashGroup = HASHGROUP_INTAG; // this returns false on failure if ( ! hashString ( buf,blen,&hi ) ) return false; } return true; } bool XmlDoc::hashAds ( HashTableX *tt ) { setStatus ( "hashing ad ids" ); for(int32_t i = 0; i < size_adVector / 8 ; i++) { int32_t score = *getSiteNumInlinks8() * 256; if ( score <= 0 ) score = 1; char buf[128]; char *field; char *descr; //buflen = snprintf(buf,128,"%s-%s", // m_adProvider[i],m_adClient[i]); int32_t buflen = snprintf(buf,128,"%"UINT64"",ptr_adVector[i] ); field = "gbad"; descr = "ad provider and id"; // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_prefix = "gbad"; hi.m_desc = "ad provider and id"; //log(LOG_WARN, "build: url %s indexing ad termid %s:%s", // getFirstUrl()->getUrl(), field, buf); //this returns false on failure if ( ! hashString ( buf,buflen,&hi ) ) return false; } return true; } Url *XmlDoc::getBaseUrl ( ) { if ( m_baseUrlValid ) return &m_baseUrl; // need this Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (Url *)xml; Url *cu = getCurrentUrl(); if ( ! cu || cu == (void *)-1 ) return (Url *)cu; // no longer set addWWW to true since tmblr.co has an IP but // www.tmblr.co does not m_baseUrl.set ( cu , false ); // addWWW = true // look for base url for ( int32_t i=0 ; i < xml->getNumNodes() ; i++ ) { // 12 is the <base href> tag id if ( xml->getNodeId ( i ) != TAG_BASE ) continue; // get the href field of this base tag int32_t linkLen; char *link = (char *) xml->getString ( i, "href", &linkLen ); // skip if not valid if ( ! link || linkLen == 0 ) continue; // set base to it. addWWW=true m_baseUrl.set(link, linkLen, false);//true); break; } m_baseUrlValid = true; return &m_baseUrl; } // hash gbhasthumbnail:0|1 bool XmlDoc::hashImageStuff ( HashTableX *tt ) { setStatus ("hashing image stuff"); char *val = "0"; char **td = getThumbnailData(); if ( *td ) val = "1"; // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_prefix = "gbhasthumbnail"; hi.m_desc = "has a thumbnail"; // this returns false on failure if ( ! hashString ( val,1,&hi ) ) return false; return true; } // returns false and sets g_errno on error bool XmlDoc::hashIsAdult ( HashTableX *tt ) { setStatus ("hashing isadult"); char *ia = getIsAdult(); // this should not block or return error! should have been // set in prepareToMakeTitleRec() before hashAll() was called! if ( ! ia || ia == (void *)-1 ) {char *xx=NULL;*xx=0; } // index gbisadult:1 if adult or gbisadult:0 if not char *val; if ( *ia ) val = "1"; else val = "0"; // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_prefix = "gbisadult"; hi.m_desc = "is document adult content"; // this returns false on failure if ( ! hashString ( val,1,&hi ) ) return false; return true; } // hash destination urls for embedded gb search boxes bool XmlDoc::hashSubmitUrls ( HashTableX *tt ) { setStatus ( "hashing submit urls" ); Url *baseUrl = getBaseUrl(); if ( ! baseUrl || baseUrl == (Url *)-1) { char*xx=NULL;*xx=0;} for ( int32_t i = 0 ; i < m_xml.getNumNodes() ; i++ ) { // Find forms if ( m_xml.getNodeId(i) != TAG_FORM ) continue; if ( m_xml.isBackTag(i) ) continue; int32_t score = *getSiteNumInlinks8() * 256; if ( score <= 0 ) score = 1; int32_t len; char *s = m_xml.getString ( i , "action" , &len ); if (!s || len == 0) continue; Url url; url.set(baseUrl, s, len, true); char *buf = url.getUrl(); int32_t blen = url.getUrlLen(); // update hash parms HashInfo hi; hi.m_tt = tt; hi.m_hashGroup = HASHGROUP_INTAG; hi.m_prefix = "gbsubmiturl"; hi.m_desc = "submit url for form"; // this returns false on failure if ( ! hashString ( buf,blen,&hi ) ) return false; } return true; } // // STUFF IMPORTED FROM INDEXLIST.CPP // // we also assume all scores are above 256, too uint8_t score32to8 ( uint32_t score ) { // ensure score is > 0... no! not any more if ( score <= 0 ) return (unsigned char) 0; // extremely large scores need an adjustment to avoid wrapping if ( score < (uint32_t)0xffffffff - 128 ) score += 128; // scores are multiplied by 256 to preserve fractions, so undo that score /= 256; // ensure score is > 0 if ( score <= 0 ) return (unsigned char) 1; // if score < 128 return it now if ( score < 128 ) return (unsigned char) score; // now shrink it so it's now from 1 upwards score -= 127; // . take NATURAL log of score now // . PROBLEM: for low scores logscore may increase by close to 1.0 // for a score increase of 1.0. and since s_maxscore is about 22.0 // we end up moving 1.0/22.0 of 128 total pts causing a jump of // 2 or more score points!! oops!!! to fix, let's add 10 pts // to the score score += 10; double logscore = ::log ( (double)score ); // now the max it can be //double maxscore = ::log ( (double)(0x00ffffff - 127)); static double s_maxscore = -1.0; static double s_minscore = -1.0; if ( s_maxscore == -1.0 ) { uint32_t max = ((0xffffffff + 0)/256) - 127 + 10; uint32_t min = ( 128 ) - 127 + 10; s_maxscore = ::log((double)max); s_minscore = ::log((double)min); // adjust s_maxscore -= s_minscore; } // adjust it logscore -= s_minscore; // scale it into [126,0] (add .5 for rounding) double scaled = (logscore* 127.0) / s_maxscore + .5; // sanity check if ( (unsigned char)scaled >= 128 ) { char *xx=NULL;*xx=0; } // . go into the 8 bit score now // . set the hi bit so they know we took its log unsigned char score8 = (unsigned char)scaled | 128; return score8; } // for score8to32() below static uint32_t s_scoreMap[] = { 0UL, 1UL, 385UL, 641UL, 897UL, 1153UL, 1409UL, 1665UL, 1921UL, 2177UL, 2433UL, 2689UL, 2945UL, 3201UL, 3457UL, 3713UL, 3969UL, 4225UL, 4481UL, 4737UL, 4993UL, 5249UL, 5505UL, 5761UL, 6017UL, 6273UL, 6529UL, 6785UL, 7041UL, 7297UL, 7553UL, 7809UL, 8065UL, 8321UL, 8577UL, 8833UL, 9089UL, 9345UL, 9601UL, 9857UL, 10113UL, 10369UL, 10625UL, 10881UL, 11137UL, 11393UL, 11649UL, 11905UL, 12161UL, 12417UL, 12673UL, 12929UL, 13185UL, 13441UL, 13697UL, 13953UL, 14209UL, 14465UL, 14721UL, 14977UL, 15233UL, 15489UL, 15745UL, 16001UL, 16257UL, 16513UL, 16769UL, 17025UL, 17281UL, 17537UL, 17793UL, 18049UL, 18305UL, 18561UL, 18817UL, 19073UL, 19329UL, 19585UL, 19841UL, 20097UL, 20353UL, 20609UL, 20865UL, 21121UL, 21377UL, 21633UL, 21889UL, 22145UL, 22401UL, 22657UL, 22913UL, 23169UL, 23425UL, 23681UL, 23937UL, 24193UL, 24449UL, 24705UL, 24961UL, 25217UL, 25473UL, 25729UL, 25985UL, 26241UL, 26497UL, 26753UL, 27009UL, 27265UL, 27521UL, 27777UL, 28033UL, 28289UL, 28545UL, 28801UL, 29057UL, 29313UL, 29569UL, 29825UL, 30081UL, 30337UL, 30593UL, 30849UL, 31105UL, 31361UL, 31617UL, 31873UL, 32129UL, 32385UL, 32641UL, 32897UL, 33488UL, 33842UL, 34230UL, 34901UL, 35415UL, 35979UL, 36598UL, 37278UL, 38025UL, 39319UL, 40312UL, 41404UL, 43296UL, 44747UL, 46343UL, 48098UL, 51138UL, 53471UL, 56037UL, 58859UL, 61962UL, 65374UL, 71287UL, 75825UL, 80816UL, 86305UL, 92342UL, 98982UL, 110492UL, 119326UL, 129042UL, 139728UL, 151481UL, 171856UL, 187496UL, 204699UL, 223622UL, 244437UL, 267333UL, 307029UL, 337502UL, 371022UL, 407893UL, 448450UL, 493062UL, 570408UL, 629783UL, 695095UL, 766938UL, 845965UL, 982981UL, 1088163UL, 1203862UL, 1331130UL, 1471124UL, 1625117UL, 1892110UL, 2097072UL, 2322530UL, 2570533UL, 2843335UL, 3143416UL, 3663697UL, 4063102UL, 4502447UL, 4985726UL, 5517332UL, 6439034UL, 7146599UL, 7924919UL, 8781070UL, 9722836UL, 10758778UL, 12554901UL, 13933735UL, 15450451UL, 17118838UL, 18954063UL, 20972809UL, 24472927UL, 27159874UL, 30115514UL, 33366717UL, 36943040UL, 43143702UL, 47903786UL, 53139877UL, 58899576UL, 65235244UL, 72204478UL, 84287801UL, 93563849UL, 103767501UL, 114991518UL, 127337936UL, 140918995UL, 164465962UL, 182542348UL, 202426372UL, 224298798UL, 248358466UL, 290073346UL, 322096762UL, 357322519UL, 396070851UL, 438694015UL, 485579494UL, 566869982UL, 629274552UL, 697919578UL, 773429105UL, 856489583UL, 947856107UL, 1106268254UL, 1227877095UL, 1361646819UL, 1508793514UL, 1670654878UL, 1951291651UL, 2166729124UL, 2403710344UL, 2664389686UL, 2951136962UL, 3266558965UL, 3813440635UL, 4233267317UL }; uint32_t score8to32 ( uint8_t score8 ) { /* int32_t test = score32to8((uint32_t)0xffffffff); static bool s_set = false; if ( ! s_set ) { s_set = true; uint8_t lasts = 0; int32_t step = 128; int64_t start = gettimeofdayInMilliseconds(); for ( uint64_t i=1 ; i<(uint32_t)0xffffffff ; i+=step) { // get the score uint8_t s = score32to8(i); // print it out now if ( s != lasts ) { fprintf(stderr,"\t%"UINT32"UL,\n",i); } // if no change, skip it if (lasts != 0 && s == lasts ) { if ( s > 128 ) step = (int32_t)((float)step * 1.1); continue; } // otherwise set it s_scoreMap[s] = i; // reset lasts = s; } // sanity test for ( int32_t j = 1 ; j < 256 ; j++ ) { uint32_t big = s_scoreMap[j]; if ( score32to8(big) != j ) { char *xx=NULL;*xx=0;} } int64_t end = gettimeofdayInMilliseconds(); logf(LOG_DEBUG, "gb: took %"INT64" ms to build score table.", end-start); } // sanity test static bool s_set = false; if ( ! s_set ) { for ( int32_t j = 1 ; j < 256 ; j++ ) { uint32_t big = s_scoreMap[j]; uint8_t tt; tt = score32to8(big); if ( tt != j ) { char *xx=NULL;*xx=0;} } s_set = true; } */ return(s_scoreMap[score8]); } //////////////////////////////////////////////////////////// // // Summary/Title generation for Msg20 // //////////////////////////////////////////////////////////// void XmlDoc::set20 ( Msg20Request *req ) { // clear it all out reset(); // this too m_reply.reset(); m_pbuf = NULL;//pbuf; m_niceness = req->m_niceness; // remember this m_req = req; // and this! //m_coll = req->ptr_coll; //setCollNum ( req->ptr_coll ); m_collnum = req->m_collnum; m_collnumValid = true; // make this stuff valid if ( m_req->m_docId > 0 ) { m_docId = m_req->m_docId; m_docIdValid = true; } // set url too if we should if ( m_req->size_ubuf > 1 ) setFirstUrl ( m_req->ptr_ubuf , false ); } #define MAX_LINK_TEXT_LEN 512 #define MAX_RSSITEM_SIZE 30000 void getMsg20ReplyWrapper ( void *state ) { XmlDoc *THIS = (XmlDoc *)state; // make sure has not been freed from under us! if ( THIS->m_freed ) { char *xx=NULL;*xx=0;} // return if it blocked if ( THIS->getMsg20Reply ( ) == (void *)-1 ) return; // otherwise, all done, call the caller callback if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state ); else THIS->m_callback2 ( THIS->m_state ); } // . returns NULL with g_errno set on error // . returns -1 if blocked Msg20Reply *XmlDoc::getMsg20Reply ( ) { // return it right away if valid if ( m_replyValid ) return &m_reply; // . internal callback // . so if any of the functions we end up calling directly or // indirectly block, this callback will be called if ( ! m_masterLoop ) { m_masterLoop = getMsg20ReplyWrapper; m_masterState = this; } // used by Msg20.cpp to time this XmlDoc::getMsg20Reply() function if ( ! m_startTimeValid && isClockInSync() ) { m_startTime = gettimeofdayInMilliseconds(); m_startTimeValid = true; } // caller shouldhave the callback set if ( ! m_callback1 && ! m_callback2 ) { char *xx=NULL;*xx=0; } //char safeStack[100000]; //safeStack[0] = 0; //safeStack[90000] = 0; // int16_tcut Msg20Reply *reply = &m_reply; m_niceness = m_req->m_niceness; m_collnum = m_req->m_collnum;//cr->m_collnum; m_collnumValid = true; //char *coll = m_req->ptr_coll; CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); if ( ! cr ) { g_errno = ENOCOLLREC; return NULL; } //CollectionRec *cr = getCollRec(); //if ( ! cr ) return NULL; // set this important member var //if (!cr ) cr=g_collectiondb.getRec(cr->m_coll,gbstrlen(cr->m_coll)); // return NULL with g_errno set on error //if ( ! cr ) return NULL; // . cache it for one hour // . this will set our ptr_ and size_ member vars char **otr = getOldTitleRec ( ); if ( ! otr || otr == (void *)-1 ) return (Msg20Reply *)otr; // must have a title rec in titledb if ( ! *otr ) { g_errno = ENOTFOUND; return NULL; } // sanity if ( *otr != m_oldTitleRec ) { char *xx=NULL;*xx=0; } // what is this? int32_t maxSize = 0; // . set our ptr_ and size_ member vars from it after uncompressing // . returns false and sets g_errno on error if ( ! m_setTr ) { // . this completely resets us // . this returns false with g_errno set on error bool status = set2( *otr, maxSize, cr->m_coll, NULL, m_niceness); // sanity check if ( ! status && ! g_errno ) { char *xx=NULL;*xx=0; } // if there was an error, g_errno should be set. if ( ! status ) return NULL; m_setTr = true; } // breathe QUICKPOLL(m_niceness); // init reply->m_nextMerged = NULL; reply->m_collnum = m_collnum; // MsgE uses this one if ( m_req->m_getTitleRec ) { // this is the original compressed titleRec, preceeded // by key and dataSize and followed by the data reply-> ptr_tr = m_oldTitleRec; reply->size_tr = m_oldTitleRecSize; m_replyValid = true; return reply; } // if they provided a query with gbfacet*: terms then we have // to get those facet values. if ( ! m_gotFacets ) { // only do this once m_gotFacets = true; // get facet term char *qs = m_req->ptr_qbuf; facetPrintLoop: for ( ; qs && *qs ; qs++ ) { if ( qs[0] != 'g' ) continue; if ( qs[1] != 'b' ) continue; if ( qs[2] != 'f' ) continue; if ( strncasecmp(qs,"gbfacet",7) ) continue; qs += 7; // gbfacetstr: gbfacetint: gbfacetfloat: if ( strncasecmp(qs,"str:" ,4) == 0 ) qs += 4; else if ( strncasecmp(qs,"int:" ,4) == 0 ) qs += 4; else if ( strncasecmp(qs,"float:",6) == 0 ) qs += 6; else continue; break; } // if we had a facet, get the values it has in the doc if ( qs && *qs ) { // need this for storeFacetValues() if we are json if ( m_contentType == CT_JSON ) { Json *jp = getParsedJson(); if ( ! jp || jp == (void *)-1) return (Msg20Reply *)jp; } if ( m_contentType == CT_HTML || m_contentType == CT_XML ) { Xml *xml = getXml(); if ( ! xml || xml==(void *)-1) return (Msg20Reply *)xml; } // find end of it char *e = qs; for ( ; *e && ! is_wspace_a(*e) ; e++ ); // tmp null it char c = *e; *e = '\0'; // this is zero if unspecifed FacetValHash_t fvh = m_req->m_facetValHash; // . this will store facetField/facetValue pairs // . stores into safebuf, m_tmpBuf2 // . it will terminate all stored strings with \0 // . we check meta tags for html docs // . otherwise we check xml/json doc fields // . returns false with g_errno set on error bool ret = storeFacetValues ( qs , &m_tmpBuf2 , fvh ) ; // revert the \0 *e = c; // return NULL with g_errno set on error if ( ! ret ) return NULL; // advance qs = e; // do another one goto facetPrintLoop; } // assign reply-> ptr_facetBuf = m_tmpBuf2.getBufStart(); reply->size_facetBuf = m_tmpBuf2.length(); } if ( m_req->m_justGetFacets ) { m_replyValid = true; return reply; } if ( m_req->m_getTermListBuf ) { // ensure content is recycled from title rec m_recycleContent = true; //xd->m_recycleLinkInfo = true; // only get posdb keys really for this stuff m_useTitledb = false; m_useTagdb = false; m_useClusterdb = false; m_useSpiderdb = false; m_useLinkdb = false; // time it if ( m_tlbufTimer == 0 ) m_tlbufTimer = gettimeofdayInMilliseconds(); // . shit limit content for speed!!! // . this is for getting matching queries/relatedqueries // anyway, so should be ok if ( size_utf8Content > 150000 ) { char *p = ptr_utf8Content + 150000 - 1; char *pstart = ptr_utf8Content; // back up until we hit punct for ( ; p > pstart ; p-- ) if ( is_punct_utf8(p) ) break; // set new size then *p = '\0'; size_utf8Content = p - pstart + 1; } // hack: should be sorted by lower 32bits of termids // so handleRequest8e does not have to sort before doing // its query matching algo with queries in g_qbuf. // but these termlists are really mostly used for doing // the gbdocid:|xxxx queries in handleRequest8e. SafeBuf *tbuf = getTermListBuf(); if ( ! tbuf || tbuf == (void *)-1 ) return (Msg20Reply *)tbuf; SafeBuf *tibuf = getTermId32Buf(); if ( ! tibuf || tibuf == (void *)-1)return (Msg20Reply *)tibuf; // time it int64_t took = gettimeofdayInMilliseconds() - m_tlbufTimer; log("seo: tlistbuf gen took %"INT64" ms for docid %"INT64"", took,m_docId); // just that reply-> ptr_tlistBuf = tbuf->getBufStart(); reply->size_tlistBuf = tbuf->length(); reply-> ptr_tiBuf = tibuf->getBufStart(); reply->size_tiBuf = tibuf->length(); m_replyValid = true; return reply; } // lookup the tagdb rec fresh if setting for a summary. that way we // can see if it is banned or not. but for getting m_getTermListBuf // and stuff above, skip the tagrec lookup! // save some time when SPIDERING/BUILDING by skipping fresh // tagdb lookup and using tags in titlerec if ( m_req && ! m_req->m_getLinkText && ! m_checkedUrlFilters ) m_tagRecDataValid = false; // set and validate member vars //if ( ! m_setFromTitleRec ) // // return NULL with g_errno set on error // if ( ! set ( tr , NULL , m_niceness ) ) return NULL; TagRec *gr = getTagRec(); if ( ! gr || gr == (void *)-1 ) return (Msg20Reply *)gr; //reply-> ptr_tagRec = (char *)gr; //reply->size_tagRec = gr->getSize(); // we use this instead of nowGlobal //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } // this should be valid, it is stored in title rec if ( m_contentHash32Valid ) reply->m_contentHash32 = m_contentHash32; else reply->m_contentHash32 = 0; // if this page is potential spam, toss it! //char *isSpam = getIsSpam(); //if ( ! isSpam || isSpam == (char *)-1 ) return (Msg20Reply *)isSpam; if ( ! m_checkedUrlFilters ) { // do it //int32_t *rn = getRegExpNum2(-1); //if ( ! rn || rn == (int32_t *)-1 ) return (Msg20Reply *)rn; // do not re-check m_checkedUrlFilters = true; // a non-www url? /* now we allow domain-only urls in the index, so this is hurting us... if ( ! m_req->m_getLinkText ) { Url tmp; tmp.set ( ptr_firstUrl ); if ( tmp.getHostLen() == tmp.getDomainLen() ) { // set m_errno reply->m_errno = EDOCFILTERED; // tmp debug log("xmldoc: filtering non www url %s", ptr_firstUrl); // and this reply->m_isFiltered = true; // give back the url at least reply->ptr_ubuf = getFirstUrl()->getUrl(); reply->size_ubuf =getFirstUrl()->getUrlLen()+1; // validate m_replyValid = true; // and return return reply; } } */ // get this //time_t nowGlobal = getTimeGlobal(); // get this SpiderRequest sreq; SpiderReply srep; setSpiderReqForMsg20 ( &sreq , &srep );//, *isSpam ); int32_t spideredTime = getSpideredTime(); // get it int32_t ufn; ufn=::getUrlFilterNum(&sreq,&srep,spideredTime,true, m_niceness,cr, false, // isOutlink? NULL ); // sanity check if ( ufn < 0 ) { log("msg20: bad url filter for url %s", sreq.m_url); } // save it reply->m_urlFilterNum = ufn; // get spider priority if ufn is valid int32_t pr = 0; if ( ufn >= 0 ) pr = cr->m_spiderPriorities[ufn]; // this is an automatic ban! if ( gr->getLong("manualban",0) ) pr = SPIDER_PRIORITY_BANNED; // is it banned if ( pr == SPIDER_PRIORITY_BANNED ) { // -2 // set m_errno reply->m_errno = EDOCBANNED; // and this reply->m_isBanned = true; } // // for now always allow it until we can fix this better // we probably should assume NOT filtered unless it matches // a string match only url filter... but at least we will // allow it to match "BANNED" filters for now... // pr = 0; if ( pr == SPIDER_PRIORITY_FILTERED ) { // -3 // set m_errno reply->m_errno = EDOCFILTERED; // and this reply->m_isFiltered = true; } // done if we are if ( reply->m_errno && ! m_req->m_showBanned ) { // give back the url at least reply->ptr_ubuf = getFirstUrl()->getUrl(); reply->size_ubuf = getFirstUrl()->getUrlLen() + 1; m_replyValid = true; return reply; } } // breathe QUICKPOLL ( m_niceness ); // a special hack for XmlDoc::getRecommendedLinksBuf() so we exclude // links that link to the main url's site/domain as well as a // competitor url (aka related docid) Links *links = NULL; if ( m_req->m_ourHostHash32 || m_req->m_ourDomHash32 ) { links = getLinks(); if ( ! links || links==(Links *)-1) return (Msg20Reply *)links; } // breathe QUICKPOLL ( m_niceness ); // truncate content length if we should // this was hurting our linkdb lookups! do not do it for those! /* if ( size_utf8Content > cr->m_contentLenMaxForSummary && // fix for link text fetching! ! req->m_getLinkText ) { logf(LOG_DEBUG,"summary: truncating doc of len %"INT32" to %"INT32" for " "generating summary", size_utf8Content,cr->m_contentLenMaxForSummary); size_utf8Content = cr->m_contentLenMaxForSummary ; // null term just in case ptr_utf8Content[size_utf8Content-1] = '\0'; } */ // does they want a summary? if ( m_req->m_numSummaryLines>0 && ! reply->ptr_displaySum ) { char *hsum = getHighlightedSummary(); if ( ! hsum || hsum == (void *)-1 ) return (Msg20Reply *)hsum; //Summary *s = getSummary(); //if ( ! s || s == (void *)-1 ) return (Msg20Reply *)s; //int32_t sumLen = m_finalSummaryBuf.length(); // is it size and not length? int32_t hsumLen = 0; // seems like it can return 0x01 if none... if ( hsum == (char *)0x01 ) hsum = NULL; // get len. this is the HIGHLIGHTED summary so it is ok. if ( hsum ) hsumLen = gbstrlen(hsum); // must be \0 terminated. not any more, it can be a subset // of a larger summary used for deduping if ( hsumLen > 0 && hsum[hsumLen] ) { char *xx=NULL;*xx=0; } // assume size is 0 //int32_t sumSize = 0; // include the \0 in size //if ( sum ) sumSize = sumLen + 1; // do not get any more than "me" lines/excerpts of summary //int32_t max = m_req->m_numSummaryLines; // grab stuff from it! //reply->m_proximityScore = s->getProximityScore(); reply-> ptr_displaySum = hsum;//s->getSummary(); reply->size_displaySum = hsumLen+1;//sumSize;//s->getSummaryLen // this is unhighlighted for deduping, and it might be longer // . seems like we are not using this for deduping but using // the gigabit vector in Msg40.cpp, so take out for now //reply-> ptr_dedupSum = s->m_summary; //reply->size_dedupSum = s->m_summaryLen+1; //if ( s->m_summaryLen == 0 ) reply->size_dedupSum = 0; //reply->m_diversity = s->getDiversity(); } reply->m_numAlnumWords = 0; if ( m_wordsValid ) reply->m_numAlnumWords = m_words.m_numAlnumWords; // . we filter out search results that do not have all the query terms // . Matches.cpp checks the link text, dmoz, etc. for all query terms // . it must get into the results form indexdb corruption? // . this filtering method is/was known as the "BIG HACK" if ( m_req->size_qbuf > 1 ) { reply->m_hasAllQueryTerms = true; // Matches *mm = getMatches(); // if ( ! mm || mm == (Matches *)-1 ) return (Msg20Reply *)mm; // reply->m_hasAllQueryTerms = mm->m_matchesQuery; } // breathe QUICKPOLL ( m_niceness ); // copy the link info stuff? if ( ! m_req->m_getLinkText ) { reply->ptr_linkInfo = (char *)ptr_linkInfo1; reply->size_linkInfo = size_linkInfo1; } // breathe QUICKPOLL ( m_niceness ); bool getThatTitle = true; if ( m_req->m_titleMaxLen <= 0 ) getThatTitle = false; if ( reply->ptr_tbuf ) getThatTitle = false; // if steve's requesting the inlink summary we will want to get // the title of each linker even if they are spammy! // only get title here if NOT getting link text otherwise // we only get it down below if not a spammy voter, because // this sets the damn slow sections class if ( m_req->m_getLinkText && ! m_useSiteLinkBuf && ! m_usePageLinkBuf && // m_pbuf is used by pageparser.cpp now, not the other two things // above this. ! m_pbuf ) getThatTitle = false; // if steve is getting the inlinks, bad and good, for displaying // then get the title here now... otherwise, if we are just spidering // and getting the inlinks, do not bother getting the title because // the inlink might be linkspam... and we check down below... if ( ! m_req->m_onlyNeedGoodInlinks ) getThatTitle = true; // ... no more seo so stop it... disable this for sp if ( m_req->m_getLinkText ) getThatTitle = false; if ( getThatTitle ) { Title *ti = getTitle(); if ( ! ti || ti == (Title *)-1 ) return (Msg20Reply *)ti; char *tit = ti->getTitle(); int32_t titLen = ti->getTitleLen(); reply-> ptr_tbuf = tit; reply->size_tbuf = titLen + 1; // include \0 // sanity if ( tit && tit[titLen] != '\0' ) { char *xx=NULL;*xx=0; } if ( ! tit || titLen <= 0 ) { reply->ptr_tbuf = NULL; reply->size_tbuf = 0; } } // this is not documented because i don't think it will be popular if ( m_req->m_getHeaderTag ) { SafeBuf *htb = getHeaderTagBuf(); if ( ! htb || htb == (SafeBuf *)-1 ) return (Msg20Reply *)htb; // . it should be null terminated // . actually now it is a \0 separated list of the first // few h1 tags // . we call SafeBuf::pushChar(0) to add each one reply->ptr_htag = htb->getBufStart(); reply->size_htag = htb->getLength(); } // breathe QUICKPOLL ( m_niceness ); if ( m_req->m_getMatches && ! reply->ptr_mbuf ) { MatchOffsets *mo = getMatchOffsets(); if ( ! mo || mo == (MatchOffsets *)-1) return (Msg20Reply *)mo; reply-> ptr_mbuf = (char *)mo->m_matchOffsets; reply->size_mbuf = mo->m_numMatches*4; } // breathe QUICKPOLL ( m_niceness ); // get site reply->ptr_site = ptr_site; reply->size_site = size_site; // assume unknown reply->m_noArchive = 0; // are we noarchive? only check this if not getting link text if ( ! m_req->m_getLinkText ) { char *na = getIsNoArchive(); if ( ! na || na == (char *)-1 ) return (Msg20Reply *)na; reply->m_noArchive = *na; } // breathe QUICKPOLL ( m_niceness ); int32_t nowUTC2 = m_req->m_nowUTC; if ( m_req->m_clockSet ) nowUTC2 = m_req->m_clockSet; // . summary vector for deduping // . does not compute anything if we should not! (svSize will be 0) if ( ! reply->ptr_vbuf && m_req->m_getSummaryVector && cr->m_percentSimilarSummary > 0 && cr->m_percentSimilarSummary < 100 ) { int32_t *sv = getSummaryVector ( ); if ( ! sv || sv == (void *)-1 ) return (Msg20Reply *)sv; reply-> ptr_vbuf = (char *)m_summaryVec; reply->size_vbuf = m_summaryVecSize; } // breathe QUICKPOLL ( m_niceness ); if ( m_req->m_numSummaryLines > 0 ) { // turn off for now since we added this to posdb uint8_t *sl = getSummaryLangId(); if ( ! sl || sl == (void *)-1 ) return (Msg20Reply *)sl; reply->m_summaryLanguage = *sl; } // breathe QUICKPOLL ( m_niceness ); // returns values of specified meta tags if ( ! reply->ptr_dbuf && m_req->size_displayMetas > 1 ) { int32_t dsize; char *d; d = getDescriptionBuf(m_req->ptr_displayMetas,&dsize); if ( ! d || d == (char *)-1 ) return (Msg20Reply *)d; reply->ptr_dbuf = d; reply->size_dbuf = dsize; // includes \0 } // breathe QUICKPOLL ( m_niceness ); // . sample buffer for doing gigabit generation // . Msg40.cpp calls intersectGigabits on all these samples from // all the Msg20Replies it gets in the search results //if ( ! reply->ptr_gigabitQuery && m_req->m_bigSampleMaxLen > 0 ) { if ( ! reply->ptr_gigabitSample && m_req->m_bigSampleMaxLen > 0 ) { // before we got a chunk of text from teh doc SafeBuf *gsbuf = getSampleForGigabits(); if ( ! gsbuf||gsbuf ==(void *)-1) return (Msg20Reply *)gsbuf; reply->ptr_gigabitSample = gsbuf->getBufStart(); reply->size_gigabitSample = gsbuf->length(); // . now we use the gigabit query! // . this is really used to find out what wikipedia pages // we match the best... // . this also sets the vector /* char *gq = getGigabitQuery(); if ( ! gq || gq == (char *)-1) return (Msg20Reply *)gq; reply-> ptr_gigabitQuery = m_gigabitQuery; reply->size_gigabitQuery = gbstrlen(m_gigabitQuery)+1; reply-> ptr_gigabitScores = ptr_gigabitScores; reply->size_gigabitScores = size_gigabitScores; */ } // get full image url. but not if we already have a thumbnail... if ( ! reply->ptr_imgUrl&&!reply->ptr_imgData&&!m_req->m_getLinkText){ // && m_req->m_getImageUrl ) { char **iu = getImageUrl(); if ( ! iu || iu == (char **)-1 ) return (Msg20Reply *)iu; reply-> ptr_imgUrl = *iu; reply->size_imgUrl = 0; if ( *iu ) reply->size_imgUrl = gbstrlen(*iu)+1; } // get thumbnail image DATA if ( ! reply->ptr_imgData ) { // && m_req->m_getImageUrl ) { reply-> ptr_imgData = ptr_imageData; reply->size_imgData = size_imageData; } // . adids contained in the doc // . get from title rec rather than generating // . but we need to generate to store in titleRec at index time // . they are 32 bits each int64_t **avp = getAdVector(); if ( ! avp || avp == (void *)-1 ) return (Msg20Reply *)avp; // get firstip int32_t *fip = getFirstIp(); if ( ! fip || fip == (void *)-1 ) return (Msg20Reply *)fip; //Url **redir = getRedirUrl(); //if ( ! redir || redir == (Url **)-1 ) return (Msg20Reply *)redir; //int32_t redirSize = 0; //if ( *redir ) redirSize = (*redir)->getUrlLen() + 1; //char *ru = NULL; //if ( *redir ) ru = (*redir)->getUrl(); char *ru = ptr_redirUrl; int32_t rulen = 0; if ( ru ) rulen = gbstrlen(ru)+1; // . Msg25.cpp uses m_adIdHash for restricting voting // . these are 64 bit termids hashes reply-> ptr_gbAdIds = (char *)*avp; // this size is in bytes and includes the \0 reply->size_gbAdIds = size_adVector; // need full cached page of each search result? if ( m_req->m_includeCachedCopy ) { reply-> ptr_content = ptr_utf8Content; reply->size_content = size_utf8Content; } // if ( m_req->m_getSectionVotingInfo && m_tmpBuf3.getCapacity() <=0) { // Sections *ss = getSections(); // if ( ! ss || ss == (void *)-1) return (Msg20Reply *)ss; // // will at least store a \0 in there, but will not count // // as part of the m_tmpBuf.length() // ss->printVotingInfoInJSON ( &m_tmpBuf3 ); // reply-> ptr_sectionVotingInfo = m_tmpBuf3.getBufStart(); // reply->size_sectionVotingInfo = m_tmpBuf3.length() + 1; // } // breathe QUICKPOLL ( m_niceness ); // do they want to know if this doc has an outlink to a url // that has the provided site and domain hash, Msg20Request:: // m_ourHostHash32 and m_ourDomHash32? int32_t nl = 0; if ( links ) nl = links->getNumLinks(); // scan all outlinks we have on this page int32_t i ; for ( i = 0 ; i < nl ; i++ ) { // get the normalized url //char *url = links->getLinkPtr(i); // get the site. this will not block or have an error. int32_t hh32 = (int32_t)((uint32_t)links->getHostHash64(i)); if ( hh32 == m_req->m_ourHostHash32 ) break; int32_t dh32 = links->getDomHash32(i); if ( dh32 == m_req->m_ourDomHash32 ) break; } reply->m_hasLinkToOurDomOrHost = false; if ( i < nl ) reply->m_hasLinkToOurDomOrHost = true; // easy ones reply->m_isPermalink = m_isPermalink; reply->m_ip = m_ip; reply->m_firstIp = *fip; reply->m_domHash = getDomHash32();//domHash; reply->m_docId = m_docId; reply->m_urlHash48 = getFirstUrlHash48(); reply->m_contentLen = size_utf8Content; reply->m_lastSpidered = getSpideredTime();//m_spideredTime; reply->m_datedbDate = m_pubDate; reply->m_firstIndexedDate = m_firstIndexedDate; reply->m_firstSpidered = m_firstIndexedDate; reply->m_contentType = m_contentType; reply->m_hostHash = getHostHash32a(); //reply->m_contentHash = *getContentHash32(); reply->m_language = m_langId; reply->m_country = *getCountryId(); //reply->m_hasAllQueryTerms = false; reply->m_hopcount = m_hopCount; reply->m_siteRank = getSiteRank(); reply->ptr_ubuf = getFirstUrl()->getUrl(); reply->ptr_rubuf = ru; reply->ptr_catIds = ptr_catIds; reply->ptr_indCatIds = ptr_indCatIds; reply->ptr_dmozTitles = ptr_dmozTitles; reply->ptr_dmozSumms = ptr_dmozSumms; reply->ptr_dmozAnchors = ptr_dmozAnchors; reply->size_ubuf = getFirstUrl()->getUrlLen() + 1; reply->size_rubuf = rulen; reply->size_catIds = size_catIds; reply->size_indCatIds = size_indCatIds; reply->size_dmozTitles = size_dmozTitles; reply->size_dmozSumms = size_dmozSumms; reply->size_dmozAnchors = size_dmozAnchors; // breathe QUICKPOLL( m_req->m_niceness ); /* // truncate if necessary (buzz) int32_t maxLen = 150000; // truncate it? bool trunc = true; // not if getting link text if ( req->m_getLinkText ) trunc = false; // or outlinks if ( req->m_getOutlinks ) trunc = false; // or any niceness 1+ for that matter, that indicates a build operation if ( req->m_niceness > 0 ) trunc = false; // this is causing us to get EMISSINGQUERYTERMS errors!!! trunc = false; // MDW: int16_ten for speed test //int32_t maxLen = 1000; if ( trunc && contentLen > maxLen+1 ) { contentLen = maxLen; content [maxLen ] = '\0'; } */ // check the tag first if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } //if ( ! m_sitePopValid ) { char *xx=NULL;*xx=0; } //Tag *tag1 = gr->getTag ("sitenuminlinks"); //Tag *tag2 = gr->getTag ("sitepop"); //int32_t sni = 0; //int32_t spop = 0; //if ( tag1 ) sni = atol(tag1->m_data); //if ( tag2 ) spop = atol(tag2->m_data); reply->m_siteNumInlinks = m_siteNumInlinks; reply->m_siteNumInlinksTotal = m_siteNumInlinksTotal; reply->m_siteNumUniqueIps = m_siteNumInlinksUniqueIp; reply->m_siteNumUniqueCBlocks = m_siteNumInlinksUniqueCBlock; //reply->m_sitePop = m_sitePop; // . get stuff from link info // . this is so fast, just do it for all Msg20 requests // . no! think about it -- this can be huge for pages like // google.com!!! LinkInfo *info1 = ptr_linkInfo1; if ( info1 ) { // && m_req->m_getLinkInfo ) { reply->m_pageNumInlinks = info1->m_totalInlinkingDocIds; reply->m_pageNumGoodInlinks = info1->m_numGoodInlinks; reply->m_pageNumUniqueIps = info1->m_numUniqueIps; reply->m_pageNumUniqueCBlocks = info1->m_numUniqueCBlocks; reply->m_pageInlinksLastUpdated = info1->m_lastUpdated; //reply->m_pagePop = 0;//info1->m_pagePop; //reply->m_siteNumInlinks = info1->m_siteNumInlinks; //reply->m_sitePop = info1->m_sitePop; } // breathe QUICKPOLL ( m_niceness ); // getLinkText is true if we are getting the anchor text for a // supplied url as part of the SPIDER process.. // this was done by Msg23 before if ( ! m_req->m_getLinkText ) { m_replyValid = true; return &m_reply; } // use the first url of the linker by default Url *linker = &m_firstUrl; // the base url, used for doing links: terms, is the final url, // just in case there were any redirects Url redir; if ( ru ) { redir.set ( ru ); linker = &redir; } // breathe QUICKPOLL( m_niceness ); // . get score weight of link text // . phase out the sitedb*.xml files //int64_t x[] = {0,20,30,40,50,70,90,100}; qualities! // map these siteNumInlinks (x) to a weight (y) //int64_t x[] = {0,50,100,200,500,3000,10000,50000}; // these are the weights the link text will receive //int64_t y[] = {10,30,2000,3000,4000,5000,6000,7000}; // sanity check //if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } // int16_tcut //int32_t sni = m_siteNumInlinks;// *getSiteNumInlinks(); // get the final link text weight as a percentage //int32_t ltw = getY ( m_siteNumInlinks , x , y , 8 ); // store the weight in the reply //reply->m_linkTextScoreWeight = ltw; //log(LOG_DEBUG,"build: got score weight of %"INT32" for sni=%"INT32"", // (int32_t)reply->m_linkTextScoreWeight, m_siteNumInlinks); // breathe QUICKPOLL( m_niceness ); // . we need the mid doma hash in addition to the ip domain because // chat.yahoo.com has different ip domain than www.yahoo.com , ... // and we don't want them both to be able to vote // . the reply is zeroed out in call the reply->reset() above so // if this is not yet set it will be 0 if ( reply->m_midDomHash == 0 ) { char *m = linker->getMidDomain(); int32_t mlen = linker->getMidDomainLen(); reply->m_midDomHash = hash32 ( m , mlen ); } // breathe QUICKPOLL( m_niceness ); int64_t start = gettimeofdayInMilliseconds(); // if not set from above, set it here if ( ! links ) links = getLinks ( true ); // do quick set? if ( ! links || links == (Links *)-1 ) return (Msg20Reply *)links; Pos *pos = getPos(); if ( ! pos || pos == (Pos *)-1 ) return (Msg20Reply *)pos; Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (Msg20Reply *)ww; Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (Msg20Reply *)xml; //Sections *ss = getSections(); //if ( ! ss || ss == (void *)-1) return (Msg20Reply *)ss; // . is this page a dynamic page? // . like a guestbook, access log stats, etc. // . we don't like to count such pages for links analysis because // they can be spammed so easily // . TODO: guestbooks and message boards typically contain cgi links // can we use that to identify? // . the coll size includes the \0 //CollectionRec *cr ; //cr = g_collectiondb.getRec ( m_req->ptr_coll,m_req->size_coll-1); // g_errno should be ENOCOLLREC //if ( ! cr ) return NULL; // . we want link text for this url, "linkee" // . TODO: true --> add "www" to see if that fixes our problem // i guess Links.cpp does that with the outlinks, so when // Linkdb::fillList() uses Links.cpp, the outlinks have "www" // prepended on them... //Url linkee; //linkee.set ( m_req->ptr_linkee , m_req->size_linkee ); // get a ptr to the link in the content. will point to the // stuff in the href field of the anchor tag. used for seeing if // we have bad links or not. int32_t linkNode = -1; int32_t linkNum = -1; // . get associated link text from the linker's document for our "url" // . only gets from FIRST link to us // . TODO: allow more link text from better quality pages? // . TODO: limit score based on link text length? // . should always be NULL terminated // . should not break in the middle of a word // . this will return the item/entry if we are extracting from an // rss/atom feed char *rssItem = NULL; int32_t rssItemLen = 0; // store link text in here char linkTextBuf[MAX_LINK_TEXT_LEN]; // // TODO: for getting siteinlinks just match the site in the url // not the full url... and maybe match the one with the int16_test path. // // . get the link text // . linkee might be a site if m_isSiteLinkInfo is true in which // case we get the best inlink to that site, and linkee is // something like blogspot.com/mary/ or some other site. int32_t blen = links->getLinkText ( m_req->ptr_linkee ,//&linkee, m_req->m_isSiteLinkInfo , linkTextBuf , MAX_LINK_TEXT_LEN-2 , &rssItem , &rssItemLen , &linkNode , &linkNum , m_niceness ); int64_t took = gettimeofdayInMilliseconds() - start; if ( took > 100 ) log("build: took %"INT64" ms to get link text for " "%s from linker %s", took, m_req->ptr_linkee, m_firstUrl.m_url ); // . BUT this skips the news topic stuff too. bad? // . THIS HAPPENED before because we were truncating the xml(see above) if ( linkNode < 0 ) { logf(LOG_DEBUG,"build: Got linknode = %"INT32" < 0. Cached " "linker %s does not have outlink to %s like linkdb " "says it should. page is probably too big and the " "outlink is past our limit. contentLen=%"INT32". or " "a sitehash collision, or an area tag link.", linkNode,getFirstUrl()->getUrl(),m_req->ptr_linkee, m_xml.getContentLen()); //g_errno = ECORRUPTDATA; // do not let multicast forward to a twin! so use this instead // of ECORRUTPDATA g_errno = EBADENGINEER; //char *xx=NULL;*xx=0; return NULL; } // breathe QUICKPOLL(m_niceness); if ( ! verifyUtf8 ( linkTextBuf , blen ) ) { log("xmldoc: bad OUT link text from url=%s for %s", m_req->ptr_linkee,m_firstUrl.m_url); linkTextBuf[0] = '\0'; blen = 0; } // verify for rss as well. seems like we end up coring because // length/size is not in cahoots and [size-1] != '\0' sometimes if ( ! verifyUtf8 ( rssItem , rssItemLen ) ) { log("xmldoc: bad RSS ITEM text from url=%s for %s", m_req->ptr_linkee,m_firstUrl.m_url); rssItem[0] = '\0'; rssItemLen = 0; } // point to it, include the \0. if ( blen > 0 ) { reply->ptr_linkText = linkTextBuf; // save the size into the reply, include the \0 reply->size_linkText = blen + 1; // sanity check if ( blen + 2 > MAX_LINK_TEXT_LEN ) { char *xx=NULL;*xx=0; } // sanity check. null termination required. if ( linkTextBuf[blen] ) { char *xx=NULL;*xx=0; } } // . the link we link to // . important when getting site info because the link url // can be different than the root url! reply-> ptr_linkUrl = links->getLink (linkNum); reply->size_linkUrl = links->getLinkLen(linkNum)+1; // save the rss item in our state so we can point to it, include \0 if ( rssItemLen > MAX_RSSITEM_SIZE-2 ) rssItemLen = MAX_RSSITEM_SIZE-2; char rssItemBuf[MAX_RSSITEM_SIZE]; if ( rssItemLen > 0) { gbmemcpy ( rssItemBuf, rssItem , rssItemLen ); // NULL terminate it rssItemBuf[rssItemLen] = 0; } // point to it, include the \0 if ( rssItemLen > 0 ) { reply->ptr_rssItem = rssItemBuf; reply->size_rssItem = rssItemLen + 1; } // breathe QUICKPOLL( m_niceness ); if ( ! m_req->m_doLinkSpamCheck ) reply->m_isLinkSpam = false; if ( m_req->m_doLinkSpamCheck ) { // reset to NULL to avoid gbstrlen segfault char *note = NULL; // need this if ( ! m_xmlValid ) { char *xx=NULL;*xx=0; } // time it //int64_t start = gettimeofdayInMilliseconds(); Url linkeeUrl; linkeeUrl.set ( m_req->ptr_linkee ); // get it. does not block. reply->m_isLinkSpam = ::isLinkSpam ( linker , m_ip , ptr_indCatIds , size_indCatIds / 4 , m_siteNumInlinks, &m_xml, links, MAXDOCLEN,//150000, ¬e , &linkeeUrl , // url , linkNode , cr->m_coll , m_niceness ); // store it if ( note ) { // include the \0 reply->ptr_note = note; reply->size_note = gbstrlen(note)+1; } // log the reason why it is a log page if ( reply->m_isLinkSpam ) log(LOG_DEBUG,"build: linker %s: %s.", linker->getUrl(),note); // sanity if ( reply->m_isLinkSpam && ! note ) log("linkspam: missing note for d=%"INT64"!",m_docId); // store times... nah, might have yielded cpu! reply->m_timeLinkSpam = 0; } // breathe QUICKPOLL(m_niceness); // sanity check if ( reply->ptr_rssItem && reply->size_rssItem>0 && reply->ptr_rssItem[reply->size_rssItem-1]!=0) { char *xx=NULL;*xx=0; } //log ("nogl=%"INT32"",(int32_t)m_req->m_onlyNeedGoodInlinks ); // . skip all this junk if we are a spammy voter // . we get the title above in "getThatTitle" if ( reply->m_isLinkSpam ) { m_replyValid = true; return reply; } // . this vector is set from a sample of the entire doc // . it is used to dedup voters in Msg25.cpp // . this has pretty much been replaced by vector2, it was // also saying a doc was a dup if all its words were // contained by another, like if it was a small subset, which // wasn't the best behaviour. // . yeah neighborhood text is much better and this is setting // the slow sections class, so i took it out getPageSampleVector (); // must not block or error out. sanity check if ( ! m_pageSampleVecValid ) { char *xx=NULL;*xx=0; } //st->m_v1.setPairHashes ( ww , -1 , m_niceness ); // breathe QUICKPOLL( m_niceness ); //st->m_v2.setPairHashes ( ww,linkWordNum, m_niceness ); // . this vector is set from the text after the link text // . it terminates at at a breaking tag // . check it out in ~/fff/src/Msg20.cpp getPostLinkTextVector ( linkNode ); // must not block or error out. sanity check //if ( ! m_postLinkTextVecValid ) { char *xx=NULL;*xx=0; } // breathe QUICKPOLL( m_niceness ); // set from the hashes of the tag id pairs //st->m_v3.setTagPairHashes ( xml , m_niceness ); // get it getTagPairHashVector(); // must not block or error out. sanity check if ( ! m_tagPairHashVecValid ) { char *xx=NULL;*xx=0; } // breathe QUICKPOLL( m_niceness ); // this vector is set from the hashes of the path components // with punctuation stripped out //v4.set ( xml, NULL , linker, -1 ,buf4,size); // . the 4th vector is provided, this will point to m_topIps[] buffer // . this is temporarily disabled // . this is the top 2 bytes of the ips of each inlink // . we were looking this info up in linkdb // . so if two good inlinkers had their inlinks from the same ip // neighborhoods, then one would have its voting power "deduped". // . see the old LinkText.cpp for the logic that read these from linkdb //v5.set2 ( (char *)incomingIps , numIncomingIps ); // reference the vectors in our reply reply-> ptr_vector1 = m_pageSampleVec;//(char *)&st->m_v1; reply->size_vector1 = m_pageSampleVecSize;//st->m_v1.getSize(); reply-> ptr_vector2 = m_postVec;//(char *)&st->m_v2; reply->size_vector2 = m_postVecSize;//st->m_v2.getSize(); reply-> ptr_vector3 = m_tagPairHashVec; // (char *)&st->m_v3; reply->size_vector3 = m_tagPairHashVecSize;//st->m_v3.getSize(); // crap, we gotta bubble sort these i think // but only tag pair hash vec bool flag = true; uint32_t *d = (uint32_t *)m_tagPairHashVec; // exclude the terminating 0 int32_t int32_t nd = (m_tagPairHashVecSize / 4) - 1; while ( flag ) { // breathe QUICKPOLL ( m_niceness ); flag = false; for ( int32_t i = 1 ; i < nd ; i++ ) { if ( d[i-1] <= d[i] ) continue; uint32_t tmp = d[i-1]; d[i-1] = d[i]; d[i] = tmp; flag = true; } } // just always do it //if ( ! req->m_getInlinkNeighborhoods ) return true; // convert "linkNode" into a string ptr into the document char *node = xml->getNodePtr(linkNode)->m_node; // . find the word index, "n" for this node // . this is INEFFICIENT!! char **wp = ww->getWords(); int32_t nw = ww->getNumWords(); int32_t n; for ( n = 0; n < nw && wp[n] < node ; n++ ) QUICKPOLL(m_niceness); // sanity check //if ( n >= nw ) { char *xx=NULL; *xx=0; } if ( n >= nw ) { log("links: crazy! could not get word before linknode"); g_errno = EBADENGINEER; return NULL; } //int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE; // get the ptrs to the sections, 1-1 with words //Section **sp = NULL; //if ( ss ) sp = ss->m_sectionPtrs; // . even tags in the article section have positive scores // . the scores array is 1-1 with the words in Words, not the nodes // in Xml. so we had to do that conversion. //if ( ! sp || !(sp[n]->m_flags & NOINDEXFLAGS) ) // reply->m_outlinkInContent = true; // // get the surrounding link text, around "linkNode" // // radius of 80 characters around n char sbuf[1201]; int32_t radius = 80; char *p = sbuf; char *pend = sbuf + 600; // . make a neighborhood in the "words" space [a,b] // . radius is in characters, so "convert" into words by dividing by 5 int32_t a = n - radius / 5; int32_t b = n + radius / 5; if ( a < 0 ) a = 0; if ( b > nw ) b = nw; int32_t *pp = pos->m_pos; int32_t len; // if too big shring the biggest, a or b? while ( (len=pp[b]-pp[a]) >= 2 * radius + 1 ) { // decrease the largest, a or b if ( a<n && (pp[n]-pp[a])>(pp[b]-pp[n])) a++; else if ( b>n ) b--; } // only store it if we can if ( p + len + 1 < pend ) { // store it // FILTER the html entities!! int32_t len2 = pos->filter(p,pend,ww,a,b,NULL);//ss); // ensure NULL terminated p[len2] = '\0'; // store in reply. it will be serialized when sent. reply->ptr_surroundingText = p; reply->size_surroundingText = len2 + 1; } // breathe QUICKPOLL ( m_niceness ); // get title? its slow because it sets the sections class if ( m_req->m_titleMaxLen > 0 && ! reply->ptr_tbuf && // don't get it anymore if getting link info because it // is slow... getThatTitle ) { Title *ti = getTitle(); if ( ! ti || ti == (Title *)-1 ) return (Msg20Reply *)ti; char *tit = ti->getTitle(); int32_t titLen = ti->getTitleLen(); reply-> ptr_tbuf = tit; reply->size_tbuf = titLen + 1; // include \0 if ( ! tit || titLen <= 0 ) { reply->ptr_tbuf = NULL; reply->size_tbuf = 0; } } m_replyValid = true; return reply; } //static void gotMsg5ListWrapper ( void *state , RdbList *list , Msg5 *msg5 ) { // XmlDoc *THIS = (XmlDoc *)state; // THIS->m_masterLoop ( THIS->m_masterState ); //} char **XmlDoc::getDiffbotPrimaryImageUrl ( ) { // use new json parser Json *jp = getParsedJson(); if ( ! jp || jp == (void *)-1 ) return (char **)jp; JsonItem *ji = jp->getFirstItem(); // assume none m_imageUrl2 = NULL; m_imageUrl2Valid = true; //logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url); for ( ; ji ; ji = ji->m_next ) { QUICKPOLL(m_niceness); // skip if not number or string if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING ) continue; //char *topName = NULL; // what name level are we? // int32_t numNames = 1; // JsonItem *pi = ji->m_parent; // for ( ; pi ; pi = pi->m_parent ) { // // empty name? // if ( ! pi->m_name ) continue; // if ( ! pi->m_name[0] ) continue; // topName = pi->m_name; // numNames++; // } char *name0 = ji->m_name; char *name1 = NULL; char *name2 = NULL; if ( ji->m_parent ) name1 = ji->m_parent->m_name; if ( ji->m_parent->m_parent ) name2 = ji->m_parent->m_parent->m_name; // stop at first image for "images":[{ indicator if ( strcmp(name0,"url") == 0 && name1 && strcmp(name1,"images") == 0 ) break; // for products if ( strcmp(name0,"link") == 0 && name1 && strcmp(name1,"media") == 0 ) break; } if ( ! ji ) return &m_imageUrl2; int32_t vlen; char *val = ji->getValueAsString( &vlen ); // ok, we got it, just copy that m_imageUrlBuf2.safeMemcpy ( val , vlen ); m_imageUrlBuf2.nullTerm(); m_imageUrl2 = m_imageUrlBuf2.getBufStart(); return &m_imageUrl2; } // get the image url SPECIFIED by the page, so there is no guesswork here // unlike with the Images.cpp class char **XmlDoc::getImageUrl() { // return if valid if ( m_imageUrlValid ) return &m_imageUrl; // get first url Url *f = getFirstUrl(); if ( ! f || f == (Url *)-1 ) return (char **)f; // assume none m_imageUrl = NULL; m_imageUrlValid = true; // we use getDiffbotPrimaryImageUrl() above for doing thumbs if ( m_isDiffbotJSONObject || m_contentType == CT_JSON ) return &m_imageUrl; // all done if not youtube or meta cafe char *host = f->getHost(); char found = 0; if ( ! strncmp ( host , "www.youtube.com/" , 16 ) ) found = 1; if ( ! strncmp ( host , "youtube.com/" , 12 ) ) found = 1; if ( ! strncmp ( host , "www.metacafe.com/" , 17 ) ) found = 2; if ( ! strncmp ( host , "metacafe.com/" , 13 ) ) found = 2; if ( ! found ) return &m_imageUrl; // char ptr char *u = f->getUrl(); // make it if ( found == 1 ) { char *s = strstr(u,"v="); // if url does not contain a "v=" then forget it if ( ! s ) return &m_imageUrl; // point to the id s += 2; //m_imageUrl = m_imageUrlBuf; //char *p = m_imageUrlBuf; m_imageUrlBuf.safeStrcpy("http://img.youtube.com/vi/"); // do not break //char *pend = m_imageUrlBuf + 80; // copy the id/number //for ( ; is_digit(*s) && p < pend ; ) *p++ = *s++; for ( ; is_digit(*s) ; s++ ) m_imageUrlBuf.pushChar(*s); // wrap it up m_imageUrlBuf.safeStrcpy ( "/2.jpg" ); // size includes \0; //m_imageUrlSize = p - m_imageUrl ; // sanity check //if ( m_imageUrlSize > 100 ) { char *xx=NULL;*xx=0; } m_imageUrl = m_imageUrlBuf.getBufStart(); return &m_imageUrl; } // must be meta cafe now // http://www.metacafe.com/watch/559561/surfer_girls_vol_2/ // http://s2.mcstatic.com/thumb/559561.jpg // scan url path for first digit for ( char *t = f->getPath() ; *t ; t++ ) { // look for digit if ( ! is_digit ( *t ) ) t++; // grab that int32_t id = atol ( t ); // skip ifnot good if ( id <= 0 ) continue; // make the url //m_imageUrl = m_imageUrlBuf; //char *p = m_imageUrlBuf; //gbmemcpy ( p , "http://s2.mcstatic.com/thumb/" , 29 ); //p += 29; //p += sprintf ( p , "%"INT32"" , id ); //gbmemcpy ( p , ".jpg\0" , 5 ); //p += 5; m_imageUrlBuf.safePrintf("http://s2.mcstatic." "com/thumb/%"INT32".jpg", id); m_imageUrl = m_imageUrlBuf.getBufStart(); // size includes \0; //m_imageUrlSize = p - m_imageUrl ; // sanity check //if ( m_imageUrlSize > 100 ) { char *xx=NULL;*xx=0; } break; } return &m_imageUrl; } MatchOffsets *XmlDoc::getMatchOffsets () { // return it if it is set if ( m_matchOffsetsValid ) return &m_matchOffsets; // need a buncha crap Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (MatchOffsets *)ww; Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (MatchOffsets *)xml; Matches *mm = getMatches(); if ( ! mm || mm == (Matches *)-1 ) return (MatchOffsets *)mm; m_matchOffsets.set ( xml , ww , mm , true ); // getMatches=true m_matchOffsetsValid = true; return &m_matchOffsets; } Query *XmlDoc::getQuery() { if ( m_queryValid ) return &m_query; // bail if no query if ( ! m_req || ! m_req->ptr_qbuf ) { m_queryValid = true; return &m_query; } // return NULL with g_errno set on error if ( ! m_query.set2( m_req->ptr_qbuf , m_req->m_langId , true ) ) return NULL; m_queryValid = true; return &m_query; } Matches *XmlDoc::getMatches () { // return it if it is set if ( m_matchesValid ) return &m_matches; // if no query, matches are empty if ( ! m_req->ptr_qbuf ) { m_matchesValid = true; return &m_matches; } // cache it for one hour //XmlDoc *od = getOldXmlDoc ( 3600 ); //if ( ! od || od == (XmlDoc *)-1 ) return (Matches *)od; //if ( od->isEmpty() ) od = NULL; // need a buncha crap Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (Matches *)ww; Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (Matches *)xml; Bits *bits = getBitsForSummary(); if ( ! bits || bits == (Bits *)-1 ) return (Matches *)bits; Sections *ss = getSections(); if ( ! ss || ss == (void *)-1) return (Matches *)ss; Pos *pos = getPos(); if ( ! pos || pos == (Pos *)-1 ) return (Matches *)pos; Title *ti = getTitle(); if ( ! ti || ti == (Title *)-1 ) return (Matches *)ti; //Synonyms *syn = getSynonyms(); //if ( ! syn || syn == (void *)-1 ) return (Matches *)syn; Phrases *phrases = getPhrases(); if ( ! phrases || phrases == (void *)-1 ) return (Matches *)phrases; Query *q = getQuery(); if ( ! q ) return (Matches *)q; // set it up m_matches.setQuery ( q ); // returns false and sets g_errno on error if ( ! m_matches.set ( this , ww , //syn , phrases , ss , bits , pos , xml , ti , m_niceness ) ) return NULL; // we got it m_matchesValid = true; return &m_matches; } // sender wants meta description, custom tags, etc. char *XmlDoc::getDescriptionBuf ( char *displayMetas , int32_t *dsize ) { // return the buffer if we got it if ( m_dbufValid ) { *dsize = m_dbufSize; return m_dbuf; } Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; // now get the content of the requested display meta tags //char dbuf [ 1024*64 ]; char *dbufEnd = m_dbuf + 1024;//1024*64; char *dptr = m_dbuf; char *pp = displayMetas; char *ppend = pp + gbstrlen(displayMetas); // loop over the list of requested meta tag names while ( pp < ppend && dptr < dbufEnd ) { // skip initial spaces. meta tag names are ascii always i guess while ( *pp && is_wspace_a(*pp) ) pp++; // that's the start of the meta tag name char *s = pp; // . find end of that meta tag name // . can end in :<integer> which specifies max len while ( *pp && ! is_wspace_a(*pp) && *pp != ':' ) pp++; // assume no max length to the content of this meta tag int32_t maxLen = 0x7fffffff; // save current char char c = *pp; // . NULL terminate the name // . before, overflowed the request buffer and caused core! // . seems like it is already NULL terminated if ( *pp ) *pp = '\0'; // always advance regardless though pp++; // if ':' was specified, get the max length if ( c == ':' ) { if ( is_digit(*pp) ) maxLen = atoi ( pp ); // skip over the digits while ( *pp && ! is_wspace_a (*pp) ) pp++; } // don't exceed our total buffer size (save room for \0 at end) int32_t avail = dbufEnd - dptr - 1; if ( maxLen > avail ) maxLen = avail; // store the content at "dptr" (do not exceed "maxLen" bytes) int32_t wlen = xml->getMetaContent ( dptr , // write buf maxLen , // buf length s , // name value gbstrlen(s) , // name len "name" , // http-equiv/name false );// convert &#'s? dptr[wlen] = '\0'; // test it out if ( ! verifyUtf8 ( dptr ) ) { log("xmldoc: invalid utf8 content for meta tag %s.",s); continue; } // advance and NULL terminate dptr += wlen; *dptr++ = '\0'; // bitch if we truncated if ( dptr >= dbufEnd ) log("query: More than %"INT32" bytes of meta tag " "content " "was encountered. Truncating.", (int32_t)(dbufEnd-m_dbuf)); } // what is the size of the content of displayed meta tags? m_dbufSize = dptr - m_dbuf; m_dbufValid = true; *dsize = m_dbufSize; return m_dbuf; } SafeBuf *XmlDoc::getHeaderTagBuf() { if ( m_htbValid ) return &m_htb; Sections *ss = getSections(); if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss; int32_t count = 0; // scan sections Section *si = ss->m_rootSection; moreloop: for ( ; si ; si = si->m_next ) { // breathe QUICKPOLL(m_niceness); if ( si->m_tagId != TAG_H1 ) continue; // if it contains now text, this will be -1 // so give up on it if ( si->m_firstWordPos < 0 ) continue; if ( si->m_lastWordPos < 0 ) continue; // ok, it works, get it break; } // if no h1 tag then make buf empty if ( ! si ) { m_htb.nullTerm(); m_htbValid = true; return &m_htb; } // otherwise, set it char *a = m_words.m_words[si->m_firstWordPos]; char *b = m_words.m_words[si->m_lastWordPos] ; b += m_words.m_wordLens[si->m_lastWordPos]; // copy it m_htb.safeMemcpy ( a , b - a ); m_htb.pushChar('\0'); si = si->m_next; // add more? if ( count++ < 3 ) goto moreloop; m_htbValid = true; return &m_htb; } Title *XmlDoc::getTitle ( ) { if ( m_titleValid ) return &m_title; // need a buncha crap Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (Title *)xml; Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (Title *)ww; Sections *sections = getSections(); if ( ! sections ||sections==(Sections *)-1) return (Title *)sections; Pos *pos = getPos(); if ( ! pos || pos == (Pos *)-1 ) return (Title *)pos; Query *q = getQuery(); if ( ! q ) return (Title *)q; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; int32_t titleMaxLen = cr->m_titleMaxLen; if ( m_req ) titleMaxLen = m_req->m_titleMaxLen; // limit for speed, some guys have a 100k word title! if ( titleMaxLen > 256 ) titleMaxLen = 256; m_titleValid = true; if ( ! m_title.setTitle ( this , xml , ww , sections , pos , titleMaxLen , 0xffff , NULL , q , cr , m_niceness ) ) return NULL; return &m_title; } Summary *XmlDoc::getSummary () { if ( m_summaryValid ) return &m_summary; // xml and json docs have empty summaries for now uint8_t *ct = getContentType(); if ( ! ct || ct == (void *)-1 ) return (Summary *)ct; if ( *ct == CT_JSON || *ct == CT_XML ) { m_summaryValid = true; return &m_summary; } // need a buncha crap Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (Summary *)ww; Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (Summary *)xml; Bits *bits = getBitsForSummary(); if ( ! bits || bits == (Bits *)-1 ) return (Summary *)bits; Sections *sections = getSections(); if ( ! sections ||sections==(Sections *)-1) return (Summary *)sections; Pos *pos = getPos(); if ( ! pos || pos == (Pos *)-1 ) return (Summary *)pos; char *site = getSite (); if ( ! site || site == (char *)-1 ) return (Summary *)site; int64_t *d = getDocId(); if ( ! d || d == (int64_t *)-1 ) return (Summary *)d; Matches *mm = getMatches(); if ( ! mm || mm == (Matches *)-1 ) return (Summary *)mm; Title *ti = getTitle(); if ( ! ti || ti == (Title *)-1 ) return (Summary *)ti; Query *q = getQuery(); if ( ! q ) return (Summary *)q; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // . get the highest number of summary lines that we need // . the summary vector we generate for doing summary-based deduping // typically has more lines in it than the summary we generate for // displaying to the user int32_t numLines = m_req->m_numSummaryLines; if ( cr->m_percentSimilarSummary > 0 && cr->m_percentSimilarSummary < 100 && m_req->m_getSummaryVector && cr->m_summDedupNumLines > numLines ) // request more lines than we will display numLines = cr->m_summDedupNumLines; // int16_tcut Summary *s = &m_summary; // time cpu set time int64_t start = gettimeofdayInMilliseconds(); m_cpuSummaryStartTime = start; // make sure summary does not include title char *tbuf = ti->m_title; // this does not include the terminating \0 int32_t tbufLen = ti->m_titleBytes; // compute the summary bool status; status = s->set2( xml , ww , bits , sections , pos , q , (int64_t *)m_req->ptr_termFreqs , (float *)m_req->ptr_affWeights , false , // doStemming m_req->m_summaryMaxLen , numLines , // . displayLines, # lines we are displaying // . Summary::getDisplayLen() will return the // length of the summary to display m_req->m_numSummaryLines , m_req->m_summaryMaxNumCharsPerLine, m_req->m_ratInSummary , getFirstUrl() , //&reply->m_queryProximityScore , mm , tbuf , tbufLen ); // error, g_errno should be set! if ( ! status ) return NULL; m_summaryValid = true; return &m_summary; } char *XmlDoc::getHighlightedSummary ( ) { if ( m_finalSummaryBufValid ) { //char *fsum = m_finalSummaryBuf.getBufStart(); //if ( ! fsum ) fsum = (char *)0x01; return m_finalSummaryBuf.getBufStart(); } Summary *s = getSummary(); if ( ! s || s == (void *)-1 ) return (char *)s; Query *q = getQuery(); if ( ! q ) return (char *)q; // get the summary char *sum = s->getSummary(); //int32_t sumLen = s->getSummaryLen(); int32_t sumLen = s->getSummaryDisplayLen(); //sum[sumLen] = 0; // assume no highlighting? if ( ! m_req->m_highlightQueryTerms || sumLen == 0 ) { m_finalSummaryBuf.safeMemcpy ( sum , sumLen ); m_finalSummaryBuf.nullTerm(); m_finalSummaryBufValid = true; return m_finalSummaryBuf.getBufStart(); //char *fsum = m_finalSummaryBuf.getBufStart(); //if ( ! fsum ) fsum = (char *)0x01; //return fsum; } if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; } //char tt[5000]; Highlight hi; SafeBuf hb; // highlight the query in it int32_t hlen = hi.set ( &hb, //tt , //4999 , sum, sumLen, m_langId, q, false , // doStemming? false , //click&scroll? NULL , // base url "<b>" , // front tag "</b>" , // back tag 0, m_niceness ); // highlight::set() returns 0 on error if ( hlen < 0 ) { log("build: highlight class error = %s",mstrerror(g_errno)); if ( ! g_errno ) { char *xx=NULL;*xx=0; } return NULL; } // store into our safebuf then m_finalSummaryBuf.safeMemcpy ( &hb );//tt , hlen + 1 ); m_finalSummaryBufValid = true; m_finalSummaryBuf.nullTerm(); return m_finalSummaryBuf.getBufStart(); //char *fsum = m_finalSummaryBuf.getBufStart(); //if ( ! fsum ) fsum = (char *)0x01; //return fsum; } // // GET GIGABIT SAMPLE // // // This will get samples surrounding all the query terms for purposes // of gigabits generation. We don't just generate gigabits from the // WHOLE document because it takes much longer?? is that still true? // We assume that the first call to getTopLines() above set // matches/numMatches. We use those arrays to // skip directly to just the query terms in the document and save time. // We may have to reset the Scores array here if we want to use it ltr. // // aka getGigabitSample. get gigabit sample // SafeBuf *XmlDoc::getSampleForGigabits ( ) { if ( m_gsbufValid ) return &m_gsbuf; // assume empty //m_gsbuf = NULL; // basically, exit now if no sample needed if ( m_req->m_bigSampleMaxLen <= 0 || m_req->m_bigSampleRadius <= 0 ) { m_gsbufValid = true; return &m_gsbuf; } uint8_t *ct = getContentType(); if ( ! ct || ct == (void *)-1 ) return (SafeBuf *)ct; // if it is json then only return the json fields that are strings // and json decode them... separate each field with a \0. if ( *ct == CT_JSON ) return getSampleForGigabitsJSON(); Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww; // just send back the whole page, but separate each section // with \0. make only sentences end with ? ! or ., headers // not with anything, and no menu items Sections *sections = getSections(); if ( ! sections ||sections==(Sections *)-1) return (SafeBuf *)sections; Section *sp = sections->m_rootSection; SafeBuf reply; reply.setLabel("gbtrepbuf"); // m_contentLen is invalid, don't use that here use size_utf8Content if ( ! reply.reserve ( size_utf8Content + 1000 ) ) return NULL; // scan the sections of the document for ( ; sp ; sp = sp->m_next ) { QUICKPOLL(m_niceness); // do not allow menu crap if ( sp->m_flags & ( SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ) ) continue; // must be sentence or header bool ok = false; if ( sp->m_flags & SEC_SENTENCE ) ok = true; // headings are ok, just don't use as sentences... if ( sp->m_flags & SEC_HEADING ) ok = true; if ( ! ok ) continue; // store without tags char *p = ww->m_words[sp->m_a]; // include period after final word in section int32_t b = sp->m_b - 1; char *e = ww->m_words[b] + ww->m_wordLens[b]; // if 3+ commas and one comma for every 4 words, forget it, // it is probably a list! well, process it, but make sure it // does not end in a period so we do not display it // as a fast fact, but we use it for gigabits. bool isList = false; int32_t commaCount = 0; int32_t bracketCount = 0; for ( char *z = p ; z < e ; z++ ) { if ( *z == ',' ) commaCount++; // fix ] [AllTheWeb] [Gigablast] [Google] [HotBot]... if ( *z == '[' ) bracketCount++; } int32_t naw = (b - sp->m_a) / 2; // just skip even for gigabits if too long. most likely // a spammy list of nouns. if ( naw >= 130 ) continue; if ( commaCount >= 3 && commaCount *4 >= naw ) isList = true; if ( commaCount >= 10 ) isList = true; if ( bracketCount >= 3 ) isList = true; // too much uppercase? bool yelling = false; int32_t upper = 0; int32_t lower = 0; char cs = 0; for ( char *z = p ; z < e ; z += cs ) { cs = getUtf8CharSize(z); if ( ! is_alpha_utf8(z) ) continue; if ( is_upper_utf8(z) ) upper++; if ( is_lower_utf8(z) ) lower++; } if ( upper > lower ) yelling = true; // ending ) or ] if ( e[0] == ')' ) e++; else if ( e[0] == ']' ) e++; // incorporate period etc. if ( e[0] == '.' ) e++; else if ( e[0] == '!' ) e++; else if ( e[0] == '?' ) e++; else if ( e[0] == ';' ) e++; // must end in a period, or .) or .] bool endsInPeriod = false; if ( e-2 >= p && ( e[-1] =='.' || e[-1] =='!' || e[-1] =='?' ) ) endsInPeriod = true; if ( (e[-1] == ')' || e[-1] == ']' ) && (e[-2] == '.' || e[-2] == '?' || e[-2] == '!' ) ) endsInPeriod = true; //int32_t off = reply.length(); // filter out tags and \n's and \r's and store into "reply" if ( ! reply.safePrintFilterTagsAndLines ( p , e-p ,false ) ) return NULL; // if a sentence and does not end in period, toss one in //if ( sp->m_flags & SEC_SENTENCE ) { // if ( e[-1] !='.' && // e[-1] !='!' && // e[-1] !='?' && // e[-1] !=']' && // e[-1] !=')' ) // reply.pushChar('.'); //} // too huge? if # of ALNUM words > 70 it's too big. bool isHuge = false; if ( naw > 70 ) isHuge = true; // ending in a * indicates a printable sentence for fast facts if ( (sp->m_flags & SEC_SENTENCE) && ! isList && ! isHuge && ! yelling && endsInPeriod ) reply.pushChar('*'); // delineate sentences/headers/sections with | now so // we can still allow a word to be a gigabit even if it is // not in a sentence with a query term //reply.pushChar('\0'); reply.pushChar('|'); char *pc = reply.getBufStart() + reply.length() - 1; *pc = '\0'; // debug //char *x = reply.getBufStart() + off; // turn off fast fact debug for now //log("fastfact: fastfact: %s",x); // revert back to | *pc = '|'; // stop? this fixes the query 'lesbain vedeo porno' on // my cluster taking 10 seconds to get gigabits for. // bigsamplemaxlen is 1000 as of 12/4/2013. if ( reply.length() >= m_req->m_bigSampleMaxLen ) break; } // a final \0 reply.pushChar('\0'); // move it over to m_gsbuf now m_gsbuf.stealBuf ( &reply ); // we are valid m_gsbufValid = true; // success return &m_gsbuf; // need a buncha crap Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (SafeBuf *)xml; Pos *pos = getPos(); if ( ! pos || pos == (Pos *)-1 ) return (SafeBuf *)pos; Matches *mm = getMatches(); if ( ! mm || mm == (Matches *)-1 ) return (SafeBuf *)mm; // convert length to number of words int32_t bigSampleRadius = m_req->m_bigSampleRadius / 5; // at least 1 if ( bigSampleRadius <= 0 ) bigSampleRadius = 1; // alloc for whole document? int32_t max = xml->getContentLen() ; // do not exceed if ( max > m_req->m_bigSampleMaxLen ) max = m_req->m_bigSampleMaxLen; // make sure we have something in words too. i guess no sample? if ( max <= 2 ) { m_gsbufValid = true; return &m_gsbuf; } // a flag so we don't overlap samples... int32_t lastb = -1; // . set m_buf to where we write the sample // . add a byte for the terminating \0 int32_t gsbufAllocSize = max + 1; // temp hack //m_gsbuf = (char *)mmalloc(m_gsbufAllocSize,"gsbuf"); if ( ! m_gsbuf.reserve ( gsbufAllocSize, "gsbuf" ) ) return NULL; // g_errno should be set... //if ( ! m_gsbuf ) return NULL; //m_freeBuf = true; // set our pointer char *pstart = m_gsbuf.getBufStart(); char *p = pstart; char *pend = pstart + max; int32_t nw = ww->m_numWords; // skip to first query term for ( int32_t i = 0 ; i < mm->m_numMatches ; i++ ) { // breathe QUICKPOLL ( m_niceness ); // get the match Match *m = &mm->m_matches[i]; // break out if match is not from the document's Words class if ( m->m_words != ww ) break; // the word # int32_t n = m->m_wordNum; // got a match, add this samplet, [a,b] int32_t a = n - bigSampleRadius; int32_t b = n + bigSampleRadius; if ( a < 0 ) a = 0; if ( b > nw ) b = nw; if ( a < lastb ) a = lastb; // ensure the samples are separated by \0 else if ( p > pstart && p + 2 < pend ) { *p++ = '\0'; } Pos *pos = m->m_pos; int32_t *pp = pos->m_pos; int32_t len = pp[b+1] - pp[a]; // if match would send us over, we are done if ( p + len >= pend ) break; len = pos->filter(p,pend,m->m_words,a,b,m->m_sections); // for debug (mdw) //log("query: gigabitsample#%"INT32"=%s",i,p); p += len; // we are the new lastb lastb = b; } // always null terminate *p++ = '\0'; // . set sample size // . this includes terminating 0\'s in this case //int32_t gsbufSize = p - m_gsbuf; m_gsbuf.setLength( p - m_gsbuf.getBufStart() ); // we are valid m_gsbufValid = true; // for debug (mdw) //log("query: finalgigabitsample=%s",m_gsbuf); // success return &m_gsbuf; } // if it is json then only return the json fields that are strings // and json decode them... separate each field with a \0. SafeBuf *XmlDoc::getSampleForGigabitsJSON ( ) { SafeBuf tmp; // use new json parser Json *jp = getParsedJson(); if ( ! jp || jp == (void *)-1 ) return (SafeBuf *)jp; JsonItem *ji = jp->getFirstItem(); for ( ; ji ; ji = ji->m_next ) { QUICKPOLL(m_niceness); // skip if not string if ( ji->m_type != JT_STRING ) continue; // store field value char *val = ji->getValue(); int valLen = ji->getValueLen(); // if it contains html then skip it as a gigabit candidate. // otherwise our fast facts end up including html tags in them // in computeFastFacts() in Msg40.cpp int i; for ( i = 0 ; i < valLen ; i++ ) if ( val[i] == '<' ) break; if ( i < valLen ) continue; if ( ! tmp.pushChar('\n') ) return NULL; // if ( ! tmp.safePrintf("<p>")) // return NULL; // decode the json //SafeBuf xx; if ( ! tmp.safeDecodeJSONToUtf8(val,valLen,m_niceness)) return NULL; // escape out the html // if ( ! tmp.htmlEncode ( xx.getBufStart() )) // return NULL; // two new lines if ( ! tmp.safePrintf("<hr>")) return NULL; if ( ! tmp.pushChar('\n') ) return NULL; if ( ! tmp.pushChar('\n') ) return NULL; if ( ! tmp.pushChar('\n') ) return NULL; } if ( ! tmp.nullTerm() ) return NULL; Xml xml; if ( ! xml.set ( tmp.getBufStart() , tmp.length() , false , // ownData? 0 , // allocSize false , // pure xml? m_version , false , // setParentsArg? m_niceness , CT_HTML ) ) // *ct ) ) return NULL; Words ww; if ( ! ww.set ( &xml , true , m_niceness ) ) return NULL; Bits bb; if ( ! bb.set ( &ww ,0 ,m_niceness ) ) return NULL; Phrases pp; if ( ! pp.set ( &ww , &bb , true,false,0,m_niceness) ) return NULL; // this uses the sectionsReply to see which sections are // "text", etc. rather than compute it expensively Sections sec; if ( !sec.set ( &ww , &pp , &bb , getFirstUrl() , 0,//*d , 0,//*sh64 , // 64 bits "",//cr->m_coll , m_niceness , NULL,//m_masterState , // state NULL,//m_masterLoop , // callback CT_JSON, // *ct , NULL,//&m_dates , NULL , // sd // sections data true , // sections data valid? NULL , // sv // for m_nsvt NULL , // buf 0 )) { // bufSize return NULL; } // now add each sentence section into the buffer // scan the sentences if we got those char **wptrs = ww.getWords(); int32_t *wlens = ww.getWordLens(); Section *ss = sec.m_firstSent; for ( ; ss ; ss = ss->m_nextSent ) { // breathe QUICKPOLL(m_niceness); // count of the alnum words in sentence int32_t count = ss->m_alnumPosB - ss->m_alnumPosA; // start with one word! count--; // how can it be less than one alnum word if ( count < 0 ) continue; // store it char *wp1 = wptrs[ss->m_senta]; char *wp2 = wptrs[ss->m_sentb-1] + wlens[ss->m_sentb-1]; bool gotTerm = (wp2[0]=='.' || wp2[0]=='?' || wp2[0]=='!' ) ; //if ( ! gotTerm ) continue; if ( ! m_gsbuf.safeMemcpy ( wp1 , wp2 - wp1 ) ) return NULL; // puncty? if ( gotTerm && ! m_gsbuf.pushChar(wp2[0])) return NULL; // to indicate end of header or sentence, in order to // qualify as a fast fact, we must add a '*'. see // PageResults.cpp, search for ''*'' if ( gotTerm && ! m_gsbuf.pushChar('*') ) return NULL; if ( ! m_gsbuf.pushChar('\0') ) return NULL; } m_gsbufValid = true; return &m_gsbuf; } // . good sites sometimes have hacked pages // . try to identify those char *XmlDoc::getIsCompromised ( ) { if ( m_isCompromisedValid ) return &m_isCompromised; Xml *xml = getXml(); if ( ! xml || xml == (void *)-1 ) return (char *)xml; int32_t n = xml->getNumNodes(); XmlNode *nodes = xml->getNodes(); // assume compromised m_isCompromised = true; m_isCompromisedValid = true; // find the first meta summary node for ( int32_t i = 0 ; i < n ; i++ ) { // continue if not a meta tag if ( nodes[i].m_nodeId != TAG_FONT ) continue; // only get content for <meta name=..> not <meta http-equiv=..> int32_t stlen; char *style = nodes[i].getFieldValue ( "style" , &stlen ); // skip if none if ( ! style || stlen <= 6 ) continue; // NULL term char c = style[stlen]; style[stlen] = '\0'; char *hc = strstr(style,"height"); char *wc = strstr(style,"width"); // skip if neighter if ( ! hc && ! wc ) continue; // advance if ( hc ) hc += 6; if ( wc ) wc += 5; while ( is_wspace_a(*hc) ) hc++; while ( is_wspace_a(*wc) ) wc++; if ( hc && *hc == ':' ) hc++; if ( wc && *wc == ':' ) hc++; while ( is_wspace_a(*hc) ) hc++; while ( is_wspace_a(*wc) ) wc++; style[stlen] = c; // a zero height or width is a signal of invisble text and of // our syzygy compromised site to compromised site spammer if ( *hc == '0' ) return &m_isCompromised; if ( *wc == '0' ) return &m_isCompromised; } m_isCompromised = false; return &m_isCompromised; } // <meta name=robots value=noarchive> // <meta name=gigabot value=noarchive> char *XmlDoc::getIsNoArchive ( ) { if ( m_isNoArchiveValid ) return &m_isNoArchive; Xml *xml = getXml(); if ( ! xml || xml == (void *)-1 ) return (char *)xml; m_isNoArchive = false; m_isNoArchiveValid = true; int32_t n = xml->getNumNodes(); XmlNode *nodes = xml->getNodes(); // find the meta tags for ( int32_t i = 0 ; i < n ; i++ ) { // breathe QUICKPOLL(m_niceness); // continue if not a meta tag if ( nodes[i].m_nodeId != TAG_META ) continue; // get robots attribute int32_t alen; char *att; // <meta name=robots value=noarchive> att = nodes[i].getFieldValue ( "name" , &alen ); // need a name! if ( ! att ) continue; // get end char *end = att + alen; // skip leading spaces while ( att < end && *att && is_wspace_a(*att) ) att++; // must be robots or gigabot. skip if not if ( strncasecmp(att,"robots" ,6) && strncasecmp(att,"gigabot",7) ) continue; // get the content vaue att = nodes[i].getFieldValue("content",&alen); // skip if none if ( ! att ) continue; // get end end = att + alen; // skip leading spaces while ( att < end && *att && is_wspace_a(*att) ) att++; // is is noarchive? skip if no such match if ( strncasecmp(att,"noarchive",9) ) continue; // ok, we got it m_isNoArchive = true; break; } // return what we got return &m_isNoArchive; } // this vector's components are 64-bit, not the usual 32-bit int64_t **XmlDoc::getAdVector ( ) { if ( m_adVectorValid ) return &ptr_adVector; Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (int64_t **)xml; setStatus ( "parsing out ad ids"); // assume valid m_adVectorValid = true; int32_t na = 0; int32_t n = xml->getNumNodes(); XmlNode *nodes = xml->getNodes(); // find the meta tags for ( int32_t i = 0 ; i < n ; i++ ) { // breathe QUICKPOLL(m_niceness); // continue if not a script tag if ( nodes[i].m_nodeId != TAG_SCRIPT ) continue; // 83 // must be a front tag, not a back tag if ( xml->isBackTag ( i ) ) continue; // find the back tag for it int32_t j; for ( j = i ; j < n ; j++ ) { // another script tag if( nodes[i].m_nodeId != TAG_SCRIPT ) continue; // must be a back tag this time if ( ! xml->isBackTag ( i ) ) continue; // ok, we got it break; } // if no back tag, give up if ( j == n ) break; // buf/len defines the script area char *buf = xml->getNode(i); int32_t len = xml->getNode(j) - buf; // skip this script tag for next loop i = j; bool found = false; // start off looking for google char *needles[3] = { "google_ad_client" , "ctxt_ad_partner", "http://ad" }; char *providers[3] = { "google" , "yahoo", "doubleclick" }; for ( int32_t k = 0 ; k < 3 ; k++ ) { // try to match this needle char *match = needles[k]; // try to get a match char *p = strnstr ( buf, match , len ); // go again if ( ! p ) continue; // do not exceed the script area char *pend = buf + len; // it is in quotes // pub-uint64_t for google ad, uint32_t for yahoo // check for double or single quote while (k<2 && p<pend && *p != '"' && *p != '\'') p++; // it must have them!... i guess if ( p >= pend ) continue; // point to after the quote char *pbegin = ++p; // find the ending quote while (k<2 && p<pend && *p != '"' && *p != '\'') p++; // if none, bail if ( p >= pend ) continue; // get length of the ad client id between the quotes int32_t adClientLen = p - pbegin; if ( k == 2 ) { p = strnstr(p,".doubleclick.net/",pend-p); if ( ! p ) continue; p += 17; // look for doubleclick ads // user name is the second element of the path while(p < pend && *p != '/') p++; pbegin = ++p; while(p < pend && *p != '/') p++; if(p >= pend) continue; adClientLen = p - pbegin; found = true; } char *f = pbegin; char *fend = pbegin + adClientLen; for ( ; f < fend ; f++ ) { if ( is_alnum_a ( *f ) ) continue; if ( *f == '-' || *f == '_' || *f == '.' ) continue; break; } if ( f < fend ) continue; if ( adClientLen >= 400 ) continue; if ( adClientLen < 4 ) continue; // null term temp char c = *fend; *fend = '\0'; // hash it char buf[512]; sprintf(buf,"gbad:%s-%s",providers[k],pbegin); // put it back *fend = c; // . make the query term id // . first hash the field uint64_t h = hash64 ( "gbad" , 4 ); // then add in the other junk h = hash64 ( buf , gbstrlen(buf) , h ); // . now we will index that as-is // . and Msg25/LinkInfo can use to dedup voters! m_adIds[na++] = h; // stop if too many. save room for NULL termination. if ( na + 1 >= XD_MAX_AD_IDS ) break; } //look for another if not found or not ok. } // null term it like a good vector! no, those are 32-bit components, // we are a 64-bit component vector //m_adIds[na++] = 0; // point to where we should put them ptr_adVector = m_adIds; // store this i guess size_adVector = na * 8; // *lastNode = nn; return &ptr_adVector; } char *XmlDoc::getIsLinkSpam ( ) { if ( m_isLinkSpamValid ) return &m_isLinkSpam2; setStatus ( "checking if linkspam" ); Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; Links *links = getLinks(); if ( ! links || links == (Links *)-1 ) return (char *)links; int32_t *ip = getIp(); if ( ! ip || ip == (int32_t *)-1 ) return (char *)ip; int32_t **pici = getIndCatIds(); if ( ! pici || pici == (void *)-1 ) return (char *)pici; //LinkInfo *info1 = getLinkInfo1(); //if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1; int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (int32_t *)-1 ) return (char *)sni; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // reset note m_note = NULL; // . if a doc is "link spam" then it cannot vote, or its // voting power is reduced // . look for indications that the link is from a guestbook // . doc length over 100,000 bytes consider it link spam m_isLinkSpamValid = true; m_isLinkSpam = ::isLinkSpam ( getFirstUrl(), // linker *ip , ptr_indCatIds , size_indCatIds / 4 , *sni , xml, links, MAXDOCLEN,//150000,//maxDocLen , &m_note , NULL , // &linkee , // url , -1 , // linkNode , cr->m_coll , m_niceness ); // set shadow m_isLinkSpam2 = (bool)m_isLinkSpam; return &m_isLinkSpam2; } void *zliballoc ( void *opaque , unsigned int items , unsigned int size ) { //log("db: got zlib alloc"); return (void *)mmalloc ( items * size , "zlib" ); } void zlibfree ( void *opaque , void *address ) { //log("db: got zlib free"); // -1 will tell Mem.cpp to look it up in the table mfree ( address , -1 , "zlib" ); } void *malloc_replace (void *pf , unsigned int nitems , unsigned int size ) { return g_mem.gbmalloc(size*nitems,"malloc_replace"); } void free_replace ( void *pf , void *s ) { // -1 means we don't know the size g_mem.gbfree(s,-1,"free_replace"); } int gbuncompress ( unsigned char *dest , uint32_t *destLen , unsigned char *source , uint32_t sourceLen ) { z_stream stream; int err; stream.next_in = (Bytef*)source; stream.avail_in = (uInt)sourceLen; // Check for source > 64K on 16-bit machine: if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR; stream.next_out = dest; stream.avail_out = (uInt)*destLen; if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR; //stream.zalloc = (alloc_func)0; //stream.zfree = (free_func)0; stream.zalloc = malloc_replace;//zliballoc; stream.zfree = free_replace;//zlibfree; // this calls memcpy so make sure Profiler.cpp doesn't crash // since when it calls backtrace() that calls memcpy() too // and it's not async safe g_inMemcpy = 2; //we can be gzip or deflate err = inflateInit2(&stream, 47); g_inMemcpy = 0; if (err != Z_OK) return err; err = inflate(&stream, Z_FINISH); if (err != Z_STREAM_END) { inflateEnd(&stream); if (err == Z_NEED_DICT || (err == Z_BUF_ERROR && stream.avail_in == 0)) return Z_DATA_ERROR; return err; } *destLen = stream.total_out; err = inflateEnd(&stream); return err; } void deflateQuickPoll ( ) { QUICKPOLL(1); } int gbcompress ( unsigned char *dest , uint32_t *destLen , unsigned char *source , uint32_t sourceLen , int32_t encoding ) { int level = Z_DEFAULT_COMPRESSION; z_stream stream; int err; int method = Z_DEFLATED; //lots of mem, faster, more compressed, see zlib.h int windowBits = 31; int memLevel = 8; int strategy = Z_DEFAULT_STRATEGY; stream.next_in = (Bytef*)source; stream.avail_in = (uInt)sourceLen; #ifdef MAXSEG_64K // Check for source > 64K on 16-bit machine: if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR; #endif stream.next_out = dest; stream.avail_out = (uInt)*destLen; if ((uLong)stream.avail_out != *destLen) return Z_BUF_ERROR; //stream.zalloc = (alloc_func)0; //stream.zfree = (free_func)0; stream.zalloc = malloc_replace;//zliballoc; stream.zfree = free_replace;//zlibfree; stream.opaque = (voidpf)0; //we can be gzip or deflate if(encoding == ET_DEFLATE) err = deflateInit (&stream, level); else err = deflateInit2(&stream, level, method, windowBits, memLevel, strategy); if (err != Z_OK) { // zlib's incompatible version error? if ( err == -6 ) { log("zlib: zlib did you forget to add #pragma pack(4) to " "zlib.h when compiling libz.a so it aligns on 4-byte " "boundaries because we have that pragma in " "gb-include.h so its used when including zlib.h"); } return err; } // cygwin uses the system libz.a which is not hacked for our quickpoll #ifndef CYGWIN // tell deflat() to call quickpoll // MDW: 11/14/2014 don't do this for the 64bit zlib for now just to // save some time. do it later when it proves to be an issue. //setQuickPoll ( (char *)&g_loop.m_needsToQuickPoll, deflateQuickPoll); #endif // this calls memcpy so make sure Profiler.cpp doesn't crash // since when it calls backtrace() that calls memcpy() too // and it's not async safe g_inMemcpy = 3; err = deflate(&stream, Z_FINISH); g_inMemcpy = 0; if (err != Z_STREAM_END) { deflateEnd(&stream); return err == Z_OK ? Z_BUF_ERROR : err; } *destLen = stream.total_out; err = deflateEnd(&stream); return err; } // // NO NO don't use until use replace in[64] with SafeBuf in and out below // int gbcompress7 ( unsigned char *dest , uint32_t *destLen , unsigned char *source , uint32_t sourceLen , bool compress ) { //int32_t id = 1; // pass the input to the program through this file // rather than a pipe, since popen() seems broken char in[64]; if ( compress ) sprintf ( in , "%s/in.7z", g_hostdb.m_dir ); else sprintf ( in , "%s/out.7z", g_hostdb.m_dir ); unlink ( in ); // collect the output from the filter from this file char out[64]; if ( compress ) sprintf ( out , "%s/out.7z", g_hostdb.m_dir ); else sprintf ( out , "%s/in.7z", g_hostdb.m_dir ); if ( ! compress ) unlink ( out ); // ignore errno from those unlinks errno = 0; // open the input file retry11: int fd = open ( in , O_WRONLY | O_CREAT , S_IRWXU ); if ( fd < 0 ) { // valgrind if ( errno == EINTR ) goto retry11; log("build: Could not open file %s for writing: %s.", in,mstrerror(errno)); return -1; } retry12: // write the content into the input file int32_t w = write ( fd , source , sourceLen ); // valgrind if ( w < 0 && errno == EINTR ) goto retry12; // did we get an error if ( w != (int32_t)sourceLen ) { log("build: Error writing to %s: %s.",in,mstrerror(errno)); close(fd); return -1; } // close the file close ( fd ); // . open a pipe to pdf2html program // . the output will go to stdout //char cmd[2048]; SafeBuf cmd; // different commands to filter differt ctypes // -i : ignore images // -stdout: send output to stdout // -c : generate complex document // Google generates complex docs, but the large ones are horribly slow // in the browser, but docs with 2 cols don't display right w/o -c. // damn, -stdout doesn't work when -c is specified. // These ulimit sizes are max virtual memory in kilobytes. let's // keep them to 25 Megabytes // . the newer 2.6 kernels do not support ulimit !!! if ( compress ) // 7za a out.7z in.7z cmd.safePrintf( "%s7za a %s %s > /dev/null", g_hostdb.m_dir , out,in); else // -y = yes on all. so we overwrite "in.7z" cmd.safePrintf( "%s7za -o%s -y e %s > /dev/null", g_hostdb.m_dir,g_hostdb.m_dir , in);//,in); // breach sanity check //if ( gbstrlen(cmd) > 2040 ) { char *xx=NULL;*xx=0; } // exectue it int retVal = gbsystem ( cmd.getBufStart() ); if ( retVal == -1 ) log("gb: system(%s) : %s",cmd.getBufStart(), mstrerror(g_errno)); // all done with input file // clean up the binary input file from disk //if ( unlink ( in ) != 0 ) { // // log error // log("gbfilter: unlink (%s): %s\n",in,strerror(errno)); // // ignore it, since it was not a processing error per se // errno = 0; //} retry13: fd = open ( out , O_RDONLY ); if ( fd < 0 ) { // valgrind if ( errno == EINTR ) goto retry13; log("7zip: Could not open file %s for reading: %s.", out,mstrerror(errno)); return -1; } // to read - leave room for \0 int32_t toRead = MAXDOCLEN + 1000; retry14: // read right from pipe descriptor int32_t r = read (fd, dest,toRead); // note errors if ( r < 0 ) { // valgrind if ( errno == EINTR ) goto retry14; log("7zip: reading output: %s",mstrerror(errno)); // this is often bad fd from an oom error, so ignore it errno = 0; r = 0; } // clean up shop close ( fd ); // delete output file //unlink ( out ); if ( r > (int32_t)*destLen ) { char *xx=NULL;*xx=0; } // assign *destLen = r; // debug for now char *pre = ""; if ( ! compress ) pre = "un"; log("7zip: %scompressed %"UINT32" to %"UINT32" bytes" , pre,sourceLen , *destLen ); return Z_OK; } int gbuncompress7 ( unsigned char *dest , uint32_t *destLen , unsigned char *source , uint32_t sourceLen ) { return gbcompress7(dest,destLen,source,sourceLen,false); } /* bool XmlDoc::hashSingleTerm ( int64_t termId , HashInfo *hi ) { // combine with a non-NULL prefix if ( hi->m_prefix ) { int64_t prefixHash = hash64b ( hi->m_prefix ); // sanity test, make sure it is in supported list if ( getFieldCode3 ( prefixHash ) == FIELD_GENERIC ) { char *xx=NULL;*xx=0; } termId = hash64 ( termId , prefixHash ); } // save it? if ( m_wts && ! ::storeTerm ( "binary",6,termId,hi,0,0, MAXDENSITYRANK, MAXDIVERSITYRANK, MAXWORDSPAMRANK, hi->m_hashGroup, false,&m_wbuf,m_wts,false) ) return false; // int16_tcut HashTableX *dt = hi->m_tt; // sanity check if ( dt->m_ks != sizeof(key_t) ) { char *xx=NULL;*xx=0; } // make the key like we do in hashWords() key96_t k; k.n1 = hi->m_date; k.n0 = termId; // get current score for this wordid int32_t slot = dt->getSlot ( &k ); // does this termid/date already exist? if ( slot >= 0 ) { // done return true; } // otherwise, add a new slot char val = 1; if ( ! hi->m_tt->addKey ( (char *)k , &val ) ) return false; // return true on success return true; } */ bool storeTerm ( char *s , int32_t slen , int64_t termId , HashInfo *hi , int32_t wordNum , int32_t wordPos , char densityRank, char diversityRank , char wordSpamRank , char hashGroup, //bool isPhrase , SafeBuf *wbuf , HashTableX *wts , char synSrc , char langId , POSDBKEY key ) { // store prefix int32_t poff = wbuf->length(); // int16_tcut char *p = hi->m_prefix; // add the prefix too! if ( p && ! wbuf->safeMemcpy(p,gbstrlen(p)+1)) return false; // none? if ( ! p ) poff = -1; // store description int32_t doff = wbuf->length(); // int16_tcut char *d = hi->m_desc; // add the desc too! if ( d && ! wbuf->safeMemcpy(d,gbstrlen(d)+1) ) return false; // none? if ( ! d ) doff = -1; // store term int32_t toff = wbuf->length(); // add it if ( ! wbuf->safeMemcpy ( s , slen ) ) return false; // make this TermDebugInfo ti; ti.m_termOff = toff; ti.m_termLen = slen; ti.m_descOff = doff; ti.m_prefixOff = poff; ti.m_date = hi->m_date; ti.m_shardByTermId = hi->m_shardByTermId; ti.m_termId = termId; //ti.m_weight = 1.0; //ti.m_spam = -1.0; ti.m_diversityRank = diversityRank; ti.m_densityRank = densityRank; ti.m_wordSpamRank = wordSpamRank; ti.m_hashGroup = hashGroup; ti.m_wordNum = wordNum; ti.m_wordPos = wordPos; ti.m_langId = langId; ti.m_key = key; // was sitehash32 //ti.m_facetVal32 = hi->m_facetVal32;//sentHash32 = hi->m_sentHash32; // save for printing out an asterisk ti.m_synSrc = synSrc; // isSynonym = isSynonym; // get language bit vec ti.m_langBitVec64 = g_speller.getLangBits64(&termId); //if ( isPhrase ) ti.m_synSrc = SOURCE_NGRAM; /* // the weight vec for the words and phrases for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) ti.m_rv[j] = 1.0; int32_t *wscores = NULL; if ( weights && ! isPhrase ) wscores = weights->m_ww; if ( weights && isPhrase ) wscores = weights->m_pw; // int16_tcut int32_t i = wordNum; if ( weights && ! weights->m_rvw ) { char *xx=NULL;*xx=0; } if ( weights && ! weights->m_rvp ) { char *xx=NULL;*xx=0; } float *rv = NULL; if ( weights && ! isPhrase ) rv = &weights->m_rvw[i*MAX_RULES]; if ( weights && isPhrase ) rv = &weights->m_rvp[i*MAX_RULES]; if ( weights ) ti.m_weight = (float)wscores[i] / (float)DW; if ( weights ) gbmemcpy ( &ti.m_rv, rv , MAX_RULES*sizeof(float)); // no, because if this is zero we force it up to 1! //if ( weights ) // ti.m_score32 = (int32_t)((float)ti.m_score32 * ti.m_weight); ti.m_score32 = score; if ( isSynonym ) ti.m_score32 = score; */ // make the key key96_t k; k.n1 = 0; // date k.n0 = termId; // store it return wts->addKey ( &k , &ti ) ; } bool XmlDoc::hashSingleTerm ( char *s , int32_t slen , HashInfo *hi ) { // empty? if ( slen <= 0 ) return true; if ( ! m_versionValid ) { char *xx=NULL;*xx=0; } if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; } // // POSDB HACK: temporarily turn off posdb until we hit 1B pages! // //if ( ! m_storeTermListInfo ) // return true; // a single blob hash int64_t termId = hash64 ( s , slen ); // combine with prefix int64_t final = termId; // combine with a non-NULL prefix int64_t prefixHash = 0LL; if ( hi->m_prefix ) { prefixHash = hash64b ( hi->m_prefix ); final = hash64 ( termId , prefixHash ); } // call the other guy now //return hashSingleTerm ( final , hi ); // int16_tcut HashTableX *dt = hi->m_tt; // sanity check if ( dt->m_ks != sizeof(key144_t) ) { char *xx=NULL;*xx=0; } // make the key like we do in hashWords() key144_t k; g_posdb.makeKey ( &k , final, 0LL, // docid 0, // dist MAXDENSITYRANK, // density rank MAXDIVERSITYRANK, // diversity rank MAXWORDSPAMRANK, // wordspamrank 0, // siterank hi->m_hashGroup, // we set to docLang in final hash loop langUnknown,// langid 0, // multiplier 0, // syn? false , // delkey? hi->m_shardByTermId ); // // HACK: mangle the key if its a gbsitehash:xxxx term // used for doing "facets" like stuff on section xpaths. // // no longer do this because we just hash the term // gbxpathsitehash1234567 where 1234567 is that hash. // but // //static int64_t s_gbsectionhash = 0LL; //if ( ! s_gbsectionhash ) s_gbsectionhash = hash64b("gbsectionhash"); //if ( prefixHash == s_gbsectionhash ) // g_posdb.setSectionSentHash32 ( &k, hi->m_sentHash32 ); // . otherwise, add a new slot // . key should NEVER collide since we are always // incrementing the distance cursor, m_dist if ( ! dt->addTerm144 ( &k ) ) return false; // add to wts for PageParser.cpp display if ( m_wts && ! storeTerm ( s,slen,final,hi, 0, // wordnum 0, // wordPos, MAXDENSITYRANK, MAXDIVERSITYRANK, MAXWORDSPAMRANK, hi->m_hashGroup, //false, &m_wbuf, m_wts, SOURCE_NONE, // synsrc langUnknown, k) ) return false; return true; } bool XmlDoc::hashString ( char *s, HashInfo *hi ) { return hashString ( s , gbstrlen(s), hi ); } bool XmlDoc::hashString ( char *s , int32_t slen , HashInfo *hi ) { if ( ! m_versionValid ) { char *xx=NULL;*xx=0; } if ( hi->m_useCountTable && ! m_countTableValid){char *xx=NULL;*xx=0; } if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } int32_t *sni = getSiteNumInlinks(); return hashString3( s , slen , hi , &m_countTable , m_pbuf , m_wts , &m_wbuf , m_version , *sni , m_niceness ); } bool XmlDoc::hashString3( char *s , int32_t slen , HashInfo *hi , HashTableX *countTable , SafeBuf *pbuf , HashTableX *wts , SafeBuf *wbuf , int32_t version , int32_t siteNumInlinks , int32_t niceness ) { Words words; Bits bits; Phrases phrases; //Weights weights; //Synonyms synonyms; if ( ! words.set ( s , slen , version , true , niceness ) ) return false; if ( ! bits.set ( &words , version , niceness ) ) return false; if ( ! phrases.set(&words,&bits,true,false,version,niceness ) ) return false; // use primary langid of doc if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; } // words //SafeBuf myLangVec; //if ( ! setLangVec ( &words , &myLangVec , m_niceness ) ) // return false; //char *langVec = (char *)myLangVec.getBufStart(); /* // debugBuf for synonyms? yes if we are debugging SafeBuf synDebugBuf; SafeBuf *sdbp = NULL; if ( pbuf || m_storeTermListInfo ) sdbp = &synDebugBuf; // now we can set it... if ( hi->m_useSynonyms && !synonyms.set(&words, NULL, // langVec, m_langId, &phrases, niceness, sdbp)) return false; */ // set weights because of count table //if ( countTable && ! weights.set ( &words , /* if ( hi->m_useWeights && ! weights.set ( &words , &phrases , &bits , NULL , pbuf , false , false , version , 100 , // titleWeight 100 , // headerWeight countTable , false , // isLinkText false , // isCntTable? siteNumInlinks , niceness ) ) return false; Weights *wp = &weights; if ( ! hi->m_useWeights ) wp = NULL; */ //Synonyms *sp = NULL; //if ( hi->m_useSynonyms ) sp = &synonyms; return hashWords3 ( //0 , //words.getNumWords() , hi , &words , &phrases , NULL,//sp , synonyms NULL , // sections countTable , NULL , // fragvec NULL , // wordspamvec NULL , // langvec langUnknown , // default langid doclangid pbuf , wts , wbuf , niceness ); } bool XmlDoc::hashWords ( //int32_t wordStart , //int32_t wordEnd , HashInfo *hi ) { // sanity checks if ( ! m_wordsValid ) { char *xx=NULL; *xx=0; } if ( ! m_phrasesValid ) { char *xx=NULL; *xx=0; } if ( hi->m_useCountTable &&!m_countTableValid){char *xx=NULL; *xx=0; } if ( ! m_bitsValid ) { char *xx=NULL; *xx=0; } if ( ! m_sectionsValid) { char *xx=NULL; *xx=0; } //if ( ! m_synonymsValid) { char *xx=NULL; *xx=0; } if ( ! m_fragBufValid ) { char *xx=NULL; *xx=0; } if ( ! m_wordSpamBufValid ) { char *xx=NULL; *xx=0; } if ( m_wts && ! m_langVectorValid ) { char *xx=NULL; *xx=0; } if ( ! m_langIdValid ) { char *xx=NULL; *xx=0; } // . is the word repeated in a pattern? // . this should only be used for document body, for meta tags, // inlink text, etc. we should make sure words are unique char *wordSpamVec = getWordSpamVec(); char *fragVec = m_fragBuf.getBufStart(); char *langVec = m_langVec.getBufStart(); return hashWords3( //wordStart , //wordEnd , hi , &m_words , &m_phrases , NULL,//&m_synonyms , &m_sections , &m_countTable , fragVec , wordSpamVec , langVec , m_langId , // defaultLangId docLangId m_pbuf , m_wts , &m_wbuf , m_niceness ); } // . this now uses posdb exclusively bool XmlDoc::hashWords3 ( //int32_t wordStart , //int32_t wordEnd , HashInfo *hi , Words *words , Phrases *phrases , Synonyms *synonyms , Sections *sectionsArg , HashTableX *countTable , char *fragVec , char *wordSpamVec , char *langVec , char docLangId , // default lang id //Weights *weights , SafeBuf *pbuf , HashTableX *wts , SafeBuf *wbuf , int32_t niceness ) { // // POSDB HACK: temporarily turn off posdb until we hit 1B pages! // //if ( ! m_storeTermListInfo ) // return true; Sections *sections = sectionsArg; // for getSpiderStatusDocMetaList() we don't use sections it'll // mess us up if ( ! hi->m_useSections ) sections = NULL; // int16_tcuts uint64_t *wids = (uint64_t *)words->getWordIds(); //nodeid_t *tids = words->m_tagIds; uint64_t *pids2 = (uint64_t *)phrases->m_phraseIds2; //uint64_t *pids3 = (uint64_t *)phrases->m_phraseIds3; HashTableX *dt = hi->m_tt; // . sanity checks // . posdb just uses the full keys with docid if ( dt->m_ks != 18 ) { char *xx=NULL;*xx=0; } if ( dt->m_ds != 4 ) { char *xx=NULL;*xx=0; } // if provided... if ( wts ) { if ( wts->m_ks != 12 ) { char *xx=NULL;*xx=0; } if ( wts->m_ds != sizeof(TermDebugInfo)){char *xx=NULL;*xx=0; } if ( ! wts->m_allowDups ) { char *xx=NULL;*xx=0; } } // ensure caller set the hashGroup if ( hi->m_hashGroup < 0 ) { char *xx=NULL;*xx=0; } // handy char **wptrs = words->getWordPtrs(); int32_t *wlens = words->getWordLens(); // hash in the prefix uint64_t prefixHash = 0LL; int32_t plen = 0; if ( hi->m_prefix ) plen = gbstrlen ( hi->m_prefix ); if ( hi->m_prefix && plen ) { // we gotta make this case insensitive, and skip spaces // because if it is 'focal length' we can't search // 'focal length:10' because that comes across as TWO terms. prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen ); // . sanity test, make sure it is in supported list // . hashing diffbot json output of course fails this so // skip in that case if diffbot //if ( ! m_isDiffbotJSONObject && // getFieldCode3 ( prefixHash ) == FIELD_GENERIC ) { // if (hi->m_desc&&strcmp(hi->m_desc,"custom meta tag")) { // char *xx=NULL;*xx=0; } //} } bool hashIffUnique = false; //if ( hi->m_hashGroup == HASHGROUP_INLINKTEXT ) hashIffUnique = true; if ( hi->m_hashGroup == HASHGROUP_INMETATAG ) hashIffUnique = true; if ( hi->m_hashGroup == HASHGROUP_INTAG ) hashIffUnique = true; HashTableX ut; ut.set ( 8,0,0,NULL,0,false,niceness,"uqtbl"); /////// // // diversity rank vector. // /////// // the final diversity which is a multiplier // is converted into a rank from 0-15 i guess. // so 'mexico' in "new mexico" should receive a low word score but high // phrase score. thus, a search for 'mexico' should not bring up // the page for university of new mexico! SafeBuf dwbuf; if(!getDiversityVec ( words,phrases,countTable,&dwbuf,niceness)) return false; char *wdv = dwbuf.getBufStart(); int32_t nw = words->getNumWords(); ///// // // calculate density ranks // ///// // // this now varies depending on the length of the sentence/header etc. // so if the hasgroup is not title, link text or meta tag, we have to // use a safebuf. SafeBuf densBuf; // returns false and sets g_errno on error if ( ! getDensityRanks((int64_t *)wids, nw,//wordStart, //wordEnd, hi->m_hashGroup, &densBuf, sections, m_niceness)) return false; // a handy ptr char *densvec = (char *)densBuf.getBufStart(); //////////// // // get word positions // /////////// Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; SafeBuf wpos; if ( ! getWordPosVec ( words , sections, //wordStart, //wordEnd, m_dist, // hi->m_startDist, fragVec, niceness, &wpos) ) return false; // a handy ptr int32_t *wposvec = (int32_t *)wpos.getBufStart(); /* // show that for debug if ( m_docId == 192304365235LL ) { for ( int32_t i = 0 ; i < nw ; i++ ) { char buf[1000]; int32_t len = wlens[i]; if ( len > 900 ) len = 900; gbmemcpy(buf,wptrs[i],len); buf[len]='\0'; log("seopipe: wptr=%s pos[%"INT32"]=%"INT32"",buf,i,wposvec[i]); } } */ //int32_t wc = 0; //int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT; int32_t i; for ( i = 0 ; i < nw ; i++ ) { // breathe QUICKPOLL(niceness); if ( ! wids[i] ) continue; // ignore if in repeated fragment if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) continue; // ignore if in style section if ( sp && (sp[i]->m_flags & NOINDEXFLAGS) ) continue; // do not breach wordpos bits if ( wposvec[i] > MAXWORDPOS ) break; // . hash the startHash with the wordId for this word // . we must mask it before adding it to the table because // this table is also used to hash IndexLists into that come // from LinkInfo classes (incoming link text). And when // those IndexLists are hashed they used masked termIds. // So we should too... //uint64_t h = g_indexdb.getTermId ( startHash , wids[i] ) ; uint64_t h ; if ( plen > 0 ) h = hash64 ( wids[i] , prefixHash ); else h = wids[i]; // . get word spam rank. 0 means not spammed // . just mod Weights class to ues a weight rank... // . and diversity rank // . need to separate weights by spam vs. diversity. // . maybe just have a diversity class and a pattern class // and leave the poor weights class alone //int32_t wsr = 0; int32_t hashGroup = hi->m_hashGroup; Section *sx = NULL; if ( sp ) { sx = sp[i]; // . this is taken care of in hashTitle() // . it is slightly different if the title is // multiple sentences because when hashing the // body the density rank is per sentence, but in // hashTitle we count all the words in the title // towards the density rank even if they are // in different sentences if ( sx->m_flags & SEC_IN_TITLE ) //hashGroup = HASHGROUP_TITLE; continue; if ( sx->m_flags & SEC_IN_HEADER ) hashGroup = HASHGROUP_HEADING; if ( sx->m_flags & ( SEC_MENU | SEC_MENU_SENTENCE | SEC_MENU_HEADER ) ) hashGroup = HASHGROUP_INMENU; } // this is for link text and meta tags mostly if ( hashIffUnique ) { // skip if already did it if ( ut.isInTable ( &h ) ) continue; if ( ! ut.addKey ( &h ) ) return false; } char ws = 15; if ( wordSpamVec ) ws = wordSpamVec[i]; // HACK: // if this is inlink text, use the wordspamrank to hold the // inlinker's site rank! if ( hashGroup == HASHGROUP_INLINKTEXT ) ws = hi->m_linkerSiteRank; // default to the document's primary language if it is not // clear what language this word belongs to. // if the word is only in german it should be german, // otherwise it will be the document's primary language. char langId = langUnknown; if ( m_wts && langVec ) langId = langVec[i]; // keep it as the original vector. i'm not sure we use // this for anything but for display, so show the user // how we made our calculation of the document's primary lang //if ( langId == langUnknown ) langId = docLangId; char wd; if ( hi->m_useCountTable ) wd = wdv[i]; else wd = MAXDIVERSITYRANK; // if using posdb key144_t k; g_posdb.makeKey ( &k , h , 0LL,//docid wposvec[i], // dist, densvec[i],// densityRank , // 0-15 wd, // diversityRank 0-15 ws, // wordSpamRank 0-15 0, // siterank hashGroup , // we set to docLang final hash loop langUnknown, // langid 0 , // multiplier false , // syn? false , // delkey? hi->m_shardByTermId ); // key should NEVER collide since we are always incrementing // the distance cursor, m_dist dt->addTerm144 ( &k ); // . make the m_wordPosInfoBuf here because we need to set // WordPosInfo::m_wordPtr/m_wordLen. // . could also use instead of the "wts" buffer? if ( m_doingSEO ) { // alloc in 10k chunks if ( m_wordPosInfoBuf.getAvail() < (int32_t)sizeof(WordPosInfo) ) { int32_t newSize = m_wordPosInfoBuf.length(); newSize += 10000; if ( ! m_wordPosInfoBuf.reserve ( newSize ) ) return false; } // make it WordPosInfo wi; wi.m_wordPtr = wptrs[i]; wi.m_wordLen = wlens[i]; wi.m_wordPos = wposvec[i]; wi.m_densityRank = densvec[i]; wi.m_wordSpamRank = ws; wi.m_diversityRank = wd;//v[i]; wi.m_hashGroup = hashGroup; wi.m_trafficGain = 0; int32_t cs = sizeof(WordPosInfo); if(!m_wordPosInfoBuf.safeMemcpy(&wi,cs)) return false; } // add to wts for PageParser.cpp display if ( wts ) { if ( ! storeTerm ( wptrs[i],wlens[i],h,hi,i, wposvec[i], // wordPos densvec[i],// densityRank , // 0-15 wd,//v[i], ws, hashGroup, //false, // is phrase? wbuf, wts, SOURCE_NONE, // synsrc langId , k)) return false; } // // STRIP POSSESSIVE WORDS for indexing // // . for now do simple stripping here // . if word is "bob's" hash "bob" // if ( wlens[i] >= 3 && wptrs[i][wlens[i]-2] == '\'' && to_lower_a(wptrs[i][wlens[i]-1]) == 's' ) { int64_t nah ; nah = hash64Lower_utf8 ( wptrs[i], wlens[i]-2 ); if ( plen>0 ) nah = hash64 ( nah , prefixHash ); g_posdb.makeKey ( &k , nah, 0LL,//docid wposvec[i], // dist, densvec[i],// densityRank , // 0-15 wd,//v[i], // diversityRank , ws, // wordSpamRank , 0, //siterank hashGroup, // we set to docLang final hash loop langUnknown, // langid 0 , // multiplier true , // syn? false , // delkey? hi->m_shardByTermId ); // key should NEVER collide since we are always // incrementing the distance cursor, m_dist dt->addTerm144 ( &k ); // keep going if not debug if ( ! wts ) continue; // print the synonym if ( ! storeTerm(wptrs[i], // synWord, wlens[i] -2, // gbstrlen(synWord), nah, // termid hi, i, // wordnum wposvec[i], // wordPos densvec[i],// densityRank , // 0-15 wd,//v[i], ws, hashGroup, //false, // is phrase? wbuf, wts, SOURCE_GENERATED, langId, k) ) return false; } ///////////// // // synonyms (alt words,morphs,synonyms) // ///////////// /* int64_t *aids = NULL; int16_t naids = 0; int64_t syh; if ( synonyms ) { aids = synonyms->getAltIds (i); naids = synonyms->getNumAlts(i); //ascore = saved / 4; //if ( ascore <= 0 ) ascore = 1; //asaved = ascore; } for ( int32_t j = 0 ; j < naids ; j++ ) { // skip if same as original if ( (uint64_t)aids[j] == wids[i] ) continue; // . hash it with the prefix if any // . fixes gbwhere:galleries bug... if ( plen>0 ) syh = hash64 ( aids[j] , prefixHash ); else syh = aids[j]; g_posdb.makeKey ( &k , syh , 0LL,//docid wposvec[i], // dist, densvec[i],// densityRank , // 0-15 wdv[i], // diversityRank , ws, // wordSpamRank , 0, //siterank hashGroup, // we set to docLang final hash loop langUnknown, // langid 0 , // multiplier true , // syn? false ); // delkey? // key should NEVER collide since we are always // incrementing the distance cursor, m_dist dt->addTerm144 ( &k ); // keep going if not debug if ( ! wts ) continue; // get the junk char *synWord = synonyms->getStringFromId(&aids[j]); // sanity if ( ! synWord ) { char *xx=NULL;*xx=0; } // print the synonym if ( ! storeTerm(synWord, gbstrlen(synWord), syh, // termid hi, i, // wordnum wposvec[i], // wordPos densvec[i],// densityRank , // 0-15 wdv[i], ws, hashGroup, //false, // is phrase? wbuf, wts, synonyms->m_source[i], // synsrc langId) ) return false; } */ //////// // // two-word phrase // //////// int64_t npid = pids2[i]; int32_t npw = 2; uint64_t ph2 = 0; // repeat for the two word hash if different! if ( npid ) { // hash with prefix if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash ); else ph2 = npid; g_posdb.makeKey ( &k , ph2 , 0LL,//docid wposvec[i],//dist, densvec[i],// densityRank , // 0-15 MAXDIVERSITYRANK, //phrase ws, // wordSpamRank , 0,//siterank hashGroup, // we set to docLang final hash loop langUnknown, // langid 0 , // multiplier true , // syn? false , // delkey? hi->m_shardByTermId ); // key should NEVER collide since we are always // incrementing the distance cursor, m_dist dt->addTerm144 ( &k ); } // add to wts for PageParser.cpp display if ( wts && npid ) { // get phrase as a string int32_t plen; char *phr=phrases->getPhrase(i,&plen,npw); // store it if ( ! storeTerm ( phr,plen,ph2,hi,i, wposvec[i], // wordPos densvec[i],// densityRank , // 0-15 MAXDIVERSITYRANK,//phrase ws, hashGroup, //true, wbuf, wts, SOURCE_BIGRAM, // synsrc langId, k) ) return false; } //////// // // three-word phrase // //////// /* npid = pids3[i]; npw = 3; // repeat for the two word hash if different! if ( npid ) { // hash with prefix uint64_t ph2 ; if ( plen > 0 ) ph2 = hash64 ( npid , prefixHash ); else ph2 = npid; g_posdb.makeKey ( &k , ph2 , 0LL,//docid wposvec[i],//dist, densvec[i],// densityRank , // 0-15 MAXDIVERSITYRANK, //phrase ws, // wordSpamRank , 0,//siterank hashGroup, // we set to docLang final hash loop langUnknown, // langid 0 , // multiplier true , // syn? false ); // delkey? // key should NEVER collide since we are always // incrementing the distance cursor, m_dist dt->addTerm144 ( &k ); } // add to wts for PageParser.cpp display if ( wts && npid ) { // get phrase as a string int32_t plen; char *phr=phrases->getPhrase(i,&plen,npw); // store it if ( ! storeTerm ( phr,plen,ph2,hi,i, wposvec[i], // wordpos densvec[i],// densityRank , // 0-15 MAXDIVERSITYRANK,//phrase ws, hashGroup, //true, // is phrase? wbuf, wts, SOURCE_TRIGRAM, // synsrc langId ) ) return false; } */ // update for hashIncomingLinkText() //hi->m_startDist = wposvec[i]; // debug point //if ( ph2 == (uint64_t)-233869093807964777LL ) { // log("hey slot=%"INT32" date=%"UINT32" n0=%"INT64" score=%"INT32"", // slot, // k.n1,k.n0, // score); // //char *xx=NULL;*xx=0; //} // // NUMERIC SORTING AND RANGES // // only store numbers in fields this way if ( prefixHash == 0 ) continue; // this may or may not be numeric. if ( ! is_digit ( wptrs[i][0] ) ) continue; // this might have to "back up" before any '.' or '-' symbols if ( ! hashNumber ( wptrs[0] , wptrs[i] , wlens[i] , hi ) ) return false; } // hash a single term so they can do gbfacet:ext or // gbfacet:siterank or gbfacet:price. a field on a field. if ( prefixHash && words->m_numWords ) // hash gbfacet:price with and store the price in the key hashFacet1 ( hi->m_prefix, words ,hi->m_tt);//, hi ); // between calls? i.e. hashTitle() and hashBody() //if ( wc > 0 ) m_dist = wposvec[wc-1] + 100; if ( i > 0 ) m_dist = wposvec[i-1] + 100; return true; } // just like hashNumber*() functions but we use "gbfacet" as the // primary prefix, NOT gbminint, gbmin, gbmax, gbmaxint, gbsortby, // gbsortbyint, gbrevsortby, gbrevsortbyint bool XmlDoc::hashFacet1 ( char *term , Words *words , HashTableX *tt ) { // need a prefix //if ( ! hi->m_prefix ) return true; // hash the ENTIRE content, all words as one blob int32_t nw = words->getNumWords(); char *a = words->m_words[0]; char *b = words->m_words[nw-1]+words->m_wordLens[nw-1]; // hash the whole string as one value, the value of the facet int32_t val32 = hash32 ( a , b - a ); if ( ! hashFacet2 ( "gbfacetstr",term, val32 , tt ) ) return false; // // why do this if we already do it for hashNumber() using gbsortby: ? // /* // if it's a number hash as float and int if ( nw != 1 ) return true; char **wptrs = words->m_words; if ( ! is_digit ( wptrs[0][0] ) ) return true; // hash with a float val float f = atof(wptrs[0]); int32_t vf32 = *(int32_t *)&f; if ( ! hashFacet2 ( "gbfacetfloat",term, vf32 , tt ) ) return false; // and an int val int32_t vi32 = atoi(wptrs[0]); if ( ! hashFacet2 ( "gbfacetint",term, vi32 , tt ) ) return false; */ return true; } bool XmlDoc::hashFacet2 ( char *prefix, char *term , int32_t val32 , HashTableX *tt , // we only use this for gbxpathsitehash terms: bool shardByTermId ) { // need a prefix //if ( ! hi->m_prefix ) return true; //int32_t plen = gbstrlen ( hi->m_prefix ); //if ( plen <= 0 ) return true; // we gotta make this case insensitive, and skip spaces // because if it is 'focal length' we can't search // 'focal length:10' because that comes across as TWO terms. //int64_t prefixHash =hash64Lower_utf8_nospaces ( hi->m_prefix,plen); // now any field has to support gbfacet:thatfield // and store the 32-bit termid into where we normally put // the word position bits, etc. //static int64_t s_facetPrefixHash = 0LL; //if ( ! s_facetPrefixHash ) // s_facetPrefixHash = hash64n ( "gbfacet" ); // this is case-sensitive int64_t prefixHash = hash64n ( prefix ); // term is like something like "object.price" or whatever. // it is the json field itself, or the meta tag name, etc. int64_t termId64 = hash64n ( term ); // combine with the "gbfacet" prefix. old prefix hash on right. // like "price" on right and "gbfacetfloat" on left... see Query.cpp. int64_t ph2 = hash64 ( termId64, prefixHash ); // . now store it // . use field hash as the termid. normally this would just be // a prefix hash // . use mostly fake value otherwise key144_t k; g_posdb.makeKey ( &k , ph2 , 0,//docid 0,// word pos # 0,// densityRank , // 0-15 0 , // MAXDIVERSITYRANK 0 , // wordSpamRank , 0 , //siterank 0 , // hashGroup, // we set to docLang final hash loop //langUnknown, // langid // unless already set. so set to english here // so it will not be set to something else // otherwise our floats would be ordered by langid! // somehow we have to indicate that this is a float // termlist so it will not be mangled any more. //langEnglish, langUnknown, 0 , // multiplier false, // syn? false , // delkey? shardByTermId ); //int64_t final = hash64n("products.offerprice",0); //int64_t prefix = hash64n("gbsortby",0); //int64_t h64 = hash64 ( final , prefix); //if ( ph2 == h64 ) // log("hey: got offer price"); // now set the float in that key g_posdb.setInt ( &k , val32 ); // HACK: this bit is ALWAYS set by Posdb::makeKey() to 1 // so that we can b-step into a posdb list and make sure // we are aligned on a 6 byte or 12 byte key, since they come // in both sizes. but for this, hack it off to tell // addTable144() that we are a special posdb key, a "numeric" // key that has a float stored in it. then it will NOT // set the siterank and langid bits which throw our sorting // off!! g_posdb.setAlignmentBit ( &k , 0 ); HashTableX *dt = tt;//hi->m_tt; // the key may indeed collide, but that's ok for this application if ( ! dt->addTerm144 ( &k ) ) return false; if ( ! m_wts ) return true; bool isFloat = false; if ( strcmp(prefix,"gbfacetfloat")==0 ) isFloat = true; // store in buffer for display on pageparser.cpp output char buf[128]; int32_t bufLen; if ( isFloat ) bufLen=sprintf(buf,"facetField=%s facetVal32=%f",term, *(float *)&val32); else bufLen=sprintf(buf,"facetField=%s facetVal32=%"UINT32"", term,(uint32_t)val32); // make a special hashinfo for this facet HashInfo hi; hi.m_tt = tt; // the full prefix char fullPrefix[64]; snprintf(fullPrefix,64,"%s:%s",prefix,term); hi.m_prefix = fullPrefix;//"gbfacet"; // add to wts for PageParser.cpp display // store it if ( ! storeTerm ( buf, bufLen, ph2, // prefixHash, // s_facetPrefixHash, &hi, 0, // word#, i, 0, // wordPos 0,// densityRank , // 0-15 0, // MAXDIVERSITYRANK,//phrase 0, // ws, 0, // hashGroup, //true, &m_wbuf, m_wts, // a hack for display in wts: SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc langUnknown , k) ) return false; return true; } bool XmlDoc::hashFieldMatchTerm ( char *val , int32_t vlen , HashInfo *hi ) { HashTableX *tt = hi->m_tt; uint64_t val64 = hash64 ( val , vlen ); // term is like something like "object.price" or whatever. // it is the json field itself, or the meta tag name, etc. uint64_t middlePrefix = hash64n ( hi->m_prefix ); // hash "This is a new product." with "object.desc". // "object.desc" (termId64) is case-sensitive. uint64_t composite = hash64 ( val64 , middlePrefix ); // hash that with "gbfieldmatch" char *prefix = "gbfieldmatch"; uint64_t prefixHash = hash64n ( prefix ); uint64_t ph2 = hash64 ( composite , prefixHash ); // . now store it // . use field hash as the termid. normally this would just be // a prefix hash // . use mostly fake value otherwise key144_t k; g_posdb.makeKey ( &k , ph2 , 0,//docid 0,// word pos # 0,// densityRank , // 0-15 0 , // MAXDIVERSITYRANK 0 , // wordSpamRank , 0 , //siterank 0 , // hashGroup, // we set to docLang final hash loop //langUnknown, // langid // unless already set. so set to english here // so it will not be set to something else // otherwise our floats would be ordered by langid! // somehow we have to indicate that this is a float // termlist so it will not be mangled any more. //langEnglish, langUnknown, 0 , // multiplier false, // syn? false , // delkey? false ) ; // shardByTermId? no, by docid. HashTableX *dt = tt;//hi->m_tt; // the key may indeed collide, but that's ok for this application if ( ! dt->addTerm144 ( &k ) ) return false; if ( ! m_wts ) return true; // store in buffer for display on pageparser.cpp output char buf[128]; int32_t bufLen ; bufLen = sprintf(buf,"gbfieldmatch:%s:%"UINT64"",hi->m_prefix,val64); // make a special hashinfo for this facet HashInfo hi2; hi2.m_tt = tt; // the full prefix char fullPrefix[64]; snprintf(fullPrefix,64,"%s:%s",prefix,hi->m_prefix); hi2.m_prefix = fullPrefix;//"gbfacet"; // add to wts for PageParser.cpp display // store it if ( ! storeTerm ( buf, bufLen, ph2, // prefixHash, // s_facetPrefixHash, &hi2, 0, // word#, i, 0, // wordPos 0,// densityRank , // 0-15 0, // MAXDIVERSITYRANK,//phrase 0, // ws, 0, // hashGroup, //true, &m_wbuf, m_wts, // a hack for display in wts: SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc langUnknown , k) ) return false; return true; } // . we store numbers as floats in the top 4 bytes of the lower 6 bytes of the // posdb key // . the termid is the hash of the preceeding field // . in json docs a field is like "object.details.price" // . in meta tags it is just the meta tag name // . credit card numbers are 16 digits. we'd need like 58 bits to store those // so we can't do that here, but we can approximate as a float // . the binary representation of floating point numbers is ordered in the // same order as the floating points themselves! so we are lucky and can // keep our usually KEYCMP sorting algos to keep the floats in order. bool XmlDoc::hashNumber ( char *beginBuf , char *buf , int32_t bufLen , HashInfo *hi ) { if ( ! is_digit(buf[0]) ) return true; char *p = buf; char *bufEnd = buf + bufLen; // back-up over any . if ( p > beginBuf && p[-1] == '.' ) p--; // negative sign? if ( p > beginBuf && p[-1] == '-' ) p--; // . convert it to a float // . this now allows for commas in numbers like "1,500.62" float f = atof2 ( p , bufEnd - p ); // debug //log("build: hashing %s %f",hi->m_prefix,f); if ( ! hashNumber2 ( f , hi , "gbsortby" ) ) return false; // also hash in reverse order for sorting from low to high f = -1.0 * f; if ( ! hashNumber2 ( f , hi , "gbrevsortby" ) ) return false; // // also hash as an int, 4 byte-integer so our lastSpidered timestamps // dont lose 128 seconds of resolution // int32_t i = (int32_t) atoll2 ( p , bufEnd - p ); if ( ! hashNumber3 ( i , hi , "gbsortbyint" ) ) return false; // also hash in reverse order for sorting from low to high i = -1 * i; if ( ! hashNumber3 ( i , hi , "gbrevsortbyint" ) ) return false; return true; } // . THIS IS NOW replaced by ::hashFacet2() being called by hashSections() // above. it is a more generic, faceted approch. // . the term is gbxpathsite123456 the prefix is gbfacet the val32 // stored in the posdb key is the inner html hash of the section, and // the "123456" is the hash of the xpath and site. so the field names // are very custom, not your typical "ext" or "title" // . CHROME DETECTION // . hash a special "gbxpathsitehash12345678" term which has the hash of the // innerHTML content embedded in it. // . we do this for doing gbfacetstr:gbxpathsitehash12345678 etc. on every // section with innerHTML so we can figure out the histogram of each // section on this page relative to its subdomain. like the distriubtion // of the innerHTML for this section as it appears on other pages from // this site. this allows killer CHROME DETECTION!!!! /* bool XmlDoc::hashSectionTerm ( char *term , HashInfo *hi , int32_t sentHash32 ) { int64_t termId = hash64 ( term , gbstrlen(term) ); key144_t k; g_posdb.makeKey ( &k , termId, 0,//docid 0,// word pos # 0,// densityRank , // 0-15 0 , // MAXDIVERSITYRANK 0 , // wordSpamRank , 0 , //siterank 0 , // hashGroup, // we set to docLang final hash loop //langUnknown, // langid // unless already set. so set to english here // so it will not be set to something else // otherwise our floats would be ordered by langid! // somehow we have to indicate that this is a float // termlist so it will not be mangled any more. //langEnglish, langUnknown, 0 , // multiplier false, // syn? false , // delkey? hi->m_shardByTermId ); //int64_t final = hash64n("products.offerprice",0); //int64_t prefix = hash64n("gbsortby",0); //int64_t h64 = hash64 ( final , prefix); //if ( ph2 == h64 ) // log("hey: got offer price"); // now set the float in that key g_posdb.setInt ( &k , sentHash32 ); // HACK: this bit is ALWAYS set by Posdb::makeKey() to 1 // so that we can b-step into a posdb list and make sure // we are aligned on a 6 byte or 12 byte key, since they come // in both sizes. but for this, hack it off to tell // addTable144() that we are a special posdb key, a "numeric" // key that has a float stored in it. then it will NOT // set the siterank and langid bits which throw our sorting // off!! g_posdb.setAlignmentBit ( &k , 0 ); // sanity int t = g_posdb.getInt ( &k ); if ( t != sentHash32 ) { char *xx=NULL;*xx=0; } HashTableX *dt = hi->m_tt; // the key may indeed collide, but that's ok for this application if ( ! dt->addTerm144 ( &k ) ) return false; if ( ! m_wts ) return true; // store in buffer //char buf[128]; //int32_t bufLen = sprintf(buf,"%"UINT32"",sentHash32); // if no gbmin or gbmax or gbsorty or gbrevsortby we need gbfacet //int64_t truePrefix64 = hash64n ( "gbfacet" ); // add to wts for PageParser.cpp display // store it if ( ! storeTerm ( term,//buf, gbstrlen(term),//bufLen, 0LL,//truePrefix64, hi, 0, // word#, i, 0, // wordPos 0,// densityRank , // 0-15 0, // MAXDIVERSITYRANK,//phrase 0, // ws, 0, // hashGroup, //true, &m_wbuf, m_wts, // a hack for display in wts: SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc langUnknown , k)) return false; return true; } */ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) { // prefix is something like price. like the meta "name" or // the json name with dots in it like "product.info.price" or something int64_t nameHash = 0LL; int32_t nameLen = 0; if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix ); if ( hi->m_prefix && nameLen ) nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen ); // need a prefix for hashing numbers... for now else { char *xx=NULL; *xx=0; } // combine prefix hash with a special hash to make it unique to avoid // collisions. this is the "TRUE" prefix. int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby"); // hash with the "TRUE" prefix int64_t ph2 = hash64 ( nameHash , truePrefix64 ); // . now store it // . use field hash as the termid. normally this would just be // a prefix hash // . use mostly fake value otherwise key144_t k; g_posdb.makeKey ( &k , ph2 , 0,//docid 0,// word pos # 0,// densityRank , // 0-15 0 , // MAXDIVERSITYRANK 0 , // wordSpamRank , 0 , //siterank 0 , // hashGroup, // we set to docLang final hash loop //langUnknown, // langid // unless already set. so set to english here // so it will not be set to something else // otherwise our floats would be ordered by langid! // somehow we have to indicate that this is a float // termlist so it will not be mangled any more. //langEnglish, langUnknown, 0 , // multiplier false, // syn? false , // delkey? hi->m_shardByTermId ); //int64_t final = hash64n("products.offerprice",0); //int64_t prefix = hash64n("gbsortby",0); //int64_t h64 = hash64 ( final , prefix); //if ( ph2 == h64 ) // log("hey: got offer price"); // now set the float in that key g_posdb.setFloat ( &k , f ); // HACK: this bit is ALWAYS set by Posdb::makeKey() to 1 // so that we can b-step into a posdb list and make sure // we are aligned on a 6 byte or 12 byte key, since they come // in both sizes. but for this, hack it off to tell // addTable144() that we are a special posdb key, a "numeric" // key that has a float stored in it. then it will NOT // set the siterank and langid bits which throw our sorting // off!! g_posdb.setAlignmentBit ( &k , 0 ); // sanity float t = g_posdb.getFloat ( &k ); if ( t != f ) { char *xx=NULL;*xx=0; } HashTableX *dt = hi->m_tt; // the key may indeed collide, but that's ok for this application if ( ! dt->addTerm144 ( &k ) ) return false; if ( ! m_wts ) return true; // store in buffer char buf[128]; int32_t bufLen = sprintf(buf,"%s:%s float32=%f",sortByStr,hi->m_prefix,f); // add to wts for PageParser.cpp display // store it if ( ! storeTerm ( buf, bufLen, truePrefix64, hi, 0, // word#, i, 0, // wordPos 0,// densityRank , // 0-15 0, // MAXDIVERSITYRANK,//phrase 0, // ws, 0, // hashGroup, //true, &m_wbuf, m_wts, // a hack for display in wts: SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc langUnknown , k) ) return false; return true; } bool XmlDoc::hashNumber3 ( int32_t n , HashInfo *hi , char *sortByStr ) { // prefix is something like price. like the meta "name" or // the json name with dots in it like "product.info.price" or something int64_t nameHash = 0LL; int32_t nameLen = 0; if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix ); if ( hi->m_prefix && nameLen ) nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen ); // need a prefix for hashing numbers... for now else { char *xx=NULL; *xx=0; } // combine prefix hash with a special hash to make it unique to avoid // collisions. this is the "TRUE" prefix. int64_t truePrefix64 = hash64n ( sortByStr ); // "gbsortby"); // hash with the "TRUE" prefix int64_t ph2 = hash64 ( nameHash , truePrefix64 ); // . now store it // . use field hash as the termid. normally this would just be // a prefix hash // . use mostly fake value otherwise key144_t k; g_posdb.makeKey ( &k , ph2 , 0,//docid 0,// word pos # 0,// densityRank , // 0-15 0 , // MAXDIVERSITYRANK 0 , // wordSpamRank , 0 , //siterank 0 , // hashGroup, // we set to docLang final hash loop //langUnknown, // langid // unless already set. so set to english here // so it will not be set to something else // otherwise our floats would be ordered by langid! // somehow we have to indicate that this is a float // termlist so it will not be mangled any more. //langEnglish, langUnknown, 0 , // multiplier false, // syn? false , // delkey? hi->m_shardByTermId ); //int64_t final = hash64n("products.offerprice",0); //int64_t prefix = hash64n("gbsortby",0); //int64_t h64 = hash64 ( final , prefix); //if ( ph2 == h64 ) // log("hey: got offer price"); // now set the float in that key //g_posdb.setFloat ( &k , f ); g_posdb.setInt ( &k , n ); // HACK: this bit is ALWAYS set by Posdb::makeKey() to 1 // so that we can b-step into a posdb list and make sure // we are aligned on a 6 byte or 12 byte key, since they come // in both sizes. but for this, hack it off to tell // addTable144() that we are a special posdb key, a "numeric" // key that has a float stored in it. then it will NOT // set the siterank and langid bits which throw our sorting // off!! g_posdb.setAlignmentBit ( &k , 0 ); // sanity //float t = g_posdb.getFloat ( &k ); int32_t x = g_posdb.getInt ( &k ); if ( x != n ) { char *xx=NULL;*xx=0; } HashTableX *dt = hi->m_tt; // the key may indeed collide, but that's ok for this application if ( ! dt->addTerm144 ( &k ) ) return false; if ( ! m_wts ) return true; // store in buffer char buf[128]; int32_t bufLen = sprintf(buf,"%s:%s int32=%"INT32"",sortByStr,hi->m_prefix,n); // add to wts for PageParser.cpp display // store it if ( ! storeTerm ( buf, bufLen, truePrefix64, hi, 0, // word#, i, 0, // wordPos 0,// densityRank , // 0-15 0, // MAXDIVERSITYRANK,//phrase 0, // ws, 0, // hashGroup, //true, &m_wbuf, m_wts, // a hack for display in wts: SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc langUnknown , k ) ) return false; return true; } // . many many websites got hijacked pages in them... // . revkim.org/mcdrt/mgntf/sata/sata.htm // . collegefootballweekly.net/hswsj/riime/sata/sata.htm char *XmlDoc::getIsHijacked() { bool hj = false; if ( ! hj ) hj = isHijackerFormat ( ptr_firstUrl ); if ( ! hj ) hj = isHijackerFormat ( ptr_redirUrl ); if ( ! hj ) { m_isHijacked = false; m_isHijackedValid = true; return &m_isHijacked; } uint32_t *h1 = getTagPairHash32(); if ( ! h1 || h1 == (void *)-1 ) return (char *)h1; // TODO: check it for the malicious tag formats here!! m_isHijacked = false; m_isHijackedValid = true; return &m_isHijacked; } // is it a custom error page? ppl do not always use status 404! char *XmlDoc::getIsErrorPage ( ) { if ( m_isErrorPageValid ) return &m_isErrorPage; setStatus ( "getting is error page"); // need a buncha crap Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; // get local link info LinkInfo *info1 = getLinkInfo1(); // error or blocked if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char *)info1; // get remote link info LinkInfo **pinfo2 = getLinkInfo2(); // error or blocked if ( ! pinfo2 || pinfo2 == (void *)-1 ) return (char *)pinfo2; // convenience LinkInfo *info2 = *pinfo2; // default LinkInfo *li = info1; //we have to be more sophisticated with longer pages because they //are could actually be talking about an error message. //if(xml->getContentLen() > 4096) return false; // assume not m_isErrorPage = false; m_isErrorPageValid = true; int32_t nn = xml->getNumNodes(); int32_t i; char* s; int32_t len; int32_t len2; char* errMsg = NULL; int32_t numChecked = 0; // check the first header and title tag // limit it to first 32 nodes if(nn > 32) nn = 32; for ( i = 0 ; i < nn ; i++ ) { switch(xml->getNodeId(i)) { case TAG_TITLE: case TAG_H1: case TAG_H2: case TAG_H3: case TAG_SPAN: char* p = xml->getString(i,true,&len); if(len == 0 || len > 1024) continue; char* pend = p + len; errMsg = matchErrorMsg(p, pend ); ++numChecked; break; } if(errMsg || numChecked > 1) break; } if(!errMsg) return &m_isErrorPage; len = gbstrlen(errMsg); // make sure the error message was not present in the link text loop: if ( li && li->getNumGoodInlinks() > 5 ) return &m_isErrorPage; for (Inlink *k=NULL;li && (k=li->getNextInlink(k)); ) { //int32_t nli = li->getNumLinkTexts(); //if we can index some link text from the page, then do it //if(nli > 5) return false; //for ( int32_t i = 0 ; i < nli ; i++ ) { s = k->getLinkText(); len2 = k->size_linkText - 1; // exclude \0 //if(!s) break; //allow error msg to contain link text or vice versa if(len < len2) { if(strncasestr(errMsg, s,len,len2) != NULL) return &m_isErrorPage; } else { if(strncasestr(s, errMsg,len2,len) != NULL) return &m_isErrorPage; } } if ( li ) { li = info2; info2 = NULL; goto loop; } m_isErrorPage = true; return &m_isErrorPage; } char* XmlDoc::matchErrorMsg(char* p, char* pend ) { char utf8Buf[1024]; // int32_t utf8Len = 0; int32_t len = pend - p; if(len > 1024) len = 1024; pend = p + len; char* tmp = utf8Buf; while(p < pend) { *tmp = to_lower_a(*p); tmp++; p++; } p = utf8Buf; pend = p + len; char* errMsg = NULL; while(p < pend) { int32_t r = pend - p; switch (*p) { //sorted by first letter, then by frequency case '4': errMsg = "404 error"; if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg; errMsg = "403 forbidden"; if(r>=13&&strncmp(p, errMsg, 13) == 0) return errMsg; break; case 'd': errMsg = "detailed error information follows"; if(r>=34&&strncmp(p, errMsg, 34) == 0) return errMsg; break; case 'e': errMsg = "error 404"; if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg; errMsg = "error was encountered while processing " "your request"; if(r>=51&&strncmp(p, errMsg,51) == 0) return errMsg; errMsg = "error occurred while processing request"; if(r>=39&&strncmp(p, errMsg, 39) == 0) return errMsg; errMsg = "exception error has occurred"; if(r>=28&&strncmp(p, errMsg,28) == 0) return errMsg; errMsg = "error occurred"; if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg; //http://www.gnu.org/fun/jokes/unix.errors.html //errMsg = "error message"; //if(strncmp(p, errMsg, 13) == 0) return errMsg; break; case 'f': errMsg = "file not found"; if(r>=14&&strncmp(p, errMsg, 14) == 0) return errMsg; break; case 'h': errMsg = "has moved"; if(r>=9&&strncmp(p, errMsg, 9) == 0) return errMsg; break; case 'n': errMsg = "no referrer"; if(r>=12&&strncmp(p, errMsg,12) == 0) return errMsg; break; case 'o': errMsg = "odbc error code = "; if(r>=18&&strncmp(p, errMsg,18) == 0) return errMsg; errMsg = "object not found"; if(r>=16&&strncmp(p, errMsg,16) == 0) return errMsg; break; case 'p': errMsg = "page not found"; if(r>=14&&strncmp(p, errMsg,14) == 0) return errMsg; break; case 's': errMsg = "system error"; if(r>=12&&strncmp(p, errMsg, 12) == 0) return errMsg; break; case 't': errMsg = "the application encountered an " "unexpected problem"; if(r>=49&&strncmp(p, errMsg, 49) == 0) return errMsg; errMsg = "the page you requested has moved"; if(r>=32&&strncmp(p, errMsg, 32) == 0) return errMsg; errMsg = "this page has moved"; if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg; break; case 'u': errMsg = "unexpected problem has occurred"; if(r>=31&&strncmp(p, errMsg, 31) == 0) return errMsg; errMsg = "unexpected error has occurred"; if(r>=29&&strncmp(p, errMsg, 29) == 0) return errMsg; errMsg = "unexpected problem occurred"; if(r>=27&&strncmp(p, errMsg, 27) == 0) return errMsg; errMsg ="unexpected error occurred"; if(r>=25&&strncmp(p, errMsg, 25) == 0) return errMsg; errMsg ="unexpected result has occurred"; if(r>=33&&strncmp(p, errMsg, 33) == 0) return errMsg; errMsg ="unhandled exception"; if(r>=19&&strncmp(p, errMsg, 19) == 0) return errMsg; break; case 'y': errMsg = "you have been blocked"; if(r>=21&&strncmp(p, errMsg, 21) == 0) return errMsg; break; } //skip to the beginning of the next word while(p < pend && !is_wspace_a(*p)) p++; while(p < pend && is_wspace_a(*p)) p++; } return NULL; } #include "Spider.h" static SafeBuf *s_wbuf = NULL; // . this is used by gbsort() above // . sorts TermInfos alphabetically by their TermInfo::m_term member int cmptp (const void *v1, const void *v2) { TermDebugInfo *t1 = *(TermDebugInfo **)v1; TermDebugInfo *t2 = *(TermDebugInfo **)v2; char *start = s_wbuf->getBufStart(); // prefix first char *ps1 = start + t1->m_prefixOff; char *ps2 = start + t2->m_prefixOff; if ( t1->m_prefixOff < 0 ) ps1 = NULL; if ( t2->m_prefixOff < 0 ) ps2 = NULL; int32_t plen1 = 0; if ( ps1 ) plen1 = gbstrlen(ps1); int32_t plen2 = 0; if ( ps2 ) plen2 = gbstrlen(ps2); int32_t pmin = plen1; if ( plen2 < pmin ) pmin = plen2; int32_t pn = strncmp ( ps1 , ps2 , pmin ); if ( pn ) return pn; if ( plen1 != plen2 ) return ( plen1 - plen2 ); // return if groups differ int32_t len1 = t1->m_termLen; int32_t len2 = t2->m_termLen; int32_t min = len1; if ( len2 < min ) min = len2; char *s1 = start + t1->m_termOff; char *s2 = start + t2->m_termOff; int32_t n = strncasecmp ( s1 , s2 , min ); if ( n ) return n; // . if length same, we are tied // . otherwise, prefer the int16_ter return ( len1 - len2 ); } // . this is used by gbsort() above // . sorts TermDebugInfos by their TermDebugInfo::m_wordPos member int cmptp2 (const void *v1, const void *v2) { TermDebugInfo *t1 = *(TermDebugInfo **)v1; TermDebugInfo *t2 = *(TermDebugInfo **)v2; // word position first int32_t d = t1->m_wordPos - t2->m_wordPos; if ( d ) return d; // secondly drop back to hashgroup i guess //d = t1->m_hashGroup - t2->m_hashGroup; d = t1->m_synSrc - t2->m_synSrc; if ( d ) return d; // word len d = t1->m_termLen - t2->m_termLen; if ( d ) return d; return 0; } bool printLangBits ( SafeBuf *sb , TermDebugInfo *tp ) { char printed = false; if ( tp->m_synSrc ) { sb->safePrintf(" "); printed = true; } int32_t j = 0; if ( printed ) j = MAX_LANGUAGES; for ( ; j < MAX_LANGUAGES ; j++ ) { int64_t mask = 1LL << j; //if ( j == tp->m_langId ) // sb->safePrintf("[%s]", // getLangAbbr(tp->m_langId)); if ( ! (tp->m_langBitVec64 & mask) ) continue; char langId = j+1; // match in langvec? that means even if the // word is in multiple languages we put it in // this language because we interesect its lang bit // vec with its neighbors in the sliding window // algo in setLangVector. if ( langId == tp->m_langId ) sb->safePrintf("<b>"); sb->safePrintf("%s ", getLangAbbr(langId) ); if ( langId == tp->m_langId ) sb->safePrintf("</b>"); printed = true; } if ( ! printed ) { sb->safePrintf("??"); } return true; } bool XmlDoc::printDoc ( SafeBuf *sb ) { if ( ! sb ) return true; Url *u = getFirstUrl(); // hash the url into 64 bits int64_t uh64 = hash64(u->getUrl(),u->getUrlLen()); // int16_tcut char *fu = ptr_firstUrl; char *allowed = "???"; if ( m_isAllowedValid && m_isAllowed ) allowed = "yes"; else if ( m_isAllowedValid ) allowed = "no"; int32_t ufn = -1; if ( m_urlFilterNumValid ) ufn = m_urlFilterNum; time_t spideredTime = getSpideredTime(); CollectionRec *cr = getCollRec(); if ( ! cr ) return false; sb->safePrintf ("<meta http-equiv=\"Content-Type\" " "content=\"text/html; charset=utf-8\">" "<table cellpadding=3 border=0>\n" "<tr>" "<td width=\"25%%\">docId</td>" "<td><a href=/get?c=%s&d=%"UINT64">%"UINT64"</a></td>" "</tr>\n" "<tr>" "<td width=\"25%%\">uh48</td>" "<td>%"UINT64"</td>" "</tr>\n" "<tr>" "<td width=\"25%%\">uh64</td>" "<td>%"UINT64"</td>" "</tr>\n" "<tr>" "<td>index error code</td>" "<td>%s</td>" "</tr>\n" "<tr>" "<td>url filter num</td>" "<td>%"INT32"</td>" "</tr>\n" "<tr>" "<td>other - errno</td>" "<td>%s</td>" "</tr>\n" "<tr>" "<td>robots.txt allows</td>" "<td>%s</td>" "</tr>\n" "<tr>" "<td>metalist size</td>" "<td>%"INT32"</td>" "</tr>\n" "<tr>" "<td>url</td>" "<td><a href=\"%s\">%s</a></td>" "</tr>\n" , cr->m_coll, m_docId , m_docId , getFirstUrlHash48(), // uh48 getFirstUrlHash64(), // uh48 mstrerror(m_indexCode), ufn, mstrerror(g_errno), allowed, m_metaListSize, fu, fu ); if ( ptr_redirUrl ) sb->safePrintf( "<tr>" "<td>redir url</td>" "<td><a href=\"%s\">%s</a></td>" "</tr>\n" ,ptr_redirUrl ,ptr_redirUrl ); else sb->safePrintf( "<tr>" "<td>redir url</td>" "<td>--</td>" "</tr>\n" ); sb->safePrintf("<tr><td>hostHash64</td><td>0x%"XINT64"</td></tr>", (uint64_t)getHostHash32a()); sb->safePrintf("<tr><td>site</td><td>"); sb->safeMemcpy(ptr_site,size_site-1); sb->safePrintf("</td></tr>\n"); if ( m_siteHash32Valid ) sb->safePrintf("<tr><td>siteHash32</td><td>0x%"XINT32"</td></tr>\n", m_siteHash32); if ( m_domHash32Valid ) sb->safePrintf("<tr><td>domainHash32</td><td>0x%"XINT32"</td></tr>\n", m_domHash32); sb->safePrintf ( "<tr>" "<td>domainHash8</td>" "<td>0x%"XINT32"</td>" "</tr>\n" , (int32_t)g_titledb.getDomHash8FromDocId(m_docId) ); sb->safePrintf( "<tr>" "<td>coll</td>" "<td>%s</td>" "</tr>\n" "<tr>" "<td>spidered date</td>" "<td>%s UTC</td>" "</tr>\n" , cr->m_coll, asctime(gmtime ( &spideredTime )) ); /* char *ms = "-1"; if ( m_minPubDate != -1 ) ms = asctime(gmtime ( &m_minPubDate )); sb->safePrintf ( "<tr>" "<td>min pub date</td>" "<td>%s UTC</td>" "</tr>\n" , ms ); ms = "-1"; if ( m_maxPubDate != -1 ) ms = asctime(gmtime ( &m_maxPubDate )); sb->safePrintf ( "<tr>" "<td>max pub date</td>" "<td>%s UTC</td>" "</tr>\n" , ms ); */ // our html template fingerprint sb->safePrintf ("<tr><td>tag pair hash 32</td><td>"); if ( m_tagPairHash32Valid )sb->safePrintf("%"UINT32"", (uint32_t)m_tagPairHash32); else sb->safePrintf("invalid"); sb->safePrintf("</td></tr>\n" ); // print list we added to delete stuff if ( m_indexCode && m_oldDocValid && m_oldDoc ) { // skip debug printing for now... //return true; sb->safePrintf("</table><br>\n"); sb->safePrintf("<h2>Delete Meta List</h2>"); printMetaList ( m_metaList , m_metaList + m_metaListSize ,sb); } if ( m_indexCode || g_errno ) { printMetaList ( m_metaList , m_metaList + m_metaListSize, sb ); } if ( m_indexCode ) return true; if ( g_errno ) return true; // sanity check //if ( ! m_sreqValid ) { char *xx=NULL;*xx=0; } /* sb->safePrintf("<tr><td>next spider date</td>" "<td>%s UTC</td></tr>\n" "<tr><td>next spider priority</td>" "<td>%"INT32"</td></tr>\n" , asctime(gmtime( &m_nextSpiderTime )) , (int32_t)m_nextSpiderPriority ); */ // must always start with http i guess! if ( strncmp ( fu , "http" , 4 ) ) { char *xx=NULL;*xx=0; } // show the host that should spider it //int32_t domLen ; char *dom = getDomFast ( fu , &domLen , true ); //int32_t hostId; if ( m_sreqValid ) { // must not block SpiderRequest *oldsr = &m_sreq; uint32_t shard = g_hostdb.getShardNum(RDB_SPIDERDB,oldsr); sb->safePrintf ("<tr><td><b>assigned spider shard</b>" "</td>\n" "<td><b>%"UINT32"</b></td></tr>\n",shard); } time_t ts = m_firstIndexedDate; sb->safePrintf("<tr><td>first indexed date</td>" "<td>%s UTC</td></tr>\n" , asctime(gmtime(&ts )) ); ts = m_outlinksAddedDate; sb->safePrintf("<tr><td>outlinks last added date</td>" "<td>%s UTC</td></tr>\n" , asctime(gmtime(&ts )) ); // hop count sb->safePrintf("<tr><td>hop count</td><td>%"INT32"</td></tr>\n", (int32_t)m_hopCount); // thumbnails ThumbnailArray *ta = (ThumbnailArray *) ptr_imageData; if ( ta ) { int32_t nt = ta->getNumThumbnails(); sb->safePrintf("<tr><td># thumbnails</td>" "<td>%"INT32"</td></tr>\n",nt); for ( int32_t i = 0 ; i < nt ; i++ ) { ThumbnailInfo *ti = ta->getThumbnailInfo(i); sb->safePrintf("<tr><td>thumb #%"INT32"</td>" "<td>%s (%"INT32"x%"INT32",%"INT32"x%"INT32") " , i , ti->getUrl() , ti->m_origDX , ti->m_origDY , ti->m_dx , ti->m_dy ); ti->printThumbnailInHtml ( sb , 100,100,true,NULL) ; // end the row for this thumbnail sb->safePrintf("</td></tr>\n"); } } char *ddd; time_t datedbDate = (time_t)m_pubDate; if ( datedbDate != -1 ) ddd = asctime ( gmtime(&datedbDate )); else ddd = "---"; char strLanguage[128]; languageToString(m_langId, strLanguage); // print tags //if ( ! m_tagRecValid ) { char *xx=NULL;*xx=0; } SafeBuf tb; TagRec *ogr = NULL; if ( m_tagRecValid ) ogr = &m_tagRec; if ( ogr ) ogr->printToBufAsHtml ( &tb , "old tag" ); SafeBuf *ntb = NULL; if ( m_newTagBufValid ) ntb = getNewTagBuf(); if ( ntb ) { // this is just a sequence of tags like an rdblist char *pt = ntb->getBufStart(); char *ptend = pt + ntb->length(); for ( ; pt < ptend ; ) { // skip rdbid pt++; // cast it Tag *tag = (Tag *)pt; // skip it pt += tag->getRecSize(); // print tag out tag->printToBufAsHtml ( &tb, "new tag"); } } // prevent (null) from being displayed tb.pushChar('\0'); //Tag *tag1 = gr->getTag ("sitenuminlinks"); //Tag *tag2 = gr->getTag ("sitepop"); //int32_t sni = 0; //int32_t spop = 0; //if ( tag1 ) sni = atol(tag1->m_data); //if ( tag2 ) spop = atol(tag2->m_data); int32_t sni = m_siteNumInlinks; //int32_t spop = m_sitePop; LinkInfo *info1 = ptr_linkInfo1; //LinkInfo *info2 = ptr_linkInfo2; //int32_t sni ; //int32_t extrapolated = 0; //if ( info1 ) extrapolated = info1->m_numInlinksExtrapolated; //if ( info1 ) sni = info1->m_siteNumInlinks; char *ipString = iptoa(m_ip); char *estimated = ""; if ( datedbDate & 0x01 ) // tr->datedbDateIsEstimated() ) estimated = "<nobr><b>[estimated from bisection]</b></nobr>"; //char *ls = getIsLinkSpam(); Links *links = getLinks(); // sanity check. should NEVER block! if ( links == (void *)-1 ) { char *xx=NULL;*xx=0; } // this is all to get "note" //char *note = NULL; // make it a URL Url uu; uu.set ( ptr_firstUrl , false ); // sanity check Xml *xml = getXml(); // sanity check if ( xml == (void *)-1 ) { char *xx=NULL;*xx=0; } sb->safePrintf ( "<tr><td>datedb date</td><td>%s UTC (%"UINT32")%s" "</td></tr>\n" "<tr><td>compressed size</td><td>%"INT32" bytes</td></tr>\n" "<tr><td>original charset</td><td>%s</td></tr>\n" //"<tr><td>site num inlinks</td><td><b>%"INT32"%</b></td></tr>\n" //"<tr><td>total extrapolated linkers</td><td>%"INT32"</td></tr>\n" "<tr><td><b>title rec version</b></td><td><b>%"INT32"</b>" "</td></tr>\n" "<tr><td>adult bit</td><td>%"INT32"</td></tr>\n" //"<tr><td>is link spam?</td><td>%"INT32" <b>%s</b></td></tr>\n" "<tr><td>is permalink?</td><td>%"INT32"</td></tr>\n" "<tr><td>is RSS feed?</td><td>%"INT32"</td></tr>\n" //"<tr><td>index article only?</td><td>%"INT32"</td></tr>\n" "%s\n" "<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">" "%s</td></tr>\n" "<tr><td>content len</td><td>%"INT32" bytes</td></tr>\n" "<tr><td>content truncated</td><td>%"INT32"</td></tr>\n" "<tr><td>content type</td><td>%"INT32" (%s)</td></tr>\n" "<tr><td>language</td><td>%"INT32" (%s)</td></tr>\n" "<tr><td>country</td><td>%"INT32" (%s)</td></tr>\n" "</td></tr>\n", ddd , (uint32_t)datedbDate , estimated , m_oldTitleRecSize, get_charset_str(m_charset), //sni , //ptr_linkInfo1->m_numInlinksExtrapolated, (int32_t)m_version , (int32_t)m_isAdult, //(int32_t)m_isLinkSpam, //m_note, (int32_t)m_isPermalink, (int32_t)m_isRSS, //(int32_t)m_eliminateMenus, // tag rec tb.getBufStart(), ipString, cr->m_coll, ipString, size_utf8Content - 1, (int32_t)m_isContentTruncated, (int32_t)m_contentType, g_contentTypeStrings[(int)m_contentType] , (int32_t)m_langId, strLanguage, (int32_t)m_countryId, g_countryCode.getName(m_countryId) ); /* int32_t boost1 = getBoostFromSiteNumInlinks ( sni ); sb->safePrintf ( "<tr><td><b>title weight</b></td>" "<td><b>%"UINT32"%%</b></td></tr>\n" "<tr><td>header weight</td>" "<td>%"UINT32"%%</td></tr>\n" "<tr><td>url path weight</td>" "<td>%"UINT32"%%</td></tr>\n" "<tr><td>external link text weight</td>" "<td>%"UINT32"%%</td></tr>\n" "<tr><td>internal link text weight</td>" "<td>%"UINT32"%%</td></tr>\n" "<tr><td>concept weight</td>" "<td>%"UINT32"%%</td></tr>\n" "<tr><td>score boost from site num inlinks</td>" "<td>%"INT32"%%</td>" "</tr>\n", (int32_t)m_titleWeight, (int32_t)m_headerWeight, (int32_t)m_urlPathWeight, (int32_t)m_externalLinkTextWeight, (int32_t)m_internalLinkTextWeight, (int32_t)m_conceptWeight , boost1 ); */ // print title //sb->safePrintf( "<tr><td>title</td><td>%s</td></tr>\n" , // ti->m_title ); // print the new, unstored, gigabit vector if ( size_gigabitHashes ) { // get gigabit vector int32_t *vec = ptr_gigabitHashes; // point to scores int32_t *ss = ptr_gigabitScores; int32_t count = 0; int32_t total = 0; sb->safePrintf ( "<tr><td>stored gigabit vector</td><td>"); while ( *vec ) { sb->safePrintf ( "%08"XINT32" ", *vec ); sb->safePrintf ( "(%05"INT32") ", *ss ); vec++; ss++; count++; total++; //if ( total >= GIGABITS_IN_VECTOR ) break; if ( count < 4 ) continue; count = 0; sb->safePrintf ( "<br>\n"); } sb->safePrintf ( "</tr>\n"); } // print dmoz stuff int32_t numCatIds = size_catIds/4; int32_t numIndCatIds = size_indCatIds/4; sb->safePrintf( "<tr><td>Number of Category IDs</td>" "<td>%"INT32"</td></tr>\n", numCatIds ); char *dtp = ptr_dmozTitles; char *dsp = ptr_dmozSumms; char *dap = ptr_dmozAnchors; for (int32_t i = 0; i < numCatIds; i++) { // print the ID sb->safePrintf( "<tr><td>ID #%"INT32"</td><td>%"INT32"</td></tr>\n", i, ptr_catIds[i]); // print the title if ( dtp ) { sb->safePrintf( "<tr><td>Title #%"INT32" </td><td>",i); sb->safeMemcpy( dtp,gbstrlen(dtp) ); sb->safePrintf( "</td></tr>\n"); dtp += gbstrlen(dtp) + 1; } // print the summary if ( dsp ) { sb->safePrintf( "<tr><td>Summary #%"INT32"</td><td>", i); sb->safeMemcpy( dsp , gbstrlen(dsp ) ) ; sb->safePrintf( "</td></tr>\n"); dsp += gbstrlen ( dsp ) + 1; } // print the anchor if ( dap ) { sb->safePrintf( "<tr><td>Anchor #%"INT32"</td><td>",i); sb->safeMemcpy( dap , gbstrlen(dap) ); sb->safePrintf( "</td></tr>\n"); dap += gbstrlen ( dap ) + 1; } } sb->safePrintf( "<tr><td>Number of Indirect Category IDs</td>" "<td>%"INT32"</td></tr>\n", numIndCatIds); for (int32_t i = 0; i < numIndCatIds; i++) { // print the ID sb->safePrintf( "<tr><td>Indirect ID #%"INT32"</td>" "<td>%"INT32"</td></tr>\n", i, ptr_indCatIds[i]); } if ( info1 ) { //sb->safePrintf("<tr><td>page pop</td><td>%"INT32"</td></tr>\n", // info1->m_pagePop ); //sb->safePrintf("<tr><td>whole site pop</td>" // "<td>%"INT32"</td></tr>\n", // spop ); sb->safePrintf("<tr><td>num GOOD links to whole site</td>" "<td>%"INT32"</td></tr>\n", sni ); } // close the table sb->safePrintf ( "</table></center><br>\n" ); // // convert document into json representing multiple documents // if it makes sense. sometimes a single url contains multiple // subdocuments that each should have their own url, but do not, // so we fix that here. // SafeBuf *dbr = getDiffbotReply(); if ( dbr->length() ) { sb->safePrintf("<b>START EXACT DIFFBOT REPLY</b><br>\n"); sb->safePrintf("<pre>"); sb->safeMemcpy ( dbr ); sb->safePrintf("</pre>"); sb->safePrintf("<b>END EXACT DIFFBOT REPLY</b><br><br>\n"); } // // PRINT ADDRESSES (prints streets first) // Addresses *aa = getAddresses (); if ( ! aa || aa == (Addresses *)-1 ) { char *xx=NULL;*xx=0;} aa->print(sb,uh64); // // PRINT PUB DATE CANDIDATES // // print stored pub date candidates which we indexed as clock // or not clock! Dates *dp = getDates() ; // should never block! if ( dp == (void *)-1 ) { char *xx=NULL;*xx=0; } // print it out if ( dp ) dp->printDates ( sb ); //return true; // // PRINT SECTIONS // Sections *sections = getSections(); if ( ! sections ||sections==(Sections *)-1) {char*xx=NULL;*xx=0;} //SectionVotingTable *nsvt = getNewSectionVotingTable(); //if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;} //SectionVotingTable *osvt = getOldSectionVotingTable(); //if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;} // these are nice //HashTableX *pt = dp->getPhoneTable(); //HashTableX *et = dp->getEmailTable(); //HashTableX *at = aa->getPlaceTable(); //HashTableX *tt = dp->getTODTable(); //HashTableX *rt = ev->getRegistrationTable(); //HashTableX *priceTable = dp->getPriceTable(); //sections->print ( sb , pt , et , NULL , at , tt , priceTable ); // try the new print function //sections->print2 ( sb , NULL, NULL , NULL , false ); printRainbowSections ( sb , NULL ); //nsvt->print ( sb , "NEW Sections Voting Table" ); //osvt->print ( sb , "OLD Sections Voting Table" ); // // PRINT LINKINFO // //if ( info1 ) // info1->print ( sb , cr->m_coll ); //if ( info2 ) { // sb->safePrintf ( "<tr><td><b>IMPORTED LINK INFO:" // "</b></td></tr>" ); // info2->print ( sb , cr->m_coll ); //} // cut it int16_t for debugging logf(LOG_DEBUG,"xmldoc: FIX ME remove return"); //return true; // // PRINT LINKINFO // char *p = m_pageLinkBuf.getBufStart(); int32_t plen = m_pageLinkBuf.length(); sb->safeMemcpy ( p , plen ); // // PRINT SITE LINKINFO // p = m_siteLinkBuf.getBufStart(); plen = m_siteLinkBuf.length(); sb->safeMemcpy ( p , plen ); // // BEGIN PRINT GIGABITS // // print out for PageParser.cpp const char *help = "The <i>Gigabits</i> are words extracted from the document " "that are deemed to best represent it. The <i>Pop</i> column " "is the popularity of the word and it ranges from 0 to 1000 " "and is how many documents out of a sample of 1000 that " "contained that word. The <i>Score</i> of each Gigabit is " "based on the popularity and how many times the word appeared " "in the document. Higher scores are deemed more " "representative of the document. The hashes of these Gigabits " "are stored with the cached copy of the document as numeric " "hashes for purposes of topic clustering. You can see these " "hashes by clicking on the <i>[info]</i> link next to " "any search result.<br><br>"; if ( m_numTop > 0 ) sb->safePrintf( "<table width=100%%>" "<td bgcolor=pink>\n" "%s" "<table>" "<tr><td>#</td><td>" "<b>%"INT32" Gigabits</b></td><td><b>Score</b>" "</td>" "<td><b>Pop</b></td>" "<td><b>Hash</b></td>" "</tr>\n", help,m_numTop); // . print out the top gigabits we harvested // . start with the highest scoring node first, the last node since // nodes are ranked by lowest to highest key int32_t total = 0; for ( int32_t i = 0 ; i < m_numTop ; i++ ) { // get the info GigabitInfo *gi = m_top[i]; // print row sb->safePrintf("<tr><td>%"INT32"</td><td>",i); // print gigabit sb->safeMemcpy(gi->m_ptr , gi->m_len ); // get 32 bit hash uint32_t h = gi->m_hash & 0xffffffff; // never allow 0 if ( h == 0 ) h = 1; // if unicode, pop's hi bit is set sb->safePrintf( "</td>" "<td>%"INT32"</td>" "<td>%"INT32"</td>" "<td>%08"XINT32"</td>" "</tr>\n", (int32_t)gi->m_pts, (int32_t)gi->m_pop, (int32_t)h ); // add up all scores total += gi->m_pts; } // close table if ( m_numTop > 0 ) { sb->safePrintf("<tr><td></td><td></td><td>" "<b>%"INT32"</b></td></tr>\n",total); sb->safePrintf("</table>\n"); } // // END PRINT GIGABITS // // note this sb->safePrintf("<h2>NEW Meta List</h2>"); printMetaList ( m_metaList , m_metaList + m_metaListSize , sb ); // all done if no term table to print out if ( ! m_wts ) return true; // print out the rules in Weights.cpp /* sb->safePrintf ("<br>" "<table border=1 cellpadding=0>" "<tr><td>Rule #3</td>" "<td>First 40 words in ()'s.</td></tr>\n" "<tr><td>Rule #4</td>" "<td>Adjacent to bad punct.</td></tr>\n" "<tr><td>Rule #5</td>" "<td>In a link.</td></tr>\n" "<tr><td>Rule #6</td>" "<td>First occurence in a section. Actual weight " "depends on section word count.</td></tr>\n" "<tr><td>Rule #7</td>" "<td>In a header tag. h1 is most weight.</td></tr>\n" "<tr><td>Rule #8</td>" "<td>In a \"ul\" list.</td></tr>\n" "<tr><td>Rule #9</td>" "<td>Repeated occurence in the same fragment or " "sentence.</td></tr>\n" "<tr><td>Rule #10</td>" "<td>In a comma-separated list.</td></tr>\n" "<tr><td>Rule #11</td>" "<td>Promoted isolated capitalized words, demote " "if it is in a capitalized phrase.</td></tr>\n" "<tr><td>Rule #13</td>" "<td>First occurence in document.</td></tr>\n" "<tr><td>Rule #15</td>" "<td>Word to phrase ratio weight.</td></tr>\n" "<tr><td>Rule #16</td>" "<td>At the beginning of a fragment or sentence." "</td></tr>\n" "<tr><td>Rule #17</td>" "<td>If immediately after a quote, iff not " "promoted by Rule #18.</td></tr>\n" "<tr><td>Rule #18</td>" "<td>Promote phrase if capitalized. Demote phrase " "if mixed case without hypehn.</td></tr>\n" "<tr><td>Rule #22</td>" "<td>Demote phrases containing bad punct.</td></tr>\n" "<tr><td>Rule #23</td>" "<td>In script, style, select or marquee tag. " "</td></tr>\n" "<tr><td>Rule #23</td>" "<td>Follows a number.</td></tr>\n" "<tr><td>Rule #25</td>" "<td>Demote non-hyphenated phrases that would split " "adjacent hyphenated phrases.</td></tr>\n" "<tr><td>Rule #26</td>" "<td>Demote if in a repeated fragment.</td></tr>\n" "<tr><td>Rule #27</td>" "<td>Demote if in a menu section.</td></tr>\n" "<tr><td>Rule #28</td>" "<td>Pattern spam detector.</td></tr>\n" "</table>\n" "<br>" ); */ // // BEGIN PRINT HASHES TERMS // // int16_tcut HashTableX *wt = m_wts; // use the keys to hold our list of ptrs to TermDebugInfos for sorting! TermDebugInfo **tp = NULL; // add them with this counter int32_t nt = 0; int32_t nwt = 0; if ( wt ) { nwt = wt->m_numSlots; tp = (TermDebugInfo **)wt->m_keys; } // now print the table we stored all we hashed into for ( int32_t i = 0 ; i < nwt ; i++ ) { // skip if empty if ( wt->m_flags[i] == 0 ) continue; // breathe //QUICKPOLL(m_niceness); // get its key, date=32bits termid=64bits //key96_t *k = (key96_t *)wt->getKey ( i ); // get the TermDebugInfo TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i ); // point to it for sorting tp[nt++] = ti; } // set this for cmptp s_wbuf = &m_wbuf; // sort them alphabetically by Term gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp , m_niceness ); // determine how many non 1.0 weight fields we got in the vectors /* int32_t count [ MAX_RULES ]; memset ( count , 0 , MAX_RULES * 4 ); for ( int32_t i = 0 ; i < nt ; i++ ) { TermDebugInfo *ti = tp[i]; for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) if ( ti->m_rv[j] != 1.0 ) count[j]++; } // count the counts char fbuf[9024]; char *fp = fbuf; for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) { if ( ! count[j] ) continue; fp += sprintf(fp ,"<td><b>R#%"INT32"</b></td>",j); } */ // print them out in a table char hdr[1000]; sprintf(hdr, "<table border=1 cellpadding=0>" "<tr>" // this messes up Test.cpp diff'ing //"<td><b>#</b></td>" "<td><b>Prefix</b></td>" "<td><b>WordNum</b></td>" "<td><b>Lang</b></td>" "<td><b>Term</b></td>" //"%s" //"<td><b>Weight</b></td>" //"<td><b>Spam</b></td>" "<td><b>Desc</b></td>" "<td><b>TermId/TermHash48</b></td>" "<td><b>ShardByTermId?</b></td>" "<td><b>Note</b></td>" "</tr>\n" //,fbuf ); sb->safePrintf("%s",hdr); char *start = m_wbuf.getBufStart(); int32_t rcount = 0; for ( int32_t i = 0 ; i < nt ; i++ ) { // see if one big table causes a browser slowdown if ( (++rcount % TABLE_ROWS) == 0 ) sb->safePrintf("<!--ignore--></table>%s",hdr); char *prefix = " "; if ( tp[i]->m_prefixOff >= 0 ) prefix = start + tp[i]->m_prefixOff; bool isFacet = false; if ( prefix && prefix[0]=='g' && strncmp(prefix,"gbfacet",7)== 0 ) isFacet = true; sb->safePrintf ( "<tr>" //"<td><b>%"INT32"</b></td>" "<td>%s</td>" //i , , prefix ); if ( isFacet ) sb->safePrintf("<td>--</td>"); else sb->safePrintf( "<td>%"INT32"</td>" , tp[i]->m_wordNum ); // print lang //char langId = tp[i]->m_langId; // print out all langs word is in if it's not clear // what language it is. we use a sliding window to // resolve some ambiguity, but not all, so print out // the possible langs here sb->safePrintf("<td>"); if ( isFacet ) sb->safePrintf("--"); else printLangBits ( sb , tp[i] ); sb->safePrintf("</td>"); // print the term sb->safePrintf("<td><nobr>"); if ( tp[i]->m_synSrc ) sb->pushChar('*'); char *term = start + tp[i]->m_termOff; int32_t termLen = tp[i]->m_termLen; sb->safeMemcpy ( term , termLen ); /* char *dateStr = " "; int32_t ddd = tp[i]->m_date; uint8_t *tddd = (uint8_t *)&ddd; char tbbb[32]; if ( ddd && tddd[2] == 0 && tddd[3] == 0 && tddd[0] && tddd[1] && tddd[1] <= tddd[0] ) { sprintf(tbbb,"evIds %"INT32"-%"INT32"", (int32_t)tddd[1],(int32_t)tddd[0]); dateStr = tbbb; } else if ( ddd ) dateStr = asctime ( gmtime(&ddd )); */ //char ss[30]; //if ( tp[i]->m_spam == -1.0 ) sprintf(ss," "); //else if ( tp[i]->m_spam == 0.0 ) sprintf(ss,"--"); //else sprintf ( ss , "%.03f",1.0-tp[i]->m_spam); sb->safePrintf ( "</nobr></td>" ); // print the weight vector before Weight and Spam /* float prod = 1.0; for ( int32_t j = 0 ; j < MAX_RULES ; j++ ) { if ( ! count[j] ) continue; if ( tp[i]->m_isSynonym ) sb->safePrintf("<td> </td>" ); else if ( tp[i]->m_rv[j] == 1.0 ) sb->safePrintf("<td> </td>" ); else sb->safePrintf("<td>%.02f</td>",tp[i]->m_rv[j] ); // product up prod *= tp[i]->m_rv[j]; } // sanity check // maybe look into this at some point, but not a big deal!! //float err = prod - tp[i]->m_weight; //if ( err > .05 ) // logf(LOG_DEBUG,"weights: prod was %.02f should be " // "%.02f",prod,tp[i]->m_weight); */ //char *desc = " "; //if ( tp[i]->m_descOff >= 0 ) // desc = start + tp[i]->m_descOff; /* // synonyms are always 1/4 weight of original if ( tp[i]->m_isSynonym ) sb->safePrintf("<td> </td>" ); else sb->safePrintf("<td>%.03f</td>", tp[i]->m_weight ); */ sb->safePrintf ( //"<td>%s</td>" //"<td><b>%"UINT32"</b></td>" //"<td><nobr>%s</nobr></td>" "<td><nobr>%s", getHashGroupString(tp[i]->m_hashGroup) ); //if ( tp[i]->m_synSrc ) { // char ss = tp[i]->m_synSrc; // sb->safePrintf(" - %s",g_synonyms.getSourceString(ss)); //} sb->safePrintf ( "</nobr></td>" ); sb->safePrintf ( "<td>%016"UINT64"</td>" , //ss , //(uint32_t)tp[i]->m_score32 , //dateStr , //desc, // start + tp[i]->m_descOff , (uint64_t)(tp[i]->m_termId & TERMID_MASK) ); if ( tp[i]->m_shardByTermId ) sb->safePrintf("<td><b>1</b></td>" ); else sb->safePrintf("<td>0</td>" ); sb->safePrintf("<td>"); // there is no prefix for such terms now // TODO: store actual key in there i guess?? or just this bit. int32_t val32 = 0; if ( strncmp(prefix,"gbfacet",7) == 0 ) val32 = g_posdb.getInt(&tp[i]->m_key); // . this is like gbxpathsitehash1234567 // . the number following it is the hash // . the value stored in the posdb key is the hash of the // inner html content of that xpath/site for this page if ( strncmp(term,"facetField=gbxpathsitehash",26)==0) sb->safePrintf("<b>Term</b> is a 32-bit hash of the " "X-path of " "a section XOR'ed with the 32-bit " "hash of this document's subdomain. " "[%"UINT32"] is the 32-bit hash of the " "Inner HTML of this section stored " "in the posdb key instead of " "the usual stuff. This is also " "sharded by termId!", (uint32_t)val32 //(int32_t)tp[i]->m_sentHash32 ); sb->safePrintf("</td>"); sb->safePrintf("</tr>\n"); } sb->safePrintf("</table><br>\n"); // // END PRINT HASHES TERMS // return true; } bool XmlDoc::printMenu ( SafeBuf *sb ) { // encode it SafeBuf ue; ue.urlEncode ( ptr_firstUrl ); // get sb->safePrintf ("<meta http-equiv=\"Content-Type\" " "content=\"text/html; charset=utf-8\">" ); CollectionRec *cr = getCollRec(); if ( ! cr ) return false; /* char *coll = cr->m_coll; int64_t d = m_docId; // print links at top sb->safePrintf( //"<a href=/print?c=%s&u=%s&page=1>general info</a> | " //"<a href=/print?c=%s&u=%s&page=2>page inlinks</a> | " //"<a href=/print?c=%s&u=%s&page=3>site inlinks</a> | " //"<a href=/print?c=%s&u=%s&page=4>sections</a> | " //"<a href=/print?c=%s&u=%s&page=5>indexed terms</a> | " // the breakdown of when it was spidered and when it // is due to be spidered again. and any errors // encountered when spidering //"<a href=/print?c=%s&u=%s&page=6>spider stats</a> | " //"<a href=/print?c=%s&u=%s&page=7>cached page</a>" "<a href=/print?c=%s&d=%"INT64"&page=1>general info</a> | " "<a href=/print?c=%s&d=%"INT64"&page=2&recompute=1>" "page inlinks</a> | " "<a href=/print?c=%s&d=%"INT64"&page=3>site inlinks</a> | " //"<a href=/print?c=%s&d=%"INT64"&page=4>sections</a> | " "<a href=/print?c=%s&d=%"INT64"&page=5>indexed terms</a>" // the breakdown of when it was spidered and when it // is due to be spidered again. and any errors // encountered when spidering //"<a href=/print?c=%s&d=%"INT64"&page=6>spider stats</a> |" //" <a href=/print?c=%s&d=%"INT64"&page=7>cached page</a>" "<br>" "<br>" ,coll,d//ue.getBufStart() ,coll,d//ue.getBufStart() ,coll,d//ue.getBufStart() //,coll,d//ue.getBufStart() ,coll,d//ue.getBufStart() //,coll,d//ue.getBufStart() //,coll,d//ue.getBufStart() ); */ return true; } // if printDocForProCog, an entry function, blocks, we gotta re-call it static void printDocForProCogWrapper ( void *state ) { XmlDoc *THIS = (XmlDoc *)state; // make sure has not been freed from under us! if ( THIS->m_freed ) { char *xx=NULL;*xx=0;} // note it THIS->setStatus ( "in print doc for pro cog wrapper" ); // get it bool status = THIS->printDocForProCog ( THIS->m_savedSb , THIS->m_savedHr ); // return if it blocked if ( ! status ) return; // otherwise, all done, call the caller callback if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state ); else THIS->m_callback2 ( THIS->m_state ); } // in PageRoot.cpp bool printFrontPageShell ( SafeBuf *sb , char *tabName , CollectionRec *cr , bool printGigablast ); // . returns false if blocked, true otherwise // . sets g_errno and returns true on error bool XmlDoc::printDocForProCog ( SafeBuf *sb , HttpRequest *hr ) { if ( ! sb ) return true; CollectionRec *cr = getCollRec(); if ( ! cr ) return true; m_masterLoop = printDocForProCogWrapper; m_masterState = this; m_savedSb = sb; m_savedHr = hr; // if we are generating site or page inlinks info for a // non docid based url, then store that info in the respective // safe bufs m_useSiteLinkBuf = true; m_usePageLinkBuf = true; int32_t page = hr->getLong("page",1); // for some reason sections page blocks forever in browser if ( page != 7 && ! m_printedMenu ) { // && page != 5 ) printFrontPageShell ( sb , "search" , cr , false ); m_printedMenu = true; //printMenu ( sb ); } if ( page == 1 ) return printGeneralInfo(sb,hr); if ( page == 2 ) return printPageInlinks(sb,hr); if ( page == 3 ) return printSiteInlinks(sb,hr); if ( page == 4 ) return printRainbowSections(sb,hr); if ( page == 5 ) return printTermList(sb,hr); if ( page == 6 ) return printSpiderStats(sb,hr); if ( page == 7 ) return printCachedPage(sb,hr); return true; } bool XmlDoc::printGeneralInfo ( SafeBuf *sb , HttpRequest *hr ) { // int16_tcut char *fu = ptr_firstUrl; // sanity check Xml *xml = getXml(); // blocked? if ( xml == (void *)-1 ) return false; // error? if ( ! xml ) return true; char *ict = getIsContentTruncated(); if ( ! ict ) return true; if ( ict == (char *)-1 ) return false; char *at = getIsAdult(); if ( ! at ) return true; if ( at == (void *)-1 ) return false; char *ls = getIsLinkSpam(); if ( ! ls ) return true; if ( ls == (void *)-1 ) return false; uint8_t *ct = getContentType(); if ( ! ct ) return true; if ( ct == (void *)-1 ) return false; uint16_t *cs = getCharset ( ); if ( ! cs ) return true; if ( cs == (uint16_t *)-1 ) return false; char *pl = getIsPermalink(); if ( ! pl ) return true; if ( pl == (char *)-1 ) return false; char *isRSS = getIsRSS(); if ( ! isRSS ) return true; if ( isRSS == (char *)-1 ) return false; int32_t *ip = getIp(); if ( ! ip ) return true; if ( ip == (int32_t *)-1 ) return false; uint8_t *li = getLangId(); if ( ! li ) return true; if ( li == (uint8_t *)-1 ) return false; uint16_t *cid = getCountryId(); if ( ! cid ) return true; if ( cid == (uint16_t *)-1 ) return false; LinkInfo *info1 = getLinkInfo1(); if ( ! info1 ) return true; if ( info1 == (void *)-1 ) return false; CollectionRec *cr = getCollRec(); if ( ! cr ) return true; //char *ls = getIsLinkSpam(); //Links *links = getLinks(); // blocked? //if ( links == (void *)-1 ) { char *xx=NULL;*xx=0;}//return false; // error? //if ( ! links ) return true; // make it a URL Url uu; uu.set ( fu , false ); char *allowed = "???"; int32_t allowedInt = 1; if ( m_isAllowedValid && m_isAllowed ) { allowed = "yes"; allowedInt = 1; } else if ( m_isAllowedValid ) { allowed = "no"; allowedInt = 0; } int32_t ufn = -1; if ( m_urlFilterNumValid ) ufn = m_urlFilterNum; char *es = mstrerror(m_indexCode); if ( ! m_indexCode ) es = mstrerror(g_errno); int32_t isXml = hr->getLong("xml",0); if ( ! isXml ) printMenu ( sb ); //int32_t groupId = g_hostdb.getGroupIdFromDocId(m_docId); //Host *group = g_hostdb.getGroup(groupId); int32_t shardNum = getShardNumFromDocId ( m_docId ); Host *hosts = g_hostdb.getShard ( shardNum ); Host *h = &hosts[0]; if ( ! isXml ) sb->safePrintf ( "<table cellpadding=3 border=0>\n" "<tr>" "<td width=\"25%%\">docId</td>" "<td><a href=/get?c=%s&d=%"UINT64">%"UINT64"</a></td>" "</tr>\n" "<tr>" "<td width=\"25%%\">on host #</td>" "<td>%"INT32"</td>" "</tr>\n" "<tr>" "<td>index error code</td>" "<td>%s</td>" "</tr>\n" "<tr>" "<td>robots.txt allows</td>" "<td>%s</td>" "</tr>\n" "<tr>" "<td>url</td>" "<td><a href=\"%s\">%s</a></td>" "</tr>\n" , cr->m_coll, m_docId , m_docId , h->m_hostId, es, allowed, fu, fu ); else sb->safePrintf ( "<?xml version=\"1.0\" " "encoding=\"UTF-8\" ?>\n" "<response>\n" "\t<coll><![CDATA[%s]]></coll>\n" "\t<docId>%"INT64"</docId>\n" "\t<indexError><![CDATA[%s]]></indexError>\n" "\t<robotsTxtAllows>%"INT32"" "</robotsTxtAllows>\n" "\t<url><![CDATA[%s]]></url>\n" , cr->m_coll, m_docId , es, allowedInt,//(int32_t)m_isAllowed, fu ); char *redir = ptr_redirUrl; if ( redir && ! isXml ) { sb->safePrintf( "<tr>" "<td>redir url</td>" "<td><a href=\"%s\">%s</a></td>" "</tr>\n" ,redir ,redir ); } else if ( redir ) { sb->safePrintf("\t<redirectUrl><![CDATA[%s]]>" "</redirectUrl>\n" ,redir ); } if ( m_indexCode || g_errno ) { if ( ! isXml ) sb->safePrintf("</table><br>\n"); else sb->safePrintf("</response>\n"); return true; } // must always start with http i guess! if ( strncmp ( fu , "http" , 4 ) ) { char *xx=NULL;*xx=0; } time_t ts = (time_t)m_firstIndexedDate; if ( ! isXml ) sb->safePrintf("<tr><td>first indexed date</td>" "<td>%s UTC</td></tr>\n" , asctime(gmtime(&ts)) ); else sb->safePrintf("\t<firstIndexedDateUTC>%"UINT32"" "</firstIndexedDateUTC>\n", (uint32_t)m_firstIndexedDate ); ts = m_spideredTime; if ( ! isXml ) sb->safePrintf("<tr><td>last indexed date</td>" "<td>%s UTC</td></tr>\n" , asctime(gmtime(&ts )) ); else sb->safePrintf("\t<lastIndexedDateUTC>%"UINT32"" "</lastIndexedDateUTC>\n", (uint32_t)m_spideredTime ); ts = m_outlinksAddedDate; if ( ! isXml ) sb->safePrintf("<tr><td>outlinks last added date</td>" "<td>%s UTC</td></tr>\n" , asctime(gmtime(&ts )) ); else sb->safePrintf("\t<outlinksLastAddedUTC>%"UINT32"" "</outlinksLastAddedUTC>\n", (uint32_t)m_outlinksAddedDate ); // hop count if ( ! isXml ) sb->safePrintf("<tr><td>hop count</td><td>%"INT32"</td>" "</tr>\n", (int32_t)m_hopCount); else sb->safePrintf("\t<hopCount>%"INT32"</hopCount>\n", (int32_t)m_hopCount); char strLanguage[128]; languageToString(m_langId, strLanguage); // print tags //SafeBuf tb; int32_t sni = m_siteNumInlinks; char *ipString = iptoa(m_ip); //int32_t sni = info1->getNumGoodInlinks(); time_t tlu = info1->getLastUpdated(); struct tm *timeStruct3 = gmtime ( &tlu );//info1->m_lastUpdated ); char tmp3[64]; strftime ( tmp3 , 64 , "%b-%d-%Y(%H:%M:%S)" , timeStruct3 ); if ( ! isXml ) sb->safePrintf ( "<tr><td>original charset</td><td>%s</td></tr>\n" "<tr><td>adult bit</td><td>%"INT32"</td></tr>\n" //"<tr><td>is link spam?</td><td>%"INT32" <b>%s</b></td></tr>\n" "<tr><td>is permalink?</td><td>%"INT32"</td></tr>\n" "<tr><td>is RSS feed?</td><td>%"INT32"</td></tr>\n" "<tr><td>ip</td><td><a href=\"/search?q=ip%%3A%s&c=%s&n=100\">" "%s</td></tr>\n" "<tr><td>content len</td><td>%"INT32" bytes</td></tr>\n" "<tr><td>content truncated</td><td>%"INT32"</td></tr>\n" "<tr><td>content type</td><td>%s</td></tr>\n" "<tr><td>language</td><td>%s</td></tr>\n" "<tr><td>country</td><td>%s</td></tr>\n" "<tr><td>good inlinks to site" "</td><td>%"INT32"</td></tr>\n" "<tr><td>unique IP inlinks to site" "</td><td>%"INT32"</td></tr>\n" "<tr><td>unique CBlock inlinks to site" "</td><td>%"INT32"</td></tr>\n" "<tr><td>site rank</td><td>%"INT32"</td></tr>\n" "<tr><td>good inlinks to page" "</td><td>%"INT32"</td></tr>\n" "<tr><td>unique IP inlinks to page" "</td><td>%"INT32"</td></tr>\n" "<tr><td>unique CBlock inlinks to page" "</td><td>%"INT32"</td></tr>\n" "<tr><td>total inlinks to page" "</td><td>%"INT32"</td></tr>\n" "<tr><td><nobr>page inlinks last computed</nobr></td>" "<td>%s</td></tr>\n" "</td></tr>\n", get_charset_str(m_charset), (int32_t)m_isAdult, //(int32_t)m_isLinkSpam, //m_note, (int32_t)m_isPermalink, (int32_t)m_isRSS, ipString, cr->m_coll, ipString, size_utf8Content - 1, (int32_t)m_isContentTruncated, g_contentTypeStrings[(int)m_contentType] , strLanguage, g_countryCode.getName(m_countryId) , sni, m_siteNumInlinksUniqueIp, m_siteNumInlinksUniqueCBlock, ::getSiteRank(sni), //info1->getNumTotalInlinks(), info1->getNumGoodInlinks(), info1->m_numUniqueIps, info1->m_numUniqueCBlocks, info1->m_totalInlinkingDocIds, tmp3 ); else { sb->safePrintf ( "\t<charset><![CDATA[%s]]></charset>\n" "\t<isAdult>%"INT32"</isAdult>\n" "\t<isLinkSpam>%"INT32"</isLinkSpam>\n" "\t<siteRank>%"INT32"</siteRank>\n" "\t<numGoodSiteInlinks>%"INT32"</numGoodSiteInlinks>\n" "\t<numTotalSiteInlinks>%"INT32"</numTotalSiteInlinks>\n" "\t<numUniqueIpsLinkingToSite>%"INT32"" "</numUniqueIpsLinkingToSite>\n" "\t<numUniqueCBlocksLinkingToSite>%"INT32"" "</numUniqueCBlocksLinkingToSite>\n" // how many inlinks, external and internal, we have // to this page not filtered in any way!!! "\t<numTotalPageInlinks>%"INT32"</numTotalPageInlinks>\n" // how many inlinking ips we got, including our own if // we link to ourself "\t<numUniqueIpsLinkingToPage>%"INT32"" "</numUniqueIpsLinkingToPage>\n" // how many inlinking cblocks we got, including our own // if we link to ourself "\t<numUniqueCBlocksLinkingToPage>%"INT32"" "</numUniqueCBlocksLinkingToPage>\n" "\t<numGoodPageInlinks>%"INT32"</numGoodPageInlinks>\n" "\t<pageInlinksLastComputed>%"INT32"" "</pageInlinksLastComputed>\n" ,get_charset_str(m_charset) ,(int32_t)m_isAdult ,(int32_t)m_isLinkSpam ,::getSiteRank(sni) ,sni ,m_siteNumInlinksTotal ,m_siteNumInlinksUniqueIp ,m_siteNumInlinksUniqueCBlock ,info1->m_totalInlinkingDocIds ,info1->m_numUniqueIps ,info1->m_numUniqueCBlocks ,info1->getNumGoodInlinks() //,tmp3 ,(int32_t)info1->m_lastUpdated ); //if ( m_note ) // sb->safePrintf("\t<isLinkSpamReason><![CDATA[%s]]>" // "</isLinkSpamReason>\n" // , m_note ); sb->safePrintf("\t<isPermalink>%"INT32"</isPermalink>\n" "\t<isRSSFeed>%"INT32"</isRSSFeed>\n" "\t<ipAddress><![CDATA[%s]]></ipAddress>\n" "\t<contentLenInBytes>%"INT32"" "</contentLenInBytes>\n" "\t<isContentTruncated>%"INT32"" "</isContentTruncated>\n" "\t<contentType><![CDATA[%s]]></contentType>\n" "\t<language><![CDATA[%s]]></language>\n" "\t<country><![CDATA[%s]]></country>\n", (int32_t)m_isPermalink, (int32_t)m_isRSS, ipString, size_utf8Content - 1, (int32_t)m_isContentTruncated, g_contentTypeStrings[(int)m_contentType] , strLanguage, g_countryCode.getName(m_countryId) ); } //sb->safePrintf("<tr><td>site</td><td>"); //sb->safeMemcpy(ptr_site,size_site-1); //sb->safePrintf("</td></tr>\n"); TagRec *ogr = NULL; if ( m_tagRecDataValid && m_version >= 118 ) { ogr = getTagRec(); // &m_tagRec; // sanity. should be set from titlerec, so no blocking! if ( ! ogr || ogr == (void *)-1 ) { char *xx=NULL;*xx=0; } } if ( ogr && ! isXml ) ogr->printToBufAsHtml ( sb , "tag" ); else if ( ogr ) ogr->printToBufAsXml ( sb ); // show the good inlinks we used when indexing this if ( ! isXml ) info1->print(sb,cr->m_coll); // close the table if ( ! isXml ) sb->safePrintf ( "</table></center><br>\n" ); else sb->safePrintf("</response>\n"); return true; } bool XmlDoc::printSiteInlinks ( SafeBuf *sb , HttpRequest *hr ) { // use msg25 to hit linkdb and give us a link info class i guess // but we need paging functionality so we can page through like // 100 links at a time. clustered by c-class ip. // do we need to mention how many from each ip c-class then? because // then we'd have to read the whole termlist, might be several // separate disk reads. // we need to re-get both if either is NULL LinkInfo *sinfo = getSiteLinkInfo(); // block or error? if ( ! sinfo ) return true; if ( sinfo == (LinkInfo *)-1) return false; int32_t isXml = hr->getLong("xml",0); if ( ! isXml ) printMenu ( sb ); if ( isXml ) sb->safePrintf ("<?xml version=\"1.0\" " "encoding=\"UTF-8\" ?>\n" "<response>\n" ); sb->safeMemcpy ( &m_siteLinkBuf ); if ( isXml ) sb->safePrintf ("</response>\n" ); // just print that //sinfo->print ( sb , cr->m_coll ); return true; } bool XmlDoc::printPageInlinks ( SafeBuf *sb , HttpRequest *hr ) { // we need to re-get both if either is NULL LinkInfo *info1 = getLinkInfo1(); // block or error? if ( ! info1 ) return true; if ( info1 == (LinkInfo *)-1) return false; int32_t isXml = hr->getLong("xml",0); if ( ! isXml ) printMenu ( sb ); if ( isXml ) sb->safePrintf ("<?xml version=\"1.0\" " "encoding=\"UTF-8\" ?>\n" "<response>\n" ); int32_t recompute = hr->getLong("recompute",0); CollectionRec *cr = getCollRec(); if ( ! cr ) return false; // i guess we need this if ( ! recompute ) // m_setFromTitleRec ) info1->print ( sb , cr->m_coll ); else sb->safeMemcpy ( &m_pageLinkBuf ); if ( isXml ) sb->safePrintf ("</response>\n" ); return true; } static void getInlineSectionVotingBufWrapper ( void *state ) { XmlDoc *xd = (XmlDoc *)state; SafeBuf *vb = xd->getInlineSectionVotingBuf(); // return if blocked if ( vb == (void *)-1 ) return; // error? if ( ! vb ) log("xmldoc: error getting inline section votes: %s", mstrerror(g_errno)); // all done then. call original entry callback log("xmldoc: returning control to original caller"); xd->m_callback1 ( xd->m_state ); } // . returns false if blocked, true otherwise // . returns true with g_errno set on error // . this actually returns the page content with inserted information // based on sectiondb data // . for example, <div id=poo> --> <div id=poo d=5 n=20> // means that the section is repeated on 20 pages from this site and 5 of // which have the same innerHtml as us SafeBuf *XmlDoc::getInlineSectionVotingBuf ( ) { CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // . if we block anywhere below we want to come back here until done // . this can be a main entry point, so set m_masterLoop if ( ! m_masterLoop ) { m_masterLoop = getInlineSectionVotingBufWrapper; m_masterState = this; log("xmldoc: getting section voting info from coll=%s", cr->m_coll); } if ( m_inlineSectionVotingBufValid ) return &m_inlineSectionVotingBuf; Sections *sections = getSectionsWithDupStats(); if ( ! sections || sections == (void *)-1 ) return (SafeBuf *)sections; Words *words = getWords(); if ( ! words || words == (void *)-1 ) return (SafeBuf *)words; HttpMime *mime = getMime(); if ( ! mime || mime == (void *)-1 ) return (SafeBuf *)mime; int32_t siteHash32 = *getSiteHash32(); //int32_t nw = words->getNumWords(); //int64_t *wids = words->getWordIds(); SafeBuf *sb = &m_inlineSectionVotingBuf; // store mime first then content if ( ! m_utf8ContentValid ) { char *xx=NULL;*xx=0; } // we no longer use this through a proxy, so take this out //sb->safeMemcpy ( m_httpReply , mime->getMimeLen() ); // but hack the Content-Length: field to something alien // because we markup the html and the lenght will be different... //sb->nullTerm(); // we no longer use this through a proxy so take this out //char *cl = strstr(sb->getBufStart(),"\nContent-Length:"); //if ( cl ) cl[1] = 'Z'; //sec_t mflags = SEC_SENTENCE | SEC_MENU; // just print out each word // map the word to a section. // if it s the first time we've printed the section then we // can inject the stuff // set a printed bit to indicate when we print out a section so // we do not re-print it... // these are 1-1 with words Section **sptrs = sections->m_sectionPtrs; int32_t nw = words->getNumWords(); char **wptrs = words->m_words; int32_t *wlens = words->m_wordLens; for ( int32_t i = 0 ; i < nw ; i++ ) { char *a = wptrs[i]; // skip if not a front tag if ( *a != '<' || a[1] == '/' ) { sb->safeMemcpy(a,wlens[i]); continue; } Section *sa = sptrs[i]; // straight copy if no stats if ( ! sa || ! sa->m_stats.m_totalEntries ) { sb->safeMemcpy ( a , wlens[i] ); continue; } // should be tag then char *e = a; for ( ; *e && *e != '>' && ! is_wspace_a(*e) ; e++); // copy that sb->safeMemcpy ( a , e-a); // the hash of the turktaghash and sitehash32 combined // so you can do gbfacetstr:gbxpathsitehash12345 // where the 12345 is this h32 value. uint32_t h32 = sa->m_turkTagHash32 ^ siteHash32; // insert our stuff into the tag //sb->safePrintf("<!--"); //sb->safePrintf("<font color=red>"); SectionStats *sx = &sa->m_stats; // # docs from our site had the same innerHTML? sb->safePrintf(" _s=M%"INT32"D%"INT32"n%"INT32"u%"INT32"h%"UINT32"", // total # of docs that had an xpath with // our same innerHtml (int32_t)sx->m_totalMatches, // # of of docids with this facet (int32_t)sx->m_totalDocIds, // . total # of times this xpath occurred // . can be multiple times per doc (int32_t)sx->m_totalEntries, // unique values in the xpath innerhtml (int32_t)sx->m_numUniqueVals, // xpathsitehash h32 ); // copy the rest of the tag sb->safeMemcpy( e, wlens[i]-(e-a) ); //sb->safePrintf("-->"); //sb->safePrintf("</font>"); // print it here } sb->nullTerm(); m_inlineSectionVotingBufValid = true; return &m_inlineSectionVotingBuf; } bool XmlDoc::printRainbowSections ( SafeBuf *sb , HttpRequest *hr ) { // what wordposition to scroll to and blink? int32_t hiPos = -1; if ( hr ) hiPos = hr->getLong("hipos",-1); // // PRINT SECTIONS // Sections *sections ; // hr is NULL if being called from page parser which does not have the // dup stats! and we core if we block here! if ( hr ) sections = getSectionsWithDupStats(); else sections = getSections(); if ( ! sections) return true;if (sections==(Sections *)-1)return false; //SectionVotingTable *nsvt = getNewSectionVotingTable(); //if ( ! nsvt || nsvt == (void *)-1 ) {char*xx=NULL;*xx=0;} //SectionVotingTable *osvt = getOldSectionVotingTable(); //if ( ! osvt || osvt == (void *)-1 ) {char*xx=NULL;*xx=0;} Words *words = getWords(); if ( ! words ) return true; if ( words == (Words *)-1 ) return false; Phrases *phrases = getPhrases(); if ( ! phrases ) return true; if (phrases == (void *)-1 ) return false; HashTableX *cnt = getCountTable(); if ( ! cnt ) return true; if ( cnt == (void *)-1 ) return false; int32_t nw = words->getNumWords(); //int32_t wordStart = 0; //int32_t wordEnd = nw; int64_t *wids = words->getWordIds(); int32_t isXml = false; if ( hr ) isXml = (bool)hr->getLong("xml",0); //if ( ! isXml ) printMenu ( sb ); // now complement, cuz bigger is better in the ranking world //int32_t densityRank = getDensityRank ( wids , 0 , nw , HASHGROUP_BODY ); SafeBuf densBuf; // returns false and sets g_errno on error if ( ! getDensityRanks((int64_t *)wids, nw, HASHGROUP_BODY,//hi->m_hashGroup, &densBuf, sections, m_niceness)) return true; // a handy ptr char *densityVec = (char *)densBuf.getBufStart(); /* if ( ! isXml ) sb->safePrintf("<br><b>density rank of body = %"INT32"</b> " "(out of %"INT32")" "<br>" "<br>" , densityRank , (int32_t)MAXDENSITYRANK ); */ char *wordSpamVec = getWordSpamVec(); char *fragVec = m_fragBuf.getBufStart(); SafeBuf dwbuf; if(!getDiversityVec(words,phrases,cnt,&dwbuf,m_niceness))return true; char *diversityVec = dwbuf.getBufStart(); // hack fack debug //m_bodyStartPos =2136; SafeBuf wpos; if ( ! getWordPosVec ( words , sections, //wordStart, //wordEnd, // we save this in the titlerec, when we // start hashing the body. we have the url // terms before the body, so this is necessary. m_bodyStartPos,//0, // hi->m_startDist, fragVec, m_niceness, &wpos) ) return true; // a handy ptr int32_t *wposVec = (int32_t *)wpos.getBufStart(); if ( ! isXml ) { // put url in for steve to parse out sb->safePrintf("%s\n", m_firstUrl.m_url); sb->safePrintf("<font color=black>w</font>" "/" "<font color=purple>x</font>" //"/" //"<font color=green>y</font>" "/" "<font color=red>z</font>" ": " "w=wordPosition " "x=densityRank " //"y=diversityRank " "z=wordSpamRank " "<br>" "<br>" "" ); } if ( ! isXml ) { // try the new print function sections->print2 ( sb , hiPos, wposVec, densityVec, diversityVec, wordSpamVec, fragVec, NULL, NULL , &m_addresses , true ); return true; } if ( isXml ) sb->safePrintf ("<?xml version=\"1.0\" " "encoding=\"UTF-8\" ?>\n" "<response>\n" ); Section *si = sections->m_rootSection; sec_t mflags = SEC_SENTENCE | SEC_MENU; for ( ; si ; si = si->m_next ) { // breathe QUICKPOLL(m_niceness); // print it out sb->safePrintf("\t<section>\n"); // get our offset in the array of sections int32_t num = si - sections->m_sections; sb->safePrintf("\t\t<id>%"INT32"</id>\n",num); Section *parent = si->m_parent; if ( parent ) { int32_t pnum = parent - sections->m_sections; sb->safePrintf("\t\t<parent>%"INT32"</parent>\n",pnum); } char *byte1 = words->m_words[si->m_a]; char *byte2 = words->m_words[si->m_b-1] + words->m_wordLens[si->m_b-1]; int32_t off1 = byte1 - words->m_words[0]; int32_t size = byte2 - byte1; sb->safePrintf("\t\t<byteOffset>%"INT32"</byteOffset>\n",off1); sb->safePrintf("\t\t<numBytes>%"INT32"</numBytes>\n",size); if ( si->m_flags & mflags ) { sb->safePrintf("\t\t<flags><![CDATA["); bool printed = false; if ( si->m_flags & SEC_SENTENCE ) { sb->safePrintf("sentence"); printed = true; } if ( si->m_flags & SEC_MENU ) { if ( printed ) sb->pushChar(' '); sb->safePrintf("ismenu"); printed = true; } sb->safePrintf("]]></flags>\n"); } int32_t bcolor = (int32_t)si->m_colorHash& 0x00ffffff; int32_t fcolor = 0x000000; //int32_t rcolor = 0x000000; uint8_t *bp = (uint8_t *)&bcolor; bool dark = false; if ( bp[0]<128 && bp[1]<128 && bp[2]<128 ) dark = true; // or if two are less than 50 if ( bp[0]<100 && bp[1]<100 ) dark = true; if ( bp[1]<100 && bp[2]<100 ) dark = true; if ( bp[0]<100 && bp[2]<100 ) dark = true; // if bg color is dark, make font color light if ( dark ) { fcolor = 0x00ffffff; //rcolor = 0x00ffffff; } sb->safePrintf("\t\t<bgColor>%06"XINT32"</bgColor>\n",bcolor); sb->safePrintf("\t\t<textColor>%06"XINT32"</textColor>\n",fcolor); // count stats uint64_t ch64 = (int32_t)si->m_sentenceContentHash64; if ( ! ch64 ) { sb->safePrintf("\t</section>\n"); continue; } /* take this out for now it is not quite right any more. we now use the xpath hash and site hash as the key and the "value" is the sentence/innerHtml hash sb->safePrintf("\t\t<numOnSitePagesThatDuplicateContent>%"INT32"" "</numOnSitePagesThatDuplicateContent>\n", (int32_t)si->m_stats.m_onSiteDocIds); sb->safePrintf("\t\t<numOffSitePagesThatDuplicateContent>%"INT32"" "</numOffSitePagesThatDuplicateContent>\n", (int32_t)si->m_stats.m_offSiteDocIds); sb->safePrintf("\t\t<numSitesThatDuplicateContent>%"INT32"" "</numSitesThatDuplicateContent>\n", (int32_t)si->m_stats.m_numUniqueSites); */ // you can do a sitehash:xxxxx this number to see who the // dups are! sb->safePrintf("\t\t<innerContentHash64>%"UINT64"" "</innerContentHash64>\n", si->m_sentenceContentHash64); sb->safePrintf("\t</section>\n"); } // now print out the entire page content so the offsets make sense! sb->safePrintf("\t<utf8Content><![CDATA["); if ( ptr_utf8Content ) sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1,false); sb->safePrintf("]]></utf8Content>\n"); // end xml response sb->safePrintf("</response>\n"); return true; } bool XmlDoc::printTermList ( SafeBuf *sb , HttpRequest *hr ) { // set debug buffer m_storeTermListInfo = true; // default to sorting by wordpos m_sortTermListBy = hr->getLong("sortby",1); // cores in getNewSpiderReply() if we do not have this and provide // the docid... m_useSpiderdb = false; char *metaList = getMetaList ( ); if ( ! metaList ) return true; if (metaList==(char *) -1) return false; CollectionRec *cr = getCollRec(); if ( ! cr ) return false; int32_t isXml = hr->getLong("xml",0); if ( isXml ) { sb->safePrintf ("<?xml version=\"1.0\" " "encoding=\"UTF-8\" ?>\n" "<response>\n" ); sb->safePrintf( "\t<maxDens>%"INT32"</maxDens>\n" //"\t<maxDiv>%"INT32"</maxDiv>\n" "\t<maxSpam>%"INT32"</maxSpam>\n" , (int32_t)MAXDENSITYRANK //, (int32_t)MAXDIVERSITYRANK , (int32_t)MAXWORDSPAMRANK ); } if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; } if ( ! isXml ) { //printMenu ( sb ); //sb->safePrintf("<i>* indicates word is a synonym or " // "alternative word form<br><br>"); sb->safePrintf("N column = DensityRank (0-%"INT32")<br>" //"V column = DiversityRank (0-%"INT32")<br>" "S column = WordSpamRank (0-%"INT32") " "[or linker " "siterank if its offsite link text]<br>" "Lang column = language used for purposes " "of detecting the document's primary language " "using a simple majority vote" "<br>" "</i>" "<br>" "Document Primary Language: <b>%s</b> (%s)" "<br>" "<br>" , (int32_t)MAXDENSITYRANK //, (int32_t)MAXDIVERSITYRANK , (int32_t)MAXWORDSPAMRANK , getLanguageString (m_langId) , getLangAbbr(m_langId) ); // encode it SafeBuf ue; ue.urlEncode ( ptr_firstUrl ); sb->safePrintf("Sort by: " ); if ( m_sortTermListBy == 0 ) sb->safePrintf("<b>Term</b>"); else sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&" "sortby=0>" "Term</a>" , cr->m_coll , ue.getBufStart() ); sb->safePrintf(" | "); if ( m_sortTermListBy == 1 ) sb->safePrintf("<b>WordPos</b>"); else sb->safePrintf("<a href=/print?c=%s&page=5&u=%s&" "sortby=1>" "WordPos</a>" , cr->m_coll , ue.getBufStart() ); sb->safePrintf("<br>" "<br>" ); } // // BEGIN PRINT HASHES TERMS (JUST POSDB) // // int16_tcut HashTableX *wt = m_wts; // use the keys to hold our list of ptrs to TermDebugInfos for sorting! TermDebugInfo **tp = NULL; // add them with this counter int32_t nt = 0; int32_t nwt = 0; if ( wt ) { nwt = wt->m_numSlots; tp = (TermDebugInfo **)wt->m_keys; } // now print the table we stored all we hashed into for ( int32_t i = 0 ; i < nwt ; i++ ) { // skip if empty if ( wt->m_flags[i] == 0 ) continue; // breathe //QUICKPOLL(m_niceness); // get its key, date=32bits termid=64bits //key96_t *k = (key96_t *)wt->getKey ( i ); // get the TermDebugInfo TermDebugInfo *ti = (TermDebugInfo *)wt->getValueFromSlot ( i ); // point to it for sorting tp[nt++] = ti; } // set this for cmptp s_wbuf = &m_wbuf; if ( m_sortTermListBy == 0 ) // sort them alphabetically gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp , m_niceness ); else // sort by word pos gbsort ( tp , nt , sizeof(TermDebugInfo *), cmptp2 , m_niceness ); // print the weight tables //printLocationWeightsTable(sb,isXml); //printDiversityWeightsTable(sb,isXml); //printDensityWeightsTable(sb,isXml); //printWordSpamWeightsTable(sb,isXml); // print them out in a table char hdr[1000]; sprintf(hdr, "<table border=1 cellpadding=0>" "<tr>" // this messes up Test.cpp diff'ing //"<td><b>#</b></td>" "<td><b>Prefix</b></td>" "<td><b>WordPos</b></td>" "<td><b>Lang</b></td>" "<td><b>Term</b></td>" //"%s" //"<td><b>Weight</b></td>" //"<td><b>Spam</b></td>" "<td><b>Desc</b></td>" "<td><b>N</b></td>" //"<td><b>V</b></td>" // diversityRank "<td><b>S</b></td>" "<td><b>Score</b></td>" //"<td><b>Date</b></td>" //"<td><b>Desc</b></td>" //"<td><b>TermId</b></td>" "</tr>\n" //,fbuf ); if ( ! isXml ) sb->safePrintf("%s",hdr); char *start = m_wbuf.getBufStart(); int32_t rcount = 0; for ( int32_t i = 0 ; i < nt ; i++ ) { // see if one big table causes a browser slowdown if ( (++rcount % TABLE_ROWS) == 0 && ! isXml ) sb->safePrintf("<!--ignore--></table>%s",hdr); char *prefix = NULL;//" "; if ( tp[i]->m_prefixOff >= 0 ) prefix = start + tp[i]->m_prefixOff; if ( isXml ) sb->safePrintf("\t<term>\n"); if ( isXml && prefix ) sb->safePrintf("\t\t<prefix><![CDATA[%s]]>" "</prefix>\n",prefix); if ( ! isXml ) { sb->safePrintf ( "<tr>"); if ( prefix ) sb->safePrintf("<td>%s:</td>",prefix); else sb->safePrintf("<td> </td>"); } if ( ! isXml ) sb->safePrintf("<td>%"INT32"" "/%"INT32"" "</td>" , tp[i]->m_wordPos ,tp[i]->m_wordNum ); //char *abbr = getLangAbbr(tp[i]->m_langId); //if ( tp[i]->m_langId == langTranslingual ) abbr ="??"; //if ( tp[i]->m_langId == langUnknown ) abbr ="--"; //if ( tp[i]->m_synSrc ) abbr = ""; // print out all langs word is in if it's not clear // what language it is. we use a sliding window to // resolve some ambiguity, but not all, so print out // the possible langs here if ( ! isXml ) { sb->safePrintf("<td>"); printLangBits ( sb , tp[i] ); sb->safePrintf("</td>"); } //if ( ! isXml && abbr[0] ) // sb->safePrintf("<td>%s</td>", abbr ); //else if ( ! isXml ) // sb->safePrintf("<td> </td>" ); //else if ( abbr[0] ) // sb->safePrintf("\t\t<lang><![CDATA[" // "]]>%s</lang>\n", abbr ); if ( isXml ) sb->safePrintf("\t\t<s><![CDATA["); if ( ! isXml ) sb->safePrintf ("<td><nobr>" ); //if ( tp[i]->m_synSrc ) // sb->pushChar('*'); sb->safeMemcpy_nospaces ( start + tp[i]->m_termOff , tp[i]->m_termLen ); /* char *dateStr = " "; int32_t ddd = tp[i]->m_date; uint8_t *tddd = (uint8_t *)&ddd; char tbbb[32]; if ( ddd && tddd[2] == 0 && tddd[3] == 0 && tddd[0] && tddd[1] && tddd[1] <= tddd[0] ) { sprintf(tbbb,"evIds %"INT32"-%"INT32"", (int32_t)tddd[1],(int32_t)tddd[0]); dateStr = tbbb; } else if ( ddd ) dateStr = asctime ( gmtime(&ddd )); char tmp[20]; if ( tp[i]->m_noSplit ) sprintf ( tmp,"<b>1</b>" ); else sprintf ( tmp,"0" ); */ if ( isXml ) sb->safePrintf("]]></s>\n"); else sb->safePrintf ( "</nobr></td>" ); if ( isXml ) sb->safePrintf("\t\t<wordPos>%"INT32"</wordPos>\n", tp[i]->m_wordPos); char *desc = NULL; if ( tp[i]->m_descOff >= 0 ) desc = start + tp[i]->m_descOff; // use hashgroup int32_t hg = tp[i]->m_hashGroup; if ( ! desc || ! strcmp(desc,"body") ) desc = getHashGroupString(hg); if ( isXml && desc ) sb->safePrintf("\t\t<loc>%s</loc>\n", desc); else if ( ! isXml ) { if ( ! desc ) desc = " "; sb->safePrintf ( "<td>%s", desc ); char ss = tp[i]->m_synSrc; if ( ss ) sb->safePrintf(" - %s", getSourceString(ss)); sb->safePrintf("</td>"); } int32_t dn = (int32_t)tp[i]->m_densityRank; if ( isXml ) sb->safePrintf("\t\t<dens>%"INT32"</dens>\n",dn); if ( ! isXml && dn >= MAXDENSITYRANK ) sb->safePrintf("<td>%"INT32"</td>\n",dn); else if ( ! isXml ) sb->safePrintf("<td><font color=purple>%"INT32"</font>" "</td>",dn); // the diversityrank/wordspamrank /* int32_t ds = (int32_t)tp[i]->m_diversityRank; if ( isXml ) sb->safePrintf("\t\t<div>%"INT32"</div>\n",ds); if ( ! isXml && ds >= MAXDIVERSITYRANK ) sb->safePrintf("<td>%"INT32"</td>\n",ds); else if ( ! isXml ) sb->safePrintf("<td><font color=green>%"INT32"</font>" "</td>",ds); */ int32_t ws = (int32_t)tp[i]->m_wordSpamRank; if ( isXml && hg == HASHGROUP_INLINKTEXT ) sb->safePrintf("\t\t<linkerSiteRank>%"INT32"" "</linkerSiteRank>\n",ws); else if ( isXml ) sb->safePrintf("\t\t<spam>%"INT32"</spam>\n",ws); if ( ! isXml && ws >= MAXWORDSPAMRANK ) sb->safePrintf("<td>%"INT32"</td>",ws); else if ( ! isXml ) sb->safePrintf("<td><font color=red>%"INT32"</font></td>", ws); float score = 1.0; // square this like we do in the query ranking algo score *= getHashGroupWeight(hg) * getHashGroupWeight(hg); //score *= getDiversityWeight(tp[i]->m_diversityRank); score *= getDensityWeight(tp[i]->m_densityRank); if ( tp[i]->m_synSrc ) score *= SYNONYM_WEIGHT; if ( hg == HASHGROUP_INLINKTEXT ) score *= getLinkerWeight(ws); else score *= getWordSpamWeight(ws); if ( isXml ) sb->safePrintf("\t\t<score>%.02f</score>\n",score); else sb->safePrintf("<td>%.02f</td>\n",score); if ( isXml ) sb->safePrintf("\t</term>\n"); else sb->safePrintf("</tr>\n"); } if ( isXml ) sb->safePrintf ("</response>\n" ); else sb->safePrintf("</table><br>\n"); // // END PRINT HASHES TERMS // return true; } bool XmlDoc::printSpiderStats ( SafeBuf *sb , HttpRequest *hr ) { int32_t isXml = hr->getLong("xml",0); if ( ! isXml ) printMenu ( sb ); sb->safePrintf("<b>Coming Soon</b>"); return true; } bool XmlDoc::printCachedPage ( SafeBuf *sb , HttpRequest *hr ) { char **c = getUtf8Content(); if ( ! c ) return true; if ( c==(void *)-1) return false; int32_t isXml = hr->getLong("xml",0); int32_t raw = hr->getLong("raw",0); if ( ! isXml && ! raw ) printMenu ( sb ); if ( ! isXml ) { // just copy it otherwise if ( ptr_utf8Content ) sb->safeMemcpy ( ptr_utf8Content ,size_utf8Content -1); return true; } sb->safePrintf ("<?xml version=\"1.0\" " "encoding=\"UTF-8\" ?>\n" "<response>\n" ); sb->safePrintf("\t<utf8Content><![CDATA["); if ( ptr_utf8Content ) sb->htmlEncode ( ptr_utf8Content ,size_utf8Content-1, false); sb->safePrintf("]]></utf8Content>\n"); // end xml response sb->safePrintf("</response>\n"); return true; } // . get the possible titles of the root page // . includes the title tag text // . includes various inlink text // . used to match the VERIFIED place name 1 or 2 of addresses on this // site in order to set Address::m_flags's AF_VENUE_DEFAULT bit which // indicates the address is the address of the website (a venue website) char **XmlDoc::getRootTitleBuf ( ) { // return if valid if ( m_rootTitleBufValid ) return (char **)&m_rootTitleBuf; // get it from the tag rec first setStatus ( "getting root title buf"); // sanity check, root must have been indexed //if ( ! m_sreq.m_rootIndexed ) { char *xx=NULL;*xx=0; } // . update it first before reading it! // . do not update it here, just update it in getTitleRec() because // this makes doConsistencyCheck() block and core //bool *status2 = updateSiteTitleBuf(); //if ( ! status2 || status2 == (void *)-1 ) return (char **)status2; // get it from the tag rec if we can TagRec *gr = getTagRec (); if ( ! gr || gr == (void *)-1 ) return (char **)gr; // clear this if not set from title rec //if ( ! m_setFromTitleRec ) { // ptr_siteTitleBuf = NULL; // size_siteTitleBuf = 0; //} // PROBLEM: new title rec is the only thing which has sitetitles tag // sometimes and we do not store that in the title rec. in this case // we should maybe store ptr_siteTitleBuf/size_siteTitleBuf in the // title rec? Tag *tag = gr->getTag("roottitles"); char *src = NULL; int32_t srcSize = 0; if ( ptr_rootTitleBuf || m_setFromTitleRec ) { src = ptr_rootTitleBuf; srcSize = size_rootTitleBuf; } else if ( tag ) { src = tag->getTagData(); srcSize = tag->getTagDataSize(); // no need to add to title rec since already in the tag so // make sure we did not double add if ( ptr_rootTitleBuf ) { char *xx=NULL;*xx=0; } } else { // . get the root doc // . allow for a one hour cache of the titleRec XmlDoc **prd = getRootXmlDoc( 3600 ); if ( ! prd || prd == (void *)-1 ) return (char **)prd; // int16_tcut XmlDoc *rd = *prd; // . if no root doc, then assume no root title // . this happens if we are injecting because we do not want // to download the root page for speed purposes if ( ! rd ) { m_rootTitleBuf[0] = '\0'; m_rootTitleBufSize = 0; m_rootTitleBufValid = true; return (char **)&m_rootTitleBuf; } // . ONLY do this if root doc was NOT set from titleRec to // avoid that core in updateSiteTitleBuf(). this can happen // if the root doc had no title! (or no content) //if ( rd->m_setFromTitleRec ) { // // emptyt // m_siteTitleBuf[0] = '\0'; // // set the size of it // m_siteTitleBufSize = 0; // // validate it // m_siteTitleBufValid = true; // // return a ptr to it // return (char **)&m_siteTitleBuf; //} // a \0 separated list char **rtl = rd->getTitleBuf(); if ( ! rtl || rtl == (void *)-1 ) return (char **)rtl; // ptr src = rd->m_titleBuf; srcSize = rd->m_titleBufSize; } int32_t max = (int32_t)ROOT_TITLE_BUF_MAX - 5; // sanity if ( srcSize >= max ) { // truncate srcSize = max; // back up so we split on a space for ( ; srcSize>0 && ! is_wspace_a(src[srcSize]); srcSize--); // null term src[srcSize] = '\0'; // include it srcSize++; } // copy that over in case root is destroyed gbmemcpy ( m_rootTitleBuf , src , srcSize ); m_rootTitleBufSize = srcSize; // sanity check, must include the null ni the size if ( m_rootTitleBufSize > 0 && m_rootTitleBuf [ m_rootTitleBufSize - 1 ] ) { char *xx=NULL;*xx=0; //m_rootTitleBuf [ m_rootTitleBufSize - 1 ] = '\0'; //m_rootTitleBufSize++; } // sanity check - breach check if ( m_rootTitleBufSize > ROOT_TITLE_BUF_MAX ) { char *xx=NULL;*xx=0;} // serialize into our titlerec ptr_rootTitleBuf = m_rootTitleBuf; size_rootTitleBuf = m_rootTitleBufSize; m_rootTitleBufValid = true; return (char **)&m_rootTitleBuf; } char **XmlDoc::getFilteredRootTitleBuf ( ) { if ( m_filteredRootTitleBufValid ) return (char **)&m_filteredRootTitleBuf; // get unfiltered. m_rootTitleBuf should be set from this call. char **rtbp = getRootTitleBuf(); if ( ! rtbp || rtbp == (void *)-1 ) return (char **)rtbp; /* // assume none m_filteredRootTitleBuf[0] = '\0'; m_filteredRootTitleBufSize = 0; m_filteredRootTitleBufValid = true; return (char **)&m_filteredRootTitleBuf; */ // filter all the punct to \0 so that something like // "walmart.com : live better" is reduced to 3 potential // names, "walmart", "com" and "live better" char *src = m_rootTitleBuf; char *srcEnd = src + m_rootTitleBufSize; char *dst = m_filteredRootTitleBuf; // save some room to add a \0, so subtract 5 char *dstEnd = dst + ROOT_TITLE_BUF_MAX - 5; //char *src = tag->getTagData(); //char *srcEnd = src + tag->getTagDataSize(); int32_t size = 0; bool lastWasPunct = true; for ( ; src < srcEnd && dst < dstEnd ; src += size ) { // set the char size size = getUtf8CharSize(src); // space? if ( is_wspace_a (*src) || // allow periods too *src=='.' ) { // no back to back punct if ( lastWasPunct ) continue; // flag it lastWasPunct = true; // add it in *dst++ = '.'; // that's it continue; } // x'y or x-y if ( ( *src == '\'' || *src == '.' || *src == '-' ) && ! lastWasPunct && is_alnum_a(src[1]) ) { // add it in *dst++ = *src; // that's it continue; } // x & y is ok if ( *src == '&' ) { // assume not punct (stands for and) lastWasPunct = false; // add it in *dst++ = *src; // that's it continue; } // store alnums right in if ( is_alnum_a(*src) ) { // flag it lastWasPunct = false; // copy it over gbmemcpy ( dst , src , size ); // skip what we copied dst += size; continue; } // if punct and haven't stored anything, just skip it if ( lastWasPunct ) dst[-1] = '\0'; // store it else *dst++ = '\0'; } // make sure we end on a \0 if ( dst > m_filteredRootTitleBuf && dst[-1] != '\0' ) *dst++ = '\0'; // int16_tcut char *str = m_filteredRootTitleBuf; int32_t strSize = dst - m_filteredRootTitleBuf; // copy that over in case root is destroyed gbmemcpy ( m_filteredRootTitleBuf , str , strSize ); m_filteredRootTitleBufSize = strSize; // sanity check, must include the null ni the size if ( m_filteredRootTitleBufSize > 0 && m_filteredRootTitleBuf [ m_filteredRootTitleBufSize - 1 ] ) { char *xx=NULL;*xx=0; //m_filteredRootTitleBuf [ m_filteredRootTitleBufSize-1]='\0'; //m_filteredRootTitleBufSize++; } // sanity check - breach check if ( m_filteredRootTitleBufSize > ROOT_TITLE_BUF_MAX ) { char *xx=NULL;*xx=0;} m_filteredRootTitleBufValid = true; // make this static to avoid compiler warning static char *fp = m_filteredRootTitleBuf; return (char **)&fp; //return (char **)&m_filteredRootTitleBuf; } //static bool s_dummyBool = 1; class Binky { public: char *m_text; int32_t m_textLen; int32_t m_score; int64_t m_hash; }; int cmpbk ( const void *v1, const void *v2 ) { Binky *b1 = (Binky *)v1; Binky *b2 = (Binky *)v2; return b1->m_score - b2->m_score; } char **XmlDoc::getTitleBuf ( ) { if ( m_titleBufValid ) return (char **)&m_titleBuf; // recalc this everytime the root page is indexed setStatus ( "getting title buf on root"); // are we a root? char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (char **)isRoot; // this should only be called on the root! // . if the site changed for us, but the title rec of what we // think is now the root thinks that it is not the root because // it is using the old site, then it cores here! // . i.e. if the new root is www.xyz.com/user/ted/ and the old root // is www.xyz.com then and the old root is stored in ptr_site for // the title rec for www.xyz.com/user/ted/ then we core here, // . so take this sanity check out // . but if the title rec does not think he is the site root yet // then just wait until he does so we can get his // ptr_rootTitleBuf below if ( ! *isRoot ) { m_titleBuf[0] = '\0'; m_titleBufSize = 0; m_titleBufValid = true; return (char **)&m_titleBuf; } // sanity check if ( m_setFromTitleRec ) { gbmemcpy(m_titleBuf, ptr_rootTitleBuf, size_rootTitleBuf ); m_titleBufSize = size_rootTitleBuf; m_titleBufValid = true; return (char **)&m_titleBuf; } char *mysite = getSite(); if ( ! mysite || mysite == (char *)-1 ) return (char **)mysite; // get link info first LinkInfo *info1 = getLinkInfo1(); // error or blocked if ( ! info1 || info1 == (LinkInfo *)-1 ) return (char **)info1; // sanity check Xml *xml = getXml(); // return -1 if it blocked if ( xml == (void *)-1 ) return (char **)-1; // set up for title int32_t tlen ; char *title ; // on error, ignore it to avoid hammering the root! if ( xml == (void *)NULL ) { // log it log("build: error downloading root xml: %s", mstrerror(g_errno)); // clear it g_errno = 0; // make it 0 tlen = 0; title = NULL; } else { // get the title title = m_xml.getTextForXmlTag ( 0, 999999 , "title" , &tlen , true ); // skip leading spaces } // truncate to 100 chars //for ( ; tlen>0 && (tlen > 100 || is_alnum_a(title[tlen])) ; tlen-- ) // if ( tlen == 0 ) break; if ( tlen > 100 ) { char *tpend = title + 100; char *prev = getPrevUtf8Char ( tpend , title ); // make that the end so we don't split a utf8 char tlen = prev - title; } // store tag in here char tmp[1024]; // point to it char *ptmp = tmp; // set this char *pend = tmp + 1024; // add that in gbmemcpy ( ptmp, title, tlen); ptmp += tlen; // null terminate it *ptmp++ = '\0'; // two votes per internal inlink int32_t internalCount = 0; // count inlinkers int32_t linkNum = 0; Binky bk[1000]; // init this //char stbuf[2000]; //HashTableX scoreTable; //scoreTable.set(8,4,64,stbuf,2000,false,m_niceness,"xmlscores"); // scan each link in the link info for ( Inlink *k = NULL; (k = info1->getNextInlink(k)) ; ) { // do not breach if ( linkNum >= 1000 ) break; // is this inlinker internal? bool internal=((m_ip&0x0000ffff)==(k->m_ip&0x0000ffff)); // get length of link text int32_t tlen = k->size_linkText; if ( tlen > 0 ) tlen--; // get the text char *txt = k->getLinkText(); // skip corrupted if ( ! verifyUtf8 ( txt , tlen ) ) { log("xmldoc: bad link text 4 from url=%s for %s", k->getUrl(),m_firstUrl.m_url); continue; } // store these // zero out hash bk[linkNum].m_hash = 0; bk[linkNum].m_text = txt; bk[linkNum].m_textLen = tlen; bk[linkNum].m_score = 0; // internal count if ( internal && ++internalCount >= 3 ) continue; // it's good bk[linkNum].m_score = 1; linkNum++; /* // set into words Words w; // return NULL on error with g_errno set if ( ! w.setx ( txt , tlen , m_niceness ) ) return NULL; // int16_tcut int64_t *wids = w.getWordIds(); // init hash int64_t h = 0LL; // hash all words together for ( int32_t i = 0 ; i < w.m_numWords ; i++ ) { // skip if not hashable if ( ! wids[i] ) continue; // mix it up h <<= 1LL; // xor it in h ^= wids[i]; } // update hash bk[linkNum].m_hash = h; // store in table, return NULL with g_errno set on error if ( ! scoreTable.addTerm ( &h ) ) return NULL; */ } // init this char dtbuf[1000]; HashTableX dupTable; dupTable.set(8,0,64,dtbuf,1000,false,m_niceness,"xmldup"); // now set the scores and isdup for ( int32_t i = 0 ; i < linkNum ; i++ ) { // skip if ignored if ( bk[i].m_score == 0 ) continue; // get hash int64_t h = bk[i].m_hash; // assume a dup bk[i].m_score = 0; // skip if zero'ed out if ( ! h ) continue; // only do each hash once! if ( dupTable.isInTable(&h) ) continue; // add to it. return NULL with g_errno set on error if ( ! dupTable.addKey(&h) ) return NULL; // is it in there? bk[i].m_score = 1; // scoreTable.getScore ( &h ); } // now sort the bk array by m_score //gbsort ( bk , linkNum , sizeof(Binky), cmpbk , m_niceness ); // sanity check - make sure sorted right //if ( linkNum >= 2 && bk[0].m_score < bk[1].m_score ) { // char *xx=NULL; *xx=0; } // . now add the winners to the buffer // . skip if score is 0 for ( int32_t i = 0 ; i < linkNum ; i++ ) { // skip if score is zero if ( bk[i].m_score == 0 ) continue; // skip if too big if ( bk[i].m_textLen + 1 > pend - ptmp ) continue; // store it gbmemcpy ( ptmp , bk[i].m_text , bk[i].m_textLen ); // advance ptmp += bk[i].m_textLen; // null terminate it *ptmp++ = '\0'; } // sanity int32_t size = ptmp - tmp; if ( size > ROOT_TITLE_BUF_MAX ) { char *xx=NULL;*xx=0; } gbmemcpy ( m_titleBuf , tmp , ptmp - tmp ); m_titleBufSize = size; m_titleBufValid = true; // ensure null terminated if ( size > 0 && m_titleBuf[size-1] ) { char *xx=NULL;*xx=0; } //ptr_siteTitleBuf = m_siteTitleBuf; //size_siteTitleBuf = m_siteTitleBufSize; return (char **)&m_titleBuf; } // . now we just get all the tagdb rdb recs to add using this function // . then we just use the metalist to update tagdb SafeBuf *XmlDoc::getNewTagBuf ( ) { if ( m_newTagBufValid ) return &m_newTagBuf; setStatus ( "getting new tags"); int32_t *ic = getIndexCode(); if ( ic == (void *)-1 ) { char *xx=NULL;*xx=0; } // get our ip int32_t *ip = getIp(); // this must not block to avoid re-computing "addme" above if ( ip == (void *)-1 ) { char *xx=NULL;*xx=0; } if ( ! ip || ip == (int32_t *)-1) return (SafeBuf *)ip; // . do not both if there is a problem // . otherwise if our ip is invalid (0 or 1) we core in // getNumSiteInlinks() which requires a valid ip // . if its robots.txt disallowed, then indexCode will be set, but we // still want to cache our sitenuminlinks in tagdb! delicious.com was // recomputing the sitelinkinfo each time because we were not storing // these tags in tagdb!! if ( ! *ip || *ip == -1 ) { // *ic ) { m_newTagBuf.reset(); m_newTagBufValid = true; return &m_newTagBuf; } // get the tags already in tagdb TagRec *gr = getTagRec ( ); if ( ! gr || gr == (void *)-1 ) return (SafeBuf *)gr; // get our site char *mysite = getSite(); // this must not block to avoid re-computing "addme" above if ( mysite == (void *)-1 ) { char *xx=NULL;*xx=0; } if ( ! mysite || mysite == (char *)-1 ) return (SafeBuf *)mysite; // age of tag in seconds int32_t timestamp; // always just use the primary tagdb so we can cache our sitenuminlinks char rdbId = RDB_TAGDB; //if ( m_useSecondaryRdbs ) rdbId = RDB2_TAGDB2; //else rdbId = RDB_TAGDB; // sitenuminlinks special for repair if ( m_useSecondaryRdbs && // and not rebuilding titledb ! m_useTitledb ) { m_newTagBuf.reset(); m_newTagBufValid = true; int32_t old1 = gr->getLong("sitenuminlinks",-1,NULL,×tamp); if ( old1 == m_siteNumInlinks && old1 != -1 && ! m_updatingSiteLinkInfoTags ) return &m_newTagBuf; int32_t now = getTimeGlobal(); if ( g_conf.m_logDebugLinkInfo ) log("xmldoc: adding tag site=%s sitenuminlinks=%"INT32"", mysite,m_siteNumInlinks); if ( ! m_newTagBuf.addTag2(mysite,"sitenuminlinks",now, "xmldoc", *ip,m_siteNumInlinks,rdbId) ) return NULL; return &m_newTagBuf; } // if doing consistency check, this buf is for adding to tagdb // so just ignore those. we use ptr_tagRecData in getTagRec() function // but this is really for updating tagdb. if ( m_doingConsistencyCheck ) { m_newTagBuf.reset(); m_newTagBufValid = true; return &m_newTagBuf; } Xml *xml = getXml(); if ( ! xml || xml == (Xml *)-1 ) return (SafeBuf *)xml; Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww; char *isIndexed = getIsIndexed(); if ( !isIndexed || isIndexed==(char *)-1 ) return (SafeBuf *)isIndexed; char *isRoot = getIsSiteRoot(); if ( ! isRoot || isRoot == (char *)-1 ) return (SafeBuf *)isRoot; int32_t *siteNumInlinks = getSiteNumInlinks(); if ( ! siteNumInlinks ) return NULL; if ( siteNumInlinks == (int32_t *)-1) return (SafeBuf *)-1; // ok, get the sites of the external outlinks and they must // also be NEW outlinks, added to the page since the last time // we spidered it... Links *links = getLinks (); if ( ! links || links == (Links *)-1 ) return (SafeBuf *)links; // our next slated spider priority char *spiderLinks = getSpiderLinks(); if ( ! spiderLinks || spiderLinks == (char *)-1 ) return (SafeBuf *)spiderLinks; // . get ips of all outlinks. // . use m_msgeForIps class just for that // . it sucks if the outlink's ip is a dns timeout, then we never // end up being able to store it in tagdb, that is why when // rebuilding we need to skip adding firstip tags for the outlinks int32_t **ipv = NULL; TagRec ***grv = NULL; bool addLinkTags = true; if ( ! *spiderLinks ) addLinkTags = false; if ( ! m_useSpiderdb ) addLinkTags = false; if ( addLinkTags ) { ipv = getOutlinkFirstIpVector (); if ( ! ipv || ipv == (void *)-1 ) return (SafeBuf *)ipv; // . uses m_msgeForTagRecs for this one grv = getOutlinkTagRecVector(); if ( ! grv || grv == (void *)-1 ) return (SafeBuf *)grv; } // get root langid of root page uint8_t *rl = getRootLangId(); if ( ! rl || rl == (void *)-1 ) return (SafeBuf *)rl; char *hci = getHasContactInfo(); if ( ! hci || hci == (char *)-1 ) return (SafeBuf *)hci; // get the address class Addresses *aa = getAddresses (); if ( ! aa || aa == (Addresses *)-1 ) return (SafeBuf *)aa; // get comma separated list of email address on page char *emails = getEmailBuf ( ); if ( ! emails || emails == (void *)-1 ) return (SafeBuf *)emails; #ifdef _USETURKS_ //HashTableX *tvt = getTurkVotingTable (); //if ( ! tvt || tvt == (void *)-1 ) return (SafeBuf *)tvt; #endif // // init stuff // // . this gets the root doc and and parses titles out of it // . sets our m_rootTitleBuf/m_rootTitleBufSize char **rtbufp = getRootTitleBuf(); if ( ! rtbufp || rtbufp == (void *)-1) return (SafeBuf *)rtbufp; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // overwrite "getting root title buf" status setStatus ("computing new tags"); if ( g_conf.m_logDebugLinkInfo ) log("xmldoc: adding tags for mysite=%s",mysite); // int16_tcut //TagRec *tr = &m_newTagRec; // current time int32_t now = getTimeGlobal(); // actually, use spider download time if we can. that way // Test.cpp's injection runs will be more consistent! if ( ! strcmp(cr->m_coll,"qatest123") ) { //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } now = getSpideredTime();//m_spideredTime; } // store tags into here SafeBuf *tbuf = &m_newTagBuf; // allocate space to hold the tags we will add Tag *tag; int32_t need = 512; // add in root title buf in case we add it too need += m_rootTitleBufSize; // reserve it all now if ( ! tbuf->reserve(need) ) return NULL; // // add root langid if we need to // char *oldrl = gr->getString("rootlang",NULL,×tamp); // assume no valid id int32_t oldrlid = -99; // convert to id if ( oldrl ) oldrlid = getLangIdFromAbbr ( oldrl ); // if not in old tag, or changed from what was in tag, or it has // been 10 days or more, then update tagdb with this tag. bool addRootLang = false; if ( ! oldrl ) addRootLang = true; if ( oldrlid != *rl ) addRootLang = true; if ( now-timestamp > 10*86400 ) addRootLang = true; // injects do not download the root doc for speed reasons, so do not // bother for them unless the doc itself is the root. if ( m_wasContentInjected && !*isRoot ) addRootLang = false; // . get the two letter (usually) language code from the id // . i think the two chinese languages are 5 letters char *newrl = NULL; if ( addRootLang ) // i've seen this return NULL because *rl is a corrupt 215 // for some reason newrl = getLanguageAbbr( *rl ); if ( newrl ) tbuf->addTag3(mysite,"rootlang",now,"xmldoc",*ip,newrl,rdbId); // // add hascontactinfo if we need to // int32_t oldhci = gr->getLong("hascontactinfo",-1,NULL,×tamp); if ( oldhci == -1 || oldhci != *hci || now-timestamp > 10 *86400 ) { char *val = "0"; if ( m_hasContactInfo ) val = "1"; tbuf->addTag3 (mysite,"hascontactinfo",now,"xmldoc",*ip,val, rdbId); } // // add "site" tag // char *oldsite = gr->getString("site",NULL); if ( ! oldsite || strcmp(oldsite,mysite) || now-timestamp > 10*86400) tbuf->addTag3(mysite,"site",now,"xmldoc",*ip,mysite,rdbId); // // add firstip if not there at all // char *oldfip = gr->getString("firstip",NULL); // convert it int32_t ip3 = 0; if ( oldfip ) ip3 = atoip(oldfip); // if not there or if bogus, add it!! should override bogus firstips if ( ! ip3 || ip3 == -1 ) { char *ipstr = iptoa(m_ip); //if ( m_ip == 0 || m_ip == -1 ) { char *xx=NULL;*xx=0; } //int32_t iplen = gbstrlen(ipstr); //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } tbuf->addTag3(mysite,"firstip",now,"xmldoc",*ip,ipstr, rdbId); } //if ( strncmp(m_firstUrl.m_url,"http://delicious.com/",21)==0 ) // log("boo"); // sitenuminlinks int32_t old1 = gr->getLong("sitenuminlinks",-1,NULL,×tamp); if ( old1 == -1 || old1 != m_siteNumInlinks || m_updatingSiteLinkInfoTags ) { if ( g_conf.m_logDebugLinkInfo ) log("xmldoc: adding tag site=%s sitenuminlinks=%"INT32"", mysite,m_siteNumInlinks); if ( ! tbuf->addTag2(mysite,"sitenuminlinks",now,"xmldoc", *ip,m_siteNumInlinks,rdbId) ) return NULL; } int32_t old2, old3, old4; // if running for diffbot crawlbot then isCustomCrawl is true // so do not update the siteinlink info already in tagdb since i // imported it from my main collection. we do not want to overwrite it. // NO, because for single site crawls we bottlenech on msg25 // when there are millions of urls. we only skip this // for the global-index and if already in tagdb! // No, let's just not invalidate the sitenuminlinks* tags // in XmlDoc::getSiteNumInlinks() //if ( strcmp(cr->m_coll,"GLOBAL-INDEX") == 0 ) ) goto skipSiteInlinks; // sitenuminlinksfresh old2 = gr->getLong("sitenuminlinksuniqueip",-1,NULL,×tamp); if ( old2 == -1 || old2 != m_siteNumInlinksUniqueIp || m_updatingSiteLinkInfoTags ) if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniqueip", now,"xmldoc", *ip,m_siteNumInlinksUniqueIp,rdbId)) return NULL; // sitepop old3 = gr->getLong("sitenuminlinksuniquecblock",-1,NULL, ×tamp); if ( old3 == -1 || old3 != m_siteNumInlinksUniqueCBlock || m_updatingSiteLinkInfoTags ) if ( ! tbuf->addTag2(mysite,"sitenuminlinksuniquecblock", now,"xmldoc", *ip,m_siteNumInlinksUniqueCBlock,rdbId)) return NULL; // total site inlinks old4 = gr->getLong("sitenuminlinkstotal",-1,NULL, ×tamp); if ( old4 == -1 || old4 != m_siteNumInlinksTotal || m_updatingSiteLinkInfoTags ) if ( ! tbuf->addTag2(mysite,"sitenuminlinkstotal", now,"xmldoc", *ip,m_siteNumInlinksTotal,rdbId)) return NULL; // skipSiteInlinks: // get root title buf from old tag char *data = NULL; int32_t dsize = 0; Tag *rt = gr->getTag("roottitles"); if ( rt ) { data = rt->getTagData(); dsize = rt->getTagDataSize(); } bool addRootTitle = false; // store the root title buf if we need to. if we had no tag yet... if ( ! rt ) addRootTitle = true; // or if differs in size else if ( dsize != m_rootTitleBufSize ) addRootTitle = true; // or if differs in content else if ( memcmp(data,m_rootTitleBuf,m_rootTitleBufSize)) addRootTitle =true; // or if it is 10 days old or more if ( now-timestamp > 10*86400 ) addRootTitle = true; // but not if injected if ( m_wasContentInjected && ! *isRoot ) addRootTitle = false; // add it then if ( addRootTitle && ! tbuf->addTag(mysite,"roottitles",now,"xmldoc", *ip,m_rootTitleBuf,m_rootTitleBufSize, rdbId,true) ) return NULL; // // add the VENUEADDRESS tags // // init the dedup table so we do not add the same address many times char dtbuf[1000]; HashTableX dt; dt.set(8,0,32,dtbuf,1000,false,m_niceness,"xmldt"); // reset counts int32_t numContactAddressTags = 0; int32_t numContactEmailTags = 0; int32_t tagType2 = getTagTypeFromStr ( "contactaddress" ); int32_t tagType3 = getTagTypeFromStr ( "contactemails" ); // before we add the sitevenue to the tagrec let's make sure it is // not a dedup.. i.e. that we do not already have this address // in there. int32_t tagType = getTagTypeFromStr ( "venueaddress" ); // start at the first tag tag = gr->getFirstTag(); // loop over all tags in the buf, see if we got a dup for ( ; tag ; tag = gr->getNextTag ( tag ) ) { // count current contact addresses we have if ( tag->m_type == tagType2 ) numContactAddressTags++; if ( tag->m_type == tagType3 ) numContactEmailTags++; // skip if not a venueaddress tag if ( tag->m_type != tagType ) continue; // point to the serialized address char *data = tag->getTagData(); // get that address hash i guess uint64_t ah = getHashFromAddr ( data ); // add to dedup table - return NULL with g_errno set on error if ( ! dt.addKey ( &ah ) ) return NULL; } int32_t na = aa->getNumAddresses(); // add up to 10 for now for ( int32_t i = 0 ; i < na ; i++ ) { // get it Address *a = (Address *)aa->m_am.getPtr(i); // check if venue if ( ! ( a->m_flags & AF_VENUE_DEFAULT ) ) continue; // must have street on the page, not pointing into a tagrec // from tagdb... otherwise we keep re-adding if ( a->m_street->m_a < 0 ) continue; // dedup! dedup against // addresses in tagdb for venueaddress tag. can we use // the dc[] array from Address.cpp... we need another // set of bit flags for address class: if ( dt.isInTable ( &a->m_hash ) ) continue; // sanity if ( a->m_hash == 0 ) { char *xx=NULL;*xx=0; } // . serialize it // . TODO: get rid of Address::addToTagRec() functions char abuf[5000]; a->serialize ( abuf , 5000, m_firstUrl.getUrl(),false,true); // store in safebuf of tags if ( ! tbuf->addTag3 (mysite,"venueaddress",now,"xmldoc", *ip,abuf,rdbId) ) return NULL; // only add once if ( ! dt.addKey (&a->m_hash) ) return NULL; } // // // contact info stuff // // // ensure m_numContactAddresses etc. are valid Address **ca = getContactAddresses(); // blocked? if ( ! ca || ca == (void *)-1 ) return (SafeBuf *)ca; // do not do this for root if multiple addresses. this // fixes http://obits.abqjournal.com/ if ( *isRoot && aa->m_uniqueStreetHashes > 1 ) na = 0; // do not store more than 2 contact addresses, or 2 contact emails // to avoid tagdb bloat. and also because we do not need that many. // . store contact address if we had one // . this is a buffer of Address ptrs for ( int32_t i = 0 ; i < m_numContactAddresses ; i++ ) { // stop on breach if ( numContactAddressTags >= 2 ) break; // inc it numContactAddressTags++; // breathe QUICKPOLL(m_niceness); // get it Address *a = ca[i]; // . serialize it // . TODO: get rid of Address::addToTagRec() functions char abuf[5000]; a->serialize ( abuf , 5000, m_firstUrl.getUrl(),false,true); // store in safebuf of tags if ( ! tbuf->addTag3 (mysite,"contactaddress",now,"xmldoc", *ip,abuf,rdbId) ) return NULL; } // . add email addresses and submission forms to tag // . this does not block, so make sure only called once! // . contact emails. comma separated list if ( emails && numContactEmailTags <= 1 ) { numContactEmailTags++; if ( ! tbuf->addTag3 (mysite,"contactemails",now,"xmldoc", *ip,emails,rdbId) ) return NULL; } // // // NOW add tags for our outlinks // // bool oldHighQualityRoot = true; // if we are new, do not add anything, because we only add a tagdb // rec entry for "new" outlinks that were added to the page since // the last time we spidered it if ( ! *isIndexed ) oldHighQualityRoot = false; // special tags for google search results pages for scraping char inGoogle = false; if ( strstr(mysite,"google.com") ) inGoogle = true; // no updating if we are not root if ( ! inGoogle && ! *isRoot ) oldHighQualityRoot = false; // must be high quality, too if ( ! inGoogle && *siteNumInlinks < 500 ) oldHighQualityRoot = false; // . if we are a google url then add tags for each outlink! // . more google special tags to replace Scraper.cpp char *fu = m_firstUrl.getUrl(); //char *name = NULL; bool inGoogleBlogs = false; bool inGoogleNews = false; if ( ! strncmp ( fu , "http://www.google.com/blogsearch?", 33 ) ) inGoogleBlogs = true; if ( ! strncmp ( fu , "http://blogsearch.google.com/blogsearch?", 40 )) inGoogleBlogs = true; if ( ! strncmp ( fu , "http://news.google.com/", 23 )) inGoogleNews = true; // only do once per site char buf[1000]; HashTableX ht; ht.set (4,0,-1 , buf , 1000 ,false,m_niceness,"sg-tab"); // get site of outlink SiteGetter siteGetter; // . must be from an EXTERNAL DOMAIN and must be new // . we should already have its tag rec, if any, since we have msge int32_t n = links->getNumLinks(); // not if not spidering links if ( ! addLinkTags ) n = 0; // get the flags linkflags_t *flags = links->m_linkFlags; // scan all outlinks we have on this page for ( int32_t i = 0 ; i < n ; i++ ) { // get its tag rec TagRec *gr = (*grv)[i]; // does this hostname have a "firstIp" tag? char *ips = gr->getString("firstip",NULL); bool skip = false; // skip if we are not "old" high quality root if ( ! oldHighQualityRoot ) skip = true; // . skip if not external domain // . we added this above, so just "continue" if ( flags[i] & LF_SAMEDOM ) continue;//skip = true; // skip links in the old title rec if ( flags[i] & LF_OLDLINK ) skip = true; // skip if determined to be link spam! should help us // with the text ads we hate so much if ( links->m_spamNotes[i] ) skip = true; // if we should skip, and they have firstip already... if ( skip && ips ) continue; // get the normalized url char *url = links->getLinkPtr(i); // get the site. this will not block or have an error. siteGetter.getSite(url,gr,timestamp,cr->m_collnum,m_niceness); // these are now valid and should reference into // Links::m_buf[] char *site = siteGetter.m_site; int32_t siteLen = siteGetter.m_siteLen; int32_t linkIp = (*ipv)[i]; // get site hash uint32_t sh = hash32 ( site , siteLen ); // ensure site is unique if ( ht.getSlot ( &sh ) >= 0 ) continue; // add it. returns false and sets g_errno on error if ( ! ht.addKey ( &sh ) ) return NULL; // . need to add firstip tag for this link's subdomain? // . this was in Msge1.cpp but now we do it here if ( ! ips && linkIp && linkIp != -1 ) { // make it char *ips = iptoa(linkIp); if (!tbuf->addTag3(site,"firstip",now,"xmldoc",*ip,ips, rdbId)) return NULL; } if ( skip ) continue; // if outlink is a .gov or .edu site, do not bother, because // getIsSpam() always returns false for those // TODO: verify this //if ( flags[i] & LF_EDUTLD ) continue; //if ( flags[i] & LF_GOVTLD ) continue; // this must be valid //if ( ! m_spideredTimeValid ) { char *xx=NULL;*xx=0; } //int32_t timestamp = m_spideredTime; // how much avail for adding tags? int32_t avail = tbuf->getAvail(); // reserve space int32_t need = 512; // make sure enough if ( need > avail && ! tbuf->reserve ( need ) ) return NULL; // add tag for this outlink if ( inGoogle ) {// && ! gr->getTag("ingoogle") ) { if ( ! tbuf->addTag(site,"ingoogle",now,"xmldoc", *ip,"1",2,rdbId,true) ) return NULL; } if ( inGoogleBlogs && //! gr->getTag("ingoogleblogs") && !tbuf->addTag(site,"ingoogleblogs",now,"xmldoc",*ip,"1",2, rdbId,true)) return NULL; if ( inGoogleNews && //! gr->getTag("ingooglenews") && !tbuf->addTag(site,"ingooglenews",now,"xmldoc",*ip,"1",2, rdbId,true)) return NULL; // link is linked to by a high quality site! 500+ inlinks. if ( gr->getNumTagTypes("authorityinlink") < 5 && ! tbuf->addTag(site,"authorityinlink",now,"xmldoc", *ip,"1",2,rdbId,true) ) return NULL; } m_newTagBufValid = true; return &m_newTagBuf; } // // // BEGIN OLD SPAM.CPP class // // #define WTMPBUFSIZE (MAX_WORDS *21*3) // . RULE #28, repetitive word/phrase spam detector // . set's the "spam" member of each word from 0(no spam) to 100(100% spam) // . "bits" describe each word in phrasing terminology // . if more than maxPercent of the words are spammed to some degree then we // consider all of the words to be spammed, and give each word the minimum // score possible when indexing the document. // . returns false and sets g_errno on error char *XmlDoc::getWordSpamVec ( ) { if ( m_wordSpamBufValid ) { char *wbuf = m_wordSpamBuf.getBufStart(); if ( ! wbuf ) return (char *)0x01; return wbuf; } setStatus("getting word spam vec"); // assume not the repeat spammer m_isRepeatSpammer = false; Words *words = getWords(); if ( ! words || words == (Words *)-1 ) return (char *)words; m_wordSpamBuf.purge(); int32_t nw = words->getNumWords(); if ( nw <= 0 ) { m_wordSpamBufValid = true; return (char *)0x01; } Phrases *phrases = getPhrases (); if ( ! phrases || phrases == (void *)-1 ) return (char *)phrases; Bits *bits = getBits(); if ( ! bits ) return (char *)NULL; m_wordSpamBufValid = true; //if ( m_isLinkText ) return true; //if ( m_isCountTable ) return true; // int16_tcuts //Words *words = m_words; //Bits *bits = m_bits; // if 20 words totally spammed, call it all spam? m_numRepeatSpam = 20; // int16_tcut int32_t sni = m_siteNumInlinks; if ( ! m_siteNumInlinksValid ) { char *xx=NULL;*xx=0; } // set "m_maxPercent" int32_t maxPercent = 6; if ( sni > 10 ) maxPercent = 8; if ( sni > 30 ) maxPercent = 10; if ( sni > 100 ) maxPercent = 20; if ( sni > 500 ) maxPercent = 30; // fix this a bit so we're not always totally spammed maxPercent = 25; // assume not totally spammed m_totallySpammed = false; // get # of words we have to set spam for int32_t numWords = words->getNumWords(); // set up the size of the hash table (number of buckets) int32_t size = numWords * 3; // . add a tmp buf as a scratch pad -- will be freed right after // . allocate this second to avoid mem fragmentation more // . * 2 for double the buckets char tmpBuf [ WTMPBUFSIZE ]; char *tmp = tmpBuf; int32_t need = (numWords * 21) * 3 + numWords; if ( need > WTMPBUFSIZE ) { tmp = (char *) mmalloc ( need , "Spam" ); if ( ! tmp ) { log("build: Failed to allocate %"INT32" more " "bytes for spam detection: %s.", need,mstrerror(g_errno)); return NULL; } } QUICKPOLL(m_niceness); // set up ptrs char *p = tmp; // first this unsigned char *spam = (unsigned char *)p; p += numWords ; // . this allows us to make linked lists of indices of words // . i.e. next[13] = 23--> word #23 FOLLOWS word #13 in the linked list int32_t *next = (int32_t *)p; p += size * 4; // hash of this word's stem (or word itself if useStem if false) int64_t *bucketHash = (int64_t *)p; p += size * 8; // that word's position in document int32_t *bucketWordPos = (int32_t *)p; p += size * 4; // profile of a word int32_t *profile = (int32_t *)p; p += size * 4; // is it a common word? char *commonWords = (char *)p; p += size * 1; // sanity check if ( p - tmp > need ) { char *xx=NULL;*xx=0; } // clear all our spam percentages for these words memset ( spam , 0 , numWords ); int32_t np; // clear the hash table int32_t i; for ( i = 0 ; i < size ; i++ ) { bucketHash [i] = 0; bucketWordPos[i] = -1; commonWords [i] = 0; } // count position since Words class can now have tags in it // //int32_t pos = 0; //bool usePos = false; //if ( words->m_tagIds ) usePos = true; int64_t *wids = words->getWordIds(); // . loop through each word // . hash their stems and place in linked list // . if no stemming then don't do stemming for ( i = 0 ; i < numWords ; i++ ) { // . skip punctuation // . this includes tags now , too i guess //if ( words->isPunct(i) ) continue; if ( wids[i] == 0 ) continue; // skip if will not be indexed cuz score is too low //if ( wscores && wscores[i] <= 0 ) continue; QUICKPOLL(m_niceness); // TODO: get phrase stem if stemming is on // store the phrase stem this word int32_to the buffer // blen = words->getPhraseStem(i,buf,100); // if (blen<=0) continue; // get the hash of the ith word int64_t h = words->getWordId(i); // use secondary wordId if available //if ( words->getStripWordId(i) ) // h = words->getStripWordId(i); // "j" is the bucket index int32_t j = (uint64_t)h % size; // make sure j points to the right bucket while (bucketHash[j]) { if ( h == bucketHash[j] ) break; if (++j == size) j = 0; } // if this bucket is occupied by a word then replace it but // make sure it adds onto the "linked list" if (bucketHash[j]) { // if Words class contain tags as words, do this //if ( usePos ) { // next [pos] = bucketWordPos[j]; // bucketWordPos[ j] = pos++; //} //else { // add onto linked list for the ith word next[i] = bucketWordPos[j]; // replace bucket with index to this word bucketWordPos[j] = i; //} } // otherwise, we have a new occurence of this word else { bucketHash [j] = h; // if Words class contain tags as words, do this //if ( usePos ) { // bucketWordPos[ j] = pos++; // next [pos] = -1; //} //else { // store our position # (i) in bucket bucketWordPos[j] = i; // no next occurence of the ith word yet next[i] = -1; //} } // if stop word or number then mark it if ( bits->isStopWord(i) ) commonWords[j] = 1; if ( words->isNum ( i ) ) commonWords[j] = 1; } // count distinct candidates that had spam and did not have spam int32_t spamWords = 0; int32_t goodWords = 0; // . now cruise down the hash table looking for filled buckets // . grab the linked list of indices and make a "profile" for ( i = 0 ; i < size ; i++ ) { // skip empty buckets if (bucketHash[i] == 0) continue; np=0; // word #j is in bucket #i int32_t j = bucketWordPos[i]; // . cruise down the linked list for this word while ( j!=-1) { // store position of occurence of this word in profile profile [ np++ ] = j; // get the position of next occurence of this word j = next[ j ]; } // if 2 or less occurences of this word, don't check for spam if ( np < 3 ) { goodWords++; continue; } // // set m_isRepeatSpammer // // look for a word repeated in phrases, in a big list, // where each phrase is different // int32_t max = 0; int32_t count = 0; int32_t knp = np; // must be 3+ letters, not a stop word, not a number if ( words->m_wordLens[profile[0]] <= 2 || commonWords[i] ) knp = 0; // scan to see if they are a tight list for ( int32_t k = 1 ; k < knp ; k++ ) { // breathe QUICKPOLL(m_niceness); // are they close together? if not, bail if ( profile[k-1] - profile[k] >= 25 ) { count = 0; continue; } // otherwise inc it count++; // must have another word in between or tag int32_t a = profile[k]; int32_t b = profile[k-1]; bool gotSep = false; bool inLink = false; for ( int32_t j = a+1 ; j <b ; j++ ) { // if in link do not count, chinese spammer // does not have his crap in links if ( words->m_words[j][0] == '<' && words->m_wordLens[j]>=3 ) { // get the next char after the < char nc; nc=to_lower_a(words->m_words[j][1]); // now check it for anchor tag if ( nc == 'a' ) { inLink = true; break; } } if ( words->m_words[j][0] == '<' ) gotSep = true; if ( is_alnum_a(words->m_words[j][0]) ) gotSep = true; } // . the chinese spammer always has a separator, // usually another tag // . and fix "BOW BOW BOW..." which has no separators if ( ! gotSep ) count--; else if ( inLink ) count--; // get the max if ( count > max ) max = count; } // a count of 50 such monsters indicates the chinese spammer if ( max >= 50 ) m_isRepeatSpammer = true; // // end m_isRepeatSpammer detection // // . determine the probability this word was spammed by looking // at the distribution of it's positions in the document // . sets "spam" member of each word in this profile // . don't check if word occurred 2 or less times // . TODO: what about TORA! TORA! TORA! // . returns true if 1+ occurences were considered spam QUICKPOLL(m_niceness); bool isSpam = setSpam ( profile , np , numWords , spam ); // don't count stop words or numbers towards this threshold if ( commonWords[i] ) continue; // tally them up if ( isSpam ) spamWords++; else goodWords++; } // what percent of distinct cadidate words were spammed? int32_t totalWords = spamWords + goodWords; // if no or ver few words return true int32_t percent; if ( totalWords <= 10 ) goto done; percent = ( spamWords * 100 ) / totalWords; // if 20% of words we're spammed punish everybody now to 100% spam // if we had < 100 candidates and < 20% spam, don't bother //if ( percent < 5 ) goto done; if ( percent <= maxPercent ) goto done; // set flag so linkspam.cpp can see if all is spam and will not allow // this page to vote m_totallySpammed = true; // now only set to 99 so each singleton usually gets hashed for ( i = 0 ; i < numWords ; i++ ) if ( words->getWordId(i) && spam[i] < 99 ) spam[i] = 99; done: // update the weights for the words //for ( i = 0 ; i < numWords ; i++ ) { // m_ww[i] = ( m_ww[i] * (100 - spam[i]) ) / 100; //} // TODO: use the min word spam algo as in Phrases.cpp for this! //for ( i = 0 ; i < numWords ; i++ ) { // m_pw[i] = ( m_pw[i] * (100 - spam[i]) ) / 100; //} // convert from percent spammed into rank.. from 0 to 10 i guess for ( i = 0 ; i < numWords ; i++ ) spam[i] = (MAXWORDSPAMRANK * (100 - spam[i])) / 100; // copy into our buffer if ( ! m_wordSpamBuf.safeMemcpy ( (char *)spam , numWords ) ) return NULL; // free our temporary table stuff if ( tmp != tmpBuf ) mfree ( tmp , need , "Spam" ); return m_wordSpamBuf.getBufStart(); } // . a "profile" is an array of all the positions of a word in the document // . a "position" is just the word #, like first word, word #8, etc... // . we map "each" subProfile to a probability of spam (from 0 to 100) // . if the profile is really big we get really slow (O(n^2)) iterating through // many subProfiles // . so after the first 25 words, it's automatically considered spam // . return true if one word was spammed w/ probability > 20% bool XmlDoc::setSpam ( int32_t *profile, int32_t plen , int32_t numWords , unsigned char *spam ) { // don't bother detecting spam if 2 or less occurences of the word if ( plen < 3 ) return false; int32_t i; // if we have more than 10 words and this word is 20% or more of // them then all but the first occurence is spammed //log(LOG_INFO,"setSpam numRepeatSpam = %f", m_numRepeatSpam); if (numWords > 10 && (plen*100)/numWords >= m_numRepeatSpam) { for (i=1; i<plen; i++) spam[profile[i]] = 100; return true ; } // . over 50 repeated words is ludicrous // . set all past 50 to spam and continue detecting // . no, our doc length based weight takes care of that kind of thing //if (plen > 50 && m_version < 93 ) { // // TODO: remember, profile[i] is in reverse order!! we should // // really do i=0;i<plen-50, but this is obsolete anyway... // for (i=50; i<plen;i++) m_spam[profile[i]] = 100; // plen = 50; //} // we have to do this otherwise it takes FOREVER to do for plens in // the thousands, like i saw a plen of 8338! if ( plen > 50 ) { // && m_version >= 93 ) { // . set all but the last 50 to a spam of 100% // . the last 50 actually occur as the first 50 in the doc for (i=0; i<plen-50;i++) spam[profile[i]] = 100; // we now have only 50 occurences plen = 50; // we want to skip the first plen-50 because they actually // occur at the END of the document profile += plen - 50; } QUICKPOLL(m_niceness); // higher quality docs allow more "freebies", but only starting with // version 93... (see Titledb.h) // profile[i] is actually in reverse order so we subtract off from wlen //int32_t off ; //if ( m_version >= 93 ) { // off = (m_docQuality - 30) / 3; // if ( off < 0 ) off = 0; //} // just use 40% "quality" int32_t off = 3; // . now the nitty-gritty part // . compute all sub sequences of the profile // . similar to a compression scheme (wavelets?) // . TODO: word positions should count by two's since punctuation is // not included so start step @ 2 instead of 1 // . if "step" is 1 we look at every word position in the profile // . if "step" is 2 we look at every other word position // . if "step" is 3 we look at every 3rd word position, etc... int32_t maxStep = plen / 4; if ( maxStep > 4 ) maxStep = 4; // . loop through all possible tuples int32_t window, wlen, step, prob; for ( step = 1 ; step <= maxStep ; step++ ) { for ( window = 0 ; window + 3 < plen ; window+=1) { for (wlen = 3; window+wlen <= plen ; wlen+=1) { // continue if step isn't aligned with window // length if (wlen % step != 0) continue; // . get probability that this tuple is spam // . returns 0 to 100 prob = getProbSpam ( profile + window , wlen , step); // printf("(%i,%i,%i)=%i\n",step,window, // wlen,prob); // . if the probability is too low continue // . was == 100 if ( prob <= 20 ) continue; // set the spammed words spam to "prob" // only if it's bigger than their current spam for (i=window; i<window+wlen;i++) { // first occurences can have immunity // due to doc quality being high if ( i >= plen - off ) break; if (spam[profile[i]] < prob) spam[profile[i]] = prob; } QUICKPOLL(m_niceness); } } } // was this word spammed at all? bool hadSpam = false; for (i=0;i<plen;i++) if ( spam[profile[i]] > 20 ) hadSpam = true; // make sure at least one word survives for (i=0;i<plen;i++) if ( spam[profile[i]] == 0) return hadSpam; // clear the spam level on this guy spam[profile[0]] = 0; // return true if we had spam, false if not return hadSpam; } bool getWordPosVec ( Words *words , Sections *sections, //int32_t wordStart, //int32_t wordEnd, int32_t startDist, // m_dist char *fragVec, int32_t niceness , SafeBuf *wpos ) { int32_t dist = startDist; // 0; Section *lastsx = NULL; int32_t tagDist = 0; Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; nodeid_t *tids = words->m_tagIds; int64_t *wids = words->m_wordIds; int32_t *wlens = words->getWordLens(); char **wptrs = words->getWords(); int32_t nw = words->getNumWords(); if ( ! wpos->reserve ( nw * 4 ) ) return false; int32_t *wposvec = (int32_t *)wpos->getBufStart(); for ( int32_t i = 0 ; i < nw ; i++ ) { // breathe QUICKPOLL(niceness); // save it wposvec[i] = dist; // tags affect the distance/wordposition cursor if ( tids && tids[i] ) { // tag distance affects nodeid_t tid = tids[i] & BACKBITCOMP; if ( isBreakingTagId ( tid ) ) tagDist += SENT_UNITS; dist++; continue; } // . and so do sequences of punct // . must duplicate this code in Query.cpp for setting // QueryWord::m_posNum if ( ! wids[i] ) { // simple space or sequence of just white space if ( words->isSpaces(i) ) dist++; // 'cd-rom' else if ( wptrs[i][0]=='-' && wlens[i]==1 ) dist++; // 'mr. x' else if ( wptrs[i][0]=='.' && words->isSpaces2(i,1)) dist++; // animal (dog) else dist += 2; continue; } // ignore if in repeated fragment if ( fragVec && i<MAXFRAGWORDS && fragVec[i] == 0 ) { dist++; continue; } Section *sx = NULL; if ( sp ) { sx = sp[i]; // ignore if in style tag, etc. and do not // increment the distance if ( sx->m_flags & NOINDEXFLAGS ) continue; } // different sentence? if ( sx && ( ! lastsx || sx->m_sentenceSection != lastsx->m_sentenceSection ) ) { // separate different sentences with 30 units dist += SENT_UNITS; // 30; // limit this! if ( tagDist > 120 ) tagDist = 120; // and add in tag distances as well here, otherwise // we do not want "<br>" to really increase the // distance if the separated words are in the same // sentence! dist += tagDist; // new last then lastsx = sx; // store the vector AGAIN wposvec[i] = dist; } tagDist = 0; dist++; } return true; } bool getDensityRanks ( int64_t *wids , int32_t nw , int32_t hashGroup , SafeBuf *densBuf , Sections *sections , int32_t niceness ) { //int32_t nw = wordEnd - wordStart; // make the vector if ( ! densBuf->reserve ( nw ) ) return false; // convenience char *densVec = densBuf->getBufStart(); // clear i guess memset ( densVec , 0 , nw ); if ( hashGroup != HASHGROUP_BODY && hashGroup != HASHGROUP_HEADING ) sections = NULL; // scan the sentences if we got those Section *ss = NULL; if ( sections ) ss = sections->m_firstSent; // sanity //if ( sections && wordStart != 0 ) { char *xx=NULL;*xx=0; } for ( ; ss ; ss = ss->m_nextSent ) { // breathe QUICKPOLL(niceness); // count of the alnum words in sentence int32_t count = ss->m_alnumPosB - ss->m_alnumPosA; // start with one word! count--; // how can it be less than one alnum word if ( count < 0 ) continue; // . base density rank on that // . count is 0 for one alnum word now int32_t dr = MAXDENSITYRANK - count; // ensure not negative. make it at least 1. zero means un-set. if ( dr < 1 ) dr = 1; // mark all in sentence then for ( int32_t i = ss->m_senta ; i < ss->m_sentb ; i++ ) { // breathe QUICKPOLL(niceness); // assign densVec[i] = dr; } } // all done if using sections if ( sections ) return true; // count # of alphanumeric words in this string int32_t na = 0; for ( int32_t i = 0 ; i < nw ; i++ ) if ( wids[i] ) na++; // a single alnum should map to 0 "na" na--; // wtf? if ( na < 0 ) return true; // compute density rank int32_t dr = MAXDENSITYRANK - na ; // at least 1 to not be confused with 0 which means un-set if ( dr < 1 ) dr = 1; // assign for ( int32_t i = 0 ; i < nw ; i++ ) { // breathe QUICKPOLL(niceness); // assign densVec[i] = dr; } return true; } // . called by hashString() for hashing purposes, i.e. creating posdb keys // . string is usually the document body or inlink text of an inlinker or // perhaps meta keywords. it could be anything. so we need to create this // vector based on that string, which is represented by words/phrases here. bool getDiversityVec ( Words *words , Phrases *phrases , HashTableX *countTable , SafeBuf *sbWordVec , //SafeBuf *sbPhraseVec , int32_t niceness ) { int64_t *wids = words->getWordIds (); //nodeid_t *tids = words->getTagIds (); int32_t nw = words->getNumWords(); int64_t *pids = phrases->getPhraseIds2(); // . make the vector // . it will be diversity ranks, so one float per word for now // cuz we convert to rank below though, one byte rank if ( ! sbWordVec ->reserve ( nw*4 ) ) return false; //if ( ! sbPhraseVec->reserve ( nw*4 ) ) return false; // get it float *ww = (float *)sbWordVec ->getBufStart(); //float *pw = (float *)sbPhraseVec->getBufStart(); int32_t nexti = -10; int64_t pidLast = 0; // . now consider ourselves the last word in a phrase // . adjust the score of the first word in the phrase to be for ( int32_t i = 0 ; i < nw ; i++ ) { // yield QUICKPOLL ( niceness ); // skip if not alnum word if ( ! wids[i] ) { ww[i] = 0.0; continue; } // try to inline this int64_t nextWid = 0; int64_t lastPid = 0; // how many words in the bigram? int32_t nwp = phrases->getNumWordsInPhrase2(i); if ( nwp > 0 ) nextWid = wids [i + nwp - 1] ; if ( i == nexti ) lastPid = pidLast; // get current pid int64_t pid = pids[i]; // get the word and phrase weights for term #i float ww2; //float pw2; getWordToPhraseRatioWeights ( lastPid , // pids[i-1], wids[i] , pid , nextWid , // wids[i+1] , &ww2 , //&pw2 , countTable , 1);//m_version ); // 0 to 1.0 if ( ww2 < 0 || ww2 > 1.0 ) { char *xx=NULL;*xx=0; } // save the last phrase id if ( nwp > 0 ) { nexti = i + nwp - 1; pidLast = pid; // pids[i] ; } // . apply the weights // . do not hit all the way down to zero though... // . Words.cpp::hash() will not index it then... //if ( ww[i] > 0 ) { ww[i] = ww2; //} /* //if ( pw[i] > 0 ) { pw[i] = (int32_t)(pw[i] * pw2); if ( pw[i] <= 0 ) pw[i] = 1; //} // MDW: why was this here? //if ( isLinkText ) continue; // do not demote all the way to 0 //if ( ww[i] <= 0 ) ww[i] = 2; // skip if phrase score is 0 if ( ! pw[i] ) continue; if ( pid == 0 ) { pw[i] = 0; continue; } // skip if does not start phrase if ( nwp <= 0 ) continue; // sanity check if ( nwp == 99 ) { char *xx = NULL; *xx = 0; } // now mod the score float avg = pw[i]; // weight by punct in between //for ( int32_t j = i+1 ; j < i+nwp ; j++ ) { // if ( wids[j] ) continue; // avg = (avg * (int64_t)pw[j]) / DW; //} // do not demote all the way to zero, we still want to index it // and when normalized on a 100 point scale, like when printed // out by PageParser.cpp, a score of 1 here gets normalized to // 0, so make sure it is at least 2. if ( avg < 2 ) avg = 2; // set that as our new score pw[i] = avg; */ } // overwrite the array of floats with an array of chars (ranks) char *nww = (char *)ww; //char *npw = (char *)pw; // convert from float into a rank from 0-15 for ( int32_t i = 0 ; i < nw ; i++ ) { if ( ! ww[i] ) { nww[i] = 0; continue; } // 2.50 is max in getWordToPhraseRatioWeights() function char wrank = (char) ((ww[i] * ((float)MAXDIVERSITYRANK))/.55); // sanity if ( wrank > MAXDIVERSITYRANK ) wrank = MAXDIVERSITYRANK; if ( wrank < 0 ) { char *xx=NULL;*xx=0; } //char prank = (char) ((pw[i] * 15.0) / 2.50); // assign now nww[i] = wrank; //npw[i] = prank; } return true; } // match word sequences of NUMWORDS or more words #define NUMWORDS 5 // . repeated sentence frags // . 1-1 with words in body of doc char *XmlDoc::getFragVec ( ) { if ( m_fragBufValid ) { char *fb = m_fragBuf.getBufStart(); if ( ! fb ) return (char *)0x01; return fb; } setStatus("getting frag vec"); Words *words = getWords(); if ( ! words || words == (Words *)-1 ) return (char *)words; Bits *bits = getBits(); if ( ! bits ) return NULL; m_fragBuf.purge(); // ez vars int64_t *wids = words->getWordIds (); int32_t nw = words->getNumWords(); // if no words, nothing to do if ( nw == 0 ) { m_fragBufValid = true; return (char *)0x01;//true; } // truncate for performance reasons. i've seen this be over 4M // and it was VERY VERY SLOW... over 10 minutes... // - i saw this tak over 200MB for an alloc for // WeightsSet3 below, so lower from 200k to 50k. this will probably // make parsing inconsistencies for really large docs... if ( nw > MAXFRAGWORDS ) nw = MAXFRAGWORDS; int64_t ringWids [ NUMWORDS ]; int32_t ringPos [ NUMWORDS ]; int32_t ringi = 0; int32_t count = 0; uint64_t h = 0; // . make the hash table // . make it big enough so there are gaps, so chains are not too long int32_t minBuckets = (int32_t)(nw * 1.5); uint32_t nb = 2 * getHighestLitBitValue ( minBuckets ) ; int32_t need = nb * (8+4+4); char *buf = NULL; char tmpBuf[50000]; if ( need < 50000 ) buf = tmpBuf; else buf = (char *)mmalloc ( need , "WeightsSet3" ); char *ptr = buf; uint64_t *hashes = (uint64_t *)ptr; ptr += nb * 8; int32_t *vals = (int32_t *)ptr; ptr += nb * 4; float *ww = (float *)ptr; ptr += nb * 4; if ( ! buf ) return NULL; for ( int32_t i = 0 ; i < nw ; i++ ) ww[i] = 1.0; if ( ptr != buf + need ) { char *xx=NULL;*xx=0; } // make the mask uint32_t mask = nb - 1; // clear the hash table memset ( hashes , 0 , nb * 8 ); // clear ring of hashes memset ( ringWids , 0 , NUMWORDS * 8 ); // for sanity check int32_t lastStart = -1; // . hash EVERY NUMWORDS-word sequence in the document // . if we get a match look and see what sequences it matches // . we allow multiple instances of the same hash to be stored in // the hash table, so keep checking for a matching hash until you // chain to a 0 hash, indicating the chain ends // . check each matching hash to see if more than NUMWORDS words match // . get the max words that matched from all of the candidates // . demote the word and phrase weights based on the total/max // number of words matching for ( int32_t i = 0 ; i < nw ; i++ ) { // skip if not alnum word if ( ! wids[i] ) continue; // yield QUICKPOLL ( m_niceness ); // add new to the 5 word hash h ^= wids[i]; // . remove old from 5 word hash before adding new... // . initial ring wids are 0, so should be benign at startup h ^= ringWids[ringi]; // add to ring ringWids[ringi] = wids[i]; // save our position ringPos[ringi] = i; // wrap the ring ptr if we need to, that is why we are a ring if ( ++ringi >= NUMWORDS ) ringi = 0; // this 5-word sequence starts with word # "start" int32_t start = ringPos[ringi]; // need at least NUMWORDS words in ring buffer to do analysis if ( ++count < NUMWORDS ) continue; // . skip if it starts with a word which can not start phrases // . that way "a new car" being repeated a lot will not // decrease the weight of the phrase term "new car" // . setCountTable() calls set3() with this set to NULL //if ( bits && ! bits->canStartPhrase(start) ) continue; // sanity check if ( start <= lastStart ) { char *xx = NULL; *xx = 0; } // reset max matched int32_t max = 0; // look up in the hash table uint32_t n = h & mask; // sanity breach check if ( n >= nb ) { char *xx=NULL;*xx=0; } loop: // all done if empty if ( ! hashes[n] ) { // sanity check //if ( n >= nb ) { char *xx = NULL; *xx = 0; } // add ourselves to the hash table now hashes[n] = h; // sanity check //if ( wids[start] == 0 ) { char *xx = NULL; *xx = 0; } // this is where the 5-word sequence starts vals [n] = start; // save it lastStart = start; // debug point //if ( start == 7948 ) // log("heystart"); // do not demote words if less than NUMWORDS matched if ( max < NUMWORDS ) continue; // . how much we should we demote // . 10 matching words pretty much means 0 weights float demote = 1.0 - ((max-5)*.10); if ( demote >= 1.0 ) continue; if ( demote < 0.0 ) demote = 0.0; // . RULE #26 ("long" phrases) // . if we got 3, 4 or 5 in our matching sequence // . basically divide by the # of *phrase* terms // . multiply by 1/(N-1) // . HOWEVER, should we also look at HOW MANY other // sequences matches this too!??? //float demote = 1.0 / ((float)max-1.0); // set3() is still called from setCountTable() to // discount the effects of repeated fragments, and // the count table only understands score or no score //if ( max >= 15 ) demote = 0.0; // demote the next "max" words int32_t mc = 0; int32_t j; for ( j = start ; mc < max ; j++ ) { // sanity if ( j >= nw ) { char *xx=NULL;*xx=0; } if ( j < 0 ) { char *xx=NULL;*xx=0; } // skip if not an alnum word if ( ! wids[j] ) continue; // count it mc++; // demote it ww[j] = (int32_t)(ww[j] * demote); if ( ww[j] <= 0 ) ww[j] = 2; } // save the original i int32_t mini = i; // advance i, it will be incremented by 1 immediately // after hitting the "continue" statement i = j - 1; // must be at least the original i, we are monotinic // otherwise ringPos[] will not be monotonic and core // dump ultimately cuz j and k will be equal below // and we increment matched++ forever. if ( i < mini ) i = mini; // get next word continue; } // get next in chain if hash does not match if ( hashes[n] != h ) { // wrap around the hash table if we hit the end if ( ++n >= nb ) n = 0; // check out bucket #n now goto loop; } // how many words match so far int32_t matched = 0; // . we have to check starting at the beginning of each word // sequence since the XOR compositional hash is order // independent // . see what word offset this guy has int32_t j = vals[n] ; // k becomes the start of the current 5-word sequence int32_t k = start; // sanity check if ( j == k ) { char *xx = NULL; *xx = 0; } // skip to next in chain to check later if ( ++n >= nb ) n = 0; // keep advancing k and j as int32_t as the words match matchLoop: // get next wid for k and j while ( k < nw && ! wids[k] ) k++; while ( j < nw && ! wids[j] ) j++; if ( k < nw && wids[k] == wids[j] ) { matched++; k++; j++; goto matchLoop; } // keep track of the max matched for i0 if ( matched > max ) max = matched; // get another matching string of words, if possible goto loop; } if ( nw <= 0 ) { char *xx=NULL;*xx=0;} // make space if ( ! m_fragBuf.reserve ( nw ) ) { // save it int32_t saved = g_errno; if ( buf != tmpBuf ) mfree ( buf , need , "WeightsSet3" ); // reinstate it g_errno = saved; return NULL; } // validate m_fragBufValid = true; // handy ptr char *ff = m_fragBuf.getBufStart(); // convert from floats into frag score, 0 or 1 really for ( int32_t i = 0 ; i < nw ; i++ ) { if ( ww[i] <= 0.0 ) ff[i] = 0; else ff[i] = 1; } if ( buf != tmpBuf ) mfree ( buf , need , "WeightsSet3" ); // wtf? if ( ! ff ) { char *xx=NULL;*xx=0; } return ff; } float g_wtab[30][30]; // . inline this for speed // . if a word repeats in different phrases, promote the word // and demote the phrase // . if a word repeats in pretty much the same phrase, promote // the phrase and demote the word // . if you have the window of text "new mexico good times" // and word #i is mexico, then: // pid1 is "new mexico" // wid1 is "mexico" // pid2 is "mexico good" // wid2 is "good" // . we store sliderParm in titleRec so we can update it along // with title and header weights on the fly from the spider controls void getWordToPhraseRatioWeights ( int64_t pid1 , // pre phrase int64_t wid1 , int64_t pid2 , int64_t wid2 , // post word float *retww , //float *retpw , HashTableX *tt1 , int32_t titleRecVersion ) { static float s_fsp; // from 0 to 100 char sliderParm = g_conf.m_sliderParm; // i'm not too keen on putting this as a parm in the CollectionRec // because it is so cryptic... //static char sliderParm = 25; // . to support RULE #15 (word to phrase ratio) // . these weights are based on the ratio of word to phrase count // for a particular word static char s_sp = -1; if ( s_sp != sliderParm ) { // . set it to the newly updated value // . should range from 0 up to 100 s_sp = sliderParm; // the float version s_fsp = (float)sliderParm / 100.0; // sanity test if ( s_fsp < 0.0 || s_fsp > 1.0 ) { char *xx = NULL; *xx = 0; } // i is the word count, how many times a particular word // occurs in the document for ( int32_t i = 0 ; i < 30 ; i++ ) { // . k is the phrase count, how many times a particular phrase // occurs in the document // . k can be GREATER than i because we index only phrase terms // sometimes when indexing neighborhoods, and not the // single words that compose them for ( int32_t k = 0 ; k < 30 ; k++ ) { // do not allow phrase count to be greater than // word count, even though it can happen since we // add imported neighborhood pwids to the count table int32_t j = k; if ( k > i ) j = i; // get ratio //float ratio = (float)phrcount / (float)wrdcount; float ratio = (float)j/(float)i; // it should be impossible that this can be over 1.0 // but might happen due to hash collisions if ( ratio > 1.0 ) ratio = 1.0; // restrict the range we can weight a word or phrase // based on the word count //float r = 1.0; //if ( i >= 20 ) r = 2.1; //else if ( i >= 10 ) r = 1.8; //else if ( i >= 4 ) r = 1.5; //else r = 1.3; //g_ptab[i][k] = 1.00; g_wtab[i][k] = 1.00; if ( i <= 1 ) continue; // . we used to have a sliding bar between 0.0 and 1.0. // word is weighted (1.0 - x) and phrase is weighted // by (x). however, x could go all the way to 1.0 // even when i = 2, so we need to restrict x. // . x is actually "ratio" // . when we have 8 or less word occurences, do not // remove more than 80% of its score, a 1/5 penalty // is good enough for now. but for words that occur // a lot in the link text or pwids, go to town... if ( i <= 2 && ratio >= .50 ) ratio = .50; else if ( i <= 4 && ratio >= .60 ) ratio = .60; else if ( i <= 8 && ratio >= .80 ) ratio = .80; else if ( i <= 12 && ratio >= .95 ) ratio = .95; // round up, so many "new mexico" phrases but only // make it up to 95%... if ( ratio >= .95 ) ratio = 1.00; // if word's phrase is repeated 3 times or more then // is a pretty good indication that we should weight // the phrase more and the word itself less //if ( k >= 3 && ratio < .90 ) ratio = .90; // compute the weights //float pw = 2.0 * ratio; //float ww = 2.0 * (1.0 - ratio); float ww = (1.0 - ratio); // . punish words a little more // . if we got 50% ratio, words should not get as much // weight as the phrase //ww *= .45; // do not weight to 0, no less than .15 if ( ww < 0.0001 ) ww = 0.0001; //if ( pw < 0.0001 ) pw = 0.0001; // do not overpromote either //if ( ww > 2.50 ) ww = 2.50; //if ( pw > 2.50 ) pw = 2.50; // . do a sliding weight of the weight // . a "ww" of 1.0 means to do no weight // . can't do this for ww cuz we use "mod" below //float newWW = s_fsp*ww + (1.0-s_fsp)*1.00; //float newPW = s_fsp*pw + (1.0-s_fsp)*1.00; // limit how much we promote a word because it // may occur 30 times total, but have a phrase count // of only 1. however, the other 29 times it occurs it // is in the same phrase, just not this particular // phrase. //if ( ww > 2.0 ) ww = 2.0; g_wtab[i][k] = ww; //g_ptab[i][k] = newPW; //logf(LOG_DEBUG,"build: wc=%"INT32" pc=%"INT32" ww=%.2f " //"pw=%.2f",i,k,g_wtab[i][k],g_ptab[i][k]); } } } int32_t phrcount1 = 0; int32_t phrcount2 = 0; int32_t wrdcount1 = 0; int32_t wrdcount2 = 0; if ( tt1->m_numSlotsUsed > 0 ) { if (pid1) phrcount1 = tt1->getScore(&pid1); if (pid2) phrcount2 = tt1->getScore(&pid2); if (wid1) wrdcount1 = tt1->getScore(&wid1); if (wid2) wrdcount2 = tt1->getScore(&wid2); } // if we are always ending the same phrase, like "Mexico" // in "New Mexico"... get the most popular phrase this word is // in... int32_t phrcountMax = phrcount1; int32_t wrdcountMin = wrdcount1; // these must actually exist to be part of the selection if ( pid2 && phrcount2 > phrcountMax ) phrcountMax = phrcount2; if ( wid2 && wrdcount2 < wrdcountMin ) wrdcountMin = wrdcount2; // . but if we are 'beds' and in a popular phrase like 'dog beds' // there maybe a lot of other phrases mentioned that have 'beds' // in them like 'pillow beds', 'pet beds', but we need to assume // that is phrcountMax is high enough, do not give much weight to // the word... otherwise you can subvert this algorithm by just // adding other random phrases with the word 'bed' in them. // . BUT, if a page has 'X beds' with a lot of different X's then you // still want to index 'beds' with a high score!!! we are trying to // balance those 2 things. // . do this up here before you truncate phrcountMax below!! float mod = 1.0; if ( phrcountMax <= 6 ) mod = 0.50; else if ( phrcountMax <= 8 ) mod = 0.20; else if ( phrcountMax <= 10 ) mod = 0.05; else if ( phrcountMax <= 15 ) mod = 0.03; else mod = 0.01; // scale wrdcount1/phrcountMax down for the g_wtab table if ( wrdcount1 > 29 ) { float ratio = (float)phrcountMax / (float)wrdcount1; phrcountMax = (int32_t)((29.0 * ratio) + 0.5); wrdcount1 = 29; } if ( phrcountMax > 29 ) { float ratio = (float)wrdcount1 / (float)phrcountMax; wrdcount1 = (int32_t)((29.0 * ratio) + 0.5); phrcountMax = 29; } // . sanity check // . neighborhood.cpp does not always have wid/pid pairs // that match up right for some reason... so we can't do this //if ( phrcount1 > wrdcount1 ) { char *xx = NULL; *xx = 0; } //if ( phrcount2 > wrdcount2 ) { char *xx = NULL; *xx = 0; } // apply the weights from the table we computed above *retww = mod * g_wtab[wrdcount1][phrcountMax]; // slide it *retww = s_fsp*(*retww) + (1.0-s_fsp)*1.00; // ensure we do not punish too hard if ( *retww <= 0.0 ) *retww = 0.01; if ( *retww > 1.0 ) { char *xx=NULL;*xx=0; } /* if ( phrcountMax >= 0 ) { int64_t sh = getPrefixHash ( (char *)NULL , 0 , NULL , 0 ); int64_t tid = g_indexdb.getTermId ( sh , wid1 ); logf(LOG_DEBUG,"build: phrcountMax=%"INT32" wrdCount1=%"INT32" " "*ww=%.4f for word with tid=%"UINT64"", phrcountMax,wrdcount1,(float)*ww,tid); //if ( phrcountMax < 10 && tid == 16944700235015LL ) // log("hey"); } */ // sanity check //if ( *ww == 0.0 ) { char *xx = NULL; *xx = 0; } /* // scale wrdcountMin/phrcount down for the g_ptab table if ( wrdcountMin > 29 ) { float ratio = (float)phrcount2 / (float)wrdcountMin; phrcount2 = (int32_t)((29.0 * ratio) + 0.5); wrdcountMin = 29; } if ( phrcount2 > 29 ) { float ratio = (float)wrdcountMin / (float)phrcount2; wrdcountMin = (int32_t)((29.0 * ratio) + 0.5); phrcount2 = 29; } */ // . if the word is Mexico in 'New Mexico good times' then // phrase term #i which is, say, "Mexico good" needs to // get the min word count when doings its word to phrase // ratio. // . it has two choices, it can use the word count of // "Mexico" or it can use the word count of "good". // . say, each is pretty high in the document so the phrase // ends up getting penalized heavily, which is good because // it is a nonsense phrase. // . if we had "united socialist soviet republic" repeated // a lot, the phrase "socialist soviet" would score high // and the individual words would score low. that is good. // . try to seek the highest weight possible for this phrase // by choosing the lowest word count possible // . NO LONGER AFFECT phrase weights because just because the // words occur a lot in the document and this may be the only // occurence of this phrase, does not mean we should punish // the phrase. -- MDW //*retpw = 1.0; return; // do it the old way... //*pw = g_ptab[wrdcountMin][phrcount2]; // sanity check //if ( *pw == 0.0 ) { char *xx = NULL; *xx = 0; } } // for registerSleepCallback static void clockSyncWaitWrapper ( int fd , void *state ) { XmlDoc *THIS = (XmlDoc *)state; THIS->m_masterLoop ( THIS->m_masterState ); } // . a special call // . returns -1 if blocked, 1 otherwise, 0 on error char XmlDoc::waitForTimeSync ( ) { // unregister? if ( isClockInSync() && m_alreadyRegistered ) { // note it log("build: clock now synced for %s",m_firstUrl.m_url); g_loop.unregisterSleepCallback(m_masterState, clockSyncWaitWrapper); } // return 1 if synced! if ( isClockInSync() ) return 1; // already registered? wait another 1000ms if ( m_alreadyRegistered ) return -1; // flag it m_alreadyRegistered = true; // note it log("build: waiting for clock to sync for %s",m_firstUrl.m_url); // this should mean it is re-called later if ( g_loop.registerSleepCallback ( 1000 , // 1000 ms m_masterState , clockSyncWaitWrapper , m_niceness )) // wait for it, return -1 since we blocked return -1; // if was not able to register, ignore delay log("doc: failed to register clock wait callback"); return 0; } //////////////////////////// // // SCRAPING TOOLS // //////////////////////////// void doInjectLoopWrapper ( void *state ) { XmlDoc *XD = (XmlDoc *)state; // if it blocked, wait if ( ! XD->doInjectLoop ( ) ) return; // . if we did not inject any links, i guess we are done! // . this happens if the ahrefs.com doc had the same outlinks // as the ahrefs.com doc for another search result, they are all // deduped and it does not block. XD->m_finalCallback ( XD->m_finalState ); } // . return false if blocks, true otherwise // . return true and set error on error, with no blocks outstanding // . TODO: make this word for ahrefs.com list of links in xml feed bool XmlDoc::injectLinks (HashTableX *linkDedupTablePtr , HashTableX *domDedupTablePtr, void *finalState , void (* finalCallback)(void *)) { // INJECT 10 at a time. xmldoc is 1MB. int32_t i; for ( i = 0 ; i < MAX_XML_DOCS ; i++ ) { XmlDoc *nd; try { nd = new ( XmlDoc ); } catch ( ... ) { g_errno = ENOMEM; break; } mnew ( nd , sizeof(XmlDoc),"xmldocarr"); m_xmlDocs[i] = nd; } // all null? if ( i < (int32_t)MAX_XML_DOCS ) { log("scrape: one xmldoc alloc failed"); return true; } m_masterLoop = doInjectLoopWrapper; m_masterState = this; m_finalState = finalState; m_finalCallback = finalCallback; // note it //log("xmldoc: injecting outlinks of %s",m_firstUrl.getUrl()); m_linkDedupTablePtr = linkDedupTablePtr; m_domDedupTablePtr = domDedupTablePtr; // loop over all links m_i = 0; m_blocked = 0; memset ( m_used , 0 , (int32_t)MAX_XML_DOCS ); return doInjectLoop(); } void doneInjectingWrapper ( void *state ) { XmlDoc *xd = (XmlDoc *)state; XmlDoc *XD = (XmlDoc *)xd->m_hack; XD->doneInjecting ( xd ); } // . return false if blocks, true otherwise // . return true and set error on error, with no blocks outstanding bool XmlDoc::doInjectLoop ( ) { setStatus("inject outlinks"); //Links *links = getLinks(); //if ( ! links ) return (m_blocked == 0); //if ( links == (void *)-1 ) return false; Sections *sections = getSections(); if ( ! sections ) return (m_blocked == 0); if ( sections == (void *)-1 ) return false; Links *links = getLinks(); if ( ! links ) return (m_blocked == 0); if ( links == (void *)-1 ) return false; Words *words = getWords(); if ( ! words ) return (m_blocked == 0); if ( words == (void *)-1 ) return false; Bits *bp = getBits(); if ( ! bp ) return (m_blocked == 0); if ( bp == (void *)-1 ) return false; CollectionRec *cr = getCollRec(); if ( ! cr ) return true; int32_t n = links->getNumLinks(); Url tmpUrl; Section *prev = NULL; // scan the links now for ( ; m_i < n ; ) { // get xml doc then int32_t j; for ( j = 0 ; j < MAX_XML_DOCS ; j++ ) if ( ! m_used[j] ) break; // none? return false if blocked. if ( j >= MAX_XML_DOCS ) return false; // get the m_ith link char *link = links->getLink ( m_i ); int32_t linkLen = links->getLinkLen ( m_i ); // temp term if ( link[linkLen] ) { char *xx=NULL;*xx=0; } // skip to next link to index m_i++; // skip injecting if its an internal bing/google outlink if ( strncmp(link,"http://www.bing.com/",20) == 0 ) continue; // skip youtube query links. they contain our exact // query!! so almost always come up #1 if ( strstr(link,".youtube.com/") && strstr(link,"&q=")) continue; if ( strstr(link,".msn.com/") ) continue; if ( strstr(link,".microsoft.com/") ) continue; if ( strstr(link,".discoverbing.com/") ) continue; if ( strstr(link,".googleusercontent.com/") ) continue; //if(!strncmp(link,"http://webcache.googleusercontent.com/",38) if(!strncmp(link,"http://www.google.com/url?q=http",32)){ // grab the real url from that char *embed = strstr(link,"url?q=http"); if ( ! embed ) continue; link = embed+6; char *end = embed; for ( ; *end && *end != '&' ; end++) { // google appends query to url.. strange //if ( end[0] == '%' && // end[1] == '2' && // to_lower_a(end[2]) == 'b' ) // break; } SafeBuf mbuf; mbuf.reserve ( end - link + 100 ); int32_t dlen; char *bs = mbuf.getBufStart(); dlen=urlDecode(bs,link , end - link ); bs[dlen] = '\0'; tmpUrl.set ( bs ); link = tmpUrl.getUrl(); linkLen = tmpUrl.getUrlLen(); } // skip maps.google.com etc. if ( strstr(link,".google.com/") ) continue; // ok, point to title and summary for this result! // go up to prev node for first non-clickable text which // should be summary //Section **sp = sections->m_sectionPtrs; // get the section int32_t ln = links->getNodeNum(m_i-1); // get node ptr XmlNode *node = m_xml.getNodePtr(ln); char *ptr = node->m_node; // find section that contains it i guess Section *sx = sections->m_rootSection; Section *last = NULL; char **wptrs = words->getWords(); //nodeid_t *tids = words->getTagIds(); for ( ; sx ; sx = sx->m_next ) { // get section ptr char *sw = wptrs[sx->m_b-1]; if ( sw < ptr ) continue; // over? sw = wptrs[sx->m_a]; if ( sw > ptr ) break; last = sx; } // assign sx = last; // telescope section up one i guess //sx = sx->m_parent; // int16_tcut wbit_t *bits = bp->m_bits; // if still same first alnum, go another //for ( ; sx ; sx = sx->m_parent ) { // // skip if same word starts this section // //if ( sx->m_firstWordPos == fa ) continue; // // must have alnum // if ( sx->m_firstWordPos <= 1 ) continue; // // must be in link! should be the result TITLE // if ( bits[sx->m_firstWordPos] & D_IN_LINK ) break; // // word must not be "cached" or whatever... //} // if in bold tag, should telescope up some more //if ( sx && sx->m_tagId == TAG_B ) sx = sx->m_parent; //if ( sx && sx->m_tagId == TAG_STRONG ) sx = sx->m_parent; // save //int32_t fa = sx->m_firstWordPos; // that's the title so telescope up as int32_t as that is the // first alnum!!! for ( ; sx ; sx = sx->m_parent ) { //Section *ps = sx->m_parent; // do we have a next brother? stop then! that means // we are in a list! //if ( sx->m_nextBrother ) break; //if ( ps->m_firstWordPos != fa ) break; // stop when we hit a result delimeter!! if ( sx->m_tagId == TAG_LI ) { // bing... if ( strncmp(wptrs[sx->m_a], "<li class=\"sa_wr\">", 17) == 0 ) { break; } // google... if ( strncmp(wptrs[sx->m_a], "<li class=\"g\">", 13) == 0 ) { break; } } } // if no indicator, bail if ( ! sx ) continue; // skip link if contained in prev section if ( prev == sx ) continue; // save it prev = sx; // record search result details Section *title = NULL; Section *cite = NULL; Section *summary = NULL; // . that is probably the full result then... // . title is first sentence for ( ; sx ; sx = sx->m_next ) { // only sentences if ( ! ( sx->m_flags & SEC_SENTENCE ) ) continue; // grab it if ( ! title ) { title = sx; continue; } // skip section if in link if ( bits[sx->m_firstWordPos] & D_IN_LINK ) continue; // we are sentence section so fix it so we are one // above! Section *rs = sx; // ->m_parent; // telescope up to a div or whatever... //for ( ; rs ; rs = rs->m_parent ) { // if ( rs->m_tagId == TAG_DIV ) break; // if ( rs->m_tagId == TAG_P ) break; //} // and out of bold if ( rs && rs->m_tagId == TAG_B ) rs = rs->m_parent; if ( rs && rs->m_tagId == TAG_STRONG) rs=rs->m_parent; // bail if no good! if ( ! rs ) continue; // then site if google if ( ! cite ) { cite = rs; continue; } // then summary summary = rs; break; } m_serpBuf.safePrintf("\t\t<result>\n"); // print <title> tag if ( title ) printSerpFiltered(title,"title"); // print <sum> tag if ( summary ) printSerpFiltered(summary,"sum"); m_serpBuf.safePrintf("\t\t\t<url>"); m_serpBuf.safeMemcpy ( link , linkLen ); m_serpBuf.safePrintf("</url>\n"); m_serpBuf.safePrintf("\t\t</result>\n"); // if not injecting, skip //continue; if ( ! m_reallyInjectLinks ) continue; // dedup int32_t linkHash32 = hash32 ( link , linkLen ); if ( m_linkDedupTablePtr && m_linkDedupTablePtr->isInTable (&linkHash32) ) continue; // add it otherwise if ( m_linkDedupTablePtr ) m_linkDedupTablePtr->addKey ( &linkHash32 ); // we use this when injecting ahrefs links if ( m_domDedupTablePtr ) { int32_t domLen; char *dom = getDomFast ( link , &domLen ); int32_t dh32 = hash32 ( dom , domLen ); if ( m_domDedupTablePtr->isInTable (&dh32) ) continue; m_domDedupTablePtr->addKey ( &dh32 ); } // get it XmlDoc *xd = m_xmlDocs[j]; if ( ! xd ) { char *xx=NULL;*xx=0; } // add www to it Url lu; lu.set ( link , linkLen , true ); char *wwwLink = lu.getUrl(); // this can go on the stack since set4() copies it SpiderRequest sreq; sreq.reset(); // index this link! strcpy(sreq.m_url,wwwLink); // parentdocid of 0 int32_t firstIp = hash32n(wwwLink); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; sreq.setKey( firstIp,0LL, false ); sreq.m_isInjecting = 1; sreq.m_isPageInject = 1; sreq.m_hopCount = 0;//hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; setStatus("injecting an outlink"); // . use the enormous power of our new XmlDoc class // . this returns false with g_errno set on error if ( ! xd->set4 ( &sreq , NULL , cr->m_coll , NULL , // pbuf // give it a niceness of 1, we have to be // careful since we are a niceness of 0!!!! m_niceness, // 1 , // inject this content NULL, // content , false, // deleteFromIndex , 0, // forcedIp , 0, // contentType , 0, // lastSpidered , false )) { // hasMime // . g_errno should be set if that returned false // . return true if does not need to block log("xmldoc: outlink inject: %s",mstrerror(g_errno)); break; } xd->m_hack = this; // make this our callback in case something blocks xd->setCallback ( xd , doneInjectingWrapper ); // . set xd from the old title rec if recycle is true // . can also use XmlDoc::m_loadFromOldTitleRec flag xd->m_recycleContent = false;//true; // avoid looking up ip of each outlink to add "firstip" tag to // tagdb because that can be slow!!!!!!! xd->m_spiderLinks = false; xd->m_spiderLinks2 = false; xd->m_spiderLinksValid = true; // . newOnly is true --> do not inject if document is already // indexed! // . maybe just set indexCode xd->m_newOnly = true;//false;//newOnly; // need to refresh it!! //xd->m_newOnly = false;//newOnly; // turn off robots.txt lookups xd->m_isAllowed = true; xd->m_isAllowedValid = true; xd->m_crawlDelay = -1; // unknown xd->m_crawlDelayValid = true; // log it now log("inject: indexing outlink %s (hash=%"UINT32")",wwwLink, (uint32_t)linkHash32); // costs one API unit, which is one cent. but if we do // top 50 on google, top 50 on procog, it can be like // $1 every time we do this. //xd->injectAhrefsLinks(); bool status = true; // this will tell it to index ahrefs first before indexing // the doc. but do NOT do this if we are from ahrefs.com // ourselves to avoid recursive explosion!! xd->m_downloadLevel = m_downloadLevel + 1; xd->m_useAhrefs = m_useAhrefs; // inherit dedup tables as well! xd->m_linkDedupTablePtr = m_linkDedupTablePtr; // . now tell it to index // . this returns false if blocked status = xd->indexDoc ( ); // log it. i guess only for errors when it does not block? // because xmldoc.cpp::indexDoc calls logIt() if ( status ) xd->logIt(); // otherwise, it blocks else { m_blocked++; log("xmldoc: blockedout=%"INT32" slotj=%"INT32" " "(this=0x%"PTRFMT",xd=0x%"PTRFMT")", m_blocked,j,(PTRTYPE)this,(PTRTYPE)xd); m_used[j] = true; } } // return true if all done return (m_blocked == 0); } void XmlDoc::doneInjecting ( XmlDoc *xd ) { // find it in our list int32_t i; for ( i = 0 ; i < MAX_XML_DOCS ; i++ ) { if ( ! m_used[i] ) continue; if ( m_xmlDocs[i] != xd ) continue; break; } // core if not found in our list, it must be there if ( i >= MAX_XML_DOCS ) { char *xx=NULL;*xx=0; } // free it up now! m_used[i] = 0; // free it up //mdelete ( m_xmlDocs[i] , sizeof(XmlDoc), "xdarr" ); //delete ( m_xmlDocs[i] ); //m_xmlDocs[i] = NULL; m_xmlDocs[i]->reset(); // uncount it as being outstanding m_blocked--; // log debug log("xmldoc: blockedin=%"INT32" (this=0x%"PTRFMT")", m_blocked,(PTRTYPE)this); // return if still blocked if ( ! doInjectLoop() ) return; // log debug log("xmldoc: final callback"); // ok, all have been indexed m_finalCallback ( m_finalState ); } bool XmlDoc::injectAhrefsLinks ( ) { setStatus("get inlinks from ahrefs.com"); // skip for now //return true; CollectionRec *cr = getCollRec(); if ( ! cr ) return true; // make the ahrefs urls try { m_ahrefsDoc = new ( XmlDoc ); } catch ( ... ) { g_errno = ENOMEM; return true; } mnew ( m_ahrefsDoc , sizeof(XmlDoc),"xmldocah"); // make the url SafeBuf ubuf; // turn count down to 10 for now ubuf.safePrintf("http://api.ahrefs.com/get_backlinks.php?count=350&mode=exact&output=xml&AhrefsKey=0452f27fd5a7fec5e9702e23ba4af223&target="); //ubuf.safePrintf("http://www.gigablast.com/?q=poo&u="); ubuf.urlEncode (m_firstUrl.getUrl() ); Url url; url.set ( ubuf.getBufStart() ); char *up = url.getUrl(); // set by url i guess SpiderRequest sreq; sreq.reset(); strcpy(sreq.m_url,up); // parentdocid of 0 int32_t firstIp = hash32n(up); if ( firstIp == -1 || firstIp == 0 ) firstIp = 1; sreq.setKey( firstIp,0LL, false ); sreq.m_isInjecting = 1; sreq.m_isPageInject = 1; sreq.m_hopCount = 0;//hopCount; sreq.m_hopCountValid = 1; sreq.m_fakeFirstIp = 1; sreq.m_firstIp = firstIp; // int16_tcut XmlDoc *ah = m_ahrefsDoc; // . use the enormous power of our new XmlDoc class // . this returns false with g_errno set on error if ( ! ah->set4 ( &sreq , NULL , cr->m_coll , NULL , // pbuf // give it a niceness of 1, we have to be // careful since we are a niceness of 0!!!! m_niceness, // 1 , // inject this content NULL, // content , false, // deleteFromIndex , 0, // forcedIp , 0, // contentType , 0, // lastSpidered , false )) { // hasMime log("xmldoc: ahref doc error %s",mstrerror(g_errno)); // g_errno should be set if that returned false return true; } // do not re-call the set //m_needsSet = false; // make this our callback in case something blocks //ah->setCallback ( state , callback ); // do not re-lookup the robots.txt ah->m_isAllowed = true; ah->m_isAllowedValid = true; ah->m_crawlDelay = -1; // unknown ah->m_crawlDelayValid = true; ah->m_downloadLevel = m_downloadLevel + 1; // reset domain table for deduping ahref's links by domain // before injecting them... only inject one per domain if ( ! m_domDedupTablePtr ) { m_domDedupTable.set(4,0,512,NULL,0,false,m_niceness,"dmtab2"); m_domDedupTablePtr = &m_domDedupTable; } // log it now //log("inject: indexing injected doc %s",url); // if we are a url like api.ahrefs.com/get_backlinks... then // our links can use our table for deduping based on domain, AND // they can use our link dedup table in case one outlink is also // a search result on google's page... if ( ! ah->injectLinks ( m_linkDedupTablePtr, m_domDedupTablePtr, m_masterState , m_masterLoop ) ) return false; return true; } bool XmlDoc::printSerpFiltered ( Section *sx , char *tagName ) { //int64_t *wids = m_words.getWordIds(); char **wptrs = m_words.getWords(); int32_t *wlens = m_words.getWordLens(); int32_t fa = sx->m_firstWordPos; nodeid_t *tids = m_words.getTagIds(); if ( fa > 0 && tids[fa-1] == TAG_B ) fa--; if ( fa > 0 && tids[fa-1] == TAG_STRONG ) fa--; int32_t la = sx->m_b; int32_t nw = m_words.getNumWords(); if ( la+1 < nw && tids[la+1] == (TAG_B|BACKBIT) ) la++; if ( la+1 < nw && tids[la+1] == (TAG_STRONG|BACKBIT) ) la++; // advance la even more if regular words or br tags or b or strong tags for ( ; la < nw ; la++ ) { if ( ! tids[la] ) continue; if ( (tids[la]&BACKBITCOMP) == TAG_BR ) continue; if ( (tids[la]&BACKBITCOMP) == TAG_STRONG ) continue; if ( tids[la] == TAG_BR ) continue; break; } m_serpBuf.safePrintf("\t\t\t<%s>",tagName); // cdata! m_serpBuf.safePrintf("<![CDATA["); // subtract 1 from sx->m_b to avoid ending tag for ( int32_t i = fa ; i < la ; i++ ) { // skip if br if ( tids[i] == TAG_BR ) continue; m_serpBuf.cdataEncode ( wptrs[i] , wlens[i] ); } // cdata! m_serpBuf.safePrintf("]]>"); m_serpBuf.safePrintf("</%s>\n",tagName); return true; } ////////// // // BEGIN NEW SEO MATCHING QUERIES TOOL CODE // ////////// static void loadTitleRecFromDiskOrSpiderWrapper ( void *state ) { XmlDoc *THIS = (XmlDoc *)state; if ( ! THIS->loadTitleRecFromDiskOrSpider() ) return; THIS->m_callback1 ( THIS->m_state ); } // . if we can't load titlerec from titledb, spider it, index it and // use that new titlerec // . returns false if blocks // . returns true and sets g_errno on error bool XmlDoc::loadTitleRecFromDiskOrSpider() { if ( ! m_masterLoop ) { m_masterState = this; m_masterLoop = loadTitleRecFromDiskOrSpiderWrapper; } // fix a core when getTermListBuf() calls getMetaList() // which calls getNewSpiderReply() which calls // getDownloadEndTime() and tries to download the page // even though we have a valid titlerec! if ( ! m_downloadEndTimeValid ) { m_downloadEndTimeValid = true; m_downloadEndTime = 0; } // . try to recycle the content first // . try to load it from title rec first // . we have to do this otherwise our ptr_linkInfo link texts // will be somewhat random and cause us to get different scores // for the queries we match!! // . so do this not just for speed, but to be consistent. if ( ! loadFromOldTitleRec() ) return false; // did that fail? i.e. not found!?!?! ignore and just indexx it if ( m_oldTitleRecValid && m_oldTitleRec ) return true; // ok, we gotta index it if ( ! m_loggedMsg3 ) { m_loggedMsg3 = true; log("xmldoc: url %s not in titledb, spidering and indexing", m_firstUrl.m_url); } // clear that g_errno = 0; // turn off recycling i guess since we don't have it m_recycleContent = false; // first index it, but only if not already indexed // did it block? // eror indexing doc? indexCode should be set then if ( ! indexDoc() ) return false; // no blocking return true; } /* void getSEOQueryInfoWrapper ( void *state ) { XmlDoc *THIS = (XmlDoc *)state; // note it THIS->setStatus ( "seoqueryinfowrapper" ); // make sure has not been freed from under us! if ( THIS->m_freed ) { char *xx=NULL;*xx=0;} // note it THIS->setStatus ( "in seo query info wrapper" ); // return if it blocked if ( THIS->getSEOQueryInfo( ) == (void *)-1 ) return; // print any error if ( g_errno ) log("seopipe: getSeoQueryInfo error: %s",mstrerror(g_errno)); // all done else log("seopipe: getSeoQueryInfo is done"); // show timing info int64_t now = gettimeofdayInMilliseconds(); int64_t took = now - THIS->m_beginSEOTime; log("seopipe: time: getSeoQueryInfo took %"INT64"ms",took); // otherwise, all done, call the caller callback if ( THIS->m_callback1 ) THIS->m_callback1 ( THIS->m_state ); else THIS->m_callback2 ( THIS->m_state ); return; } void getSEOQueryInfoWrapper2 ( int fd , void *state ) { // just pump! otherwise we might re-launch a msg3a request while // one is outstanding causing a core in Multicast::reset() XmlDoc *THIS = (XmlDoc *)state; // debug log THIS->setStatus ("getseoqueryinfowrapper2"); // if we are waiting just on the pump i guess we are all done! if ( ! THIS->m_socketWriteBufValid ) { log("seopipe: pumping socket"); THIS->pumpSocketWriteBuf(); return; } // not pumping? log("seopipe: pumping socket ready wrapper"); // otherwise, let it call the callback getSEOQueryInfoWrapper ( state ); } // . return safebuf of xml containing matching and related queries and // related urls/titles // . this transmits the xml as it generates it to "m_seoSocket" if non-null // . returns -1 if blocked, returns NULL and sets g_errno on error // . stores the xml in the m_socketWriteBuf SafeBuf // . will keep blocking (returning -1) until the xml is delivered to socket // if it is non-NULL SafeBuf *XmlDoc::getSEOQueryInfo ( ) { setStatus ( "seo query info" ); // only set to valid once it has been all written out!! if ( m_socketWriteBufValid ) { // all done? if ( ! m_seoSocket ) return &m_socketWriteBuf; // pump pumpSocketWriteBuf(); // if socket not done being pumped... we block. it's // ready wrappers should re-call our wrapper. if ( m_socketWriteBufSent >= m_socketWriteBuf.length() ) return &m_socketWriteBuf; // wait for write to finish return (SafeBuf *)-1; } // the g_errno could be a title rec not found reply coming back // so do not process that here! it needs to be processed // by the function whose request resulted in an error reply. // for instances, the getTitle() call below needs to set g_errno // when we call it now, responding to its msg22 reply. //if ( g_errno ) return NULL; // a good place to init stuff we need here if ( ! m_masterState ) { m_printedQueries = false; m_printedRelatedDocIds = false; m_printedRelatedQueries = false; m_printedRecommendedLinks = false; m_printedScoredInsertableTerms = false; //m_docIndexed = false; // time it m_beginSEOTime = gettimeofdayInMilliseconds(); // for our m_masterLoop function, it uses this as the state m_masterState = this; // this is a main entry point function so anything that blocks // should re-call this function m_masterLoop = getSEOQueryInfoWrapper; // assume indexed m_docIndexed = true; // fix a core when getTermListBuf() calls getMetaList() // which calls getNewSpiderReply() which calls // getDownloadEndTime() and tries to download the page // even though we have a valid titlerec! if ( ! m_downloadEndTimeValid ) { m_downloadEndTimeValid = true; m_downloadEndTime = 0; } } // . try to load it from title rec first // . we have to do this otherwise our ptr_linkInfo link texts // will be somewhat random and cause us to get different scores // for the queries we match!! // . so do this not just for speed, but to be consistent. if ( m_recycleContent && ! loadFromOldTitleRec()) return (SafeBuf *)-1; // did that fail? i.e. not found!?!?! ignore and just indexx it if ( m_oldTitleRecValid && ! m_oldTitleRec && m_recycleContent ) { // just skip this asshole then log("xmldoc: url %s load3 failed",m_firstUrl.m_url); // clear that g_errno = 0; // need to index it m_docIndexed = false; } // first index it, but only if not already indexed if ( ! m_docIndexed ) { // turn off recycling i guess since we don't have it m_recycleContent = false; // did it block? // eror indexing doc? indexCode should be set then if ( ! indexDoc() ) return (SafeBuf *)-1; // do not re-call m_docIndexed = true; } // was indexing successful? int32_t *indexCode = getIndexCode(); if ( ! indexCode || indexCode == (void *)-1 ) return (SafeBuf *)indexCode; // if not successfully indexed send back error msg if ( *indexCode && m_seoSocket ) { m_socketWriteBuf.safePrintf( "\t<errorMsg><![CDATA[%s]]>" "</errorMsg>\n" "</response>" , mstrerror(*indexCode) ); // send on socket pumpSocketWriteBuf(); // if socket not done being pumped... we block if ( m_socketWriteBufSent < m_socketWriteBuf.length() ) return (SafeBuf *)-1; // otherwise, we are done sending return &m_socketWriteBuf; } // seo.cpp needs this in printDupSentences Sections *sections = getSectionsWithDupStats(); if ( ! sections || sections == (void *)-1) return (SafeBuf *)sections; // seo.cpp needs this now when it calls getSiteRank() int32_t *sni = getSiteNumInlinks(); if ( ! sni || sni == (void *)-1 ) return (SafeBuf *)sni; // . find all logged queries that this document matches // . this will launch msg99 requests to each host in the network // . then it scores them // . don't worry about sending back in real-time for this since it // should be fast SafeBuf *qpbuf = getMatchingQueriesScored(); if ( ! qpbuf || qpbuf == (void *)-1 ) return qpbuf; // . how many queries do we have that match this url? // . they should be sorted by our url's score int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *); // int16_tcut SafeBuf *sb = &m_socketWriteBuf; // cast the msg99 reply ptrs, i.e. query ptrs Msg99Reply **queryPtrs = (Msg99Reply **)qpbuf->getBufStart(); // store each one as xml then into m_headerBuf99 if ( ! m_printedQueries && m_seoSocket ) { m_printedQueries = true; // do not flood the socket! so limit to 1000 queries // they should be sorted by queryImportance! // cheatcodes.com has like 50,000 matching queries. int32_t max = numQueryPtrs; if ( max > 1000 ) max = 1000; for ( int32_t i = 0 ; i < max ; i++ ) { // int16_tcut Msg99Reply *qp = queryPtrs[i]; // sometimes queries like 'gallery-view' are // hard-phrased and do not show up for us, so skip. // they should be at the very end so we should be // trimming the tail for them, so don't worry about // <queryNum> having holes in it. if ( qp->m_myDocId == 0LL && qp->m_myScore == 0.0 ) continue; // int16_tcut QueryLogEntry *qe = &qp->m_queryLogEntry; sb->safePrintf("\t<seoQuery>\n" "\t\t<queryNum>%"INT32"</queryNum>\n" "\t\t<query><![CDATA[%s]]></query>\n" "\t\t<queryTrafficPerDay>%"INT32"" "</queryTrafficPerDay>\n" // our url's score "\t\t<myDocId>%"INT64"</myDocId>\n" "\t\t<myScore>%f</myScore>\n" //"\t\t<mySiteHash32>%"UINT32"" //"</mySiteHash32>\n" "\t\t<queryImportance>%f" "</queryImportance>\n" "\t</seoQuery>\n" , i , qp->m_queryStr // x 10 to estimate google? , qe->m_gigablastTraffic * GB_TRAFFIC_MODIFIER , qp->m_myDocId , qp->m_myScore //, qp->m_mySiteHash32 , qp->m_queryImportance //,qp->m_queryInfo.m_numUniqueWordForms //,qp->m_queryInfo.m_numRepeatWordForms //qp->m_queryInfo.m_smallestNormTermFreq ); } } // pump it some. i.e. send m_socketWriteBuf contents back to // m_seoSocket if it is non-NULL pumpSocketWriteBuf(); // . now instead try getting the top "imax" queries scored on the // whole index // . transmit them back on m_seoSocket AS WE GET THEM by calling // pumpSocketWriteBuf() function and storing into m_socketWriteBuf //qpbuf = getMatchingQueriesScoredForFullQuery ( ); //if ( ! qpbuf || qpbuf == (void *)-1 ) return qpbuf; SafeBuf *rdbuf = getRelatedDocIdsWithTitles(); if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf; RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart(); // how many related docids do we have? int32_t nr = rdbuf->length() / sizeof(RelatedDocId); // // print out the related urls // if ( ! m_printedRelatedDocIds && nr && m_seoSocket ) { m_printedRelatedDocIds = true; int32_t max = 200; // m_maxRelatedUrls; if ( max == -1 ) max = nr; if ( nr < max ) max = nr; sb->safePrintf("\t<relatedUrls>\n"); for ( int32_t i = 0 ; i < max ; i++ ) { RelatedDocId *rd = &rds[i]; // fix for titlerec not found errors char *title = rd->ptr_rd_title; char *url = rd->ptr_rd_url; if ( ! title ) title = ""; if ( ! url ) url = ""; // print it out sb->safePrintf("\t\t<relatedUrl>\n" "\t\t\t<urlNum>%"INT32"</urlNum>\n" "\t\t\t<url><![CDATA[%s]]></url>\n" "\t\t\t<docId>%"INT64"</docId>\n" "\t\t\t<siteHash32>%"UINT32"</siteHash32>\n" "\t\t\t<title><![CDATA[" , i , url , rd->m_docId , rd->m_siteHash32 ); // encode CDATA stuff in title sb->cdataEncode(title); sb->safePrintf("]]>\n" "\t\t\t%"INT32"" "\n" "\t\t\t%f" "\n" , rd->m_numCommonQueries , rd->m_dotProduct // similarityScore ); // print the actualy querynums in common int32_t firstOff = rd->m_firstCommonQueryNumOff; int32_t offset = firstOff; sb->safePrintf("\t\t\t\n"); for ( ; offset >= 0 ; ) { // get that node char *buf = m_commonQueryNumBuf.getBufStart(); // and offset buf += offset; // then cast QueryNumLinkedNode *qn; qn = (QueryNumLinkedNode *)buf; // print that sb->safePrintf("\t\t\t\t%"INT32"" "\n" , qn->m_queryNum ); // advance. will be -1 when done offset = qn->m_nextOff; } sb->safePrintf("\t\t\t\n"); sb->safePrintf("\t\t\n"); } sb->safePrintf("\t\n"); } // // recommended inlinks! // // pump it some. i.e. send m_socketWriteBuf contents back to // m_seoSocket if it is non-NULL pumpSocketWriteBuf(); SafeBuf *kbuf = getRecommendedLinksBuf(); if ( ! kbuf || kbuf == (void *)-1 ) return kbuf; // print out the recommended links in xml if ( ! m_printedRecommendedLinks && m_seoSocket ) { sb->safePrintf("\t\n"); char *p = kbuf->getBufStart(); char *pend = kbuf->getBuf(); for ( ; p < pend ; ) { // cast it RecommendedLink *ri = (RecommendedLink *)p; // skip it p += ri->getSize(); // print it out sb->safePrintf("\t\t\n" "\t\t\t\n" "\t\t\t<![CDATA[%s]]>\n" "\t\t\t%f\n" "\t\t\t%"INT32"\n" ,ri->getUrl(kbuf) ,ri->getTitle(kbuf) ,ri->m_totalRecommendedScore ,(int32_t)ri->m_siteRank ); } sb->safePrintf("\t\n"); m_printedRecommendedLinks = true; } // // related queries // // write out pumpSocketWriteBuf(); SafeBuf *relBuf = getRelatedQueryBuf(); if ( ! relBuf || relBuf == (void *)-1 ) return relBuf; QueryRel **rels = (QueryRel **)relBuf->getBufStart(); int32_t numRels = relBuf->length() / sizeof(QueryRel *); // // print out the related queries // if ( ! m_printedRelatedQueries && numRels && m_seoSocket ) { sb->safePrintf("\t\n"); int32_t max = 200; // m_maxRelatedQueries; if ( max == -1 ) max = numRels; if ( numRels < max ) max = numRels; for ( int32_t i = 0 ; i < max ; i++ ) { QueryRel *rel = rels[i]; // must be a first! if ( ! rel->m_isFirst ) { char *xx=NULL;*xx=0; } // int16_tcut //QueryInfo *qi = &rel->m_queryInfo; // print it out sb->safePrintf("\t\t\n" "\t\t\t\n" "\t\t\t%"INT32"" "\n" "\t\t\t%f" "\n" //"\t\n" , rel->m_queryStr , rel->m_docIdVotes //, qi->m_numUniqueWordForms //, qi->m_numRepeatWordForms //, qi->m_smallestNormTermFreq , rel->m_totalRelatedQueryImportance //, qi->m_myScoreRelated ); // print details! sb->safePrintf("\t\t\t\n"); // linked list of Msg99Replies for the related queries. // all in linked list are for the same query but // restricted to a different docid! for ( ; rel ; rel = rel->m_next ) { // get his related docid RelatedDocId *rd = rel->m_relatedDocId; // print that sb->safePrintf("\t\t\t\t\n" "\t\t\t\t\t%"INT64"" "\n" "\t\t\t\t\t%"UINT32"" "\n" //"\t\t\t\t\t" //"%f" //"\n" "\t\t\t\t\t%f" "\n" "\t\t\t\t\t%f" "\n" "\t\t\t\t\n" , rd->m_docId , rd->m_siteHash32 //, rd->m_similarityScore , rd->m_dotProduct , rel->m_myScore ); } sb->safePrintf("\t\t\t\n"); sb->safePrintf("\t\t\n"); } sb->safePrintf("\t\n"); m_printedRelatedQueries = true; } // write out pumpSocketWriteBuf(); // this is the Keyword Insertion Tool data (KIT data) SafeBuf *sits = getScoredInsertableTerms(); if ( ! sits || sits == (void *)-1 ) return sits; // try to store into cachedb in case user clicks a different // insertable term and we have to update the wordposinfo::m_rankChange // stuff in the html src display //if ( ! storeIntoCachedb() ) // // return -1 if it blocked and wait for store to complete // return (SafeBuf *)-1; // print out query changes if ( ! m_printedScoredInsertableTerms && m_seoSocket ) { // dump out each insertable term and it's corresponding // QueryChanges if ( ! printScoredInsertableTerms ( sb ) ) return NULL; m_printedScoredInsertableTerms = true; // end of xml response? sb->safePrintf("\n"); } // even if not fully pumped, set it to valid here m_socketWriteBufValid = true; if ( ! m_seoSocket ) return &m_socketWriteBuf; // write out pumpSocketWriteBuf(); // if socket not done being pumped... we block if ( m_socketWriteBufSent < m_socketWriteBuf.length() ) return (SafeBuf *)-1; // ok, we are done return &m_socketWriteBuf; } */ // have the smallest twids on top! int twidcmp ( const void *a, const void *b ) { TermInfo *ua = (TermInfo *)a; TermInfo *ub = (TermInfo *)b; //uint32_t ua = *(uint32_t *)a; //uint32_t ub = *(uint32_t *)b; // HACKY: sort by lower 32 bits of the 64 bit termids so // seo.cpp can use them with its QueryLogEntries which use 32 bit // termids to save mem. uint32_t ta = (uint32_t)ua->m_termId64; uint32_t tb = (uint32_t)ub->m_termId64; // lower first if ( ta > tb ) return 1; // swap if ( ta < tb ) return -1; return 0; } // . 1. make a vector of the words in the title, headers, page-inlink-text, // and site-inlink-text // // . 2. pass that word vector to every machine in network to see what queries // in the query logs we match. use Msg99.cpp. it should initialize // on startup and load in it's share of the query logs. query log file // should be sorted then sorted by filtered query then split. should also // remove queries from the most aggressive IPs (bots). we would need // a program, filterquerylog.cpp to do all that on gk37, our query log // storage server. it needs to store # of times query was done, too. // all queries should have back to back spaces removed and made lowercase. // remove queries that have double quotes or colon operators in them. // index each query term in the query log into HashTableX, which will // point to the query in the buffer. then we just store the termlist // in a SafeBuf that we save on disk. 40GB of queries split 256 ways // is still like 175MB per server! (if one server is dead, skip it) // // . 3. merge all queries received from all hosts and sort by traffic. // // . 4. perform the queries on procog and cache the scores of the top 10 // results for each query. should be cached on machine that houses the // query. try a 60-day cache max age. // // . 5. now redo the queries but with a "url:thisurl |" to get this page's // score for each query. if the min score of the query on procog is // well beyond our grasp, we could just skip it. // // . 6. then determine the # of inlinks we need to add to get more traffic // for each query. assume siterank of 0 per inlink. if that would be // impossible then increment the siterank until it gets us in the top 10. // // just use getTopTermsVector HashTableX *XmlDoc::getTermIdBufDedupTable32 ( ) { SafeBuf *tiBuf = getTermInfoBuf(); if ( ! tiBuf || tiBuf == (void *)-1 ) return (HashTableX *)tiBuf; return &m_tidTable32; } // . used by handleRequest8e() which uses msg20::getSummary() with // m_getTermListBuf to call this in the local host msg20 handler. // . this buf is used to determine what queries this document matches SafeBuf *XmlDoc::getTermId32Buf() { if ( m_termId32BufValid ) return &m_termId32Buf; SafeBuf *tiBuf = getTermInfoBuf (); if ( ! tiBuf || tiBuf == (void *) -1 ) return tiBuf; int32_t need = 4 * (tiBuf->length() / sizeof(TermInfo)); if ( ! m_termId32Buf.reserve(need) ) return NULL; // scan those char *p = tiBuf->getBufStart(); char *pend = tiBuf->getBuf(); uint32_t last = 0; for ( ; p < pend ; ) { TermInfo *ti = (TermInfo *)p; p += sizeof(TermInfo); uint32_t tid32 = (uint32_t)(ti->m_termId64); m_termId32Buf.pushLong(tid32); // sanity if ( last && tid32 <= last ) { char *xx=NULL;*xx=0; } last = tid32; } m_termId32BufValid = true; return &m_termId32Buf; } // . used by getTermId32Buf() for getting this document's matching queries // . serialize the words in the title and inlink text into a vector // . SafeBuf is filled with class TermInfos! defined in seo.h. currently // just a int64_t m_termId64 though! // . get synonyms of each word too! // . we sort them by the 32-bit termid so handleRequest8e() can do its fast // compare algo to find matching queries which are also sorted by the lower // 32 bits of terms in the query. SafeBuf *XmlDoc::getTermInfoBuf ( ) { setStatus ( "getterminfobuf" ); if ( m_termInfoBufValid ) return &m_termInfoBuf; bool includeSynonyms = true; Words *ww = getWords(); if ( ! ww || ww == (Words *)-1 ) return (SafeBuf *)ww; LinkInfo *info1 = getLinkInfo1(); if ( ! info1 || info1 == (LinkInfo *)-1 ) return (SafeBuf *)info1; uint8_t *langId = getLangId(); if ( ! langId || langId == (uint8_t *)-1 ) return (SafeBuf *)langId; if (!m_tidTable32.set(4,0,16384,NULL,0,false,m_niceness,"twidtabl")) return NULL; // // add document body words now to m_twbuf // if ( ! addUniqueWordsToBuf ( &m_termInfoBuf , &m_tidTable32 , // dedup table NULL, // filter table NULL, // mincounttable false , ww , includeSynonyms) ) return NULL; // // store count of each term we hash after this into "TMP" // HashTableX TMP; if(!TMP.set(4,4,4096,NULL,0,false,m_niceness,"tmttt") ) return NULL; // // hash meta desc into TMP table // int32_t mdlen; char *md = getMetaDescription( &mdlen ); if ( md ) { Words ww3; ww3.setx ( md , mdlen , m_niceness ); if (!addUniqueWordsToBuf(NULL, NULL , // dedup table NULL, // filter table &TMP, // mincounttable true, // store counts? &ww3, includeSynonyms)) return NULL; } // // hash meta keywords into TMP table // int32_t mklen; char *mk = getMetaKeywords( &mklen ); if ( mk ) { Words ww4; ww4.setx ( mk , mklen , m_niceness ); if (!addUniqueWordsToBuf(NULL, NULL, // dedup table NULL, // filter table &TMP, // mincounttable true, // store counts? &ww4, includeSynonyms)) return NULL; } // // hash each link text into TMP table // // loop over every link text to this page for ( Inlink *k = NULL; info1 && (k = info1->getNextInlink(k)) ; ) { // breathe QUICKPOLL(m_niceness); // get the link text if ( k->size_linkText <= 1 ) continue; // set Url Url u; u.set ( k->getUrl() , k->size_urlBuf ); // do not allow anomalous link text to match query //if ( k->m_isAnomaly ) continue; char *p = k-> getLinkText(); int32_t plen = k->size_linkText - 1; if ( ! verifyUtf8 ( p , plen ) ) { log("title: set4 bad link text from url=%s", k->getUrl()); continue; } // debug //log("seo: counttable for link text '%s'",k->getLinkText()); // now the words. Words ww2; if ( ! ww2.set ( k->getLinkText() , k->size_linkText-1, // len TITLEREC_CURRENT_VERSION , true , // computeIds m_niceness ))// niceness // g_errno set on error, return NULL return NULL; // int16_tcuts on link text if ( ! addUniqueWordsToBuf( NULL, NULL, // dedup table NULL, // filter table &TMP, // mincounttable true, // store counts? &ww2, includeSynonyms)) return NULL; } // // now only add link texts to main table and buffer if it occurs // already in the body, or occurs TWICE in "TMP" // // loop over every link text to this page for ( Inlink *k = NULL; info1 && (k = info1->getNextInlink(k)) ; ) { // breathe QUICKPOLL(m_niceness); // get the link text if ( k->size_linkText <= 1 ) continue; // set Url Url u; u.set ( k->getUrl() , k->size_urlBuf ); // do not allow anomalous link text to match query //if ( k->m_isAnomaly ) continue; char *p = k-> getLinkText(); int32_t plen = k->size_linkText - 1; if ( ! verifyUtf8 ( p , plen ) ) { log("title: set4 bad link text from url=%s", k->getUrl()); continue; } // now the words. Words ww2; if ( ! ww2.set ( k->getLinkText() , k->size_linkText-1, // len TITLEREC_CURRENT_VERSION , true , // computeIds m_niceness ))// niceness // g_errno set on error, return NULL return NULL; if ( !addUniqueWordsToBuf( &m_termInfoBuf, &m_tidTable32, // dedup table NULL, // filter table &TMP, // mincounttable, >=2 counts false, // store counts? &ww2, includeSynonyms)) return NULL; } // how many 32-bit twids do we got? //m_numTwids = m_twbuf.length() / 4; //m_twids = (int32_t *)m_twbuf.getBufStart(); QUICKPOLL(m_niceness); // . sort that buf now // . HACK: only sorts by last 32 bits of termid!!!! qsort ( m_termInfoBuf.getBufStart(), m_termInfoBuf.length() / sizeof(TermInfo), sizeof(TermInfo), // 32-bit twids = 4 bytes twidcmp ); QUICKPOLL(m_niceness); // if no twids then return a -2 ptr, not NULL, that means error // not -1 that means blocked! //if ( m_numTwids == 0 ) m_twids = (int32_t *)-2; // do not repeat this logic //m_twidsValid = true; m_termInfoBufValid = true; // return the vector return &m_termInfoBuf; } // . just like getTermInfoBuf but also includes terms from related queries // that our document does not have! // . we do it this way because for seo.cpp::handleRequest95() it finds // matching queries locally based on getNewTermInfoBuf()'s m_newTermInfoBuf. SafeBuf *XmlDoc::getNewTermInfoBuf ( ) { setStatus ( "getnewterminfobuf" ); if ( m_newTermInfoBufValid ) return &m_newTermInfoBuf; SafeBuf *oldBuf = getTermInfoBuf (); if ( ! oldBuf || oldBuf == (void *) -1 ) return oldBuf; SafeBuf *itBuf = getInsertableTerms(); if ( ! itBuf || itBuf == (void *)-1 ) return itBuf; // this should be valid automatically HashTableX *oldDedupTable = getTermIdBufDedupTable32 ( ); // get old guy if ( ! m_newTermInfoBuf.safeMemcpy ( oldBuf ) ) return NULL; // a dedup table on stack HashTableX newDedup32; if (! newDedup32.set(4,0,16384,NULL,0,false,m_niceness,"newdtabl")) return NULL; // now scan the insertable terms buf char *p = itBuf->getBufStart(); char*pend = itBuf->getBuf(); // scan each "term" which might be one or more words for ( ; p < pend ; ) { QUICKPOLL(m_niceness); // cast it InsertableTerm *it = (InsertableTerm *)p; p += it->getSize(); char *term = it->getTerm(); Words ww; ww.set9 ( term , m_niceness ); // we add entries to the dedup table, "newDedup32", // but only filter and not add to "oldDedupTable" if ( ! addUniqueWordsToBuf ( &m_newTermInfoBuf, &newDedup32 , // dedup table oldDedupTable, // filter table NULL, // mincounttable false, &ww , true ) ) return NULL; } QUICKPOLL(m_niceness); // . sort that buf now. // . HACK: only sorts by last 32 bits of termid!!!! qsort ( m_newTermInfoBuf.getBufStart(), m_newTermInfoBuf.length() / sizeof(TermInfo), sizeof(TermInfo), // 32-bit twids = 4 bytes twidcmp ); QUICKPOLL(m_niceness); /* // set the term freq of each one p = m_newTermInfoBuf.getBufStart(); pend = m_newTermInfoBuf.getBuf(); for ( ; p < pend ; ) { QUICKPOLL(m_niceness); TermInfo *ti = (TermInfo *)p; p += sizeof(TermInfo); // look it up int64_t tf = g_posdb.getTermFreq (cr->m_coll,ti->m_termId64); // store it ti->m_termFreq64 = tf; } */ // do not repeat this logic m_newTermInfoBufValid = true; // return the vector return &m_newTermInfoBuf; } bool XmlDoc::addUniqueWordsToBuf ( SafeBuf *termInfoBuf , HashTableX *dedupTable , HashTableX *filterTable , HashTableX *minCountTable , bool storeCounts, Words *ww , bool getSynonyms ) { int32_t nw = ww->getNumWords (); uint64_t *wids = (uint64_t *)ww->getWordIds (); //nodeid_t *tids = ww->getTagIds (); uint8_t *langId = getLangId(); // this should have been set by parent caller if ( ! langId || langId == (uint8_t *)-1 ) {char *xx=NULL;*xx=0; } // store the langId here uint8_t useLangId = *langId; // default that to english i guess if unknown if ( useLangId == langUnknown ) { static XmlDoc *s_lastPrint = NULL; if ( s_lastPrint != this ) { log("seopipe: langid of page is unknown for twid " "synonyms. assuming english."); s_lastPrint = this; } useLangId = langEnglish; } Synonyms syn; //bool inTitle = false; // scan for title for ( int32_t i = 0 ; i < nw ; i++ ) { // breathe QUICKPOLL(m_niceness); // out of a link //if(tids && tids[i] == TAG_TITLE ) inTitle = true; //if(tids && tids[i] == (TAG_TITLE | BACKBIT)) inTitle = false; // count it, limit to 30 //if ( inTitle ) tw++; // skip if not alnumword if ( ! wids[i] ) continue; // make it 32 bit uint32_t wid32 = (uint32_t)wids[i]; // filter table if ( filterTable && filterTable->isInTable(&wid32) ) continue; /* // debug if ( minCountTable && storeCounts ) { int32_t wlen = ww->m_wordLens[i]; char *wptr = ww->m_words[i]; char c= wptr[wlen]; wptr[wlen] = '\0'; log("seo: storecount wid=%"UINT32" word=%s", (uint32_t)((uint64_t)wids[i]),wptr); wptr[wlen] = c; } */ // to avoid link text anomalies, the word must have been // repeated in another link text or a meta tag. should // fix ibm.com from getting 'lincoln' or 'unc' as high-scoring // matching queries. should fix artdaily.com from getting // that foreign language phrase in danish. (bedste pa nettet) // (best of the web) if ( minCountTable && ! storeCounts && minCountTable->getScore32(&wid32) <= 1 ) continue; // get slot if ( dedupTable && dedupTable->isInTable(&wid32) ) continue; // count it! if ( storeCounts && ! minCountTable->addTerm32(&wid32) ) return false; // show it //if ( wid32 == 1174583722 && storeCounts ) { // log("seo: storing occurence. current count=%"INT32"", // (int32_t)minCountTable->getScore32(&wid32) ); //} // add it to vector TermInfo ti; ti.m_termId64 = wids[i]; //ti.m_termFreq64 = -1; if ( termInfoBuf && ! termInfoBuf->safeMemcpy(&ti,sizeof(TermInfo)) ) return false; // add it then if ( dedupTable && ! dedupTable->addKey ( &wid32 ) ) return false; // do synonyms now? if ( ! getSynonyms ) continue; // get its synonyms into tmpBuf char tmpBuf[TMPSYNBUFSIZE]; int32_t naids = syn.getSynonyms(ww,i,useLangId,tmpBuf,m_niceness); for ( int32_t j = 0 ; j < naids ; j++ ) { // get it uint32_t aid32 = (uint32_t)syn.m_aids[j]; // get slot if ( dedupTable && dedupTable->isInTable(&aid32) ) continue; // add it to vector TermInfo ti; ti.m_termId64 = syn.m_aids[j]; // 64 bit version //ti.m_termFreq64 = -1; if ( termInfoBuf && ! termInfoBuf->safeMemcpy(&ti,sizeof(TermInfo)) ) return false; // add it then if ( dedupTable && ! dedupTable->addKey(&aid32) ) return false; // count it! if ( storeCounts && ! minCountTable->addTerm32(&aid32)) return false; } } return true; } /* static void gotMsg99ReplyWrapper ( void *state , UdpSlot *slot ) { XmlDoc *THIS = (XmlDoc *)state; THIS->gotMsg99Reply ( slot ); } void XmlDoc::gotMsg99Reply ( UdpSlot *slot ) { // get replying hostid int32_t hostId = slot->m_hostId; // log setStatus ( "gotmsg99reply" ); // sanity if ( hostId < 0 || hostId >= g_hostdb.m_numHosts) {char*xx=NULL;*xx=0;} // save it int32_t i = m_numMsg99Replies; m_msg99ReplyPtrs [i] = slot->m_readBuf; m_msg99ReplySizes[i] = slot->m_readBufSize; m_msg99ReplyAlloc[i] = slot->m_readBufMaxSize; m_msg99HostIds [i] = hostId; // steal it so it doesn't free it slot->m_readBuf = NULL; // note it //log("seopipe: got msg99 reply from host #%"INT32" i=%"INT32" alloc=%"INT32"", // hostId,i,slot->m_readBufMaxSize); // inc the counter m_numMsg99Replies++; // sanity! if ( m_numMsg99Replies > m_numMsg99Requests ) { char *xx=NULL;*xx=0; } if ( m_numMsg99Replies > g_hostdb.m_numHosts ) { char *xx=NULL;*xx=0; } // don't free the sendbuf, it is shared between all hosts UNLESS // we are the last reply received!!! if ( m_numMsg99Replies < g_hostdb.m_numHosts ) slot->m_sendBufAlloc = NULL; // return control to transmit function. it will call m_callback1 // if the function is done. but if a different parent function than // transmit called us then we call that. it just depends on the // intial entry function that called getMatchingQueries() m_masterLoop ( m_masterState ); } */ /* float getQueryImportance2 ( QueryInfo *qi , float myScore ) { // now divide by the top score (or 50th score) for the query // so we can see how high we score relatively speaking... // although, if all search results for this query have the // same score this method kinda sux... float imp = myScore / qe->m_minTop50Score; return imp; // mod because one word query terms get higher scores than // multi-word queries because they are divided by distance in // the search algo. // this hurts 'gigablast' query. if ( qi->m_numUniqueWordForms <= 1 ) score /= 10.0; // multiply by it? score *= qi->m_numUniqueWordForms; // until we have the code to fix things like 'coast to coast' // where the term is repeated, we have to punish... if ( qi->m_numRepeatWordForms >= 1 ) score /= 30.0; // kill 'search+engine+search+engine' if ( qi->m_numRepeatWordForms >= 2 ) score /= 30.0; // if every word in query is repeated... push it down // try to fix 'bot+bot' and 'search+search' 'http+http' if ( qi->m_numUniqueWordForms == qi->m_numRepeatWordForms ) score /= 2000.0; // fix 'web search search' if ( qi->m_numRepeatWordForms > 0 && qi->m_numUniqueWordForms == qi->m_numRepeatWordForms + 1 ) score /= 200.0; // try to kill those queries that are just a single stop word // or forms of stop words. // this hurts 'gigablast' query, so make it > .9. no, then crap like // 'web' and 'http' come up too high... if ( qi->m_numUniqueWordForms == 1 ) { score *= (1.1 - qi->m_smallestNormTermFreq); score *= (1.1 - qi->m_smallestNormTermFreq); } // http is very common! so make the 'http' or 'http+http' queries // very low importance if ( qi->m_numControlWordForms == qi->m_numUniqueWordForms ) score /= 1000000.0; // TODO: if query is a single term and it's exact syn min // hash is that for 'and' then kill it. fix 'anding' // boost it for more accuracy since we gotta make it into anint //score *= 1000; return score; } // set Msg99Reply::m_queryImportance for all msg99replies void setQueryImportance ( Msg99Reply **qptrs , int32_t numQueryPtrs ) { } void setQueryImportanceRelated ( QueryRel **qptrs , int32_t numQueryPtrs ) { for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) { QueryRel *qrel = qptrs[i]; float score = qrel->m_queryInfo.m_myScoreRelated; QueryInfo *qi = &qrel->m_queryInfo; float imp = getQueryImportance2 ( qi , score ); qi->m_queryImportance = imp; } } */ /* int qp99cmp ( const void *a, const void *b ) { Msg99Reply *qa = *(Msg99Reply **)a; Msg99Reply *qb = *(Msg99Reply **)b; // make sure manually added queries are on top if ( qa->m_isManuallyAdded && ! qb->m_isManuallyAdded ) return 1; if ( qb->m_isManuallyAdded && ! qa->m_isManuallyAdded ) return -1; //QueryInfo *qia = &qa->m_queryInfo; //QueryInfo *qib = &qb->m_queryInfo; // get scores float scorea = qa->m_queryImportance; float scoreb = qb->m_queryImportance; if ( scorea < scoreb ) return 1; if ( scorea > scoreb ) return -1; // fallback to traffic otherwise i guess int32_t traffica = qa->m_queryLogEntry.m_gigablastTraffic; int32_t trafficb = qb->m_queryLogEntry.m_gigablastTraffic; if ( qa->m_queryLogEntry.m_googleTraffic != -1 ) traffica = qa->m_queryLogEntry.m_googleTraffic; if ( qb->m_queryLogEntry.m_googleTraffic != -1 ) trafficb = qb->m_queryLogEntry.m_googleTraffic; if ( traffica < trafficb ) return 1; if ( traffica > trafficb ) return -1; // fallback alphabetical otherwise? char *qsa = qa->m_queryStr; char *qsb = qb->m_queryStr; if ( ! qsa ) return 0; if ( ! qsb ) return 0; return strcmp( qsa , qsb ); //return 0; } */ #include "Cachedb.h" // . only check cachedb once per url // . return false if blocked, true otherwise // . returns true and sets g_errno on error bool XmlDoc::checkCachedb ( ) { if ( ! m_readFromCachedb ) return true; CollectionRec *cr = getCollRec(); if ( ! cr ) return true; // already set? //if ( m_seoInfoSetFromCache ) // return true; // return -1 if this blocked if ( ! m_checkedCachedb ) { // we now use the contenthash as part of the key because the // data we cache is dependent on the content. i guess we don't // need to use the user id then... int32_t *ch32p = getContentHash32(); if ( ! ch32p ) return true; if ( ch32p == (void *)-1 ) return false; int32_t ch32 = *ch32p; // include spider date now in case indexed copy changes // site rank, tags, etc. if ( m_spideredTimeValid ) ch32 ^= m_spideredTime; // first check cachedb. enum type cr_MatchingQueries int32_t uh32 ; uh32 =(uint32_t)((uint64_t)getFirstUrlHash64()); key_t sk = g_cachedb.makeStartKey ( uh32 , ch32 ); key_t ek = g_cachedb.makeEndKey ( uh32 , ch32 ); // debug log("seo: checking cachedb uh32=%"UINT32" ch32=%"UINT32"", (uint32_t)uh32, (uint32_t)ch32); // do not repeat m_checkedCachedb = true; // . get it from the appropriate host // . get cachedb rec for all types of safebufs for this // url/content // . then we will set safebufs based on what recs we find // in the returned list if ( ! m_msg0.getList ( -1, // hostid 0 , // ip 0 , // port 0 , // maxcacheage false, // addtocache? RDB_CACHEDB, cr->m_collnum , &m_cacheList, (char *)&sk , (char *)&ek , 30000000, // minrecsizes 30MB m_masterState, m_masterLoop, m_niceness ) ) // return FALSE if this blocks return false; } if ( m_processedCachedbReply ) return true; // only scan list once m_processedCachedbReply = true; // if empty, that was easy if ( m_cacheList.isEmpty() ) return true; // we might have one rec set from cache and another not, and we // still want to cache the one that is not in storeIntoCachedb()! //m_seoInfoSetFromCache = true; // otherwise, parse out the cache recs for ( ; ! m_cacheList.isExhausted() ; m_cacheList.skipCurrentRec() ) { // breathe QUICKPOLL(m_niceness); // get it char *rec = m_cacheList.getCurrentRec(); // . get type of cached rec // . enum types cr_MatchingQueries etc. as in Cachedb.h char recType = g_cachedb.getTypeFromKey(rec); int32_t dataSize = m_cacheList.getCurrentDataSize(); // sanity. must at least have the cached date if ( dataSize < 4 ) { char *xx=NULL;*xx=0; } char *data = m_cacheList.getCurrentData (); // in data, first int32_t is the cached time in utc //int32_t cachedDate = *(int32_t *)data; // skip the TIMESTAMP! //int32_t timestamp = *(int32_t *)data; data += 4; dataSize -= 4; // and version data += 4; dataSize -= 4; // . 1 // . is it a cached rec for matching queries? // . getSeoQueryInfo() needs this if (recType == cr_MatchingQueries && !m_matchingQueryBufValid){ // debug log("seo: found matching queries"); // total size of the msg99replies (totalMsg99ReplySize) int32_t size1 = *(int32_t *)data; data += 4; // just point into the list itself. we will // free m_cacheList on reset then. m_matchingQueryBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none data += size1; // now the m_queryLinkStringBuf size1 = *(int32_t *)data; data += 4; m_matchingQueryStringBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding data += size1; m_matchingQueryBufValid = true; continue; } // . 2 // . is it a cached rec for related docis with titles? // . getSeoQueryInfo() calls getRelatedDocIdsWithTitles() // . m_relatedDocIds SafeBuf is buf if RelatedDocId classes // . m_relatedTitleBuf is buf of titles and urls referenced // by those classes if ( recType == cr_RelatedDocIds && ! m_relatedDocIdsWithTitlesValid ) { // debug log("seo: found related docids"); // first is the safebuf of RelatedDocId classes int32_t size1 = *(int32_t *)data; data += 4; // point into it //char *p = data; //char *pend = data + size1; // just point into the list itself. we will // free m_cacheList on reset then. m_relatedDocIdBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none // skip that data += size1; size1 = *(int32_t *)data; data += 4; // save this //char *rtbuf = data; // now the string buffer m_relatedTitleBuf.setBuf ( data , size1 , size1 , false , 0 ); // skip that data += size1; size1 = *(int32_t *)data; data += 4; // now the string buffer m_commonQueryNumBuf.setBuf ( data , size1 , size1 , false , 0 ); // now the RelatedDocId::ptr_url/ptr_rd_title members // were hacked to be offsets into this for storage // into the cache! /* for ( ; p < pend ; p += sizeof(RelatedDocId) ) { QUICKPOLL(m_niceness); // cast it RelatedDocId *rd = (RelatedDocId *)p; // get offsets int32_t off1 = (int32_t)rd->ptr_rd_title; int32_t off2 = (int32_t)rd->ptr_rd_url; int32_t off3 = (int32_t)rd->ptr_rd_site; // normalize/store back rd->ptr_rd_title = rtbuf + off1; rd->ptr_rd_url = rtbuf + off2; rd->ptr_rd_site = rtbuf + off3; } */ m_relatedDocIdsWithTitlesValid = true; m_relatedTitleBufValid = true; m_relatedDocIdBufValid = true; continue; } // . 3 // . is it a cached rec for related docis with titles? // . getSeoQueryInfo() calls getRelatedQueryBuf() if ( recType == cr_RelatedQueries && ! m_queryLinkBufValid ) { // we changed the format of relatedquerystringbuf // to be a bunch of QueryLogEntries now. so ignore // if old format. //if ( timestamp <= 1367704324 ) continue; // debug log("seo: found related queries"); int32_t size1; // first is the safebuf m_queryLinkBuf of QueryLinks size1 = *(int32_t *)data; data += 4; m_relatedQueryBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none data += size1; // now the m_queryLinkStringBuf size1 = *(int32_t *)data; data += 4; m_relatedQueryStringBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding data += size1; /* // now the ptrs, sorted size1 = *(int32_t *)data; data += 4; m_relPtrs.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none // test sorting char *p = m_relPtrs.getBufStart(); char *pend = m_relPtrs.getBuf(); char *base = m_queryLinkBuf.getBufStart(); QueryLink *lastqr = NULL; for ( ; p < pend ; p += 4 ) { QUICKPOLL(m_niceness); int32_t qkOff = *(int32_t *)p; QueryLink *qr = (QueryRel *)(base+qkOff); // no, longer, it is more complicated because // if m_uniqueRound scoring addition //if ( lastqr && // lastqr->m_totalRelatedQueryImportance < // qr ->m_totalRelatedQueryImportance ) { // char *xx=NULL;*xx=0;} lastqr = qr; } */ // validate //m_relPtrsValid = true; //m_queryLinkStringBufValid = true; m_relatedQueryBufValid = true; continue; } // if it is debug and we are not, skip it!! //if(recType == cr_ScoredInsertableTermsDebug && ! m_seoDebug ) // continue; // or if we are debug and it is not, skip it! //if (recType == cr_ScoredInsertableTerms && m_seoDebug ) // continue; /* if ( (recType == cr_MissingTermBuf ) && ! m_missingTermBufValid ) { // debug log("seo: found missingtermbuf"); int32_t size1; size1 = *(int32_t *)data; data += 4; m_missingTermBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none m_missingTermBufValid = true; } */ // 3b if ( (recType == cr_WordPosInfoBuf ) && ! m_wordPosInfoBufValid ) { // debug log("seo: found wordposinfo"); int32_t size1; size1 = *(int32_t *)data; data += 4; m_wordPosInfoBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none // WordPosInfo::m_term relative to ptr_utf8Content char *p = m_wordPosInfoBuf.getBufStart(); char *pend = m_wordPosInfoBuf.getBuf(); for ( ; p < pend ; p += sizeof(WordPosInfo) ) { QUICKPOLL(m_niceness); WordPosInfo *wp = (WordPosInfo *)p; int64_t off = (int64_t)wp->m_wordPtr; char *ptr = ptr_utf8Content + off; if ( off == -1 ) ptr = NULL; wp->m_wordPtr = ptr; } m_wordPosInfoBufValid = true; } // . 4 // . and the insertable terms buffer with its querychanges // linked lists! if ( recType == cr_ScoredInsertableTerms && ! m_scoredInsertableTermsBufValid ) { // debug log("seo: found scored insertable terms"); int32_t size1; // first is the safebuf m_queryLinkBuf of QueryLinks size1 = *(int32_t *)data; data += 4; // just point into the list itself. we will // free m_cacheList on reset then. m_insertableTermsBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none // skip that data += size1; size1 = *(int32_t *)data; data += 4; // now the buffer of query changes // these are normally just referenced by // InsertableTerm and in the linked list directly // into the Msg95Reply::ptr_queryChanges, but for // caching we have to use a new safebuf m_queryChangeBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none // skip that data += size1; size1 = *(int32_t *)data; data += 4; m_queryLogBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none /* // skip that data += size1; size1 = *(int32_t *)data; data += 4; m_itStrBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none */ /* // debug scoring. QueryChange::m_debugScoreInfoOffset data += size1; size1 = *(int32_t *)data; data += 4; m_debugScoreInfoBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none // debug scoring. QueryChange::m_origScoreInfoOffset data += size1; size1 = *(int32_t *)data; data += 4; m_origScoreInfoBuf.setBuf ( data , size1 , // size size1 , // allocated false , // owndata? 0 ); // encoding none */ // insertable terms deserialization logic char *p = m_insertableTermsBuf.getBufStart(); char *pend = m_insertableTermsBuf.getBuf(); for ( ; p < pend ; ) { QUICKPOLL(m_niceness); // cast it InsertableTerm *it = (InsertableTerm *)p; p += it->getSize(); // normalize m_firstQueryChange int64_t off =(int64_t)(it->m_firstQueryChange); // fix this char *buf = m_queryChangeBuf.getBufStart(); // int16_tcut QueryChange *fqc = (QueryChange *)(buf+off); // -1 means NULL if ( off == -1 ) fqc = NULL; // put back it->m_firstQueryChange = fqc; // terms //off = (int32_t)it->m_termStr; // to this //buf = m_itStrBuf.getBufStart(); // cast it //it->m_termStr = (char *)(buf+off); } // . now we set QueryChange::m_next and // InsertableTerm::m_firstQueryChange to be offsets // into the new m_queryChangeBuf before we stored // into the cache.... p = m_queryChangeBuf.getBufStart(); pend = m_queryChangeBuf.getBuf(); for ( ; p < pend ; p += sizeof(QueryChange) ) { QUICKPOLL(m_niceness); // cast it QueryChange *qc = (QueryChange *)p; // normalize m_next int64_t off = (int64_t)qc->m_next; // offset into this char *buf = m_queryChangeBuf.getBufStart(); // put back qc->m_next = (QueryChange *)(buf + off); // -1 means NULL if ( off == -1 ) qc->m_next = NULL; } // now all ptrs should be set correctly m_scoredInsertableTermsBufValid = true; m_insertableTermsBufValid = true; continue; } // . 2 // . is it a cached rec for related docis with titles? // . getSeoQueryInfo() calls getRelatedDocIdsWithTitles() // . m_relatedDocIds SafeBuf is buf if RelatedDocId classes // . m_relatedTitleBuf is buf of titles and urls referenced // by those classes if ( recType == cr_RecommendedLinks && ! m_recommendedLinksBufValid ) { // debug log("seo: found recommended links buf"); // first is the safebuf of RelatedDocId classes int32_t size1 = *(int32_t *)data; data += 4; // now the string buffer m_recommendedLinksBuf.setBuf ( data , size1 , size1 , false , 0 ); m_recommendedLinksBufValid = true; continue; } } return true; } #define CACHEDB_CURRENT_VERSION 1 // . returns false if blocked, true otherwise // . returns true and sets g_errno on error // . flush the msg4 until it completes i guess bool XmlDoc::storeMatchingQueriesIntoCachedb ( ) { if ( ! m_writeToCachedb ) return true; int32_t *ch32p = getContentHash32(); if ( ! ch32p ) return true; if ( ch32p == (void *)-1 ) return false; int32_t ch32 = *ch32p; // include spider date now in case indexed copy changes // site rank, tags, etc. if ( m_spideredTimeValid ) ch32 ^= m_spideredTime; CollectionRec *cr = getCollRec(); if ( ! cr ) return true; // all these things should already be validated so they should // not block or have errors //SafeBuf *qpbuf = getMatchingQueriesScored(); //SafeBuf *qpbuf = &m_queryPtrs; if ( ! m_matchingQueryBufValid ) { char *xx=NULL;*xx=0; } int32_t now = getTimeGlobal(); // calc how much space we need //int32_t totalMsg99ReplySize = 0; //int32_t numQueryPtrs = 0; //Msg99Reply **qptrs = NULL; // 1. msg99replies for matchingQueries int32_t need = 0; need += sizeof(key_t) + 4 + 4+4; // key + dataSize+cacheDate(now)+ver need += 4 + m_matchingQueryBuf.length(); need += 4 + m_matchingQueryStringBuf.length(); // sanity if ( need > 20000000 ) { log("cachedb: mq listsize %"INT32" too big for cachedb",need); return true; } SafeBuf listBuf; // add 1 byte padding to ensure copying a 0 byte buf to listBuf // does not trigger a reserve if ( ! listBuf.reserve ( need + 4 ) ) return true; // ensure no reallocating - that would screw logic below up char *orig = listBuf.getBufStart(); int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64()); key_t k; int32_t dataSize = 0; // // 1. first add the matching queries, msg99 replies // k = g_cachedb.makeKey ( uh32, ch32 , cr_MatchingQueries ); // note it log("seo: cachedb storing matchingqueries " "uh32=%"UINT32" ch32=%"UINT32"" ,(uint32_t)uh32,(uint32_t)ch32); listBuf.safeMemcpy ( &k , sizeof(key_t) ); dataSize = 0; dataSize += 4; // timestamp dataSize += 4; // version dataSize += 4 + m_matchingQueryBuf.length(); dataSize += 4 + m_matchingQueryStringBuf.length(); listBuf.pushLong ( dataSize ); listBuf.pushLong ( now ); // cached date listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION ); listBuf.pushLong ( m_matchingQueryBuf.length() ); listBuf.safeMemcpy ( &m_matchingQueryBuf ); listBuf.pushLong ( m_matchingQueryStringBuf.length() ); listBuf.safeMemcpy ( &m_matchingQueryStringBuf ); if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; } // ensure list did not realloc, that would screw up everything! if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 ); key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 ); // . list is ready now // . this only returns when each record has been added m_storeList.set ( listBuf.getBufStart() , listBuf.length() , listBuf.getBufStart() , // alloc listBuf.getCapacity(), // allocsize startKey, endKey, -1, // fixeddatasize true, // owndata? false ); // use half keys? // disconnect it from safebuf so it doesn't get freed listBuf.detachBuf(); m_storeList.printList(); QUICKPOLL(m_niceness); log("xmldoc: adding matching query list of %"INT32" bytes to cachedb", m_storeList.m_listSize); // returns false if it blocks, true otherwise if ( ! m_msg1.addList ( &m_storeList, RDB_CACHEDB , cr->m_collnum, m_masterState, m_masterLoop, false, // forcelocal? m_niceness ) ) return false; return true; } bool XmlDoc::storeRelatedDocIdsIntoCachedb ( ) { if ( ! m_writeToCachedb ) return true; if ( ! m_queryPtrsWholeValid ) { char *xx=NULL;*xx=0; } CollectionRec *cr = getCollRec(); if ( ! cr ) return true; int32_t *ch32p = getContentHash32(); if ( ! ch32p ) return true; if ( ch32p == (void *)-1 ) return false; int32_t ch32 = *ch32p; // include spider date now in case indexed copy changes // site rank, tags, etc. if ( m_spideredTimeValid ) ch32 ^= m_spideredTime; if ( ! m_relatedDocIdsWithTitlesValid ) { char *xx=NULL;*xx=0;} if ( ! m_relatedTitleBufValid ) { char *xx=NULL;*xx=0;} // 2. related docids int32_t need = 0; need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver need += 4 + m_relatedDocIdBuf.length(); need += 4 + m_relatedTitleBuf.length(); need += 4 + m_commonQueryNumBuf.length(); // sanity if ( need > 20000000 ) { log("cachedb: rd listsize %"INT32" too big for cachedb",need); return true; } SafeBuf listBuf; // add 1 byte padding to ensure copying a 0 byte buf to listBuf // does not trigger a reserve if ( ! listBuf.reserve ( need + 4 ) ) return true; // ensure no reallocating - that would screw logic below up char *orig = listBuf.getBufStart(); int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64()); key_t k; int32_t dataSize = 0; char *p1; char *p2; int32_t now = getTimeGlobal(); // 2. then add related docids k = g_cachedb.makeKey ( uh32 , ch32, cr_RelatedDocIds ); // note it log("seo: cachedb storing relateddocids " "uh32=%"UINT32" ch32=%"UINT32"" ,(uint32_t)uh32,(uint32_t)ch32); listBuf.safeMemcpy ( &k , sizeof(key_t) ); dataSize = 0; dataSize += 4; // timestamp dataSize += 4; // version dataSize += 4 + m_relatedDocIdBuf.length(); dataSize += 4 + m_relatedTitleBuf.length(); dataSize += 4 + m_commonQueryNumBuf.length(); listBuf.pushLong ( dataSize ); listBuf.pushLong ( now ); // cached date listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION ); listBuf.pushLong ( m_relatedDocIdBuf.length() ); p1 = listBuf.getBuf(); listBuf.safeMemcpy ( &m_relatedDocIdBuf ); p2 = listBuf.getBuf(); listBuf.pushLong ( m_relatedTitleBuf.length() ); listBuf.safeMemcpy ( &m_relatedTitleBuf ); //char *tbuf = m_relatedTitleBuf.getBufStart(); listBuf.pushLong ( m_commonQueryNumBuf.length() ); listBuf.safeMemcpy ( &m_commonQueryNumBuf ); // make ptrs into offsets into m_relatedTitleBuf /* for ( ; p1 < p2 ; p1 += sizeof(RelatedDocId )) { QUICKPOLL(m_niceness); RelatedDocId *rd = (RelatedDocId *)p1; int32_t off; off = rd->ptr_rd_url - tbuf; rd->ptr_rd_url = (char *)off; off = rd->ptr_rd_title - tbuf; rd->ptr_rd_title = (char *)off; off = rd->ptr_rd_site - tbuf; rd->ptr_rd_site = (char *)off; } */ if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; } // ensure list did not realloc, that would screw up everything! if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 ); key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 ); // . list is ready now // . this only returns when each record has been added m_storeList.set ( listBuf.getBufStart() , listBuf.length() , listBuf.getBufStart() , // alloc listBuf.getCapacity(), // allocsize startKey, endKey, -1, // fixeddatasize true, // owndata? false ); // use half keys? // disconnect it from safebuf so it doesn't get freed listBuf.detachBuf(); m_storeList.printList(); QUICKPOLL(m_niceness); log("xmldoc: adding related docids list of %"INT32" bytes to cachedb", m_storeList.m_listSize); // returns false if it blocks, true otherwise if ( ! m_msg1.addList ( &m_storeList, RDB_CACHEDB , cr->m_collnum, m_masterState, m_masterLoop, false, // forcelocal? m_niceness ) ) return false; return true; } // . returns false if blocked, true otherwise // . returns true and sets g_errno on error bool XmlDoc::storeRecommendedLinksBuf ( ) { if ( ! m_writeToCachedb ) return true; int32_t *ch32p = getContentHash32(); if ( ! ch32p ) return true; if ( ch32p == (void *)-1 ) return false; int32_t ch32 = *ch32p; // include spider date now in case indexed copy changes // site rank, tags, etc. if ( m_spideredTimeValid ) ch32 ^= m_spideredTime; CollectionRec *cr = getCollRec(); if ( ! cr ) return true; if ( ! m_recommendedLinksBufValid ) { char *xx=NULL;*xx=0;} int32_t need = 0; need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver need += 4 + m_recommendedLinksBuf.length(); // sanity if ( need > 20000000 ) { log("cachedb: reclnx listsize %"INT32" too big for cachedb",need); return true; } SafeBuf listBuf; // add 1 byte padding to ensure copying a 0 byte buf to listBuf // does not trigger a reserve if ( ! listBuf.reserve ( need + 4 ) ) return true; // ensure no reallocating - that would screw logic below up char *orig = listBuf.getBufStart(); int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64()); key_t k; int32_t dataSize = 0; int32_t now = getTimeGlobal(); // 2. then add related docids k = g_cachedb.makeKey ( uh32 , ch32, cr_RecommendedLinks ); // note it log("seo: cachedb storing recommendedlinksbuf " "uh32=%"UINT32" ch32=%"UINT32"" ,(uint32_t)uh32,(uint32_t)ch32); listBuf.safeMemcpy ( &k , sizeof(key_t) ); dataSize = 0; dataSize += 4; // timestamp dataSize += 4; // version dataSize += 4 + m_recommendedLinksBuf.length(); listBuf.pushLong ( dataSize ); listBuf.pushLong ( now ); // cached date listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION ); listBuf.pushLong ( m_recommendedLinksBuf.length() ); listBuf.safeMemcpy ( &m_recommendedLinksBuf ); if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; } // ensure list did not realloc, that would screw up everything! if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 ); key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 ); // . list is ready now // . this only returns when each record has been added m_storeList.set ( listBuf.getBufStart() , listBuf.length() , listBuf.getBufStart() , // alloc listBuf.getCapacity(), // allocsize startKey, endKey, -1, // fixeddatasize true, // owndata? false ); // use half keys? // disconnect it from safebuf so it doesn't get freed listBuf.detachBuf(); m_storeList.printList(); QUICKPOLL(m_niceness); log("xmldoc: adding recommendedlinksbuf list of %"INT32" bytes to cachedb", m_storeList.m_listSize); // returns false if it blocks, true otherwise if ( ! m_msg1.addList ( &m_storeList, RDB_CACHEDB , cr->m_collnum, m_masterState, m_masterLoop, false, // forcelocal? m_niceness ) ) return false; return true; } // . returns false if blocked, true otherwise // . returns true and sets g_errno on error bool XmlDoc::storeRelatedQueriesIntoCachedb ( ) { if ( ! m_writeToCachedb ) return true; if ( ! m_relatedQueryBufValid ) { char *xx=NULL;*xx=0; } int32_t *ch32p = getContentHash32(); if ( ! ch32p ) return true; if ( ch32p == (void *)-1 ) return false; int32_t ch32 = *ch32p; // include spider date now in case indexed copy changes // site rank, tags, etc. if ( m_spideredTimeValid ) ch32 ^= m_spideredTime; CollectionRec *cr = getCollRec(); if ( ! cr ) return true; //SafeBuf *relBuf = NULL; //if ( m_relPtrsValid ) relBuf = &m_relPtrs; int32_t now = getTimeGlobal(); // calc how much space we need int32_t need = 0; // 3. related queries. buf of QueryLinks need += sizeof(key_t) + 4 + 4 +4; // key + dataSize + timestamp + ver need += 4 + m_relatedQueryBuf.length(); need += 4 + m_relatedQueryStringBuf.length(); //need += 4 + m_relPtrs.length(); // sanity if ( need > 20000000 ) { log("cachedb: rq listsize %"INT32" too big for cachedb",need); return true; } SafeBuf listBuf; // add 1 byte padding to ensure copying a 0 byte buf to listBuf // does not trigger a reserve if ( ! listBuf.reserve ( need + 4 ) ) return true; // ensure no reallocating - that would screw logic below up char *orig = listBuf.getBufStart(); int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64()); key_t k; int32_t dataSize = 0; // // 3. then related queries (STORED by m_queryImportanceRelated) // //int32_t sizeRels = (m_relPtrs.length() / 4) * sizeof(QueryLink); k = g_cachedb.makeKey ( uh32 , ch32, cr_RelatedQueries ); // note it log("seo: cachedb storing relatedqueries " "uh32=%"UINT32" ch32=%"UINT32"" ,(uint32_t)uh32,(uint32_t)ch32); listBuf.safeMemcpy ( &k , sizeof(key_t) ); dataSize = 0; dataSize += 4; // timestamp dataSize += 4; // version dataSize += 4 + m_relatedQueryBuf.length(); // sizeRels; dataSize += 4 + m_relatedQueryStringBuf.length(); //dataSize += 4 + m_relPtrs.length(); listBuf.pushLong ( dataSize ); listBuf.pushLong ( now ); // cached date listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION ); listBuf.pushLong ( m_relatedQueryBuf.length() ); //char *p3 = listBuf.getBuf(); listBuf.safeMemcpy ( &m_relatedQueryBuf ); //char *p4 = listBuf.getBuf(); listBuf.pushLong ( m_relatedQueryStringBuf.length() ); listBuf.safeMemcpy ( &m_relatedQueryStringBuf ); //listBuf.pushLong ( m_relPtrs.length() ); //char *p5 = listBuf.getBuf(); //listBuf.safeMemcpy ( &m_relPtrs ); // sanity tests if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; } if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } // ensure list did not realloc, that would screw up everything! if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 ); key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 ); // . list is ready now // . this only returns when each record has been added m_storeList.set ( listBuf.getBufStart() , listBuf.length() , listBuf.getBufStart() , // alloc listBuf.getCapacity(), // allocsize startKey, endKey, -1, // fixeddatasize true, // owndata? false ); // use half keys? // disconnect it from safebuf so it doesn't get freed listBuf.detachBuf(); m_storeList.printList(); QUICKPOLL(m_niceness); log("xmldoc: adding related queries list of %"INT32" bytes to cachedb", m_storeList.m_listSize); // returns false if it blocks, true otherwise if ( ! m_msg1.addList ( &m_storeList, RDB_CACHEDB , cr->m_collnum, m_masterState, m_masterLoop, false, // forcelocal? m_niceness ) ) return false; return true; } bool XmlDoc::storeWordPosInfoBufIntoCachedb ( ) { if ( ! m_writeToCachedb ) return true; if ( ! m_wordPosInfoBufValid ) { char *xx=NULL;*xx=0; } int32_t *ch32p = getContentHash32(); if ( ! ch32p ) return true; if ( ch32p == (void *)-1 ) return false; int32_t ch32 = *ch32p; // include spider date now in case indexed copy changes // site rank, tags, etc. if ( m_spideredTimeValid ) ch32 ^= m_spideredTime; CollectionRec *cr = getCollRec(); if ( ! cr ) return true; int32_t now = getTimeGlobal(); // calc how much space we need int32_t need = 0; need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver need += 4 + m_wordPosInfoBuf.length(); // sanity if ( need > 20000000 ) { log("cachedb: wpi listsize %"INT32" too big for cachedb",need); return true; } SafeBuf listBuf; // add 1 byte padding to ensure copying a 0 byte buf to listBuf // does not trigger a reserve if ( ! listBuf.reserve ( need + 4 ) ) return true; // ensure no reallocating - that would screw logic below up char *orig = listBuf.getBufStart(); int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64()); key_t k; int32_t dataSize = 0; // 4. then the insertable terms and their query changes and log buf // mangle key a little if in debug mode because that is the only // time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf uint8_t cr8 = cr_WordPosInfoBuf; k = g_cachedb.makeKey ( uh32 , ch32, cr8 ); // note it log("seo: cachedb storing wordposinfobuf " "uh32=%"UINT32" ch32=%"UINT32"" ,(uint32_t)uh32,(uint32_t)ch32); listBuf.safeMemcpy ( &k , sizeof(key_t) ); dataSize = 0; dataSize += 4; // timestamp dataSize += 4; // version dataSize += 4 + m_wordPosInfoBuf.length(); listBuf.pushLong ( dataSize ); listBuf.pushLong ( now ); // cached date listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION ); listBuf.pushLong ( m_wordPosInfoBuf.length() ); char *p8 = listBuf.getBuf(); listBuf.safeMemcpy ( &m_wordPosInfoBuf ); char *p9 = listBuf.getBuf(); if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } // WordPosInfo::m_term relative to html ptr_utf8Content! for ( ; p8 < p9 ; p8 += sizeof(WordPosInfo) ) { QUICKPOLL(m_niceness); WordPosInfo *wp = (WordPosInfo *)p8; int64_t off = wp->m_wordPtr - ptr_utf8Content; // if its a tag or fielded term it won't be in the // html like ext:html or filetype:html if ( wp->m_wordPtr< ptr_utf8Content ) off = -1; if ( wp->m_wordPtr>=ptr_utf8Content +size_utf8Content) off = -1; wp->m_wordPtr = (char *)off; } if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; } // ensure list did not realloc, that would screw up everything! if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 ); key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 ); // . list is ready now // . this only returns when each record has been added m_storeList.set ( listBuf.getBufStart() , listBuf.length() , listBuf.getBufStart() , // alloc listBuf.getCapacity(), // allocsize startKey, endKey, -1, // fixeddatasize true, // owndata? false ); // use half keys? // disconnect it from safebuf so it doesn't get freed listBuf.detachBuf(); m_storeList.printList(); QUICKPOLL(m_niceness); log("xmldoc: adding wordposinfobuf list of %"INT32" bytes to cachedb", m_storeList.m_listSize); // returns false if it blocks, true otherwise if ( ! m_msg1.addList ( &m_storeList, RDB_CACHEDB , cr->m_collnum, m_masterState, m_masterLoop, false, // forcelocal? m_niceness ) ) return false; return true; } /* bool XmlDoc::storeMissingTermBufIntoCachedb ( ) { if ( ! m_writeToCachedb ) return true; if ( ! m_missingTermBufValid ) { char *xx=NULL;*xx=0; } int32_t *ch32p = getContentHash32(); if ( ! ch32p ) return true; if ( ch32p == (void *)-1 ) return false; int32_t ch32 = *ch32p; // include spider date now in case indexed copy changes // site rank, tags, etc. if ( m_spideredTimeValid ) ch32 ^= m_spideredTime; int32_t now = getTimeGlobal(); // calc how much space we need int32_t need = 0; need += sizeof(key_t) + 4 + 4+4; // key + dataSize + timestamp + ver need += 4 + m_missingTermBuf.length(); // sanity if ( need > 20000000 ) { log("cachedb: wpi listsize %"INT32" too big for cachedb",need); return true; } SafeBuf listBuf; // add 1 byte padding to ensure copying a 0 byte buf to listBuf // does not trigger a reserve if ( ! listBuf.reserve ( need + 4 ) ) return true; // ensure no reallocating - that would screw logic below up char *orig = listBuf.getBufStart(); int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64()); key_t k; int32_t dataSize = 0; // 4. then the insertable terms and their query changes and log buf // mangle key a little if in debug mode because that is the only // time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf uint8_t cr = cr_MissingTermBuf; k = g_cachedb.makeKey ( uh32 , ch32, cr ); // note it log("seo: cachedb storing missingtermbuf " "uh32=%"UINT32" ch32=%"UINT32"",uh32,ch32); listBuf.safeMemcpy ( &k , sizeof(key_t) ); dataSize = 0; dataSize += 4; // timestamp dataSize += 4; // version dataSize += 4 + m_missingTermBuf.length(); listBuf.pushLong ( dataSize ); listBuf.pushLong ( now ); // cached date listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION ); listBuf.pushLong ( m_missingTermBuf.length() ); listBuf.safeMemcpy ( &m_missingTermBuf ); if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; } // ensure list did not realloc, that would screw up everything! if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 ); key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 ); // . list is ready now // . this only returns when each record has been added m_storeList.set ( listBuf.getBufStart() , listBuf.length() , listBuf.getBufStart() , // alloc listBuf.getCapacity(), // allocsize startKey, endKey, -1, // fixeddatasize true, // owndata? false ); // use half keys? // disconnect it from safebuf so it doesn't get freed listBuf.detachBuf(); m_storeList.printList(); QUICKPOLL(m_niceness); log("xmldoc: adding missingtermbuf list of %"INT32" bytes to cachedb", m_storeList.m_listSize); // returns false if it blocks, true otherwise if ( ! m_msg1.addList ( &m_storeList, RDB_CACHEDB , cr->m_collnum, m_masterState, m_masterLoop, false, // forcelocal? m_niceness ) ) return false; return true; } */ // . returns false if blocked, true otherwise // . returns true and sets g_errno on error // . flush the msg4 until it completes i guess bool XmlDoc::storeScoredInsertableTermsIntoCachedb ( ) { if ( ! m_writeToCachedb ) return true; if ( ! m_scoredInsertableTermsBufValid ) return true; int32_t *ch32p = getContentHash32(); if ( ! ch32p ) return true; if ( ch32p == (void *)-1 ) return false; int32_t ch32 = *ch32p; // include spider date now in case indexed copy changes // site rank, tags, etc. if ( m_spideredTimeValid ) ch32 ^= m_spideredTime; CollectionRec *cr = getCollRec(); if ( ! cr ) return true; int32_t now = getTimeGlobal(); // calc how much space we need int32_t need = 0; need += sizeof(key_t) + 4 + 4 +4; // key + dataSize + timestamp + ver need += 4 + m_insertableTermsBuf.length(); // InsertableTerm::m_firstQryChange: need += 4 + m_queryChangeBuf.length(); //4 QueryChange::m_replyQueryOffset : need += 4 + m_queryLogBuf.length(); //InsertableTerm::m_termStr reference //need += 4 + m_itStrBuf.length(); //need += 4 + m_wordPosInfoBuf.length(); // TOO BIG to score into cachedb! //need += 4 + m_debugScoreInfoBuf.length(); // debug only //need += 4 + m_origScoreInfoBuf.length(); // debug only // sanity if ( need > 20000000 ) { log("cachedb: listsize %"INT32" too big for cachedb",need); return true; } SafeBuf listBuf; // add 1 byte padding to ensure copying a 0 byte buf to listBuf // does not trigger a reserve if ( ! listBuf.reserve ( need + 4 ) ) return true; // ensure no reallocating - that would screw logic below up char *orig = listBuf.getBufStart(); int32_t uh32 = (uint32_t)((uint64_t)getFirstUrlHash64()); key_t k; int32_t dataSize = 0; char *p1; char *p2; // 4. then the insertable terms and their query changes and log buf // mangle key a little if in debug mode because that is the only // time we compute and store m_debugScoreInfoBuf and m_origScoreInfoBuf uint8_t cr8 = cr_ScoredInsertableTerms; //if ( m_seoDebug ) cr = cr_ScoredInsertableTermsDebug; k = g_cachedb.makeKey ( uh32 , ch32, cr8 ); // note it log("seo: cachedb storing scoredinsertableterms " "uh32=%"UINT32" ch32=%"UINT32"" ,(uint32_t)uh32,(uint32_t)ch32); listBuf.safeMemcpy ( &k , sizeof(key_t) ); dataSize = 0; dataSize += 4; // timestamp dataSize += 4; // version dataSize += 4 + m_insertableTermsBuf.length(); dataSize += 4 + m_queryChangeBuf.length(); dataSize += 4 + m_queryLogBuf.length(); //dataSize += 4 + m_itStrBuf.length(); //dataSize += 4 + m_wordPosInfoBuf.length(); //dataSize += 4 + m_debugScoreInfoBuf.length(); // debug only //dataSize += 4 + m_origScoreInfoBuf .length(); // debug only listBuf.pushLong ( dataSize ); listBuf.pushLong ( now ); // cached date listBuf.pushLong ( (int32_t)CACHEDB_CURRENT_VERSION ); // m_insertableTermsBuf listBuf.pushLong ( m_insertableTermsBuf.length() ); p1 = listBuf.getBuf(); listBuf.safeMemcpy ( &m_insertableTermsBuf ); char *p1End = listBuf.getBuf(); if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } // m_queryChangeBuf listBuf.pushLong ( m_queryChangeBuf.length() ); p2 = listBuf.getBuf(); listBuf.safeMemcpy ( &m_queryChangeBuf ); char *p2End = listBuf.getBuf(); if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } // m_queryLogBuf listBuf.pushLong ( m_queryLogBuf.length() ); listBuf.safeMemcpy ( &m_queryLogBuf ); if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } // m_itStrBuf referenced by InsertableTerm::m_termStr //listBuf.pushLong ( m_itStrBuf.length() ); //listBuf.safeMemcpy ( &m_itStrBuf ); //if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } // m_itStrBuf referenced by InsertableTerm::m_termStr //listBuf.pushLong ( m_wordPosInfoBuf.length() ); //char *p8 = listBuf.getBuf(); //listBuf.safeMemcpy ( &m_wordPosInfoBuf ); //char *p9 = listBuf.getBuf(); //if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } // debug buffers, QueryChange::m_*Offset parms ref them if // m_seoDebug is true. TOO BIG TO STORE INTO CACHEDB! //listBuf.pushLong ( m_debugScoreInfoBuf.length() ); //listBuf.safeMemcpy ( &m_debugScoreInfoBuf ); //listBuf.pushLong ( m_origScoreInfoBuf.length() ); //listBuf.safeMemcpy ( &m_origScoreInfoBuf ); // make the InsertableTerm::m_firstQueryChange parms into // offsets for ( ; p1 < p1End ; ) { // p1 += sizeof(InsertableTerm) ) { QUICKPOLL(m_niceness); InsertableTerm *it = (InsertableTerm *)p1; p1 += it->getSize(); QueryChange *qc = it->m_firstQueryChange; int64_t qoff =(char *)qc - m_queryChangeBuf.getBufStart(); if ( qc == NULL ) qoff = -1; it->m_firstQueryChange = (QueryChange *)qoff; // and m_termStr //int32_t off = it->m_termStr - m_itStrBuf.getBufStart(); //it->m_termStr = (char *)off; } // make QueryChange::m_next ptrs into offsets as well for ( ; p2 < p2End ; p2 += sizeof(QueryChange) ) { QUICKPOLL(m_niceness); QueryChange *qc = (QueryChange *)p2; QueryChange *next = qc->m_next; int64_t noff =(char *)next-m_queryChangeBuf.getBufStart(); if ( next == NULL ) noff = -1; qc->m_next = (QueryChange *)noff; } // WordPosInfo::m_term relative to html ptr_utf8Content! /* for ( ; p8 < p9 ; p8 += sizeof(WordPosInfo) ) { QUICKPOLL(m_niceness); WordPosInfo *wp = (WordPosInfo *)p8; int32_t off = wp->m_wordPtr - ptr_utf8Content; // if its a tag or fielded term it won't be in the // html like ext:html or filetype:html if ( wp->m_wordPtr< ptr_utf8Content ) off = -1; if ( wp->m_wordPtr>=ptr_utf8Content +size_utf8Content) off = -1; wp->m_wordPtr = (char *)off; } */ if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; } // ensure list did not realloc, that would screw up everything! if ( listBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } key_t startKey = g_cachedb.makeStartKey ( uh32, ch32 ); key_t endKey = g_cachedb.makeEndKey ( uh32, ch32 ); // . list is ready now // . this only returns when each record has been added m_storeList.set ( listBuf.getBufStart() , listBuf.length() , listBuf.getBufStart() , // alloc listBuf.getCapacity(), // allocsize startKey, endKey, -1, // fixeddatasize true, // owndata? false ); // use half keys? // disconnect it from safebuf so it doesn't get freed listBuf.detachBuf(); m_storeList.printList(); QUICKPOLL(m_niceness); log("xmldoc: adding insertable terms list of %"INT32" bytes to cachedb", m_storeList.m_listSize); // returns false if it blocks, true otherwise if ( ! m_msg1.addList ( &m_storeList, RDB_CACHEDB , cr->m_collnum, m_masterState, m_masterLoop, false, // forcelocal? m_niceness ) ) return false; return true; } #define MAX_TOP_MATCHING_QUERIES 300 /* // returns -1 if blocked, NULL with g_errno set on error SafeBuf *XmlDoc::getMatchingQueriesScored ( ) { setStatus ( "getmatchingqueriesscored" ); // try to set m_queryPtrs from cachedb record if ( ! checkCachedb() ) return (SafeBuf *)-1; // just re-use the same m_queryPtrs SafeBuf we used above but we // set the Msg99Reply::m_myScore here and sort them by that if ( m_queryPtrsSortedValid ) return &m_queryPtrs; // get the queries from msg99 replies first SafeBuf *mq = getMatchingQueries(false,-1); if ( mq == NULL || mq == (void *)-1 ) return mq; // time it if ( ! m_beginTimeMatchUrl ) m_beginTimeMatchUrl = gettimeofdayInMilliseconds(); // i'm assuming this is quer ptrs!?!?! int32_t numQueryPtrs = mq->length() / sizeof(Msg99Reply *); // get the qptrs Msg99Reply **qptrs = (Msg99Reply **)mq->getBufStart(); // score them in parallel over all hosts in network if ( ! scoreDocIdRestrictedQueries ( qptrs,NULL,numQueryPtrs) ) return (SafeBuf *)-1; // error? if ( g_errno ) return NULL; // total pages indexed! int64_t numPagesIndexed = g_titledb.getGlobalNumDocs(); // take 25% of that. i think 'the', the most common term, is in about // 25% of those pages numPagesIndexed /= 4; // // SET QUERY IMPORTANCE // // . set the m_queryImportance float and sort by that // . how important is the matching query for the main url? // . just divide the main url's score by the // QueryLogEntry::m_mintop50Score for the query to normalize it // . however, when we compute RelatedDocId::m_dotProduct we normalize // using the score of the #1 result because we executed the full // query, so keep that in mind. we can't mix the two. for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) { Msg99Reply *qp = qptrs[i]; // int16_tcut QueryLogEntry *qe = &qp->m_queryLogEntry; // get # results int64_t numResults = qe->m_numTotalResultsInSlice; // fix it to be global numResults *= (int64_t)g_hostdb.getNumGroups(); // big indexes did the "slice logic" restricting docid // range to MAX_DOCID * .10 when setting this! if ( numPagesIndexed > 10000000 ) numResults *= 10; // point to query char *qstr = qp->m_queryStr; // if not processed assume like 1M? if ( numResults < 0 ) { log("seo: guessing query importance for '%s' from " "hostid #%"INT32"", qstr,(int32_t)qp->m_replyingHostId); qp->m_queryImportance = 0.0; continue; } // zero means make it 1 to avoid div by zero below if ( numResults == 0 ) numResults = 1; // and also weight by traffic! the more traffic the // more important perhaps... // NO! with this we get 'www' 'view' etc for // jezebelgallery.com coming up in the top 50 matching // queries by importance. crap, but it hurts cheatcodes.com // then. // fix //if ( strcmp(qstr,"search engine") == 0 ) // log("poo"); // adjust since numPagesIndexed is actually a quarter of // the # of pages indexed since 'the' is only in about // 1/4 of the pages and it is the most common term if ( numResults > numPagesIndexed ) numResults = numPagesIndexed; // try doubling this to get rid of www problem for // jezebelgallery.com. it put www and view down some more. float popRatio = (float)numResults / (float)numPagesIndexed; // stuff like 'www' and 'view' will be near 1.0 float weight = 1.0 - popRatio;//(popRatio * popRatio); // go crazy weight *= weight; weight *= weight; weight *= weight; weight *= weight; // do not let this be 1.0 because 'web page searching' is // getting 1.0 for it and getting a weight of 0.0 and making // it the same as the ignored matching queries for // gigablast.com, so we end up using the ignored common // word matching queries for getting competitor pages and it // is bad! we need to fix that to not use such queries if // their importance is 0! if ( weight < .01 ) weight = .01; // because you are in the top 50 //numResults = (int32_t)powf ( (float)numResults , .4 ); //if ( numResults == 0 ) // imp /= 1; // otherwise, normalize by division //else // imp /= numResults; // boost it! //imp *= 10000; //QueryInfo *qi = &qp->m_queryInfo; //float imp = getQueryImportance2 ( qi , score ); // just try sorting by your serp score, hopefully we remove // shit like 'www' becaise isCommonQueryWordInEnglish() // takes care of it below. // consider *= weight on this // the idea is to ignore the top serp score because // you do not want terms that you may be able to be #1 // for but are not really relevant for your doc. so for this // let's focus on just getting the queries that best represent // your doc... double imp = qp->m_myScore * weight; qp->m_queryImportance = (float)imp; // just use this!!! //qp->m_queryImportance = qp->m_myScore / // (float)(numResults*numResults); // set importance to 0 for queries with minus sign in them // that indicates negative terms... for ( char *p = qstr; *p ; p++ ) { if ( *p != ' ' ) continue; if ( p[1] != '-' ) continue; // 'a - b' is ok if ( p[2] == ' ' ) continue; qp->m_queryImportance = 0.00; log("seo: ignoring query '%s' with minus sign", qstr); break; } // avoid common queries with just common words in them: // http web www com org us we 1 2 3 by on i https one page Words ww; ww.set3 ( qstr ); int32_t i; for ( i = 0 ; i < ww.m_numWords ; i++ ) { int64_t wid = ww.m_wordIds[i]; if ( wid == 0 ) continue; if ( ! isCommonQueryWordInEnglish ( wid ) ) break; } if ( i >= ww.m_numWords ) { qp->m_queryImportance = 0.00; log("seo: ignoring common query '%s'", qstr); } // skip debug for now if ( ! m_seoDebug ) continue; // note it log("seo: " "imp=%f " "numresults=%"INT64" " "numpagesindexed=%"INT64" " "popweight=%f " "myscore=%f " "topscore=%f " "qstr=%s", qp->m_queryImportance, numResults, numPagesIndexed, weight, qp->m_myScore, qe->m_topSERPScore, qstr); } // let's sort them first qsort ( qptrs , numQueryPtrs , sizeof(Msg99Reply *), qp99cmp ); // log for debug int32_t maxk = numQueryPtrs; // limit to logging 300 to avoid log spam if ( maxk > MAX_TOP_MATCHING_QUERIES ) maxk = MAX_TOP_MATCHING_QUERIES; // 300; // limit to top 300 dammit, otherwise we can't store all // into cachedb!!! int32_t newLen = maxk * sizeof(Msg99Reply *); m_queryPtrs.setLength ( newLen ); for ( int32_t k = 0 ; k < maxk ; k++ ) { Msg99Reply *kp = qptrs[k]; log("seopipe: newquery=\"%s\" myscore=%f imp=%f", kp->m_queryStr, kp->m_myScore, kp->m_queryImportance); } // time it int64_t now = gettimeofdayInMilliseconds(); int64_t took = now - m_beginTimeMatchUrl; log("seopipe: time: matchingscoredqueries took %"INT64" ms",took); m_queryPtrsSortedValid = true; if ( ! storeMatchingQueriesIntoCachedb() ) // return -1 if it blocked and wait for store to complete return (SafeBuf *)-1; return mq; } */ static void gotMsg3aReplyForFullQueryWrapper ( void *state ) { XmlDoc *THIS = (XmlDoc *)state; THIS->setStatus ( "gotmsg3areplyforfullquerywrapper" ); THIS->gotMsg3aReplyForFullQuery(); // . go back to the main entry function // . make sure g_errno is clear from a msg3a g_errno before calling // this lest it abandon the loop THIS->m_masterLoop ( THIS->m_masterState ); } /* void XmlDoc::gotMsg3aReplyForFullQueryCached ( char *cachedRec , Msg99Reply *qp ) { // try again for next guy m_triedCache = false; char *p = cachedRec; // # docids int32_t numDocIds = *(int32_t *)p; p += 4; // total # results int32_t numTotalResults = *(int32_t *)p; p += 4; // docids int64_t *docIds = (int64_t *)p; p += 8 * numDocIds; // scores float *scores = (float *)p; p += sizeof(float) * numDocIds; // site hashes int32_t *siteHashes = (int32_t *)p; p += 4 * numDocIds; // store score info into this class TopDocIds *td = qp->m_topDocIds; // store reply info, like # docids, in the query ptr int32_t max = numDocIds; if ( max > (int32_t)NUM_TOP_RESULTS ) max = (int32_t)NUM_TOP_RESULTS; td->m_numDocIds = max; // count replies m_numMsg3aReplies++; // log to log as well char tmp[50000]; p = tmp; p += sprintf(p, "seopipe: got full results CACHED " "qrynum=%"INT32"of%"INT32" docids=%"INT32" " "query=\"%s\" ", m_numMsg3aReplies,//m_msg3a->m_hackQNum, m_maxFullQueries , td->m_numDocIds, qp->m_queryStr ); // log each docid for ( int32_t i = 0 ; i < max ; i++ ) { //float score = m_msg3a->getScores()[i]; int64_t d = docIds[i]; //int32_t sh32 = m_msg3a->getSiteHash32(i); p += sprintf(p,"d%"INT32"=%"INT64" ",i,d); } log(tmp); // int16_tcut. pumpSocket() sends the contents of this to m_seoSocket SafeBuf *sb = &m_socketWriteBuf; sb->safePrintf( "\t\n" "\t\t%"INT32"\n" "\t\t%"INT32"" "\n" "\t\t%"INT32"\n" , m_msg3a->m_hackQNum , numTotalResults , numDocIds ); // print the top 50 scores for ( int32_t i = 0 ; i < max ; i++ ) { float score = scores[i]; int64_t d = docIds[i]; int32_t sh32 = siteHashes[i]; sb->safePrintf("\t\t\n"); sb->safePrintf("\t\t\t%"INT32"\n",i+1); sb->safePrintf("\t\t\t%f\n",score); sb->safePrintf("\t\t\t%"INT64"\n",d); sb->safePrintf("\t\t\t%"UINT32"\n",sh32); sb->safePrintf("\t\t\n"); // store results for this Msg99Reply td->m_topDocIds[i] = d; td->m_topScores[i] = score; td->m_topSiteHashes[i] = sh32; } // reset rest so it prints pretty on gdb debug print cmd for ( int32_t i = max ; i < (int32_t)NUM_TOP_RESULTS ; i++ ) { td->m_topDocIds[i] = 0LL; td->m_topScores[i] = 0.0; td->m_topSiteHashes[i] = 0; } sb->safePrintf("\t\n"); // pump m_socketWriteBuf to m_seoSocket pumpSocketWriteBuf ( ); } */ // . this is the msg3a reply for related docids only // . the full replies we get for determining ranks from scores for the // HTML simulator, are handled in seo.cpp using State95::m_msg3a. void XmlDoc::gotMsg3aReplyForFullQuery ( ) { int32_t err = g_errno; // save it so we know related docid generation had an error... if ( g_errno && ! m_msg3aErrno ) m_msg3aErrno = g_errno; setStatus ( "gotmsg3areplyforfullquery" ); if ( g_errno ) { log("seopipe: got msg3a reply error: %s",mstrerror(g_errno)); g_errno = 0; } // try again for next guy //m_triedCache = false; // how many docids in the search results were returned to us? int32_t numDocIds = m_msg3a->getNumDocIds(); // total # search results estimated //int32_t numTotalResults = m_msg3a->getNumTotalEstimatedHits(); // get the query as we received it in the msg99 reply //Msg99Reply *qp = (Msg99Reply *)m_msg3a->m_hackQPtr; int32_t queryNum = (int32_t)m_msg3a->m_hackQNum; // . point to the empty class we reserved in the buf // . store score info into this class //TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBuf();//Start(); // ensure enough room //if ( m_topDocIdsBuf.getAvail() < sizeof(TopDocIds) ) // m_topDocIdsBuf.reserve(sizeof(TopDocIds) ) // get next available spot to store this TopDocIds *td = (TopDocIds *)m_topDocIdsBuf.getBuf(); int32_t tdnum = m_topDocIdsBuf.length() / sizeof(TopDocIds); m_topDocIdsBuf.incrementLength(sizeof(TopDocIds)); if ( m_topDocIdsBuf.length() > m_topDocIdsBuf.m_capacity ) { char *xx=NULL;*xx=0; } QueryLink *qks = (QueryLink *)m_matchingQueryBuf.getBufStart(); QueryLink *qk = &qks[queryNum]; // the relateddocidnum hack if ( tdnum > 32000 ) { char *xx=NULL;*xx=0; } qk->m_relatedDocIdNum = tdnum; // store reply info, like # docids, in the query ptr int32_t max = numDocIds; if ( max > (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS ) max = (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS; td->m_numDocIds = max; // QueryLink # in the m_matchingQueryBuf buffer we represent td->m_queryNum = queryNum; // keep it clean //qp->m_docIdVotes = 0; // get the query base hash and use that to // dedup. the query base hash ignores common // words and converts words to their synonym // with the smallest hash //int64_t qbh = getQueryBaseHash(qstr); //m_msg3a->m_hackQNum = m_queryNum; //m_msg3a->m_hackQPtr = (char *)qp; // count replies m_numMsg3aReplies++; // log to log as well //char tmp[50000]; SafeBuf tmp; //char *p = tmp; tmp.safePrintf( "seopipe: got list of %"INT32" related docids for " "qrynum=%"INT32" " //"of%"INT32"" "numDocids=%"INT32" " "query=\"", numDocIds, m_numMsg3aReplies,//m_msg3a->m_hackQNum, //m_maxFullQueries , td->m_numDocIds); char *qqq = qk->getQueryString(&m_matchingQueryStringBuf); tmp.safeStrcpy(qqq); tmp.safePrintf("\" (err=%s)", mstrerror(err)); // log each docid //for ( int32_t i = 0 ; i < max ; i++ ) { // //float score = m_msg3a->getScores()[i]; // int64_t d = m_msg3a->m_docIds[i];//getDocIds()[i]; // //int32_t sh32 = m_msg3a->getSiteHash32(i); // p += sprintf(p,"d%"INT32"=%"INT64" ",i,d); //} char *msg = tmp.getBufStart(); log("%s",msg); /* // int16_tcut. pumpSocket() sends the contents of this to m_seoSocket SafeBuf *sb = &m_socketWriteBuf; sb->safePrintf( "\t\n" "\t\t%"INT32"\n" "\t\t%"INT32"" "\n" "\t\t%"INT32"\n" , m_msg3a->m_hackQNum , numTotalResults , numDocIds ); */ // print the top 50 scores for ( int32_t i = 0 ; i < max ; i++ ) { float score = m_msg3a->m_scores[i];//getScores()[i]; int64_t d = m_msg3a->m_docIds[i];//getDocIds()[i]; int32_t sh26 = m_msg3a->getSiteHash26(i); /* sb->safePrintf("\t\t\n"); sb->safePrintf("\t\t\t%"INT32"\n",i+1); sb->safePrintf("\t\t\t%f\n",score); sb->safePrintf("\t\t\t%"INT64"\n",d); sb->safePrintf("\t\t\t%"UINT32"\n",sh32); sb->safePrintf("\t\t\n"); */ // store results for this Msg99Reply td->m_topDocIds[i] = d; td->m_topScores[i] = score; td->m_topSiteHashes26[i] = sh26; } // reset rest so it prints pretty on gdb debug print cmd for ( int32_t i = max ; i < (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS; i++ ) { td->m_topDocIds[i] = 0LL; td->m_topScores[i] = 0.0; td->m_topSiteHashes26[i] = 0; } /* sb->safePrintf("\t\n"); */ // give front-end the progress bar info if ( m_seoSocket && m_progressBar ) { // tmp buf char tmp[16]; float percent = (float)m_numMsg3aReplies ; //percent /= (float)m_maxFullQueries; percent *= 100.0; // these are 80% of the pipeline if getting competitor // backlinks if ( m_progressBar == 2 ) percent *= .80; int32_t percentLong = (int32_t)percent; if ( percentLong >= 100 ) percentLong = 99; int32_t tmpLen = sprintf(tmp,"%02"INT32"%%",percentLong); if ( tmpLen !=3)log("seo: bad progress bar output %"INT32"",tmpLen); // try a send on non-blocking socket int32_t n = ::send ( m_seoSocket->m_sd , tmp,tmpLen , 0 ); if ( n != tmpLen ) log("seo: bad progress bar send %"INT32"",n); // forget error errno = 0; } } bool XmlDoc::clientClosedConnection ( ) { if ( ! m_seoSocket ) return false; if ( m_clientClosed ) return true; if ( g_now - m_lastCheckTime < 50 ) return m_clientClosed; m_lastCheckTime = g_now; char buffer[100]; if ( recv(m_seoSocket->m_sd,buffer,99,MSG_PEEK|MSG_DONTWAIT) == 0 ) { m_clientClosed = true; log("xmldoc: CLIENT CLOSED CONNECTION!!"); } return m_clientClosed; } // . returns -1 if blocked, NULL with g_errno set on error // . we do this to get related docids SafeBuf *XmlDoc::getMatchingQueriesScoredForFullQuery ( ) { setStatus ( "getmatchingqueriesscoredforfullquery" ); // just re-use the same m_queryPtrs SafeBuf we used above but we // set the Msg99Reply::m_myScore here and sort them by that if ( m_queryPtrsWholeValid ) return &m_matchingQueryBuf; // get the queries sorted by the url: | scores for our main url SafeBuf *mq = getMatchingQueryBuf(); if ( mq == NULL || mq == (void *)-1 ) return mq; // setup timer CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; if ( ! m_beginTimeFullQueries ) m_beginTimeFullQueries = gettimeofdayInMilliseconds(); // this buffer holds a ptr to each query in each msg99 reply we // received from all hosts in the network QueryLink *qks = (QueryLink *)mq->getBufStart(); int32_t nks = mq->length()/sizeof(QueryLink); int32_t maxFullQueries = 50; int32_t tneed = maxFullQueries * sizeof(TopDocIds); if ( m_topDocIdsBuf.length() == 0 && ! m_topDocIdsBuf.reserve(tneed) ) return NULL; // . now launch msg3as at them // . this is 60k so new it here if ( ! m_msg3a ) { // reset the query # we are processing m_queryNum = 0; m_numMsg3aRequests = 0; m_numMsg3aReplies = 0; if ( ! m_fullQueryDedup.set(8,0,256,NULL,0, false,m_niceness,"fqdd")) return NULL; try { m_msg3a = new ( Msg3a ); } catch ( ... ) { g_errno = ENOMEM; return NULL; } mnew ( m_msg3a, sizeof(Msg3a),"xdmsg3a"); // need this too now i guess since it is 65k try { m_query3a = new ( Query ); } catch ( ... ) { g_errno = ENOMEM; return NULL; } mnew ( m_query3a, sizeof(Query),"xdqry3a"); } loop: // breath in case we hit all cache QUICKPOLL(m_niceness); // have we launched all the requests we need to bool exhausted = false; if ( m_queryNum >= nks ) exhausted = true; if ( m_numMsg3aRequests >= maxFullQueries ) exhausted = true; // if client closed browser connection by hitting the stop sign // then stop here! if ( clientClosedConnection() ) m_hadMatchError = ESOCKETCLOSED; if ( m_hadMatchError ) exhausted = true; // if nothing to launch if ( exhausted && // and all replies received m_numMsg3aReplies >= m_numMsg3aRequests ) { // nuke the msg3a to save mem mdelete ( m_msg3a, sizeof(Msg3a) , "msg3a" ); delete ( m_msg3a ); m_msg3a = NULL; mdelete ( m_query3a , sizeof(Query), "qry3a" ); delete ( m_query3a ); m_query3a = NULL; // time it int64_t now = gettimeofdayInMilliseconds(); int64_t took = now - m_beginTimeFullQueries; log("seopipe: time: fullqueries took %"INT64" ms",took); // force closed? if ( m_hadMatchError ) return NULL; // we are done! m_queryPtrsWholeValid = true; return &m_matchingQueryBuf;//queryPtrs; } // if nothing to launch wait for all replies if ( exhausted ) return (SafeBuf *)-1; // get the current query to process //Msg99Reply *qp = queryPtrs[m_queryNum]; QueryLink *qk = &qks[m_queryNum]; int32_t savedQueryNum = m_queryNum; QueryLogEntry *qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf); // int16_tcut //int64_t h64 = qk->m_querySynBaseHash64; int64_t h64 = getSynBaseHash64 ( qe->getQueryString(),qe->m_langId); // . if we already did a similar query, then skip it // . Msg99Reply::m_topDocIds will be NULL so getRelatedDocIds() will // know we skipped this query and to ignore it if ( m_fullQueryDedup.isInTable(&h64) ) { m_queryNum++; goto loop; } // or if importance is 0, which means to ignore! if ( qk->m_queryImportance <= 0.0 ) { m_queryNum++; goto loop; } // int16_tcut char *qstr = qk->getQueryString(&m_matchingQueryStringBuf); // sanity if ( ! cr->m_coll || ! cr->m_coll[0] ) { char *xx=NULL;*xx=0; } // this is required for synonyms! // TODO: use whatever language the query is!!! uint8_t langId = langEnglish; // int16_tcut int32_t qlen = gbstrlen(qstr); //int32_t collLen = gbstrlen(cr->m_coll); // set the request m_mr2.reset(); m_mr2.ptr_query = qstr; m_mr2.size_query = qlen+1; //m_mr2.ptr_coll = cr->m_coll; //m_mr2.size_coll = collLen+1; m_mr2.m_collnum = cr->m_collnum; m_mr2.m_queryExpansion = 1; m_mr2.m_language = langId; m_mr2.m_niceness = m_niceness; // . get top 50 results now // . then related docids will have to be in there m_mr2.m_docsToGet = (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS; m_mr2.m_useSeoResultsCache = true; // we do not need this, we just want the related docids/scores m_mr2.m_getDocIdScoringInfo = false; // use cache for 7 days since it is just for getting related docids // right now. make sure that that cache saves to disk. // MDW: why is this not working? //m_mr2.m_maxAge = 86400 * 7; //m_mr2.m_addToCache = true; //m_mr2.m_debug = 1; // prepend to the query? int32_t ulen = m_firstUrl.m_ulen; // go to next guy if this query is too big already if ( ulen + qlen + 10 > MAX_QUERY_LEN ) { m_queryNum++; goto loop; } // support for the new TopDocIds class which holds detailed search // results for selected matching queries QueryLinks //int32_t maxt = numQueryPtrs; //if ( maxt > m_maxQueries ) maxt = m_maxQueries; //if ( ! maxt ) { char *xx=NULL;*xx=0; } // we also need the top docids //if ( ! m_topDocIdsBuf.m_capacity ) { // int32_t need = sizeof(TopDocIds) * (int32_t)MAX_MATCHING_QUERIES; // if ( ! m_topDocIdsBuf.reserve ( need ,"tdbuf" ) ) return NULL; // //m_nextAvailTopDocIdsOffset = 0;// = m_topDocIdsBuf; //} // make matching query, "qk", point to the topdocids that we // will fill in when we execute this query in full below // sanity! //int32_t off3 = m_nextAvailTopDocIdsOffset ; //if ( off3/(int32_t)sizeof(TopDocIds)>=maxt){char *xx=NULL;*xx=0;} // seo.cpp's handleRequest99() should have set it to -1 //if ( qp->m_topDocIdsBufOffset != -1 ) { char *xx=NULL;*xx=0; } // assign this TopDocIds class to this query ptr now //qp->m_topDocIdsBufOffset = m_nextAvailTopDocIdsOffset; // get that ptr to reset its count to 0 //TopDocIds *ttt = qp->getTopDocIds(&m_topDocIdsBuf); //ttt->m_numDocIds = 0; // inc it //m_nextAvailTopDocIdsOffset += sizeof(TopDocIds); // update length since we store topdocids buf based on its m_length //m_topDocIdsBuf.setLength ( m_nextAvailTopDocIdsOffset ); // advance for next guy m_queryNum++; // add it to dedup table if ( ! m_fullQueryDedup.addKey(&h64) ) { m_hadMatchError = g_errno; goto loop; } // mark it out m_numMsg3aRequests++; // . set the query class for msg3a // . queryExpansion = true m_query3a->set2 ( qstr , langId , true ); // a debug thing m_query3a->m_containingParent = (void *)this; // secret variable latchon m_msg3a->m_hack = this; m_msg3a->m_hackQNum = savedQueryNum; m_msg3a->m_hackQPtr = NULL;//(char *)qp; // note it setStatus("launching msg3a"); // . get the docIds // . this sets m_msg3a.m_clusterLevels[] for us // . it sends a msg39 request to each alive host in the network bool status = m_msg3a->getDocIds ( &m_mr2, m_query3a, this,//m_msg3a , // this , gotMsg3aReplyForFullQueryWrapper); // return false if msg3a blocked if ( ! status ) return (SafeBuf *)-1; // error? if ( g_errno ) { m_hadMatchError = g_errno; m_numMsg3aReplies++; goto loop; } // i guess did not block... can this happen? cached? //log("xmldoc: msg3a did not block"); // not supported yet. we need to process reply. //char *xx=NULL;*xx=0; // yeah, msg17 in there can cache in seoresults cache now gotMsg3aReplyForFullQuery(); // try looping goto loop; } static int rdCmp ( const void *a, const void *b ) { RelatedDocId *da = (RelatedDocId *)a; RelatedDocId *db = (RelatedDocId *)b; // get scores float scorea = da->m_relatedWeight;//dotProduct;//similarityScore; float scoreb = db->m_relatedWeight;//dotProduct;//similarityScore; if ( scorea < scoreb ) return 1; if ( scorea > scoreb ) return -1; return 0; } static int lkCmp ( const void *a, const void *b ) { QueryNumLinkedNode *ka = *(QueryNumLinkedNode **)a; QueryNumLinkedNode *kb = *(QueryNumLinkedNode **)b; // get scores int32_t ra = ka->m_relatedDocIdRank; int32_t rb = kb->m_relatedDocIdRank; if ( ra >= 0 && rb >= 0 ) { if ( ra < rb ) return -1; if ( ra > rb ) return 1; // swap } if ( ra >= 0 ) return -1; if ( rb >= 0 ) return 1; // swap // if neither ranked, go by serp score i guess float sa = ka->m_relatedDocIdSerpScore; float sb = kb->m_relatedDocIdSerpScore; if ( sa > sb ) return -1; if ( sa < sb ) return 1; // swap return 0; } // buf is an array of RelatedDocId members SafeBuf *XmlDoc::getRelatedDocIds ( ) { setStatus ( "getrelateddocids" ); if ( m_relatedDocIdBufValid ) return &m_relatedDocIdBuf; // get the full replies with the top 50 docids and scores listed // for each query. should be sorted by m_myScore. SafeBuf *mq = getMatchingQueriesScoredForFullQuery ( ); if ( ! mq || mq == (void *)-1 ) return mq; // . how many queries do we have that match this url? // . they should be sorted by our url's score //QueryLink *qks = (QueryLink *)mq->getBufStart(); //int32_t nks = mq->length()/sizeof(QueryLink); int32_t *sh32 = getSiteHash32(); if ( ! sh32 || sh32 == (int32_t *)-1 ) return (SafeBuf *)sh32; int32_t dh32 = getDomHash32(); //if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; } //if ( ! m_domHash32Valid ) { char *xx=NULL;*xx=0; } int32_t ourSiteHash26 = *sh32 & 0x03ffffff; int32_t ourDomHash26 = dh32 & 0x03ffffff; // for deduping queries with the same "base hash" we do not want // them to count twice for RelatedDocId::m_numCommonQueries //HashTableX dedup; //if ( ! dedup.set(8,0,1024,NULL,0,false,0,"dddtab")) // return NULL; // scan the top docids TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart(); int32_t ntds = m_topDocIdsBuf.length() / sizeof(TopDocIds); for ( int32_t i = 0 ; i < ntds ; i++ ) { TopDocIds *td = &tds[i]; int32_t queryNum = td->m_queryNum; //QueryLink *qk = &qks[queryNum]; // sanity int32_t nd = td->m_numDocIds; if( nd < 0) { char *xx=NULL;*xx=0; } if( nd > (int32_t)NUM_RESULTS_FOR_RELATED_DOCIDS){ char *xx=NULL;*xx=0;} // get main url score for query //float ourScore = qp->m_myScore; // and the score of the top result //float normScore = td->m_topScores[0]; // norm main url score //ourScore /= normScore; // scan the top 50 (or more) docids for this query for ( int32_t j = 0 ; j < nd ; j++ ) { // . do not allow related docid (aka competitor page) // to be from our site! will make sure we exclude // our url itself, too. otherwise competitor // backlinks mentions when a link links to us, and // we don't care about that, we already have the // link. we just want to see recommneded backlinks // we do not yet have, so we can get them. // . skip it if from our same sitehash26 if ( td->m_topSiteHashes26[j] == ourSiteHash26 ) continue; // fix cheatcodes.com being a competitor page when // our main url is www.cheatcodes.com if ( td->m_topSiteHashes26[j] == ourDomHash26 ) continue; // skip twitter facebook, etc int64_t docId = td->m_topDocIds[j]; if ( docId == 114607849462LL || // https://www.twitter docId == 273941610476LL || // twitter.com docId == 1628437294LL || // facebook.com docId == 146394931444LL ) // cnn.com/video/ continue; // add RelatedDocId into m_relatedDocIdBuf and/or // augment its linked list of query/score pairs addRelatedDocIdInfo ( td->m_topDocIds[j], queryNum , td->m_topScores[j], // score j , // rank td->m_topSiteHashes26[j] ); } } QUICKPOLL(m_niceness); // this is now in getRelatedDocIdsScored()!!!!!!! /* char *rdbuf = m_relatedDocIdBuf.getBufStart(); int32_t numDocIds = m_relatedDocIdBuf.length()/sizeof(RelatedDocId); // now sort by RelatedDocId::m_relatedWeight qsort ( rdbuf , numDocIds, sizeof(RelatedDocId),qp99docIdCmp ); QUICKPOLL(m_niceness); // limit to top MAX_RELATED_DOCIDS related docids // will take longer to get titles/urls and related queries the // higher this number is, but we will have more competitor backlinks // and terms etc. int32_t maxLen = sizeof(RelatedDocId) * MAX_RELATED_DOCIDS; int32_t currentLen = m_relatedDocIdBuf.length(); if ( currentLen > maxLen ) currentLen = maxLen; m_relatedDocIdBuf.setLength(currentLen); numDocIds = currentLen / sizeof(RelatedDocId); */ int32_t numDocIds = m_relatedDocIdBuf.length() / sizeof(RelatedDocId); /* // log out for debug char *rdbuf = m_relatedDocIdBuf.getBufStart(); RelatedDocId *rds = (RelatedDocId *)rdbuf; for ( int32_t i = 0 ; g_conf.m_logDebugSEO && i < numDocIds ; i++ ) { log("seopipe: related docId #%"INT32" docid=%"INT64" " "score=?? common=%"INT32"", i, rds[i].m_docId, //rds[i].m_relatedWeight,//dotProduct, // similarityScore, rds[i].m_numCommonQueries); } */ log("seo: got %"INT32" related docids in buf",numDocIds); m_relatedDocIdBufValid = true; return &m_relatedDocIdBuf; } // used as part of the msg4f request SafeBuf *XmlDoc::getTopMatchingQueryBuf ( ) { if ( m_topMatchingQueryBufValid ) return &m_topMatchingQueryBuf; // scan matching queries that we evaluated fully using msg3a SafeBuf *qkbuf = getMatchingQueriesScoredForFullQuery ( ); if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf; //Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart(); //int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *); QueryLink *qks = (QueryLink *)qkbuf->getBufStart(); //int32_t nks = qkbuf->length()/sizeof(QueryLink); TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart(); int32_t ntds = m_topDocIdsBuf.length() / sizeof(TopDocIds); for ( int32_t i = 0 ; i < ntds ; i++ ) { TopDocIds *td = &tds[i]; int32_t queryNum = td->m_queryNum; QueryLink *qk = &qks[queryNum]; // ok, get it char *qstr = qk->getQueryString(&m_matchingQueryStringBuf); int32_t qlen = gbstrlen(qstr); // store query # if ( ! m_topMatchingQueryBuf.pushLong(queryNum) ) return NULL; // then query if ( ! m_topMatchingQueryBuf.safeMemcpy(qstr,qlen+1)) return NULL; } m_topMatchingQueryBufValid = true; return &m_topMatchingQueryBuf; } static void gotMsg4fReplyWrapper ( void *state , UdpSlot *slot ) { XmlDoc *THIS = (XmlDoc *)state; // a bit of a hack THIS->m_savedSlot = slot; // ultimately, getRelatedDocIdsScored() will be called from this THIS->m_masterLoop ( THIS->m_masterState ); } // . lets just put everything in this one function // . launch a msg4f request for each relateddocid // . get the msg4f reply back and add the positive scoring queries to the // related docids linked list of QueryNumLinkedNodes in the // m_commonQueryNumBuf, avoid dups. // . then score each related docid by calling setRelatedDocIdScores() SafeBuf *XmlDoc::getRelatedDocIdsScored ( ) { setStatus ( "getrelateddocidsscored"); if ( m_relatedDocIdsScoredBufValid ) { // and return the buf of RelatedDocIds return &m_relatedDocIdBuf; } // what docids share our TOP-scoring matching queries? SafeBuf *rdbuf = getRelatedDocIds(); if ( ! rdbuf || rdbuf == (void *)-1) return (SafeBuf *) rdbuf; SafeBuf *tmq = getTopMatchingQueryBuf(); if ( ! tmq || tmq == (void *)-1) return (SafeBuf *) tmq; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // the top 50 or so matching queries will each be scored for // every related docid we have in m_relatedDocIdBuf. these are // the same queries we got the full results for above!!! // we have to score them for each related docid here because we only // get the top 300 or so results above for each one. so if the // related docid matched the query but was not in the top 300 results, // it would have appeared to NOT match the query. bad. that was // causing google to come up high in related docids because it // ranked high for so many generic queries. and the other good // related docids did not rank in the top 300 for those same // generic queries. so at least this logic will show that the // related docids do indeed match those generic queries, too. // and they will get higher scores (RelatedDocId::m_relatedWeight) // we must be an incoming reply if we already sent out all the requests if ( m_numMsg4fRequests > 0 ) { // increment our reply counter m_numMsg4fReplies++; // . m_savedSlot is a hack // . now parse the reply and add QueryNumLinkedNode // into m_commonQueryNumBuf. char *p = m_savedSlot->m_readBuf; char *pend = m_savedSlot->m_readBufSize + p; // now scan the reply for ( ; p < pend ; ) { // breathe QUICKPOLL(m_niceness); // the queryNum is relative to the m_queryPtrs array // which has all the matching queries of this document, // not just the "top" 50 matching queries by score. int32_t queryNum = *(int32_t *)p; // sanity if ( queryNum<0 ) {char *xx=NULL;*xx=0; } p += 4; // then docid of related docid that had this score int64_t docId = *(int64_t *)p; p += 8; // then score float score = *(float *)p; p += 4; // this will add the query/score pair into the // related docid buf. it will not add dups if already // ranked! addRelatedDocIdInfo ( docId , queryNum , score , -1 , // rank unknown -1 ); // sitehash26 unknown } // return if awaiting more replies if ( m_numMsg4fReplies < m_numMsg4fRequests ) return (SafeBuf *)-1; // point to buffer of related docids char *rdbuf = m_relatedDocIdBuf.getBufStart(); RelatedDocId *rds = (RelatedDocId *)rdbuf; int32_t nr = m_relatedDocIdBuf.length() / sizeof(RelatedDocId); for ( int32_t i = 0 ; i < nr ; i++ ) { // int16_tcut RelatedDocId *rd = &rds[i]; // now score it since we have all the serpscores for // all top matching queries. setRelatedDocIdWeightAndRank(rd); } // breathe QUICKPOLL(m_niceness); // now sort by RelatedDocId::m_relatedWeight qsort ( rdbuf , nr , sizeof(RelatedDocId),rdCmp ); // breathe QUICKPOLL(m_niceness); // limit to top MAX_RELATED_DOCIDS related docids // will take longer to get titles/urls and related queries the // higher this number is, but we will have more competitor // backlinks and terms etc. less space in cachedb too! int32_t maxLen = MAX_RELATED_DOCIDS * sizeof(RelatedDocId); int32_t newLen = m_relatedDocIdBuf.length(); if ( newLen > maxLen ) newLen = maxLen; m_relatedDocIdBuf.setLength(newLen); // // make a new buffer for m_commonQueryNumBuf just for the // related docids we picked, and sort them by rel docid rank. // so it will be smaller and sorted. // SafeBuf tmpBuf; if ( ! tmpBuf.reserve ( m_commonQueryNumBuf.length() ) ) return NULL; // scan each related docid in the top 300 or so for ( int32_t i = 0 ; i < nr ; i++ ) { // int16_tcut RelatedDocId *rd = &rds[i]; // store ptrs to query nums so we can sort them QueryNumLinkedNode *links[1024]; int32_t nn = 0; int32_t fo = rd->m_firstCommonQueryNumOff; char *base = m_commonQueryNumBuf.getBufStart(); // scan down the linked list and store ptrs to links[] for ( ; fo >= 0 ; ) { // cast it QueryNumLinkedNode *qn; qn = (QueryNumLinkedNode *)(base + fo); // point to next fo = qn->m_nextOff; // store this guy for sorting links[nn] = qn; nn++; if ( nn >= 1024 ) break; } // now sort them by m_relatedDocIdRank qsort( links, nn,sizeof(QueryNumLinkedNode *),lkCmp); // point to our new linked list in tmpBuf, we will // store them here. rd->m_firstCommonQueryNumOff = tmpBuf.length(); QueryNumLinkedNode *prev = NULL; // now store into tmpbuf for ( int32_t k = 0 ; k < nn ; k++ ) { QueryNumLinkedNode *qn = links[k]; int32_t size = sizeof(QueryNumLinkedNode); if ( !tmpBuf.reserve(size) ) return NULL; QueryNumLinkedNode *nn ; nn = (QueryNumLinkedNode *)tmpBuf.getBuf(); int32_t clen = tmpBuf.length(); tmpBuf.safeMemcpy(qn,size); // we are the previous guy's next node if ( prev ) prev->m_nextOff = clen; // assume nobody follows us nn->m_nextOff = -1; // we are now next guy's prev prev = nn; } } // now steal tmpbuf, and free our old stuff m_commonQueryNumBuf.stealBuf ( &tmpBuf ); // i guess we are done now! m_relatedDocIdsScoredBufValid = true; return &m_relatedDocIdBuf; } int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId); RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart(); // . there's a massive # of related docids at this point // . possibly 50 x 300 = 15,000 // . so launch one msg4f for each host in our network // . just specify all the related docids in the msg4f request and have // the handleRequest4f() function in seo.cpp get the title rec. // . make sure all docids are local to that host // . dispatch the msg4f request to the machine that has that docid // local so it can just hit disk // . handleRequest4f() can follow the same logic as in // getRelatedQueryLinks() which make a new xmldoc. then it can // call newxd->getTermListBuf() instead of us passing it in. // . so each host has a bin, a host bin #ifdef __APPLE__ SafeBuf hostBin[MAX_HOSTS]; #else SafeBuf hostBin[g_hostdb.m_numHosts]; #endif // scan the related docids and send the requests if we have not already for ( int32_t i = 0 ; ! m_sentMsg4fRequests && i < numRelated ; i++ ) { RelatedDocId *rd = &rds[i]; //uint32_t gid=g_hostdb.getGroupIdFromDocId (rd->m_docId); // pick host in that group //Host *group = g_hostdb.getGroup ( gid ); int32_t shardNum = getShardNumFromDocId ( rd->m_docId ); Host *group = g_hostdb.getShard ( shardNum ); int32_t nh = g_hostdb.m_numHostsPerShard; int32_t hostNum = rd->m_docId % nh; Host *h = &group[hostNum]; int32_t hostId = h->m_hostId; // skip if dead int32_t count = 0; if ( g_hostdb.isDead(hostId) && h->m_wasEverAlive ) { // increment hostnum if that one is dead if ( ++hostNum >= nh ) hostNum = 0; // set these again h = &group[hostNum]; hostId = h->m_hostId; // if all dead, just pick this one i guess if ( ++count >= nh ) break; } // int16_tcut SafeBuf *hbin = &hostBin[hostId]; // if bin is empty initialize if ( hbin->length() == 0 ) { // provide only collection to handleRequest4f() if ( ! hbin->safeMemcpy(cr->m_coll, gbstrlen(cr->m_coll)+1) ) return NULL; // . store the queries we want it to evaluate // . these are null-terminated query strings preceeded // by their corresponding query number in our // m_queryPtrs[] array which pts to a Msg99Reply if ( ! hbin->pushLong(tmq->length())) return NULL; if ( ! hbin->safeMemcpy(tmq)) return NULL; } // store this new docid, which is local to this host if ( ! hbin->pushLongLong(rd->m_docId) ) return NULL; } // shotgun out the msg4f requests now for ( int32_t i = 0 ; ! m_sentMsg4fRequests && i < g_hostdb.getNumHosts() ; i++ ) { // int16_tcut SafeBuf *hbin = &hostBin[i]; // get that host Host *host = g_hostdb.getHost(i); // make a copy for sending out SafeBuf copy; if ( ! copy.safeMemcpy ( hbin ) ) continue; // get the bin copy char *req = copy.getBufStart(); int32_t reqSize = copy.length(); // detach it so udpserver can free it when done transmitting copy.detachBuf (); // free this guy now i guess hbin->purge(); // count as launched m_numMsg4fRequests++; // launch it if ( ! g_udpServer.sendRequest ( req , reqSize, 0x4f , // msgtype host->m_ip , // ip host->m_port , // port host->m_hostId, NULL, // retslot this, gotMsg4fReplyWrapper, 10000 , // timeout -1 , // backoff -1 , // maxwait NULL, // replybuf 0, // replybufmaxsize m_niceness // niceness )) { // let admin know about error log("seopipe: sendRequest 4f had error: %s", mstrerror(g_errno)); // count it as replied then m_numMsg4fReplies++; continue; } } // do not re-send the requests m_sentMsg4fRequests = true; // wait for all replies to come in if ( m_numMsg4fRequests > m_numMsg4fReplies ) return (SafeBuf *)-1; // how can they all be done? all errors! if ( ! g_errno ) { char *xx=NULL;*xx=0; } return NULL; } // remote host will alloc an xmldoc, about 1MB each... #define MAX_OUT_MSG20S 30 // . like getRelatedDocIds() but with titles, etc. // . return a list of competiting docids/titles/etc. SafeBuf *XmlDoc::getRelatedDocIdsWithTitles ( ) { setStatus ( "getrelateddocidswithtitles" ); // try to set from cachedb record if ( ! checkCachedb() ) return (SafeBuf *)-1; if ( m_relatedDocIdsWithTitlesValid ) return &m_relatedDocIdBuf; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; SafeBuf *rdbuf = getRelatedDocIdsScored(); if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf; int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId); // now look up each docid in titledb and store the url title // into m_relatedTitleBuf safebuf and set the RelatedDocId:: // rd_title_off and rd_url_off into that when done. store offsets for // now and make into full out ptrs when done in case the // m_relatedTitleBuf reallocs. if ( ! m_msg20Buf.length() ) { int32_t need = sizeof(Msg20) * MAX_OUT_MSG20S ; if ( ! m_msg20Buf.reserve ( need,"m20buf" ) ) return NULL; // mark it all in use m_msg20Buf.setLength(need); // init them Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart(); int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20); for ( int32_t i = 0 ; i < numMsg20s ; i++ ) mp[i].constructor(); // reset cursor to start with first related docid m_rdCursor = 0; m_relatedDocIdError = 0; m_numMsg20Replies = 0; } // point to buffer of related docids RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart();; Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart(); int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20); // scan the msg20s we allocated to see if any got a reply for ( int32_t i = 0 ; i < numMsg20s ; i++ ) { // int16_tcut Msg20 *msg20 = &mp[i]; // skip if never launched if ( ! msg20->m_launched ) continue; // skip if it is in progress, awaiting its reply if ( msg20->m_inProgress ) continue; // get the reply from it (might be NULL iff g_errno is set) Msg20Reply *reply = msg20->getReply(); // m_r // get the corresponding related docid int32_t hisCursor = msg20->m_hack2; // int16_tcut RelatedDocId *rd = &rds[hisCursor]; // ok, it has a reply. could be NULL if g_errno was set. if ( ! setRelatedDocIdInfoFromMsg20Reply ( rd , reply ) ) m_relatedDocIdError = g_errno; // reset it for later us... or not... msg20->reset(); // count reply as back now m_numMsg20Replies++; } // launch more if we can. one launch per msg20. for ( int32_t i = 0 ; i < numMsg20s ; i++ ) { // no more related docids left to launch? if ( m_rdCursor >= numRelated ) break; // int16_tcut Msg20 *msg20 = &mp[i]; // skip if already launched/inuse if ( msg20->m_inProgress ) continue; // get current related docid RelatedDocId *rd = &rds[m_rdCursor]; // make the request Msg20Request req; //req.ptr_coll = cr->m_coll; //req.size_coll = gbstrlen(cr->m_coll)+1; req.m_collnum = cr->m_collnum; req.m_docId = rd->m_docId; req.m_expected = true; req.m_niceness = m_niceness; req.m_state = m_masterState; req.m_callback2 = m_masterLoop; // do not get summary stuff. too slow. req.m_numSummaryLines = 0; // if it has an outlink to our site/domain set // Msg20Reply::m_hasLinkToOurDomOrHost req.m_ourHostHash32 = getHostHash32a(); req.m_ourDomHash32 = getDomHash32(); // store cursor in msg20 itself so we know what rd it's using msg20->m_hack2 = m_rdCursor; // advance cursor!!! m_rdCursor++; // launch it if ( ! msg20->getSummary ( &req ) ) continue; // it did not block... wtf? g_errno might be set. ENOMEM? if ( ! setRelatedDocIdInfoFromMsg20Reply ( rd , NULL ) ) m_relatedDocIdError = g_errno; // reset it msg20->reset(); // count reply as back now m_numMsg20Replies++; // it is not launched i--; } // wait for one reply per related docid if ( m_numMsg20Replies < numRelated ) return (SafeBuf *)-1; // call msg20 destructor for ( int32_t i = 0 ; i < numMsg20s ; i++ ) { Msg20 *msg20 = &mp[i]; msg20->destructor(); } // purge the mem they used m_msg20Buf.purge(); // now we are done m_relatedDocIdsWithTitlesValid = true; m_relatedTitleBufValid = true; // store it in cachedb if ( ! storeRelatedDocIdsIntoCachedb( )) return (SafeBuf *)-1; return &m_relatedDocIdBuf; } bool XmlDoc::setRelatedDocIdInfoFromMsg20Reply ( RelatedDocId *rd , Msg20Reply *reply ) { // get error. g_errno can be ENOTFOUND if titlerec not found int32_t error = g_errno; // . or could be EDOCBANNED/EDOCFILTERED etc. // . if reply is NULL then g_errno MUST be set if ( ! error ) error = reply->m_errno; // int16_tcuts char *urlStr = NULL; char *titleStr = NULL; char *siteStr = NULL; if ( reply ) { urlStr = reply->ptr_ubuf; titleStr = reply->ptr_tbuf; siteStr = reply->ptr_site; } // did that fail? i.e. docid not found!?!?! if ( error ) { // . just skip this asshole then // . might be EDOCBANNED or EDOCFILTERED! // . some are filtered because they are domain-only urls // which should not be in the index because we force // a "www." prepend on all urls now. log("seo: msg20 reply for docid=%"INT64" url=%s had " "error: %s", rd->m_docId,urlStr,mstrerror(error)); // clear that g_errno = 0; ignoreRelatedDocId: // mark them offsets as not-founds rd->rd_title_off = -1; rd->rd_url_off = -1; rd->rd_site_off = -1; return true; } // bar facebook.com and twitter.com roots... too popular for all! // was coming up for jezebelgallery.com if ( strcmp(urlStr,"http://www.twitter.com/") == 0 ) goto ignoreRelatedDocId; if ( strcmp(urlStr,"https://www.twitter.com/") == 0 ) goto ignoreRelatedDocId; if ( strcmp(urlStr,"http://www.facebook.com/") == 0 ) goto ignoreRelatedDocId; // "/home.php?" or "home.*" if ( strncmp(urlStr,"http://www.facebook.com/home.",29) == 0 ) goto ignoreRelatedDocId; if ( strcmp(urlStr,"https://www.facebook.com/") == 0 ) goto ignoreRelatedDocId; if ( strcmp(urlStr,"http://www.cnn.com/video/") == 0 ) goto ignoreRelatedDocId; // fix robothits.com competitor pages if ( strcmp(urlStr,"http://www.google.com/") == 0 ) goto ignoreRelatedDocId; if ( strcmp(urlStr,"http://www.msn.com/") == 0 ) goto ignoreRelatedDocId; // null means no title i guess if ( ! titleStr ) titleStr = ""; // or if he links to us if ( reply->m_hasLinkToOurDomOrHost ) { log("seo: related docid=%"INT64" url=%s links to our domain", reply->m_docId, urlStr); goto ignoreRelatedDocId; } // store title int32_t titleOffset = m_relatedTitleBuf.length(); if ( ! m_relatedTitleBuf.safeStrcpy ( titleStr ) ) return false; m_relatedTitleBuf.pushChar('\0'); // then url int32_t urlOffset = m_relatedTitleBuf.length(); if ( ! m_relatedTitleBuf.safeStrcpy ( urlStr ) ) return false; m_relatedTitleBuf.pushChar('\0'); // then site int32_t siteOffset = m_relatedTitleBuf.length(); if ( ! m_relatedTitleBuf.safeStrcpy ( siteStr ) ) return false; m_relatedTitleBuf.pushChar('\0'); // then linkinfo //int32_t linkInfo1Offset = m_relatedTitleBuf.length(); //if(!m_relatedTitleBuf.safeMemcpy(info1,info1->getSize()))return NULL; // store as offset for easy serialization for storage into cachedb //rd->m_linkInfo1Offset = linkInfo1Offset; rd->m_relatedFirstIp = reply->m_firstIp; rd->m_relatedCurrentIp = reply->m_ip; rd->m_rd_siteRank = reply->m_siteRank; rd->m_rd_langId = reply->m_language; rd->m_rd_siteHash32 = 0; if ( reply->ptr_site ) rd->m_rd_siteHash32 = hash32n ( reply->ptr_site ); // record the offsets of title/url/site in the m_relatedTitleBuf rd->rd_title_off = titleOffset; rd->rd_url_off = urlOffset; rd->rd_site_off = siteOffset; SafeBuf *rdbuf = getRelatedDocIds(); int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId); // log out for debug log(LOG_DEBUG, "seopipe: related docid (%"INT32"of%"INT32") docid=%"INT64" score=%f " "title=\"%s\" url=\"%s\"", m_numMsg20Replies, numRelated-1, rd->m_docId, rd->m_relatedWeight, titleStr, urlStr); return true; } /* HashTableX *XmlDoc::getMatchingQueryHashTable ( ) { setStatus ( "getmatchingqueryhashtable" ); if ( m_queryHashTableValid ) return &m_queryHashTable; SafeBuf *qpbuf = getMatchingQueries(false); if ( ! qpbuf || qpbuf == (void *)-1) return (HashTableX *)qpbuf; // how many queries do we have that match this url? Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart(); int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *); // init it if ( ! m_queryHashTable.set(8, 0, numQueryPtrs*4, NULL, 0, false, m_niceness, "qdht") ) return NULL; for ( int32_t i = 0 ; i < numQueryPtrs ; i++ ) { // cast it Msg99Reply *qp = qptrs[i]; // int16_tcut int64_t eh64 = qp->m_queryInfo.m_queryExactHash64; // hash it up if ( ! m_queryHashTable.addKey ( &eh64 ) ) return NULL; } // all done m_queryHashTableValid = true; return &m_queryHashTable; } */ /* HashTableX *XmlDoc::getMatchingQueryOffsetTable ( ) { setStatus ( "getmatchingqueryoffsettable" ); if ( m_queryOffsetTableValid ) return &m_queryOffsetTable; SafeBuf *qkbuf = getMatchingQueryBuf(); if ( ! qkbuf || qkbuf == (void *)-1) return (HashTableX *)qkbuf; // how many queries do we have that match this url? //Msg99Reply **qptrs = (Msg99Reply **)qpbuf->getBufStart(); //int32_t numQueryPtrs = qpbuf->length() / sizeof(Msg99Reply *); QueryLink *qks = (QueryLink *)qkbuf->getBufStart(); int32_t nks = qkbuf->length()/sizeof(QueryLink); // init it if ( ! m_queryOffsetTable.set(8, 0, nks*4, NULL, 0, false, m_niceness, "qdot") ) return NULL; for ( int32_t i = 0 ; i < nks ; i++ ) { // cast it QueryLink *qk = &qks[i]; // int16_tcut //int64_t eh64 = qp->m_queryInfo.m_queryExactHash64; int64_t eh64 = qp->m_replyingHostId; eh64 <<= 32; eh64 |= qp->m_qbufOffset; // hash it up if ( ! m_queryOffsetTable.addKey ( &eh64 ) ) return NULL; } // all done m_queryOffsetTableValid = true; return &m_queryOffsetTable; } //static char *s_base = NULL; // related QUERY compate int qp99relatedCmp ( const void *a, const void *b ) { // these are offsets //int32_t offa = *(int32_t *)a; //int32_t offb = *(int32_t *)b; QueryLink *qa = *(QueryLink **)a; QueryLink *qb = *(QueryLink **)b; // make sure manually added queries are on top //if ( qa->m_isManuallyAdded && ! qb->m_isManuallyAdded ) return 1; //if ( qb->m_isManuallyAdded && ! qa->m_isManuallyAdded ) return -1; //QueryInfo *qia = &qa->m_queryInfo; //QueryInfo *qib = &qb->m_queryInfo; // get scores float scorea = qa->m_rq_totalScore; float scoreb = qb->m_rq_totalScore; if ( scorea < scoreb ) return 1; if ( scorea > scoreb ) return -1; //return 0; // let docidsincommon break ties return qb->m_docIdVotes - qa->m_docIdVotes; } */ /* static int qlCmp ( const void *a, const void *b ) { QueryLink *qa = (QueryLink *)a; QueryLink *qb = (QueryLink *)b; // let docid break ties int64_t da = qa->getRelatedDocId(s_rdBuf)->m_docId; int64_t db = qb->getRelatedDocId(s_rdBuf)->m_docId; //int64_t da = qa->m_relatedDocId->m_docId; //int64_t db = qb->m_relatedDocId->m_docId; // always niceness 1 i guess QUICKPOLL(1); if ( da > db ) return 1; // 1 means to swap! if ( da < db ) return -1; return 0; } */ #include // sqrtf() // now we can do square roots in gdb by calling this float gbsqrt ( float x ) { return sqrtf(x); } /* // sort the related query links intersected buf by docid QueryLink *ptrs; ptrs = (QueryLink *)m_relatedQueryLinksIntersected.getBufStart(); int32_t nk = m_relatedQueryLinksIntersected.length() / sizeof(QueryLink); qsort ( ptrs , nk, sizeof(QueryLink), qlCmp ); // show time int64_t now = gettimeofdayInMilliseconds(); int64_t took = now - start; log("seopipe: time: relatedqueryintersection took %"INT64" ms",took); */ /* void XmlDoc::gotMsg98Reply ( UdpSlot *slot ) { // get replying hostid int32_t hostId = slot->m_hostId; // log setStatus ( "gotmsg98reply" ); // sanity if ( hostId < 0 || hostId >= g_hostdb.m_numHosts) {char*xx=NULL;*xx=0;} // point to it char *p = slot->m_readBuf; char *pend = p + slot->m_readBufSize; // int16_tcuts QueryLink *qks = (QueryLink *)m_tmpBuf5.getBufStart(); // sanity, i guess if oom int32_t maxLinkOff = m_tmpBuf5.length() ; maxLinkOff /= sizeof(QueryLink); // make some space int32_t need = slot->m_readBufSize; if ( ! m_tmpStringBuf5.reserve(need,"rqdbuf") ) { m_msg98ReplyError = g_errno; // do not bother scanning the reply p = pend; } // init table if ( m_qstringTable.m_numSlots == 0 ) { // 1M slots! if ( ! m_qstringTable.set(4,4,1000000,NULL,0,false, m_niceness,"qstrtbl") ) { m_msg98ReplyError = g_errno; // do not bother scanning the reply p = pend; } } //int32_t numQueryLinks = m_relatedQueryLinksIntersected.length() ; //numQueryLinks /= sizeof(QueryLink); // put strings into m_tmpStringBuf5 // parse these strings // maybe index so we can assign to QueryLinks::m_queryStringOffset // maybe include querylink # so we can assign quickly! QueryLink *qk; for ( ; p < pend ; ) { // breathe QUICKPOLL(m_niceness); // offset of query link int32_t queryLinkOff = *(int32_t *)p; p += 4; // crazy? maybe we went oom on m_relatedQueryLinksIntersected if ( queryLinkOff >= maxLinkOff ) { log("seopipe: msg98 reply link off breach %"INT32">=%"INT32"", queryLinkOff,maxLinkOff); m_msg98ReplyError = ENOMEM; break; } // get that QueryLogEntry *qe = (QueryLogEntry *)p; // skip it p += qe->getSize(); // point to it qk = &qks[queryLinkOff]; // do not duplicate query strings! int32_t qh32 = hash32n ( qe->getQueryString() ); int32_t slot = m_qstringTable.getSlot ( &qh32 ); if ( slot >= 0 ) { int32_t qeOff; qeOff =*(int32_t *)m_qstringTable.getValueFromSlot(slot); qk->m_queryStringOffset = qeOff; qk->m_queryHostId = -1; continue; } // get offset of string in string bug int32_t stringOff = m_tmpStringBuf5.length(); // store good serp score if ( ! m_tmpStringBuf5.safeMemcpy(qe,qe->getSize() ) ) { m_msg98ReplyError = g_errno; break; } // add to table if ( ! m_qstringTable.addKey(&qh32,&stringOff) ) { m_msg98ReplyError = g_errno; break; } // show it //log("seopipe: DEBUG. mapped remote off %"INT32" (hostid%"INT32") to " // "local off %"INT32" (%s)" // ,qk->m_queryStringOffset,qk->m_queryHostId,stringOff,qstr); // . save string offset // . THIS OVERWRITES the g_qbuf offset that was in there!!! qk->m_queryStringOffset = stringOff; // to indicate that this QueryLink::m_queryStringOffset is now // an offset into m_relatedQueryStringBuf and no longer an // offset into g_qbuf of the specific hostid, we set hostid // to -1 qk->m_queryHostId = -1; } // steal it so it doesn't free it //slot->m_readBuf = NULL; // inc the counter m_numMsg98Replies++; // return control to transmit function. it will call m_callback1 // if the function is done. but if a different parent function than // transmit called us then we call that. it just depends on the // intial entry function that called getMatchingQueries() m_masterLoop ( m_masterState ); } static void gotMsg3fReplyWrapper ( void *state , void *state2 ) { XmlDoc *THIS = (XmlDoc *)state; //Multicast *m = (Multicast *)state2; Bin *bin = (Bin *)state2; THIS->gotMsg3fReply ( bin ); // m } */ static int mtCmp ( const void *a, const void *b ) { MissingTerm *wa = *(MissingTerm **)a; MissingTerm *wb = *(MissingTerm **)b; if ( wb->m_importance > wa->m_importance ) return 1; // swap if ( wb->m_importance < wa->m_importance ) return -1; if ( wb->m_votes > wa->m_votes ) return 1; // swap if ( wb->m_votes < wa->m_votes ) return -1; if ( (int64_t)b < (int64_t)a ) return 1; // swap if ( (int64_t)b > (int64_t)a ) return -1; return 0; } // . called by getMissingTermBuf() and getMatchingTermBuf() // . returns false and sets g_errno on error bool XmlDoc::addTermsFromQuery ( char *qstr, uint8_t queryLangId, int32_t gigablastTraffic, int32_t googleTraffic2, //QueryLogEntry *qe , int32_t hackqoff, SafeBuf *tmpBuf , HashTableX *scoreTable , HashTableX *topTermsTable , float imp, // importance bool isRelatedQuery ) { // sanity if ( hackqoff < 0 ) { char *xx=NULL;*xx=0; } // print query but bold-face the terms our doc has not Query qq; //SafeBuf *rqsb = &m_relatedQueryStringBuf; // doQueryExpansion = false //char *qstr = qe->getQueryString ( ); qq.set2 ( qstr , queryLangId , false ); int32_t lastStart = -1; for ( int32_t k = 0 ; k < qq.m_numWords ; k++ ) { QUICKPOLL(m_niceness); QueryWord *qw = &qq.m_qwords[k]; int32_t tid32 = qw->m_wordId & 0xffffffff; // is it not contained by our doc if ( ! tid32 ) continue; // skip if we contain it already if ( isRelatedQuery && topTermsTable->isInTable ( &tid32 ) ) continue; // skip if common word like "on" "at" etc. if ( isCommonQueryWordInEnglish(tid32) ) continue; // get start of wikipedia phrase it is in int32_t start = qw->m_wikiPhraseStart; int32_t nwk = qw->m_numWordsInWikiPhrase; // if not in wiki phrase at all, just use single word if ( qw->m_wikiPhraseId == 0 ) { start = k; nwk = 1; } // do not re-do any words in here if ( start == lastStart ) continue; lastStart = start; // hash each wordid in the term into the th64 hash int64_t th64 = 0LL; //int32_t alnumWordCount = 0; for ( int32_t j = start ; j < start + nwk ; j++ ) { // int16_tcut QueryWord *qw = &qq.m_qwords[j]; // skip punct if ( qw->m_wordId == 0 ) continue; // hash otherwise th64 ^= qw->m_wordId; // count it //alnumWordCount++; } // get traffic of related query int32_t traffic = gigablastTraffic; // make gb traffic into google monthly traffic traffic *= GB_TRAFFIC_MODIFIER; // ues google numbers if we have them, more accurate int32_t googleTraffic = googleTraffic2; if ( googleTraffic >= 0 ) traffic = googleTraffic; // now score that term int32_t slot = scoreTable->getSlot ( &th64 ); if ( slot >= 0 ) { int32_t off; off=*(int32_t *)scoreTable->getValueFromSlot(slot); char *base = tmpBuf->getBufStart(); MissingTerm *pt=(MissingTerm *)(base + off); pt->m_importance += imp; pt->m_votes++; pt->m_traffic += traffic; // store first 10 related query strings // we got this term from for ( int32_t x = 1 ; x < 10 ; x++ ) { if ( pt->m_hackQueryOffsets[x] != -1 ) continue; // grab it. querylogentry ptr!! pt->m_hackQueryOffsets[x] = hackqoff; break; } continue; } // set a class to store in safebuf MissingTerm mt; mt.m_importance = imp; //mt.m_numAlnumWords = alnumWordCount; mt.m_synOf = NULL; mt.m_votes = 1; mt.m_traffic = traffic; mt.m_hackQueryOffsets[0] = hackqoff; // if not a missing term, we are a MATCHING term mt.m_isMissingTerm = isRelatedQuery; // invalidate the remaining 9 query offsets for ( int32_t x = 1 ; x < 10 ; x++ ) mt.m_hackQueryOffsets[x] = -1; int32_t offset = tmpBuf->length(); int32_t toCopy = sizeof(MissingTerm); if ( ! tmpBuf->safeMemcpy(&mt,toCopy)) return false; // for calculating length of stored term string int32_t startLen = tmpBuf->length(); // . if first time in scoretable, add stuff // . store the string, each word separately for ( int32_t j = start ; j < start + nwk ; j++ ) { // int16_tcut QueryWord *qw = &qq.m_qwords[j]; // point to word as string char *str = qw->m_word; int32_t len = qw->m_wordLen; // make all punct a space if ( qw->m_wordId == 0 ) { str = " "; len = 1; } // store term string after MissingTerm class if ( ! tmpBuf->safeMemcpy(str,len) ) return false; } tmpBuf->pushChar('\0'); // record MissingTerm::m_termSize int32_t delta = tmpBuf->length() - startLen; char *base = tmpBuf->getBufStart(); MissingTerm *pmt = (MissingTerm *)(base + offset); pmt->m_termSize = delta; // now score table entry if ( ! scoreTable->addKey ( &th64 , &offset ) ) return false; } return true; } // this is used to sort the MissingTerm instances in a safeBuf, // missingTermBuf. it is also used to sort the Matching terms from // getMatchingTermBuf() as well now! bool XmlDoc::sortTermsIntoBuf ( HashTableX *scoreTable , SafeBuf *tmpBuf , SafeBuf *missingTermBuf ) { // make ptrs for sorting int32_t numTerms = scoreTable->getNumUsedSlots(); int32_t need = numTerms * 4; SafeBuf ptrBuf; if ( ! ptrBuf.reserve ( need ,"srtbuf") ) return false; char *p = tmpBuf->getBufStart(); char *pend = tmpBuf->getBuf(); for ( ; p < pend ; ) { MissingTerm *mt = (MissingTerm *)p; p += mt->getSize(); ptrBuf.pushPtr ( mt ); } gbqsort ( ptrBuf.getBufStart(), numTerms, sizeof(MissingTerm *), mtCmp, m_niceness); // now write the missingTerm instances into m_missingTermBuf int32_t need2 = tmpBuf->length(); if ( ! missingTermBuf->reserve ( need2 ,"mtbuf") ) return false; // now write back into the real buf MissingTerm **pp = (MissingTerm **)ptrBuf.getBufStart(); for ( int32_t i = 0 ; i < numTerms ; i++ ) { MissingTerm *mt = pp[i]; missingTermBuf->safeMemcpy ( mt , mt->getSize() ); } return true; } // . now this uses the related queries // . use logic from getInsertableTerms()!!! SafeBuf *XmlDoc::getMissingTermBuf ( ) { // try to set from cachedb record if ( ! checkCachedb() ) return (SafeBuf *)-1; if ( m_missingTermBufValid ) return &m_missingTermBuf; SafeBuf *qkbuf = getRelatedQueryBuf (); if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf; HashTableX *topTermsTable = getTermIdBufDedupTable32(); if ( ! topTermsTable || topTermsTable == (void *)-1 ) return (SafeBuf *)topTermsTable; SafeBuf tmpBuf; if ( ! tmpBuf.reserve ( 100000 ,"t3buf" ) ) return NULL; // maps 64-bit term hash (can be multiple words in a term) to // an offset into tmpBuf. HashTableX scoreTable; if ( ! scoreTable.set(8,4,1024,NULL,0,false,m_niceness,"mttst") ) return NULL; // // taken from seo.cpp's printRelatedQueries() function // //int32_t *qrOffs = (int32_t *)relBuf->getBufStart(); //int32_t numRels = relBuf->length() / sizeof(int32_t); //char *base = m_queryRelBuf.getBufStart(); //SafeBuf *rqsb = &m_relatedQueryStringBuf; int32_t nks = qkbuf->length() / sizeof(QueryLink); QueryLink *qks = (QueryLink *)qkbuf->getBufStart(); int32_t i; for ( i = 0 ; i < nks ; i++ ) { QUICKPOLL(m_niceness); // stop at 300? //if ( i >= 300 ) break; QueryLink *qk = &qks[i]; int32_t qkOff = (char *)qk - qkbuf->getBufStart(); //int32_t relOff = qrOffs[i]; //QueryRel *rel = (QueryRel *)(base+relOff); // skip if not head of a linked list if ( ! qk->m_isFirst ) continue; QueryLogEntry *qe ; qe = qk->getQueryLogEntry(&m_relatedQueryStringBuf); // relative to rqsb! m_relatedQueryStringBuf float imp = qk->m_totalQueryImportance; // modify by unique round? not yet... //imp -= rel->m_uniqueRound * 1000; // now use this function if ( ! addTermsFromQuery ( qe->getQueryString() , qe->m_langId, qe->m_gigablastTraffic, qe->m_googleTraffic, qkOff, // hackqoff &tmpBuf , &scoreTable , topTermsTable , imp , true ) ) // is related query? return NULL; } // sort MissingTerms from tmpBuf into m_missingTermBuf by // MissingTerm::m_importance if ( ! sortTermsIntoBuf ( &scoreTable, &tmpBuf, &m_missingTermBuf ) ) return NULL; m_missingTermBufValid = true; //m_numMissingTerms = i; // store it //if ( ! storeMissingTermBufIntoCachedb() ) // return (SafeBuf *)-1; return &m_missingTermBuf; } // . now get the best terms from our matching queries // . basically the exact same algo as getMissingTermBuf SafeBuf *XmlDoc::getMatchingTermBuf ( ) { // try to set from cachedb record if ( ! checkCachedb() ) return (SafeBuf *)-1; if ( m_matchingTermBufValid ) return &m_matchingTermBuf; SafeBuf *mq = getMatchingQueryBuf(); if ( mq == NULL || mq == (void *)-1 ) return mq; HashTableX *topTermsTable = getTermIdBufDedupTable32(); if ( ! topTermsTable || topTermsTable == (void *)-1 ) return (SafeBuf *)topTermsTable; // tmpBuf will hold the MissingTerms we add. SafeBuf tmpBuf; if ( ! tmpBuf.reserve ( 100000 ,"t4buf") ) return NULL; // maps 64-bit term hash (can be multiple words in a term) to // an offset into tmpBuf. tmpBuf holds the missing terms, so we // use scoreTable to accumulate MissingTerm::m_importance for // the same term in different queries. HashTableX scoreTable; if ( ! scoreTable.set(8,4,1024,NULL,0,false,m_niceness,"mttst") ) return NULL; // scan the queries this doc matches and add MissingTerms for them // into tmpBuf int32_t nks = mq->length() / sizeof(QueryLink); QueryLink *qks = (QueryLink *)mq->getBufStart(); int32_t i; for ( i = 0 ; i < nks ; i++ ) { QUICKPOLL(m_niceness); QueryLink *qk = &qks[i]; // stop at 300? if ( i >= 300 ) break; // "matching terms" have different hackqoff than missing terms int32_t qkOff = (char *)qk - mq->getBufStart(); // relative to rqsb! m_relatedQueryStringBuf float imp = qk->m_queryImportance; // querylogentry does not have string info here! it is // just the basic class QueryLogEntry *qe ; qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf); // . now use this function if ( ! addTermsFromQuery ( qe->getQueryString(), qe->m_langId, qe->m_gigablastTraffic, qe->m_googleTraffic, qkOff, // hackqoff &tmpBuf , &scoreTable , topTermsTable , imp , false ) ) // is related query? return NULL; } // sort MatchingTerms from tmpBuf into m_matchingTermBuf by // MatchingTerm::m_importance if ( ! sortTermsIntoBuf ( &scoreTable, &tmpBuf, &m_matchingTermBuf ) ) return NULL; m_matchingTermBufValid = true; //m_numMatchingTerms = i; // store it //if ( ! storeMatchingTermBufIntoCachedb() ) // return (SafeBuf *)-1; return &m_matchingTermBuf; } /* // . max # of outstanding msg3f requests we can send to one host // . now just make it 1 since it is msg3f NOT msg39 #define MAXOUT 1 //#define BINSIZE 100000 class Bin { public: // the current position for adding queries into m_buf int32_t m_cursor; int32_t m_maxCursor; int32_t m_allocSize; // some hack storage Host *m_hackHost; bool m_hackIsMsg99ReplyPtr; // for sending the m_buf to its host Multicast m_mcast; // allocates size of BINSIZE bytes char m_buf[0]; }; // . returns false and sets g_errno on error // . returns true on successful launch of request, it will block always bool XmlDoc::sendBin ( int32_t i ) { Bin *bin = m_currentBinPtrs[i]; // get host Host *h = g_hostdb.getHost(i); // copy it //int32_t reqSize = p - tmpBuf; //char *req = mdup ( tmpBuf , reqSize , "3freq" ); //if ( ! req ) return true; // increment outstanding requests he has h->m_numOutstandingRequests++; // this could be a ptr to a msg99reply or a querylink Multicast *mcast = &bin->m_mcast; //bin->m_hackxd = this; //bin->m_hackPtrCursor = firstPtrCursor; bin->m_hackHost = h; // get his group id uint32_t groupId = h->m_groupId; char *req = bin->m_buf; int32_t reqSize = bin->m_cursor; // disown it so mcast can free it when its udpslot is destroyed m_currentBinPtrs[i] = NULL; // note that setStatus("launching msg3f"); // log it too //log("seopipe: launching msg3f request of %"INT32" gbdocid queries to " // "score to host %"INT32"", queryCount,h->m_hostId); // get the docIds for this query using msg3f.cpp's handleRequest3f() bool status = mcast->send ( req , reqSize, 0x3f , false, // mcast frees request? no!!! groupId, // group to send to false, // send to whole group? 0 , // query hash for host in group select this , // state1 bin,//mcast, // state2 gotMsg3fReplyWrapper, 86401, // timeout in seconds. LONG TIME! m_niceness, false, // realtime? h->m_hostId // firsthostid to try ); // mark it out m_numMsg3fRequests++; // if this is true then it was a success and we BLOCKED if ( status ) { // must BE IN USE! if ( ! mcast->m_inUse ) { char *xx=NULL;*xx=0; } // success return true; } // it came back? m_numMsg3fReplies++; // undo this h->m_numOutstandingRequests--; // errno should be set if ( ! g_errno ) { char *xx=NULL;*xx=0; } // set error m_binError = g_errno; // note it log("seopipe: mcast had error: %s", mstrerror(g_errno)); // free that bin i guess mfree ( bin , bin->m_allocSize, "delbin" ); // return false on error return false; } // . this is called from two places: // 1. getMatchingQueriesScored() (Msg99Reply ptrs) // 2. getRelatedQueryBuf() (QueryLink ptrs) // . this can take Msg99Reply ptrs or it can take QueryLink ptrs // . it will glean the docid from either of these two ptrs types as well // as glean the pointer to the query string. // . THEN it can create a 'gbdocid:xxxx | ' query which // it will send to a host in the network. // . it will try to keep each host in the network answering 5 such queries // at any one time. bins are no longer used. // . we need to implement heavy termlist caching remotely and locally to // ensure optimal speed // . returns false if blocked, true otherwise // . returns true with g_errno set on error bool XmlDoc::scoreDocIdRestrictedQueries ( Msg99Reply **replyPtrs , QueryLink *linkPtrs , int32_t numPtrs ) { //log("debug: entered scoredocidrestrictedqueries"); if ( numPtrs == 0 ) return true; // . sanity check // . you can only score your Msg99Replies or your QueryLinks // . score your Msg99Replies for queries that match the main url // . score your QueryLinks for queries that match a related docid if ( ! replyPtrs && ! linkPtrs ) { char *xx=NULL;*xx=0; } if ( replyPtrs && m_setForReplyPtrs ) return true; if ( linkPtrs && m_setForLinkPtrs ) return true; // we now send the termlistbuf to each host receiving a msg3f // request so when it performs the msg39 on a query we provide it // will set QueryTerm::m_posdbListPtr to point to the termlists we // provided only, just for this docid SafeBuf *termListBuf = NULL; if ( ! linkPtrs ) { termListBuf = getTermListBuf(); if ( ! termListBuf ) return true; if ( termListBuf==(void *)-1 ) return false; } // force to ten for debug //numPtrs = 20; sendLoop: // // cleanup if got all replies we can // if ( m_numMsg3fReplies == m_numMsg3fRequests && ((m_qcursor >= numPtrs) || m_binError) ) { //log("debug: cleanup"); // there might be remnant bins if we stopped trying to // call sendBin because we hit m_binError for ( int32_t i = 0 ; i < g_hostdb.getNumHosts() ; i++ ) { // see if that bin is still around Bin *bin = m_currentBinPtrs[i]; if ( ! bin ) continue; // this will core if the multicast is in use bin->m_mcast.destructor(); // now nuke it then mfree ( bin , bin->m_allocSize, "delbin" ); // now make it null m_currentBinPtrs[i] = NULL; } // nuke this too! if ( m_newxd2 ) { mdelete ( m_newxd2 , sizeof(XmlDoc) , "newxd2" ); delete ( m_newxd2 ); m_newxd2 = NULL; } // free table's mem if used m_tmpDupTable.reset(); // do not repeat this logic! if ( replyPtrs ) { m_setForReplyPtrs = true; m_binErrorForReplyPtrs = m_binError; } if ( linkPtrs ) { m_setForLinkPtrs = true; m_binErrorForLinkPtrs = m_binError; } // inherit error? pass it on to caller //if ( m_binError ) g_errno = m_binError; // reset for another call to this function since we call // if from two different places above m_numMsg3fRequests = 0; m_numMsg3fReplies = 0; m_qcursor = 0; m_binError = 0; // all done! g_errno = 0; return true; } // int16_tcut char *base = m_tmpStringBuf5.getBufStart(); if ( ! m_langIdValid ) { char *xx=NULL;*xx=0; } // store the queries in our buffer into the various bins and send // a bin off when it gets full queryLoop: // breathe QUICKPOLL(m_niceness); // nothing left to do except wait for replies? if ( m_qcursor >= numPtrs ) return false; // assume ptr is good bool good = true; // set these int64_t docId; // the query as a string char *qstr = NULL; // for passing to mcast::m_hackQPtrs void *vptr; // get the ith QueryLink? if ( linkPtrs ) { QueryLink *qk = &linkPtrs[m_qcursor]; // skip if was not successfully processed above // because it's hostid was dead perhaps? if ( qk->m_queryHostId != -1 ) good = false; // get from related docid in this case SafeBuf *rdbuf = &m_relatedDocIdBuf; if ( ! m_relatedDocIdBufValid ) { char *xx=NULL;*xx=0; } RelatedDocId *rd = qk->getRelatedDocId(rdbuf); docId = rd->m_docId; // get it QueryLogEntry *qe ; qe = (QueryLogEntry *)(qk->m_queryStringOffset + base); // and this. skip over goodserpscore, gigablastTraffic and // googleTraffic qstr = qe->getQueryString(); // save it vptr = qk; } // make a new one for the first time if ( linkPtrs && ! m_newxd2 ) { try { m_newxd2 = new ( XmlDoc ); } catch ( ... ) { g_errno = ENOMEM; m_binError = g_errno; goto sendLoop; } mnew ( m_newxd2, sizeof(XmlDoc),"newxd2"); } // set the xmldoc to this new docid, if it is new... if ( linkPtrs && m_newxd2->m_docId != docId ) { // a good stopping point? if ( clientClosedConnection() ) { m_binError = ESOCKETCLOSED; goto sendLoop; } // set it from related doc's docid if ( ! m_newxd2->set3 ( docId ,cr->m_coll, m_niceness ) ) { m_binError = g_errno; goto sendLoop; } // alloc space for tablen if ( m_tmpDupTable.getNumSlots() <= 0 && ! m_tmpDupTable.set ( 8,0,1024,NULL,0,false,m_niceness, "tdtbl") ) { m_binError = g_errno; goto sendLoop; } // must not be in there already! if ( m_tmpDupTable.isInTable ( &docId ) ) { char *xx=NULL;*xx=0; } // add it if ( ! m_tmpDupTable.addKey ( &docId ) ) { m_binError = g_errno; goto sendLoop; } // ensure content is recycled from title rec m_newxd2->m_recycleContent = true; // newxd2 needs to use our master functions. so // anytime one of its internal functions blocks, then // our m_masterLoop will be called // and we'll end up right here again! m_newxd2->m_masterLoop = m_masterLoop; m_newxd2->m_masterState = m_masterState; // only get posdb keys really for this stuff m_newxd2->m_useTitledb = false; m_newxd2->m_useTagdb = false; m_newxd2->m_useClusterdb = false; m_newxd2->m_useSpiderdb = false; m_newxd2->m_useLinkdb = false; // debug log("seopipe: setting newxd2 docid=%"INT64"",docId); } // pump this if ( linkPtrs && ! m_newxd2->m_loaded ) { // . CRAP, blocking here sucks because when this function // is re-entered it can also be from a Msg3f reply // not because this document is back from msg22a... //log("debug: loading newxd2"); // try to set from title rec first. return false if blocks. if ( ! m_newxd2->loadFromOldTitleRec() ) { m_newxd2Blocked = true; //log("debug: newxd2 blocked"); return false; } } // i guess no longer out if ( linkPtrs && m_newxd2->m_loaded ) m_newxd2Blocked = false; //if ( linkPtrs ) // log("debug: newxd2 loaded=%"INT32"",(int32_t)m_newxd2->m_loaded); // sanity check if ( linkPtrs && ! m_newxd2->m_oldTitleRecValid ) { char *xx=NULL;*xx=0; } // . did that fail? i.e. docid not found!?!?! // . do not increment m_qcursor if m_binError is set if ( linkPtrs && ! m_newxd2->m_oldTitleRec && ! m_binError ) { // just skip this asshole then if ( m_lastPrintedDocId != docId ) { log("seopipe: related docid %"INT64" titlerec " "load failed99", docId); } m_lastPrintedDocId = docId; // clear that g_errno = 0; // skip it m_qcursor++; // try the next one goto queryLoop; } if ( linkPtrs ) { // . CRAP, blocking here sucks because when this function // is re-entered it can also be from a Msg3f reply // not because it has the termlistbuf ready // . use termlist buf of related docid // . we need to ENSURE that the QueryLinks are clustered // by related docid so this logic is efficient here termListBuf = m_newxd2->getTermListBuf(); // return false if it blocked if ( termListBuf == (void *)-1 ) { //log("debug: newxd2 blocked in termlistbuf"); m_newxd2Blocked = true; return false; } // this sucks. error! if ( ! termListBuf ) { m_binError = g_errno; goto sendLoop; } } // i guess no longer out if ( linkPtrs ) { //log("debug: newxd2 UNblocked in termlistbuf"); m_newxd2Blocked = false; } // wait for replies to come in so we can stop even if m_qcursor // did not complete its scan! // shit, but what if we are a msg22 coming in for m_newxd2? that // is why i moved this check down here so we can set m_newxd2Blocked // to false and allow the msg3f replies to come back in and free // all the bins. this is kinda fucked up because everything is // asynchronous. if ( m_binError ) return false; // otherwise the Msg99Reply if ( ! linkPtrs ) { Msg99Reply *qp = replyPtrs[m_qcursor]; // tis us! docId = m_docId; // sanity if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; } // and query string qstr = qp->m_queryStr; // save it vptr = qp; } int32_t qlen = gbstrlen(qstr); // mark as bad if this query is too big already if ( m_firstUrl.m_ulen + qlen + 10 > MAX_QUERY_LEN ) good = false; // if ptr was bad, do not evaluate at all if ( ! good ) { m_qcursor++; goto queryLoop; } // sanity if ( ! cr->m_coll || ! cr->m_coll[0] ) { char *xx=NULL;*xx=0; } // . get hash of query to determine bin // . this keeps our term freqs consistent since every query goes // back TO THE SAME HOST!!! thus our scores remain consistent. // each host has a slightly different TermFreq/Weight for the // exact same query because the termfreq is based on the termlist // length for that termid. and each host has a different set of // docids in its index for the most part. uint32_t h32 = hash32n ( qstr ); int32_t numHosts = g_hostdb.getNumHosts(); // do not send to host #0 if we got a lot of hosts if ( g_hostdb.getNumHosts() >= 8 ) numHosts--; int32_t hostNum = h32 % numHosts; // skip host #0 which is us i guess! if ( g_hostdb.getNumHosts() >= 8 ) hostNum++; // sanity for that if ( g_hostdb.m_myHost->m_hostId != 0 ) { char *xx=NULL;*xx=0; } // get the current bin for that host Bin *bin = m_currentBinPtrs [ hostNum ]; // alloc on demand if ( ! bin ) { // how big is the termlistbuf? int32_t tsize = termListBuf->length(); int32_t collLen = gbstrlen(cr->m_coll); // how much space do we need for a good bin? int32_t alloc = sizeof(Bin) + 8 +1+ collLen + 1 + tsize + 100000; // make that char *mem = (char *)mmalloc ( alloc ,"binreq" ); if ( ! mem ) { m_binError = g_errno; goto sendLoop; } // cast it bin = (Bin *)mem; // store it m_currentBinPtrs [ hostNum ] = bin; // this includes a Multicast in the Bin bin->m_mcast.constructor(); // for freeing bin->m_allocSize = alloc; // the end of it char *memEnd = mem + alloc; // reset offset into Bin::m_buf bin->m_cursor = 0; // is it to a msg99reply? so the reply handler knows how to // handle mcast::m_hackQPtr and what action to take. it is // slightly different. if ( linkPtrs ) bin->m_hackIsMsg99ReplyPtr = 0; else bin->m_hackIsMsg99ReplyPtr = 1; // . before we add any queries, store langid of QUERY // . crap just use doc langid for now char *bp = bin->m_buf; // first is docid. if doing QueryLinks this is the docid // of the related docid, otherwise, it is that of our main doc *(int64_t *)bp = docId; bp += 8; // then langid *bp = m_langId; bp++; // then the coll gbmemcpy ( bp , cr->m_coll , collLen ); bp += collLen; *bp++ = '\0'; // sanity! if ( bp >= memEnd ) { char *xx=NULL;*xx=0; } // the size of the termlist buf *(int32_t *)bp = tsize; bp += 4; // then the termlistbuf that has all the termlists forour docid gbmemcpy ( bp , termListBuf->getBufStart(), tsize ); bp += tsize; // update bin's cursor bin->m_cursor = bp - bin->m_buf; // for breach detection. send off Bin when breach happens. bin->m_maxCursor = alloc - sizeof(Bin); } // can we store the current query into this bin? bool storeInBin = true; // is there enough room for this query in the bin? int32_t need = qlen + 40; if ( bin->m_cursor + need >= bin->m_maxCursor ) storeInBin = false; // does docid of bin match? int64_t binDocId = *(int64_t *)(bin->m_buf); if ( docId != binDocId ) storeInBin = false; // if we can't store this query into the bin, send it off now if ( ! storeInBin ) { // use its multicast to send this bin off if too full if ( ! sendBin ( hostNum ) ) { m_binError = g_errno; goto sendLoop; } // . now the current bin should have been emptied // . go back to top to realloc Bin::m_buf to hold this query goto queryLoop; } char *p = bin->m_buf + bin->m_cursor; // first store the offset from the buf so we can return it // in the reply which is a list of scores basically and we know // what score goes with what m_qcursor *(int32_t *)p = m_qcursor; p += 4; // now store queries in the request buf for the msg3f p += sprintf(p,"gbdocid:%"UINT64" | %s",docId,qstr); *p++ = '\0'; // update cursor bin->m_cursor = p - bin->m_buf; // skip to next query/docid to evaluate m_qcursor++; // if we have more queries left, add them to bins now if ( m_qcursor < numPtrs ) goto queryLoop; // now send every bin, we have no queries left. for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { // breathe QUICKPOLL(m_niceness); // skip if empty if ( ! m_currentBinPtrs[i] ) continue; // this will transfer the request buffer over to mcast // so it will be freed when mcast returns sendBin ( i ); } goto sendLoop; } // we got back the score for each query link in // the bin that we sent out for the docid specified in the bin header request void XmlDoc::gotMsg3fReply ( Bin *bin ) { // Multicast *mcast ) { setStatus ( "gotmsg3freply" ); // do some housekeeping Host *h = bin->m_hackHost; h->m_numOutstandingRequests--; m_numMsg3fReplies++; // sanity Multicast *mcast = &bin->m_mcast; if ( mcast->m_inUse ) { char *xx=NULL;*xx=0; } // get the reply bool freeIt = false; int32_t replySize = 0; int32_t replyMaxSize; char *rbuf = mcast->getBestReply ( &replySize , &replyMaxSize , &freeIt , true ); // steal it? // log it too //log("seopipe: got msg3f reply from host %"INT32" size=%"INT32" bytes", // h->m_hostId,replySize); // cast it //Msg3fReply *mr = (Msg3fReply *)rbuf; // in case of mem-leak this helps //if ( rbuf ) relabel(rbuf,replyMaxSize,"xx-rb"); // . we must be able to free it... we must own it // . this is true if we should free it, but we should not have // to free it since it is owned by the slot? if ( freeIt ) { log(LOG_LOGIC,"query: msg3f: Steal failed."); char *xx = NULL; *xx=0; } // if it failed for some reason i guess just bail if ( ! rbuf ) { // clean up the bin and the multicast and the request buffer mfree ( bin , bin->m_allocSize, "delbin" ); g_errno = EBADREPLYSIZE; log(LOG_LOGIC,"seopipe: bad msg3f empty reply"); return; } // reply is just sequence of docid/score pairs char *rp = rbuf; char *rpEnd = rbuf + replySize; //int32_t firstCursor = bin->m_hackPtrCursor; // scan the msg99 replies and insert the scores we got for each // query from the msg3f reply in "rbuf" for ( ; rp < rpEnd ; ) { // breathe QUICKPOLL(m_niceness); // . first is index, what query # in the request are we // processing now, might not be in order because we launch // a bunch of msg39s in parallel in handleRequest3f()'s call // to processQueries() // . but the corresponding msg99reply is reply # "qcursor" int32_t qcursor = *(int32_t *)rp; rp += 4; int64_t docId = *(int64_t *)rp; rp += 8; float score = *(float *)rp; rp += 4; // . if this is true that means qcursor is referencing a // msg99reply and we should set the score of that msg99 // reply to what the handlerequest3f provided // . so store the docid and score for our url for this query if ( bin->m_hackIsMsg99ReplyPtr ) { SafeBuf *mqbuf = getMatchingQueries(false,-1); Msg99Reply **qptrs=(Msg99Reply **)mqbuf->getBufStart(); Msg99Reply *qr = qptrs[qcursor]; qr->m_myScore = score; qr->m_myDocId = docId; int32_t numQueryPtrs=mqbuf->length()/sizeof(Msg99Reply *); // if too many skip some if ( numQueryPtrs > 1000 && (qcursor%1000)!=0)continue; // if too many skip some if ( numQueryPtrs > 400 && (qcursor%100) !=0)continue; char *qstr = qr->m_queryStr; log("seopipe: got query #%"INT32"of%"INT32" score=%f qstr=%s" ,qcursor+1 ,numQueryPtrs ,score ,qstr ); continue; } // might be storing in a QueryLink (doing related docids) //SafeBuf *ibuf = getRelatedQueryLinksWithStrings(); QueryLink *qks =(QueryLink *)m_tmpBuf5.getBufStart(); //int32_t numQueryLinks = ibuf->length() / sizeof(QueryLink); QueryLink *qk = &qks[qcursor]; // sanity. make sure qk->m_queryStringOffset is related to our // local m_tmpStringBuf5 and not relative to the // g_qbuf of the hostid that sent back the msg99 reply. if ( qk->m_queryHostId != -1 ) { char *xx=NULL;*xx=0; } // how many related query links do we got? for logging. int32_t nks = m_tmpBuf5.length()/sizeof(QueryLink); // int16_tcuts char *base = m_tmpStringBuf5.getBufStart(); // skip over gigablastTraffic and googleTraffic QueryLogEntry *qe; qe = (QueryLogEntry *)(base + qk->m_queryStringOffset); SafeBuf *rdbuf = &m_relatedDocIdBuf; if ( ! m_relatedDocIdBufValid ) { char *xx=NULL;*xx=0; } RelatedDocId *rd = qk->getRelatedDocId(rdbuf); // note it if ( (qcursor % 1000) == 0 ) // || qcursor < 100 ) log("seopipe: got msg3f reply for related query " "#%"INT32"of%"INT32" " "query \"gbdocid:%"INT64" | %s\" gigablasttraffic=%"INT32" " "googletraffic=%"INT32" serpscore=%f goodscore=%f" ,qcursor+1 ,nks ,rd->m_docId ,qe->getQueryStr() ,qe->m_gigablastTraffic ,qe->m_googleTraffic ,score ,qe->m_topSERPScore // of a docid slice on 1 host ); // // no longer used queryrel! // // if we are scoring QueryLinks then we add a QueryRel //QueryRel qr; // clear that mem to zero //memset ( &qr , 0 , sizeof(QueryRel)); // then add the info we know //qr.m_relatedDocId = qk->m_relatedDocId; //char *base2 = m_relatedDocIdBuf.getBufStart(); //int32_t rdOff = (char *)qk->m_relatedDocId - base2; //qr.m_relatedDocIdOff = rdOff; //qr.m_offsetIntoRelQStrBuf = qk->m_queryStringOffset; //qr.m_myScore = score; //qr.m_nextOff = -1; //qr.m_tailOff = -1; qk->m_serpScore = score; // save that. WHAT IF THIS ERRORS?!?!?! //if ( ! m_queryRelBuf.safeMemcpy(&qr,sizeof(QueryRel)) ) { // m_binError = g_errno; // log("xmldoc: panic. failed to store query rel"); // break; //} // debug test //m_binError = EBADENGINEER; //log("xmldoc: panic2. failed to store query rel"); //break; } // ok, we got the docid and score, now free it mfree ( rbuf , replyMaxSize , "fmsg3f" ); // clean up the bin and the multicast and the request buffer mfree ( bin , bin->m_allocSize, "delbin" ); //if ( m_newxd2Blocked ) // log("debug: got reply, but returning because newxd2 " // "had blocked"); // prevent double entry bug from entering scoreDocIdRestrictedQueries() // from a newxd2 function blocking and coming in through msg22 // callback or whatever, vs. coming in from here if ( m_newxd2Blocked ) return; //log("debug: got reply and calling masterloop"); // go back to the transmit function m_masterLoop ( m_masterState ); // if not done, just return... otherwise we double enter // scoreDocIdRestrictedQueries() along with it's call to // getTermListBuf()... and all hell breaks loose return; } */ /* // send contents of m_socketWriteBuf to m_seoSocket void XmlDoc::pumpSocketWriteBuf ( ) { if ( ! m_seoSocket ) return; setStatus ( "pumpsocketwritebuf" ); SafeBuf *sb = &m_socketWriteBuf; // insert http header into m_socketWriteBuf if not there char *wbuf = sb->getBufStart(); bool insertIt = false; if ( ! wbuf ) insertIt = true; if ( wbuf && strncmp(wbuf,"HTTP/1.0 ",9 ) ) insertIt = true; // add http header first if ( insertIt ) { // reset # bytes sent m_socketWriteBufSent = 0; m_registeredSocketCallback = false; // xml-itize each query reply without scoring info sb->insert("HTTP/1.0 200 OK\r\n" "Content-Type: text/xml ; " "charset=utf-8\r\n" "\r\n" "\n",0); } // come back here to do another send sendLoop: // try sending out our xml buffer on the socket // the very first things we do is send the queries over without // the ranking info which we compute by calling msg39 on each query, // so at least we can display something quite quickly. if ( m_socketWriteBufSent < sb->length() ) { int32_t sd = m_seoSocket->m_sd; // just in case if ( m_registeredSocketCallback ) { g_loop.unregisterWriteCallback(sd,this, getSEOQueryInfoWrapper2); m_registeredSocketCallback = false; } // send that off int32_t sendLen = sb->length(); char *sendStr = sb->getBufStart(); char *sendEnd = sendStr + sendLen; // if we sent SOME last time, skip over that sendStr += m_socketWriteBufSent; // how much left? int32_t remaining = sendEnd - sendStr; // wtf? if ( remaining <= 0 ) { char *xx=NULL;*xx=0; } // try a send on non-blocking socket int32_t n = ::send ( sd , sendStr , remaining , 0 ); // did we send something? if ( n > 0 ) { m_socketWriteBufSent += n; goto sendLoop; } // maybe it sent 0 because it was waiting for something // so set our callback for when the socket is ready for // writing again. try sending more later. g_loop.registerWriteCallback ( sd , this , getSEOQueryInfoWrapper2, 0 ); // niceness = 0 // flag it so we don't leak these m_registeredSocketCallback = true; } } */ bool XmlDoc::getIsInjecting ( ) { bool isInjecting = false; //if ( g_inPageInject ) isInjecting = true; if ( m_sreqValid && m_sreq.m_isInjecting ) isInjecting = true; if ( m_isInjecting && m_isInjectingValid ) isInjecting = true; return isInjecting; } int posdbKeyCmp ( const void *a, const void *b ) { char *ka = (char *)a; char *kb = (char *)b; //int64_t tid64a = g_posdb.getTermId(ka); //int64_t tid64b = g_posdb.getTermId(kb); // a bit of a hack so handleRequest8e already has these // guys sorted by their lower 32-bits of termids so it can // match this doc to queries without having to sort first. //uint32_t tid32a = (uint32_t)tid64a; //uint32_t tid32b = (uint32_t)tid64b; //if ( tid32a < tid32b ) return -1; //if ( tid32a > tid32b ) return 1; // swap //if ( tid64a < tid64b ) return -1; //if ( tid64a > tid64b ) return 1; // swap char val = KEYCMP(ka,kb,sizeof(POSDBKEY)); if ( val > 0 ) return 1; if ( val < 0 ) return -1; return 0; } // . used by XmlDoc::getTermListBuf() below // . sorted by posdb key straight up SafeBuf *XmlDoc::getTermIdSortedPosdbListBuf ( ) { if ( m_sortedPosdbListBufValid ) return &m_sortedPosdbListBuf; // get the lists. forDelete = false. char *metaList = getMetaList ( false ); if ( ! metaList || metaList==(void *)-1 ) return (SafeBuf *)metaList; // sanity if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; } // make a tmp buf to hold posdb keys //SafeBuf tmp; if ( ! m_sortedPosdbListBuf.reserve(m_metaListSize,"spbuf")) return NULL; // point into it char *dst = m_sortedPosdbListBuf.getBufStart(); // debug test //verifyMetaList ( m_metaList , // m_metaList + m_metaListSize , // false ); // scan the meta list for posdb keys char *p = metaList; char *pend = p + m_metaListSize; // stole this loop from getMetaList() for ( ; p < pend ; ) { // breathe QUICKPOLL(m_niceness); // save it with the flag char byte = *p; // get rdbId char rdbId = byte & 0x7f; // skip that p++; // key size int32_t ks = getKeySizeFromRdbId(rdbId); // get key char *key = p; // skip that p += ks; // get datasize int32_t ds = getDataSizeFromRdbId(rdbId); // assume we do not store the datasize //bool neg = false; // . if key is negative, no data is present // . the doledb key is negative for us here if ( (key[0] & 0x01) == 0x00 ) ds = 0; // if datasize variable, read it in if ( ds == -1 ) { // get data size ds = *(int32_t *)p; // skip data size int32_t p += 4; } // point to data //char *data = p; // skip data if not zero p += ds; // if not posdb skip rec if ( rdbId != RDB_POSDB ) continue; // skip negative keys if ( (key[0] & 0x01) == 0x00 ) continue; // add to new buf now gbmemcpy ( dst , key , sizeof(POSDBKEY) ); // advance dst += sizeof(POSDBKEY); } char *start = m_sortedPosdbListBuf.getBufStart(); // update tmp m_sortedPosdbListBuf.incrementLength ( dst - start ); // sanity if ( m_sortedPosdbListBuf.length() > m_metaListSize ) { char *xx=NULL;*xx=0; } // point char *pbuf = m_sortedPosdbListBuf.getBufStart(); int32_t numKeys = m_sortedPosdbListBuf.length()/sizeof(POSDBKEY); // sort keys by termid gbqsort ( pbuf , numKeys, sizeof(POSDBKEY), posdbKeyCmp, m_niceness ); m_sortedPosdbListBufValid = true; return &m_sortedPosdbListBuf; } #define TLBUFSIZE 5000 // . used by the seo pipeline // . this is a list of posdb termlists, one termlist per termid. // . we store each termlist in this termlistbuf into g_termListCache // . we use g_termListCache for evaluating gbdocid:xxx| restricted queries // very quickly without having to hit disk because all the posdb termlists // for that docid should be in g_termListCache SafeBuf *XmlDoc::getTermListBuf ( ) { if ( m_termListBufValid ) return &m_termListBuf; // . ensure content is recycled from title rec // . no, because if we had to download the doc fresh for the first // time, this caused us headaches around line 30657 and we ended // up setting m_docIndexed to false there and calling logIt() twice! //m_recycleContent = true; //m_recycleLinkInfo = true; // try to set from title rec first. return false if it blocked. //if ( ! loadFromOldTitleRec() ) return (SafeBuf *)-1; // did that fail? i.e. docid not found!?!?! //if ( m_oldTitleRecValid && ! m_oldTitleRec ) { // g_errno = ENOTFOUND; // return NULL; //} // only get posdb keys in getMetaList() /* m_useTitledb = false; m_useTagdb = false; m_useClusterdb = false; m_useSpiderdb = false; m_useLinkdb = false; */ // . these are FULL 18-byte keys, no compression // . sorted by posdbkeys straight up, so by termid SafeBuf *posdbBuf = getTermIdSortedPosdbListBuf (); if ( ! posdbBuf || posdbBuf == (void *)-1 ) return posdbBuf; int32_t numKeys = posdbBuf->length() / sizeof(POSDBKEY); // . reserve mem for new termlistbuf // . include 4 bytes for listsize // . this buffer will be a list of lists int32_t need = numKeys * (sizeof(POSDBKEY) + 4); if ( ! m_termListBuf.reserve ( need ,"tlstbuf" ) ) return NULL; int64_t lastTermId = -1LL; /* char tmpBuf[TLBUFSIZE]; // build termlists from the posdb records RdbList termList; // stolen from RdbList::set termList.m_list = tmpBuf; termList.m_listSize = 0; termList.m_listEnd = tmpBuf; termList.m_alloc = tmpBuf; termList.m_allocSize = TLBUFSIZE; termList.m_ownData = false; termList.m_ks = sizeof(POSDBKEY); termList.m_fixedDataSize = 0; termList.m_ownData = false; termList.m_useHalfKeys = true; termList.resetListPtr(); bool breakOut = false; */ // start a size bookmark int32_t *bookmark = NULL; // scan all the sorted posdb keys and build posdb termlists and // store the termlists into "m_termListBuf" char *p = posdbBuf->getBufStart(); char *pend = p + posdbBuf->length(); for ( ; p < pend ; ) { // get the key char *key = p; // must be full 18 byte keys! if ( p[0] & 0x06 ) { char *xx=NULL;*xx=0; } // skip it p += sizeof(POSDBKEY); // get key termid int64_t termId = g_posdb.getTermId ( key ); // sanity int64_t docId = g_posdb.getDocId ( key ); if ( docId != m_docId ) { char *xx=NULL;*xx=0; } // sanity. is it sorted by termid? if ( termId < lastTermId && lastTermId == -1 ) { char *xx=NULL;*xx=0; } // log it for debug //if ( docId == 192304365235LL ) // log("tlist: docid=%"INT64" termId=%"INT64" wpos=%"INT32"", // docId, // termId, // g_posdb.getWordPos(key)); // . store size of keys following that have same termid // . assume just one for now! if ( termId != lastTermId ) { bookmark = (int32_t *)m_termListBuf.getBuf(); m_termListBuf.pushLong(sizeof(POSDBKEY)); } // store the key m_termListBuf.safeMemcpy ( key , sizeof(POSDBKEY) ); // if not first in the list, update size if ( termId == lastTermId ) *bookmark += sizeof(POSDBKEY); // . cache currently made list then // . set startkey/endkey //char startKey[sizeof(POSDBKEY)]; //char endKey [sizeof(POSDBKEY)]; //g_posdb.makeStartKey(startKey,lastTermId,m_docId); //g_posdb.makeEndKey (endKey,lastTermId,m_docId); // update it for next list lastTermId = termId; // . add to ongoing list? will use compression bit. // . return true with g_errno set on error // . use g_termListCache in Msg0.cpp //if(!addToTermListCache(cr->m_coll,startKey,endKey,&termList)) // return true; // first store the lits size //m_termListBuf.pushLong(termList.m_listSize); // then the list data itself //m_termListBuf.safeMemcpy(termList.m_list,termList.m_listSize) // now reset //termList.m_listSize = 0; //termList.m_list = tmpBuf; //termList.m_listEnd = tmpBuf;//ermList.m_list; //termList.resetListPtr(); // if we are a loopback, bail //if ( breakOut ) break; // are we the last record? //if ( p >= pend ) breakOut = true; // add fresh to the new termlist //goto addIt; } // sanity if ( m_termListBuf.length() && g_posdb.getDocId(m_termListBuf.getBufStart()+4) != m_docId ) { char *xx=NULL;*xx=0; } m_termListBufValid = true; return &m_termListBuf; // print timing //int64_t now = gettimeofdayInMilliseconds(); //int64_t took = now - m_cacheStartTime; //log("seopipe: took %"INT64" ms to parse docid %"INT64"",took,m_docId); // . flag it as being completely cached now // . returns false and sets g_errno on error //return addDocIdToTermListCache ( m_docId , cr->m_coll ); } //int32_t XmlDoc::getNumInsertableTerms ( ) { // // make sure they called getInsertableTerms() first! // if ( ! m_insertableTermsBufValid ) { char *xx=NULL;*xx=0;} // return m_insertableTermsBuf.length() / sizeof(InsertableTerm); //} // . return a list of InsertableTerms // . these are just terms we will try to insert into the document in every // possible place to see how they affect ranking of this document for // all the applicable queries // . then when we call getScoredInsertableTerms() it will fill in the // m_queryChangeBuf array SafeBuf *XmlDoc::getInsertableTerms ( ) { if ( m_insertableTermsBufValid ) return &m_insertableTermsBuf; // make sure related query string buf is valid //SafeBuf *rrr = getRelatedQueryLinksWithStrings(); //if ( ! rrr || rrr == (void *)-1 ) return rrr; // just use this now SafeBuf *mtBuf = getMissingTermBuf(); if ( ! mtBuf || mtBuf == (void *)-1 ) return mtBuf; // get buffer of ptrs to the msg99 replies for this url //SafeBuf *mqbuf = getMatchingQueries ( false ); //if ( ! mqbuf || mqbuf == (void *)-1 ) return mqbuf; // just use the MissingTerm class for these as well!! SafeBuf *maBuf = getMatchingTermBuf(); if ( ! maBuf || maBuf == (void *)-1 ) return maBuf; // // alloc space for the insertable terms in its safebuf // int32_t need = 0; char *p; char *pend; p = mtBuf->getBufStart(); pend = mtBuf->getBuf(); for ( ; p < pend ; ) { MissingTerm *mt = (MissingTerm *)p; p += mt->getSize(); need += sizeof(InsertableTerm); need += mt->getTermSize(); } // these are the matching terms, but use the same MissingTerm class p = maBuf->getBufStart(); pend = maBuf->getBuf(); for ( ; p < pend ; ) { MissingTerm *mt = (MissingTerm *)p; p += mt->getSize(); need += sizeof(InsertableTerm); need += mt->getTermSize(); } if ( ! m_insertableTermsBuf.reserve ( need ,"itblbuf" ) ) return NULL; // // now interleave the matching terms with the related terms // char *p1 = mtBuf->getBufStart(); char *p1End = mtBuf->getBuf(); char *p2 = maBuf->getBufStart(); char *p2End = maBuf->getBuf(); // int16_tcut SafeBuf *ib = &m_insertableTermsBuf; int32_t count; for ( count = 0 ; ; count++ ) { // . just get top 50 insertable terms // . use #define MAX_INSERTABLE_TERMS 50? if ( count >= 50 ) break; bool add1 = false; bool add2 = false; if ( ( count % 2 ) == 0 && p1 < p1End ) add1 = true; if ( ( count % 2 ) == 1 && p2 < p2End ) add2 = true; if ( ! add1 && ! add2 ) break; MissingTerm *mt; if ( add1 ) { mt = (MissingTerm *)p1; p1 += mt->getSize(); } if ( add2 ) { mt = (MissingTerm *)p2; p2 += mt->getSize(); } // make an insertable term InsertableTerm it; if ( add1 ) it.m_isRelatedTerm = true; else it.m_isRelatedTerm = false; // sum of traffic of the queries that contained this term it.m_trafficSum = mt->m_traffic; // hash it up char *term = mt->getTerm(); int32_t termSize = mt->getTermSize(); it.m_termHash64 = hash64 ( term , termSize - 1 ); it.m_termSize = termSize; // reset this for later use it.m_bestTrafficGain = -1; it.m_bestInsertPos = -1; // store that insertable term ib->safeMemcpy(&it,sizeof(InsertableTerm)); // then the term string itself follows for easy serialization // into cachedb... ib->safeMemcpy(term,termSize); } if ( ib->length() > need ) { char *xx=NULL;*xx=0; } //m_numInsertableTerms = count; m_insertableTermsBufValid = true; return &m_insertableTermsBuf; } static void gotMsg95ReplyWrapper ( void *state , UdpSlot *slot ) { XmlDoc *THIS = (XmlDoc *)state; THIS->gotMsg95Reply( slot ); } void XmlDoc::gotMsg95Reply ( UdpSlot *slot ) { // count it m_numMsg95Replies++; // return if still waiting if ( m_numMsg95Replies < m_numMsg95Requests ) return; // . store each msg95reply // . TODO: do we need m_msg95ReplyAlloc[] like m_msg99 has? m_msg95ReplyPtrs [slot->m_hostId] = slot->m_readBuf; m_msg95ReplySizes[slot->m_hostId] = slot->m_readBufSize; // do not let it free it, we will free it slot->m_readBuf = NULL; // all done! should call getScoredInsertableTerms() indirectly m_masterLoop ( m_masterState ); } #include "seo.h" // for Msg95Request class /* // return a buffer of WordFreqInfo instances for every word in the // insertable terms buffer. we use this so the msg95 handler can get the // term freqs of any term in any matching query consistently, because // we are host #0 calling this presumably. msg95 handler will use these // to set the termfreqs in the Msg39Request when calling msg39. // TODO: run through related queries as well! why didn't insertable terms // work!?!?! it should... SafeBuf *XmlDoc::getInsertableWordFreqInfoBuf ( ) { // must always be host 0 or it's twin! we have to ensure // consistency always when calling getTermFreq()... if ( g_hostdb.m_groupId != 0 ) { char *xx=NULL;*xx=0; } if ( m_iwfiBufValid ) return &m_iwfiBuf; // get the same top word ids we pass to the msg95 request, // because handleRequest95() uses those to get the queries // that we match, and it evaluates each of those queries on each // insertion we do. // So that is the ptr_twid32Buf, which MUST include all // insertable terms as well, like those insertable terms that are // new to us!! // scan list of insertable terms SafeBuf *itBuf = getInsertableTerms(); if ( ! itBuf || itBuf == (void *)-1 ) return itBuf; // . true means to get synonyms // . itBuf non-null will append new insertable terms we don't have int32_t *twids = getTopTermsVectorWithNewTerms ( true , itBuf ); if ( ! twids || twids==(void *)-1 ) return (SafeBuf *)twids; // int16_tcut //InsertableTerm *its = (InsertableTerm *)itBuf->getBufStart(); //int32_t ni = itBuf->length() / sizeof(InsertableTerm); // get buffer of ptrs to the msg99 replies for this url //SafeBuf *mqbuf = getMatchingQueries ( false ); //if ( ! mqbuf || mqbuf == (void *)-1 ) return mqbuf; //Msg99Reply **mrp = (Msg99Reply **)mqbuf->getBufStart(); //int32_t nmrp = mqbuf->length() / 4; // use table to dedup so we do not store dups HashTableX dups; if ( ! dups.set ( 8,0,8192,NULL,0,false,m_niceness,"iwfidup") ) return NULL; // . first store the langid in the buf!!! // . then the wordfreqinfos follow! if ( ! m_iwfiBuf.safeMemcpy ( &docLangId , 1 ) ) return NULL; char *p = itBuf->getBufStart(); char*pend = itBuf->getBuf(); // scan each "term" which might be one or more words for ( ; p < pend ; ) { //for ( int32_t i = 0 ; i < nmrp ; i++ ) { QUICKPOLL(m_niceness); // cast it InsertableTerm *it = (InsertableTerm *)p; p += it->getSize(); // add it in if ( ! addTermFreqsForTerm ( it->getTerm() , &dups ) ) return NULL; } // do the same for all words and bigram terms in doc as well m_iwfiBufValid = true; return &m_iwfiBuf; } bool XmlDoc::addTermFreqsForTerm ( char *term , HashTableX *dups ) { // we need this for synonyms //uint8_t langId = langEnglish; uint8_t *langIdPtr = getLangId(); // this should have been set by parent caller if ( ! langIdPtr || langIdPtr == (uint8_t *)-1 ) {char *xx=NULL;*xx=0;} // get the language this doc is in uint8_t docLangId = *langIdPtr; // if uknown, use english! if ( docLangId == langUnknown ) docLangId = langEnglish; //Msg99Reply *mr = mrp[i]; //Words ww; //ww.set3 ( it->m_termStr ); //ww.set3(it->getTerm() );//mr->m_queryStr );//it->m_termStr ); Query qq; // false = query expansion? i.e. use synonyms? //qq.set2 ( it->getTerm(),docLangId,true); qq.set2 ( term,docLangId,true); //if ( strstr ( mr->m_queryStr, "bio wagner")) // log("hey"); log("adding %s",term); //int64_t *wids = ww.getWordIds(); // scan each word for term freq for ( int32_t j = 0 ; j < qq.m_numTerms ; j++ ) { // int16_tcut QueryTerm *qt = &qq.m_qterms[j]; // get the full 64-bit hash of the word int64_t wid = qt->m_rawTermId; // skip if punct if ( ! wid ) continue; // dup? if ( dups->isInTable ( &wid ) ) continue; // add it int64_t tf = g_posdb.getTermFreq ( cr->m_coll, wid ); if ( ! dups->addKey ( &wid ) ) return NULL; WordFreqInfo wfi; wfi.m_wordId64 = wid; wfi.m_wordFreq64 = tf; // note it SafeBuf bb; bb.safePrintf("seo: tf for term=\""); bb.safeMemcpy ( qt->m_term, qt->m_termLen); bb.safePrintf("\" = %"INT64"",tf); log("seo: %s",bb.getBufStart()); // store it if(!m_iwfiBuf.safeMemcpy(&wfi,sizeof(WordFreqInfo))) return NULL; } return true; } */ // 2. now transmit all the insertable terms to each host in the network. each // host will evaluate each term in the list for every query that that // host has in its memory for every new word position. kick this process // off with the getNewRanks() function which returns a list of // query terms where each query term has a wordposition/trafficgain // array. [try to also insert entire phrases not just words] // Each host will return an InsertedTerm class for each term. But then // WE have to merge the InsertedTerm classes together for a particular // term. That can be a bit tricky since we do not list a wordposition // if it's traffic gain was the same as its previous wordposition. // PASS in the entire doc's termlist with each request in case not in cache // so it can evaluate each query's scores very quickly! // // . send a msg95 request to each host consisting of a list of terms to // insert, and the entire termlists of this document. // . then merge the replies into a final list of InsertedTerms. // . returned is buffer of InsertableTerms SafeBuf *XmlDoc::getScoredInsertableTerms ( ) { setStatus ( "getscoredinsertableterms" ); if ( m_scoredInsertableTermsBufValid ) return &m_insertableTermsBuf; uint8_t *langIdPtr = getLangId(); if ( ! langIdPtr || langIdPtr == (void *)-1 ) return (SafeBuf *)langIdPtr; SafeBuf *itBuf = getInsertableTerms(); if ( ! itBuf || itBuf == (void *)-1 ) return itBuf; // these are the posdb keys of our document, makes it fast // and easy for msg39 to return a serp score restricted to our docid SafeBuf *termListBuf = getTermListBuf(); if ( ! termListBuf || termListBuf==(void *)-1 ) return termListBuf; // this has all our documents terms and their synonyms in it, // as well as the new terms we plan to insert that our doc does not // have, from the getMissingTerms() buffer. in addition it // has the term freq of each one! SafeBuf *ntiBuf = getNewTermInfoBuf(); if ( ! ntiBuf || ntiBuf == (void *)-1 ) return (SafeBuf *)ntiBuf; // get list of TermFreqInfo instances for all words in the // lits of insertable terms //SafeBuf *wfib = getInsertableWordFreqInfoBuf ( ); //if ( ! wfib || wfib == (void *)-1 ) return wfib; SafeBuf *wpib = getWordPosInfoBuf(); if ( ! wpib || wpib == (void *)-1 ) return wpib; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // if still waiting for replies to come in, return -1 if ( m_numMsg95Requests > 0 && m_numMsg95Replies < m_numMsg95Requests ) return (SafeBuf *)-1; top: // otherwise, we are done! if ( m_numMsg95Requests > 0 && m_numMsg95Replies >=m_numMsg95Requests){ // . calculate the best insertable position for each // Insertable Term. // . we get a QueryChange array back from each host for // the same term, but for queries local on that host, // so add them all up here and set // InsertableTerm::m_bestTrafficGain/m_bestTermPosition // . queries that did not have us in the top 50 will not // be in the reply processMsg95Replies(); // show how long it took int64_t now = gettimeofdayInMilliseconds(); int64_t took = now - m_beginMsg95s; log("seopipe: time: getscoredinsertableterms took %"INT64" ms", took); // return the list of InsertableTerms, scored m_scoredInsertableTermsBufValid = true; // cache it! if it blocks that is ok, since it is valid n // disable for debug... MDW!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! if ( ! storeScoredInsertableTermsIntoCachedb() ) return (SafeBuf *)-1; return &m_insertableTermsBuf; } // now send every term in this list to every host in the // network so it can evaluate with each of the queries it contains // in memory from the query log for every position in the doc. // then it will return InsertableTerm::m_wordPositions/m_trafficGain // arrays for each InsertableTerm. // time how long this whole thing takes m_beginMsg95s = gettimeofdayInMilliseconds(); // reset this crap i guess m_numMsg95Requests = 0; m_numMsg95Replies = 0; // from seo.h Msg95Request mr; if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; } mr.m_docId = m_docId; mr.m_docLangId = *langIdPtr; mr.m_seoDebug = m_seoDebug; mr.ptr_posdbTermList = termListBuf->getBufStart(); // a buffer of TermInfos. used to set the termFreq of each term // and used to determine what queries match the doc and should be // evaluated for every insertion. mr.ptr_termInfoBuf = ntiBuf->getBufStart(); mr.ptr_coll = cr->m_coll; //mr.ptr_wordFreqInfoBuf = wfib->getBufStart(); mr.ptr_wordPosInfoBuf = wpib->getBufStart(); // why do we need this? doesn't termInfoBuf have all that? no, // because we limit insertableterms to like the top 300 highest // scoring, so they are separate. the termInfoBuf is sorted by // termid (lower 32-bits) and has a termfreq and is used to // get the matching queries in seo.cpp:handlerequest95() mr.ptr_insertableTerms = m_insertableTermsBuf.getBufStart(); mr.size_posdbTermList = termListBuf->length(); mr.size_termInfoBuf = ntiBuf->length();//m_numTwids * 4; mr.size_coll = gbstrlen(cr->m_coll)+1; //mr.size_wordFreqInfoBuf = wfib->length(); mr.size_wordPosInfoBuf = wpib->length(); mr.size_insertableTerms = m_insertableTermsBuf.length(); int32_t requestSize; char *req = serializeMsg ( sizeof(Msg95Request), &mr.size_posdbTermList ,// firstSizeParm &mr.size_insertableTerms,//lastSizeP &mr.ptr_posdbTermList ,// firststrptr &mr ,// thisPtr &requestSize , NULL , 0 , true ); if ( ! req ) return NULL; int32_t numHosts = g_hostdb.m_numHosts; // do not re-send if we already did this! if ( m_numMsg95Requests > 0 ) numHosts = 0; // send one msg95 request to each host. skip if dead. for ( int32_t i = 0; i < numHosts ; i++ ) { // get ptr to the host Host *host = g_hostdb.getHost(i); // get hostid of host #i int32_t hostId = host->m_hostId; // count it m_numMsg95Requests++; // skip if dead. i guess no queries from that guy. we can't // send to a twin because the twin does not have the same // queries in its in-memory query log. once we get more // machines we should probably make the twin have the same // copy so we can be redundant. if ( g_hostdb.isDead(hostId) && host->m_wasEverAlive ) { log("seo: warning. host %"INT32" is dead so we could " "not do the keyword tool right",hostId); m_numMsg95Replies++; continue; } // . send our posdb termlist to each host so it can // call msg39 restricted to our docid very quickly // . also send a ALL of the insertable terms to each // host so they can evaluate the insertion for all of the // relevant queries. // . each host should be smart enough to realize that some // queries need not be performed for an insertion because // it is impossible to break the minimum score to be in the // top 50 for that query. but we'll only have a minimum // score for each query once we run a batch to eval // each query at least partially to get a rough idea of // the score needed to be in the top 50. // . reply should be an array of QueryChanges for each // insertable term for every query that matches this doc // in the g_qlog buffer. // . in most cases these arrays will be empty because we are // not in the top 50 for that query if ( ! g_udpServer.sendRequest ( req , requestSize , 0x95 , // msgtype host->m_ip , // ip host->m_port , // port hostId, NULL, // retslot this, gotMsg95ReplyWrapper, 10000 , // timeout -1 , // backoff -1 , // maxwait NULL, // replybuf 0, // replybufmaxsize m_niceness // niceness )) { // let admin know about error log("seopipe: sendRequest 95 had error: %s", mstrerror(g_errno)); // count it as replied then m_numMsg95Replies++; continue; } } // wait for all msg95 replies to come in if ( m_numMsg95Requests > m_numMsg95Replies ) return (SafeBuf *)-1; // somehow we finished without blocking goto top; // dummy return return NULL; } // now sort the huge ptr buffer to QueryChanges first by: // 1: QueryChange::m_termHash64 // 2: QueryChange::m_queryHash32 // 3: QueryChange::m_insertPos int queryChangeCmp ( const void *a, const void *b ) { QueryChange *qa = *(QueryChange **)a; QueryChange *qb = *(QueryChange **)b; // smallest term hash should be at the head of the list if ( qa->m_termHash64 < qb->m_termHash64 ) return -1; if ( qa->m_termHash64 > qb->m_termHash64 ) return 1; if ( qa->m_queryHash32 < qb->m_queryHash32 ) return -1; if ( qa->m_queryHash32 > qb->m_queryHash32 ) return 1; if ( qa->m_insertPos < qb->m_insertPos ) return -1; if ( qa->m_insertPos > qb->m_insertPos ) return 1; return 0; } // . make each InsertableTerm point to a linked list of QueryChanges for it. // . each QueryChange is a word position and a rank change // . the linked list will be sorted by QueryChange::m_insertPos // . there can be multiple QueryChanges for a single m_insertPos, but // they will be fore different queries. bool XmlDoc::processMsg95Replies() { int32_t need = 0; // each reply is a list of QueryChanges for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { // get reply Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i]; // skip if empty, error? if ( ! mr ) continue; // deserialize the msg95replies first deserializeMsg ( sizeof(Msg95Reply) , (int32_t *)&mr->size_queryChangeBuf,//1stszparm (int32_t *)&mr->size_queryLogBuf,//lastszparm (char **)&mr->ptr_queryChangeBuf,//1ststrptr mr->m_buf ); // scan the QueryChanges //QueryChange *qcs = (QueryChange *)mr->ptr_queryChangeBuf; int32_t ncs = mr->size_queryChangeBuf/sizeof(QueryChange); need += ncs * 4; } // alloc now SafeBuf hugePtrBuf; if ( ! hugePtrBuf.reserve ( need ,"hpbuf" ) ) return false; // how big are all query log bufs? int32_t sumTotal = 0; for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { // get reply Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i]; // skip if empty, error? if ( ! mr ) continue; // how big sumTotal += mr->size_queryLogBuf; } m_queryLogBuf.reset(); if ( ! m_queryLogBuf.reserve ( sumTotal ,"qlogbuf") ) return false; char *orig = m_queryLogBuf.getBufStart(); int32_t ongoingOffset = 0; int32_t ongoingDebugOffset = 0; int32_t ongoingOrigOffset = 0; // . fill up higePtrBuf for sorting below // . also fill up m_queryLogBuf now for store*IntoCachedb() for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { // get reply Msg95Reply *mr = (Msg95Reply *)m_msg95ReplyPtrs[i]; // skip if empty, error? if ( ! mr ) continue; // ref it //char *ref = m_queryLogBuf.getBuf(); //int32_t ref = m_queryLogBuf.length(); // add to our big buffer m_queryLogBuf.safeMemcpy ( mr->ptr_queryLogBuf , mr->size_queryLogBuf ); // debug scores. should be length 0 if not debugging. m_debugScoreInfoBuf.safeMemcpy ( mr->ptr_debugScoreInfoBuf , mr->size_debugScoreInfoBuf ); // original scores buf m_origScoreInfoBuf.safeMemcpy ( mr->ptr_origScoreInfoBuf , mr->size_origScoreInfoBuf ); // scan the QueryChanges QueryChange *qcs = (QueryChange *)mr->ptr_queryChangeBuf; int32_t ncs = mr->size_queryChangeBuf/sizeof(QueryChange); for ( int32_t j = 0 ; j < ncs ; j++ ) { QueryChange *qc = &qcs[j] ; // this is relative to ptr_queryLogBuf qc->m_replyQueryOffset += ongoingOffset; // if we have debug score info if ( m_seoDebug >= 2 ) { if ( qc->m_debugScoreInfoOffset < 0 ) { char *xx=NULL;*xx=0; } if ( qc->m_origScoreInfoOffset < 0 ) { char *xx=NULL;*xx=0; } qc->m_debugScoreInfoOffset += ongoingDebugOffset; qc->m_origScoreInfoOffset += ongoingOrigOffset; } // that's relative to the msg95reply's ptr_queruStrBuf //QueryLogEntry *qe; //qe = (QueryLogEntry *)(mr->ptr_queryLogBuf + qoff); //qe = (QueryLogEntry *)(ref + qoff); // HACK that in. RELATIVE to m_queryLogBuf!!! //qc->m_queryOffset3 = ref;//(int32_t)qe; // add ptr to our global buffer hugePtrBuf.pushPtr ( qc ); } // sum it up ongoingOffset += mr->size_queryLogBuf; ongoingDebugOffset += mr->size_debugScoreInfoBuf; ongoingOrigOffset += mr->size_origScoreInfoBuf; } // sanity. make sure doesn't grow since we reference it if ( m_queryLogBuf.getBufStart() != orig ) { char *xx=NULL;*xx=0; } // now sort the huge ptr buffer to QueryChanges first by: // 1: QueryChange::m_termHash64 // 2: QueryChange::m_queryHash32 // 3: QueryChange::m_insertPos char *hhh = hugePtrBuf.getBufStart(); int32_t size = hugePtrBuf.length(); // this should breath with niceness!! gbqsort ( hhh , size/4 , sizeof(QueryChange *), queryChangeCmp , m_niceness ) ; // now store those sorted query changes into m_queryChangeBuf // so we can cache them in store*IntoCached() easily int32_t nqc = (need / 4) ; if ( ! m_queryChangeBuf.reserve ( nqc * sizeof(QueryChange),"qcbuf") ) return false; // for sanity check char *orig2 = m_queryChangeBuf.getBufStart(); // copy over sorted into m_queryChangeBuf so we can cache it in cachedb char *p = hhh; char *pend = hhh + size; for ( ; p < pend ; p += sizeof(QueryChange *) ) { // cast it QueryChange *qc = *(QueryChange **)p; // save ptr to it char *ref = m_queryChangeBuf.getBuf(); // save it m_queryChangeBuf.safeMemcpy ( qc , sizeof(QueryChange) ); // now ref that instead *(QueryChange **)p = (QueryChange *)ref; } // sanity test if ( m_queryChangeBuf.getBufStart() != orig2 ) { char *xx=NULL;*xx=0;} // now we can free the replies since we stored the replies into // m_queryLogBuf and m_queryChangeBuf for store*IntoCachedb() for ( int32_t i = 0;i < g_hostdb.m_numHosts;i++) { if ( ! m_msg95ReplyPtrs[i] ) continue; mfree ( m_msg95ReplyPtrs[i] , m_msg95ReplySizes[i] , "95rep" ); m_msg95ReplyPtrs[i] = NULL; } // . now set QueryChange::m_next to make our linked list // . if it is for a different query or termhash then end the linked // list by setting m_next to NULL QueryChange *lastqc = NULL; for ( p = hhh ; p < pend ; p += 4 ) { // cast it QueryChange *qc = *(QueryChange **)p; // assume we are the last one in the linked list qc->m_next = NULL; // make linked list if ( lastqc && // terms must match to be in same linked list lastqc->m_termHash64 == qc->m_termHash64 ) // link them lastqc->m_next = qc; // set this for next qc lastqc = qc; } // now set InsertableTerm::m_firstQueryChange to point to the head // of the linked list for that term based on it's m_termHash64. // but the insertable terms are sorted by m_trafficSum. // map a termHash64 to its corresponding first QueryChange. HashTableX tit; if ( ! m_insertableTermsBufValid ) { char *xx=NULL;*xx=0; } int32_t ni = m_insertableTermsBuf.length() / sizeof(InsertableTerm); if ( ! tit.set ( 8,4, ni*4,NULL,0,false,m_niceness,"tittbl") ) return false; int64_t lastHash64 = 0LL; // . store ptr to first querychange for each termhash64 into hash table // . should be the head of the linked list for a termid for ( p = hhh ; p < pend ; p += 4 ) { // cast it QueryChange *qc = *(QueryChange **)p; // skip if not a new term hash if ( qc->m_termHash64 == lastHash64 ) continue; // update it lastHash64 = qc->m_termHash64; // . map it in the hash table then // . it should be pre-allocated! if (!tit.addKey(&qc->m_termHash64,&qc)){char *xx=NULL;*xx=0;} } // now scan the insertable terms and set their // InsertableTerm::m_firstQueryChange ptr. points to the head // of the QueryChange linked list for this insertable term SafeBuf *itBuf = getInsertableTerms(); p = itBuf->getBufStart(); pend = itBuf->getBuf(); for ( ; p < pend ; ) { InsertableTerm *it = (InsertableTerm *)p; p += it->getSize(); // assume none it->m_firstQueryChange = NULL; char *val = (char *)tit.getValue(&it->m_termHash64); // i guess there is none if ( ! val ) continue; // cast it QueryChange *qc = *(QueryChange **)val; // and assign it->m_firstQueryChange = qc; } SafeBuf *wpib = getWordPosInfoBuf(); if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; } WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart(); int32_t nwpis = wpib->length() / sizeof(WordPosInfo); // now set InsertableTerm::m_bestTrafficGain/m_bestInsertPos/ // m_bestQueryChange by scanning the linked list and scoring each // QueryChange::m_insertPos to see which is the highest traffic gain. // and in the case of ties prefer the lowest word position. p = itBuf->getBufStart(); pend = itBuf->getBuf(); for ( ; p < pend ; ) { InsertableTerm *it = (InsertableTerm *)p; p += it->getSize(); // . use this function now so seo.cpp can call it too! // . sets WordPosInfo::m_trafficGain members setWordPosInfosTrafficGain ( it ); // now find the insert position with the most traffic gain! int32_t bestTrafficGain = -1; int32_t bestInsertPos = -1; for ( int32_t j = 0 ; j < nwpis ; j++ ) { // skip if not the best scoring position if ( wpis[j].m_trafficGain <= bestTrafficGain && // and if not first time! bestInsertPos != -1 ) continue; // we got a new winner bestTrafficGain = wpis[j].m_trafficGain; bestInsertPos = wpis[j].m_wordPos;//insertPos; } // set it it->m_bestTrafficGain = bestTrafficGain; it->m_bestInsertPos = bestInsertPos; } return true; } void XmlDoc::setWordPosInfosTrafficGain ( InsertableTerm *it ) { // get the wordposinfobuf! SafeBuf *wpib = getWordPosInfoBuf(); if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; } WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart(); int32_t nwpis = wpib->length() / sizeof(WordPosInfo); // . use the wordposinfo array to accumulate traffic gains // for each word position, WordPosInfo::m_insertPos. // . TODO: ignore tags like gblangid: // . so reset the traffic gains first for ( int32_t j = 0 ; j < nwpis ; j++ ) wpis[j].m_trafficGain = 0; if ( ! it ) return; // head of the linked list of QueryChanges for this InsertableTerm QueryChange *qc = it->m_firstQueryChange; // skip if no list. leave traffic gains set to 0 for all if ( ! qc ) return; // accumulate traffic gains int32_t k = 0; int32_t lastQueryHash32 = 0; //bool firstQueryChangeForQuery; QueryChange *lastqc = NULL; // . scan the linked list of query changes // . this is sorted by query first then m_insertPos for ( ; qc ; qc = qc->m_next ) { // assume NOT the first QueryChange for this query //firstQueryChangeForQuery = false; // . reset stuff for each different query // . QueryChanges are sorted by m_queryHash32 secondly // and by m_insertPos thirdly now... if ( qc->m_queryHash32 != lastQueryHash32 ) { // reset our WordPosInfo cursor k = 0; // for detecting the next set of QueryChanges // for a different query lastQueryHash32 = qc->m_queryHash32; //firstQueryChangeForQuery = true; lastqc = NULL; } // sanity if ( lastqc && lastqc->m_insertPos > qc->m_insertPos ) { char *xx=NULL;*xx=0; } // compute th traffic in advance from the rank changes int32_t trafficGain = getTrafficGain( qc ); // checkpoint /* if ( trafficGain > 0 ) log("got some traffic gain qh=%"UINT32" " "pos=%"INT32" term=%s gain=%"INT32"", qc->m_queryHash32, qc->m_insertPos, it->m_termStr, trafficGain); */ // get next query change QueryChange *nqc = qc->m_next; // make it NULL if for a different query if ( nqc && nqc->m_queryHash32 != qc->m_queryHash32 ) nqc = NULL; // . we use a compression where we only store a // QueryChange if different than the last QueryChange // . so advance the WordPosInfos cursor "k" until // we catch up to the qc->m_insertPos. for ( ; k < nwpis ; k++ ) { // stop if we are caught up if ( wpis[k].m_wordPos >= qc->m_insertPos ) break; } // now this position and up to next qc "nqc" gets the traffic for ( ; k < nwpis ; k++ ) { // stop if we are caught up if ( nqc && wpis[k].m_wordPos >= nqc->m_insertPos ) break; wpis[k].m_trafficGain += trafficGain; } } /* // print out positives - debug for ( int32_t k = 0 ; k < nwpis ; k++ ) { // stop if we are caught up if ( ! wpis[k].m_trafficGain ) continue; if ( wpis[k].m_trafficGain <= 0 ) continue; // note it log("seo: gain pos=%"INT32" gain=%"INT32"", wpis[k].m_wordPos, wpis[k].m_trafficGain); } */ } double getTrafficPercent ( int32_t rank ) { // from aol's query logs from that same searchenginewatch.com url static double s_posClicks[1000] = { .4230, // #1 .1192, .0844, .0603, .0486, .0399, .0337, .0298, .0283, .0270 // #10 (was .297 but for our purposes, make it <) }; //static float s_pageClicks[5]; // set total of clicks each page gets static bool s_init = false; if ( ! s_init ) { s_init = true; //float sum = 0.0; //for ( int32_t i = 0 ; i < 10 ; i++ ) // sum += s_posClicks[i]; // this is about .11 or so //float pageFactor = 1.0 - sum; // HACK! make it pass the sanity check below! //pageFactor *= .50; // sanity. do not allow top result on 2nd page // to rank higher!! //if ( pageFactor * s_posClicks[0] > s_posClicks[9] ) { // char *xx=NULL;*xx=0; } // will be like .11 for second page, .01 for 3rd, etc. //float pageMult = 1.0; // fill in the rest for ( int32_t i = 10 ; i < 1000 ; i++ ) { // just make it linear since there is too much // chaos as to our diffs with google. so this is // a good estimation way... s_posClicks[i] = .0270 - .0007 * i; if ( s_posClicks[i] < 0 ) s_posClicks[i] = 0.0; } // sanity to make sure all in order for ( int32_t i = 1 ; i < 1000 ; i++ ) { if ( s_posClicks[i-1] < s_posClicks[i] ) { char *xx=NULL;*xx=0; } if ( s_posClicks[i] < 0 ) { char *xx=NULL;*xx=0; } } } if ( rank >= 1000 ) rank = 999; if ( rank < 0 ) { char *xx=NULL;*xx=0; } return s_posClicks[rank]; } // . based on difference between m_oldRank and m_newRank // . m_*Rank starts at 0 and goes to 9 for first page of results int32_t XmlDoc::getTrafficGain ( QueryChange *qc ) { // no rank change? this can both be -1 if it is a missing // term i guess... and we're not inserting it. if ( qc->m_oldRank == qc->m_newRank ) return 0; // get old clicks int32_t oldRank = qc->m_oldRank; double oldp; // if not ranked before because this was inserting a brand new // missing term, this will be -1 if ( oldRank == -1 ) oldp = 0.0; else oldp = getTrafficPercent ( oldRank ); //if ( oldRank < 50 ) oldp = s_posClicks[oldRank]; // get new clicks int32_t newRank = qc->m_newRank; float newp = getTrafficPercent ( newRank ); //if ( newRank < 50 ) newp = s_posClicks[newRank]; // HACK // we stored the entire querylogreply buf in here char *ref = m_queryLogBuf.getBufStart(); // so we can use the replyqueryoffset then... QueryLogEntry *qe = (QueryLogEntry *)(ref + qc->m_replyQueryOffset); int32_t traffic = qe->m_gigablastTraffic; traffic *= GB_TRAFFIC_MODIFIER; int32_t trafficChange = (int32_t)((newp - oldp) * traffic); // sanity. if ( qc->m_oldRank > qc->m_newRank && trafficChange < 0 ) { char *xx=NULL;*xx=0; } // ignore this sanity check if not ranked before. i.e. inserting // a new missing term... if ( qc->m_oldRank != -1 && qc->m_oldRank < qc->m_newRank && trafficChange > 0 ) { char *xx=NULL;*xx=0; } // return the change. it might be negative! return trafficChange; } // 4. then we just dump out all the InsertedTerms into xml so they can be // displayed on the front end. // dump the list of InsertedTerms into "sbuf" as xml bool XmlDoc::printScoredInsertableTerms ( SafeBuf *sbuf ) { // print the header sbuf->safePrintf("\t\n"); // scan each term SafeBuf *itBuf = getInsertableTerms(); // has to be there if ( ! itBuf || itBuf == (void *)-1 ) { char *xx=NULL;*xx=0; } SafeBuf *wpib = getWordPosInfoBuf(); if ( ! wpib || wpib == (void *)-1 ) { char *xx=NULL;*xx=0; } WordPosInfo *wpis = (WordPosInfo *)wpib->getBufStart(); int32_t nwpis = wpib->length() / sizeof(WordPosInfo); // cast it //InsertableTerm *its = (InsertableTerm *)itBuf->getBufStart(); // how many terms do we have? //int32_t ni = m_insertableTermsBuf.length() / sizeof(InsertableTerm); // dedup queries used in query changes HashTableX qdups; if ( ! qdups.set(4,0,32,NULL,0,false,m_niceness,"qddd") ) return false; // // . print query map // . print all query ids we use and their strings // bool firstTime = true; char *p = itBuf->getBufStart(); char *pend = itBuf->getBuf(); for ( ; p < pend ; ) { QUICKPOLL(m_niceness); // cast it InsertableTerm *it = (InsertableTerm *)p; p += it->getSize(); // scan its query changes QueryChange *qc = it->m_firstQueryChange; for ( ; qc ; qc = qc->m_next ) { // skip if already printed if ( qdups.isInTable(&qc->m_queryHash32) ) continue; if ( firstTime ) { sbuf->safePrintf("\t\t\n"); sbuf->safePrintf("\t\t\t" "\n" ); } firstTime = false; // HACK char *ref = m_queryLogBuf.getBufStart(); QueryLogEntry *qe; qe = (QueryLogEntry *)(ref + qc->m_replyQueryOffset); // new query, print it. map the hash to the string // so we can just show the hash when printing // out all the QueryChanges below to save space sbuf->safePrintf("\t\t\t" "" "\n" , qc->m_queryHash32 // hack... , qe->getQueryStr() ); // do not re-print if ( ! qdups.addKey(&qc->m_queryHash32) )return false; } } if ( ! firstTime ) sbuf->safePrintf("\t\t\n"); // . now the word position map // . we only provided querychange if it has a different score than // the previously stored querychange. this is a kind of compression // . so you need to know all the possible word positions we tried // for each insertion we did sbuf->safePrintf("\t\t\n"); sbuf->safePrintf("\t\t\t" "\n" ); for ( int32_t i = 0 ; i < nwpis ; i++ ) { WordPosInfo *wpi = &wpis[i]; sbuf->safePrintf("\t\t\t\n" "\t\t\t\t%"INT32"\n" "\t\t\t\t%"INT32"\n" "\t\t\t\t%s\n" "\t\t\t\t%"INT32"\n" "\t\t\t\t%"INT32"\n" "\t\t\t\n" ,wpi->m_wordPos ,wpi->m_sentNum ,getHashGroupString(wpi->m_hashGroup) ,(int32_t)wpi->m_densityRank ,(int32_t)wpi->m_wordSpamRank ); } sbuf->safePrintf("\t\t\n"); // scan all the insertable terms p = itBuf->getBufStart(); pend = itBuf->getBuf(); for ( ; p < pend ; ) { QUICKPOLL(m_niceness); // cast it InsertableTerm *it = (InsertableTerm *)p; p += it->getSize(); // print the term sbuf->safePrintf("\t\t\n"); // the string sbuf->safePrintf("\t\t\t\n", it->getTerm()); // sum of traffic of all queries containing this term sbuf->safePrintf("\t\t\t%"INT32"\n", it->m_trafficSum); // is it contained in the doc/linktext or is it "related" sbuf->safePrintf("\t\t\t%"INT32"\n", (int32_t)it->m_isRelatedTerm); // get the first query change if any QueryChange *qc = it->m_firstQueryChange; // limit to fix firefox crash //int32_t queryChangeLimit = 30; // skip if no list if ( ! qc ) goto skip; // print the insert position that gives us the most traffic sbuf->safePrintf("\t\t\t%"INT32"" "\n", it->m_bestInsertPos); sbuf->safePrintf("\t\t\t%"INT32"" "\n", it->m_bestTrafficGain); // print query changes if ( it->m_firstQueryChange ) sbuf->safePrintf("\t\t\tm_firstQueryChange ; qc ; qc = qc->m_next ) { // fix firefox crash for now //if ( --queryChangeLimit <= 0 ) break; // now store in binary sbuf->pushLong(qc->m_insertPos); sbuf->pushLong(qc->m_queryHash32); sbuf->pushChar(qc->m_oldRank); sbuf->pushChar(qc->m_newRank); /* // . TODO: make sure to remove QueryChanges that have // the same old and new rank // . print it sbuf->safePrintf("\t\t\t\n"); sbuf->safePrintf("\t\t\t\t%"INT32"" "\n", qc->m_insertPos); sbuf->safePrintf("\t\t\t\t%"INT32"" "\n",(int32_t)qc->m_oldRank); sbuf->safePrintf("\t\t\t\t%"INT32"" "\n",(int32_t)qc->m_newRank); sbuf->safePrintf("\t\t\t\t%"INT32"" "\n", qc->m_queryHash32 ); sbuf->safePrintf("\t\t\t\n"); */ } if ( it->m_firstQueryChange ) sbuf->safePrintf("]]>\n"); skip: // print the term end sbuf->safePrintf("\t\t\n"); } sbuf->safePrintf("\t\n"); return true; } /* static int wordPosInfoCmp ( const void *a, const void *b ) { WordPosInfo *wa = (WordPosInfo *)a; WordPosInfo *wb = (WordPosInfo *)b; // smallest word position should be at the head of the list if ( wa->m_wordPos < wb->m_wordPos ) return -1; if ( wa->m_wordPos > wb->m_wordPos ) return 1; return 0; } */ static int wpPosdbKeyCmp ( const void *a, const void *b ) { int32_t wpa = g_posdb.getWordPos((char *)a); int32_t wpb = g_posdb.getWordPos((char *)b); return wpa - wpb; } SafeBuf *XmlDoc::getWordPosSortedPosdbListBuf ( ) { if ( m_wpSortedPosdbListBufValid ) return &m_wpSortedPosdbListBuf; // get the lists. forDelete = false. char *metaList = getMetaList ( false ); if ( ! metaList || metaList==(void *)-1 ) return (SafeBuf *)metaList; // sanity if ( ! m_docIdValid ) { char *xx=NULL;*xx=0; } // make a tmp buf to hold posdb keys //SafeBuf tmp; if ( ! m_wpSortedPosdbListBuf.reserve ( m_metaListSize,"wpsbuf" ) ) return NULL; // point into it char *dst = m_wpSortedPosdbListBuf.getBufStart(); // scan the meta list for posdb keys char *p = metaList; char *pend = p + m_metaListSize; // stole this loop from getMetaList() for ( ; p < pend ; ) { // breathe QUICKPOLL(m_niceness); // save it with the flag char byte = *p; // get rdbId char rdbId = byte & 0x7f; // skip that p++; // key size int32_t ks = getKeySizeFromRdbId(rdbId); // get key char *key = p; // skip that p += ks; // get datasize int32_t ds = getDataSizeFromRdbId(rdbId); // assume we do not store the datasize //bool neg = false; // . if key is negative, no data is present // . the doledb key is negative for us here if ( (key[0] & 0x01) == 0x00 ) ds = 0; // if datasize variable, read it in if ( ds == -1 ) { // get data size ds = *(int32_t *)p; // skip data size int32_t p += 4; } // point to data //char *data = p; // skip data if not zero p += ds; // if not posdb skip rec if ( rdbId != RDB_POSDB ) continue; // skip negative keys if ( (key[0] & 0x01) == 0x00 ) continue; // add to new buf now gbmemcpy ( dst , key , sizeof(POSDBKEY) ); // advance dst += sizeof(POSDBKEY); } char *start = m_wpSortedPosdbListBuf.getBufStart(); // update tmp m_wpSortedPosdbListBuf.incrementLength ( dst - start ); // sanity if ( m_wpSortedPosdbListBuf.length() > m_metaListSize ) { char *xx=NULL;*xx=0; } // point char *pbuf = m_wpSortedPosdbListBuf.getBufStart(); int32_t numKeys = m_wpSortedPosdbListBuf.length()/sizeof(POSDBKEY); // sort keys by word position gbqsort ( pbuf , numKeys, sizeof(POSDBKEY), wpPosdbKeyCmp , m_niceness ); m_wpSortedPosdbListBufValid = true; return &m_wpSortedPosdbListBuf; } // now pass this into Msg95Request so we only try to insert right before // or after m_wordPos values in this WordPosInfo vector. SafeBuf *XmlDoc::getWordPosInfoBuf ( ) { // if it is valid and we have not yet added to cachedb... if ( m_wordPosInfoBufValid && ! m_triedToAddWordPosInfoToCachedb ) { // only do this once m_triedToAddWordPosInfoToCachedb = true; // store the m_wordPosInfoBuf into cachedb if ( m_doingSEO && ! storeWordPosInfoBufIntoCachedb ( ) ) return (SafeBuf *)-1; } if ( m_wordPosInfoBufValid ) return &m_wordPosInfoBuf; // it should be valid now from our logic in hashWords3() if // m_doingSEO is set to true char *xx=NULL; *xx=0; // these are FULL 18-byte keys, no compression, sorted by word pos SafeBuf *posdbBuf = getWordPosSortedPosdbListBuf (); if ( ! posdbBuf || posdbBuf == (void *)-1 ) return posdbBuf; // scan posdb keys int32_t numKeys = posdbBuf->length() / sizeof(POSDBKEY); // . reserve mem for new buf int32_t need = numKeys * sizeof(WordPosInfo); if ( ! m_wordPosInfoBuf.reserve ( need ,"wpibuf" ) ) return NULL; int32_t sentNum = 0; int32_t lastWordPos = -1; //int32_t lastwp = -1; int32_t lastSentNum = -1; // scan all the sorted posdb keys and build posdb termlists and // store the termlists into "m_termListBuf" char *p = posdbBuf->getBufStart(); char *pend = p + posdbBuf->length(); for ( ; p < pend ; ) { // breathe QUICKPOLL(m_niceness); // get the key char *key = p; // sanity if ( g_posdb.getKeySize(p) != 18 ) { char *xx=NULL;*xx=0; } // skip del keys if ( (p[0] & 0x01) == 0x00 ) { char *xx=NULL;*xx=0; } // skip it p += sizeof(POSDBKEY); // get key termid //int64_t termId = g_posdb.getTermId ( key ); // sanity //int64_t docId = g_posdb.getDocId ( key ); //if ( docId != m_docId ) { char *xx=NULL;*xx=0; } // log it for debug //if ( docId == 192304365235LL ) // log("tlist: docid=%"INT64" termId=%"INT64" wpos=%"INT32"", // docId, // termId, // g_posdb.getWordPos(key)); WordPosInfo wpi; int32_t wp = g_posdb.getWordPos(key); // set "m_sentNum" if ( wp >= lastWordPos + 50 ) sentNum++; wpi.m_wordPos = wp; wpi.m_sentNum = sentNum; wpi.m_hashGroup = g_posdb.getHashGroup (key); wpi.m_densityRank = g_posdb.getDensityRank (key); wpi.m_wordSpamRank = g_posdb.getWordSpamRank (key); wpi.m_trafficGain = 0; // log it /* log("seopipe: term=%"INT64" pos=%"INT32" sent=%"INT32" hg=%s dr=%"INT32"", g_posdb.getTermId(key), (int32_t)wp, sentNum, getHashGroupString(wpi.m_hashGroup), (int32_t)wpi.m_densityRank); */ // bigrams share the same word position as the single term. // so ignore them. we only want unique insertion positions. if ( wp == lastWordPos ) continue; // . i thought sorted by word position?? // . word position 0 is used by generic terms, like tags if ( wp < lastWordPos ) { char *xx=NULL;*xx=0; } // additional positoin at the end of a sentence? //if ( lastwp != wp && lastSentNum == sentNum ) // // store it // m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo )); // to right as well! so it can be in same sentence, if this // word as at the end of the sentence. //wpi.m_wordPos = wp;// + 2; // add it m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo )); int32_t nextSent = -1; if ( p < pend ) { // assume same as current sentence nextSent = sentNum; // get word position of next term int32_t nextwp = g_posdb.getWordPos(p); // same as us? then it is a bigram, so try the // word after that! if ( nextwp == wp && p+18= wp + SENT_UNITS ) nextSent = sentNum+1; } // HACK. if next word starts a new sentence, add a WordPosInfo // here so we can insert term at end of THIS sentence. // otherwise we are inserted BEFORE the term whose position // we use. if ( nextSent != sentNum ) { wpi.m_wordPos += 2; m_wordPosInfoBuf.safeMemcpy(&wpi,sizeof(WordPosInfo )); } // set these lastWordPos = wp; //lastwp = wp;// + 2; lastSentNum = sentNum; } /* // point to raw buf char *raw = m_wordPosInfoBuf.getBufStart(); int32_t size = m_wordPosInfoBuf.length(); // this shit is sorted by termid then pos, so sort just by pos // this should breath with niceness!! gbqsort ( raw , size / sizeof(WordPosInfo), sizeof(WordPosInfo) , wordPosInfoCmp , m_niceness ) ; */ m_wordPosInfoBufValid = true; return &m_wordPosInfoBuf; } // . i made this easy to serialize by using offsets and not ptrs // . so we can add to cachedb easily // . and so its immune to reallocs() on m_linkSourceBuf SafeBuf class LinkSource { public: int32_t m_linkSiteRank; // the actual url of the link, references into m_buf int32_t m_linkUrlOffset; // the title of the link, references into m_buf int32_t m_linkTitleOffset; // . we store the offsets of the RelatedDocIds in m_relatedDocIdBuf // . these are the related docids that are linked to by this link src int32_t m_offsetOfRelatedDocIdOffsets; int32_t m_numRelatedDocIds; char m_buf[0]; char *getLinkUrl ( SafeBuf *linkSourceBuf ) { char *buf = linkSourceBuf->getBufStart(); buf += m_linkUrlOffset; return buf; }; char *getLinkTitle ( SafeBuf *linkSourceBuf ) { char *buf = linkSourceBuf->getBufStart(); buf += m_linkTitleOffset; return buf; }; // crap, do we store RelatedDocIds into cachedb? we should // make it use offsets and not ptrs too... int32_t *getRelatedDocIdOffsets ( SafeBuf *linkSourceBuf ) { // how can this be? //if ( m_numRelatedDocIds == 0 ) return NULL; char *buf = linkSourceBuf->getBufStart(); buf += m_offsetOfRelatedDocIdOffsets; return (int32_t *)buf; }; }; /* static void gotLinkInfoReplyWrapper ( void *state ) { //XmlDoc *newxd = (XmlDoc *)state; Msg25 *msg25 = (Msg25 *)state; XmlDoc *xd = msg25->m_xd; // count it as returned xd->m_numLinkRequestsIn++; // this will nuke the msg25 as well after copying its linkinfo xd->processLinkInfoMsg20Reply ( msg25 ); // try to send out more requests or intersect them if done xd->m_masterLoop ( xd->m_masterState ); } // . before we were just looking at the LinkInfo the msg25 makes from // all the Msg20Replies it gets, but let's keep the msg20 replies // intact because they have the titles we need! // . return false on error, true otherwise bool XmlDoc::processLinkInfoMsg20Reply ( Msg25 *msg25 ) { // int16_tcut //LinkInfo *info = msg25->getLinkInfo (); // store into our buffer //bool status ; // i guess info can be NULL on error //if ( info ) // status = m_linkInfoReplyBuf.safeMemcpy (info, info->getSize()); // give front-end the progress bar info if ( m_seoSocket && m_progressBar ) { // tmp buf char tmp[16]; float percent = (float)m_rdCursor; SafeBuf *rdbuf = getRelatedDocIdsWithTitles(); int32_t numRelated = rdbuf->length() / sizeof(RelatedDocId); percent /= (float)numRelated; // 80% of the pipeline was doing the full queries percent *= .20; percent += .80; percent *= 100.0; int32_t percentLong = (int32_t)percent; if ( percentLong >= 100 ) percentLong = 99; int32_t tmpLen = sprintf(tmp,"%02"INT32"%%",percentLong); if ( tmpLen !=3)log("seo: bad progress bar output %"INT32"",tmpLen); // try a send on non-blocking socket int32_t n = ::send ( m_seoSocket->m_sd , tmp,tmpLen , 0 ); if ( n != tmpLen ) log("seo: bad progress bar send %"INT32"",n); // forget error errno = 0; } // store this int32_t nr = msg25->m_numReplyPtrs; // reserve space if ( ! m_msg20ReplyPtrBuf.reserve ( 8 + nr * 4 * 2 ) ) { m_hadLinkInfoError = g_errno; nr = 0; } // first store related docid ptr into m_relatedDocIdBuf safebuf RelatedDocId *rd = (RelatedDocId *)msg25->m_hackrd; m_msg20ReplyPtrBuf.pushLong((int32_t)rd); // then store the # of msg20 replies m_msg20ReplyPtrBuf.pushLong(nr); // . scan each msg20reply it got, each msg20reply is an inlink // for this docid // . seems like they are only freed in Msg25::reset() for ( int32_t i = 0 ; i < nr ; i++ ) { // get one Msg20Reply *r = msg25->m_replyPtrs[i]; int32_t size = msg25->m_replySizes[i]; // steal it, we will free them ourselves below m_msg20ReplyPtrBuf.pushLong((int32_t)r); // we need this since we need to free it when done m_msg20ReplyPtrBuf.pushLong(size); } // . do not allow Msg25 to free it, we will free it below // . on OOM error above we set nr to 0 on error, so allow msg25 // to free the replies in that case if ( nr ) msg25->m_numReplyPtrs = 0; // nuke it mdelete ( msg25 , sizeof(Msg25), "m25li" ); delete ( msg25 ); return true; } */ static int riCmp ( const void *a, const void *b ) { RecommendedLink *wa = *(RecommendedLink **)a; RecommendedLink *wb = *(RecommendedLink **)b; int32_t diff = wb->m_votes - wa->m_votes; if ( diff ) return diff; if ( wb->m_totalRecommendedScore > wa->m_totalRecommendedScore ) return 1; if ( wb->m_totalRecommendedScore < wa->m_totalRecommendedScore ) return -1; // docid to break all ties if ( wb->m_rl_docId > wa->m_rl_docId ) return 1; if ( wb->m_rl_docId < wa->m_rl_docId ) return -1; return 0; } static void gotLinkdbListWrapper ( void *state ) { Msg0 *msg0 = (Msg0 *)state; XmlDoc *xd = msg0->m_hackxd; // free it's memory here lest we have a leak //msg0->reset(); xd->m_numLinkRequestsIn++; xd->m_masterLoop ( xd->m_masterState ); } #define MAX_RECOMMENDED_LINKS 300 // . returns safebuf of RecommendedLinks // . use RecommendedLink::getSize() to skip over element in array/safebuf // . these are the recommended link sources // . these are the links that your relateddocids (i.e. competing pages) have // in common the most // . TODO: store the returned safebuf in cachedb as well! SafeBuf *XmlDoc::getRecommendedLinksBuf ( ) { // try to set from cachedb record if ( ! checkCachedb() ) return (SafeBuf *)-1; if ( m_recommendedLinksBufValid ) return &m_recommendedLinksBuf; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // what docids share our matching queries? SafeBuf *rdbuf = getRelatedDocIdsWithTitles(); if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf; // cast then RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart(); // how many related docids do we have? int32_t numRelatedDocIds = rdbuf->length() / sizeof(RelatedDocId); if ( m_numLinkRequestsOut == 0 ) { // reset these on first call m_rdCursor = 0; m_numLinkRequestsIn = 0; m_hadLinkInfoError = 0; m_numMsg20sIn = 0; m_numMsg20sOut = 0; m_numValidMsg20s = 0; m_titleCursor = 0; m_msg20Phase = 0; m_recommendedLinkError = 0; } if ( ! m_relatedTitleBufValid ) { char *xx=NULL;*xx=0; } // if we are looking up the title/url of each docid in // the m_recommendedLinksBuf now, go back there if ( m_msg20Phase ) return lookupTitles(); for ( ; m_rdCursor < numRelatedDocIds ; m_rdCursor++ ) { // wait if too many out. only allow 2 out. otherwise each // one can send out like 500 msg20s if ( m_numLinkRequestsOut - m_numLinkRequestsIn > 60 ) // wait for 1 to come back return (SafeBuf *)-1; // skip the rest on error if ( m_hadLinkInfoError ) continue; // cast it RelatedDocId *rd = &rds[m_rdCursor]; // bogus? a not found, EDOCBANNED/EDOCFILTERED or it // linked to our domain if ( rd->rd_url_off < 0 ) continue; // bogus? if ( ! rd->getUrl( &m_relatedTitleBuf ) ) { log("seo: skipping null url"); continue; } if ( ! rd->getSite( &m_relatedTitleBuf ) ) { log("seo: skipping null site"); continue; } // allocate msg0 array into m_tmpMsg0Buf safebuf if ( ! m_tmpMsg0Buf.length() ) { // fill tmpmsg0 buf int32_t need = sizeof(Msg0) * numRelatedDocIds; if ( ! m_tmpMsg0Buf.reserve ( need , "tmp20s" ) ) return NULL; // do not re-call! m_tmpMsg0Buf.setLength(need); char *p = m_tmpMsg0Buf.getBufStart(); char *pend = p + need; for ( ; p < pend ; p += sizeof(Msg0) ) { Msg0 *msg0 = (Msg0 *)p; msg0->constructor(); } } // debug it if ( m_seoDebug >= 2 ) log("seo: getting inlinks to related docid=%"INT64" " "weight=%f " "url=%s", rd->m_docId, rd->m_relatedWeight, rd->getUrl(&m_relatedTitleBuf)); // just get his linkdb list! Msg0 *array = (Msg0 *)m_tmpMsg0Buf.getBufStart(); Msg0 *msg0 = &array[m_rdCursor]; key224_t startKey; key224_t endKey; char *rdurl = rd->getUrl(&m_relatedTitleBuf); // by default, just hash of hostname, unless overridden // with "site" tag in tagdb, or has a path like /~mwells int32_t siteHash32 = rd->m_rd_siteHash32; int64_t linkHash64 = hash64n(rdurl); startKey = g_linkdb.makeStartKey_uk (siteHash32,linkHash64 ); endKey = g_linkdb.makeEndKey_uk (siteHash32,linkHash64 ); // hack that thing msg0->m_hackxd = this; // consider it outstanding m_numLinkRequestsOut++; // int16_tcut, piggyback on the msg0 RdbList *list = &msg0->m_handyList; //RdbList list2; if ( ! msg0->getList ( -1 , // hostId, -1 if none 0 , // hostId ip 0 , // hostId port 0 , // max cache age -secs false , // addToCache? RDB_LINKDB , cr->m_collnum , list , // linkdb list to fill (char*)&startKey, (char*)&endKey , 1000000 , // 1MB minrecsizes msg0 , gotLinkdbListWrapper , m_niceness , true , // error correct? true , // includeTree true , // do merge -1,//hostId 0 , // startFileNum -1 , // numFiles 60*60*24*365 )){//timeout of one year // blocked? keep chugging continue; } // . maybe it was cached or something, or we had an error! // . this will nuke the msg25 // . returns false and sets g_errno on error //processLinkInfoMsg20Reply ( msg25 ); m_numLinkRequestsIn++; // save g_errno int32_t saved = g_errno; // free it's memory here lest we have a leak //msg0->reset(); // error? it will not have blocked then if ( ! saved ) continue; // save error, and stop launching any more requests m_hadLinkInfoError = saved; log("xmldoc: linksrc error3 = %s",mstrerror(saved)); } // return -1 if waiting for more requests to come in if ( m_numLinkRequestsOut > m_numLinkRequestsIn ) return (SafeBuf *)-1; // vote table to allow inlink voting HashTableX riTable; // do not return on error setting this table because we'll leave // the msg20 replies unfreed! if ( ! riTable.set ( 8,4,1024,NULL,0,false,m_niceness,"ritbl") ) m_hadLinkInfoError = g_errno; RecommendedLink *ri; HashTableX dedupVotesTable; if ( ! dedupVotesTable.set(8,0,1024,NULL,0,false,m_niceness,"dvtt") ) return NULL; // need this for computing rdOff char *rdStart = m_relatedDocIdBuf.getBufStart(); // store recommended links bufs here temporarily SafeBuf tmpBuf; if ( ! tmpBuf.reserve ( 10000000 ,"tt5buf" ) ) return NULL; // all done. scan linkdb lists and intersect. there is one list // per related docid. for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) { // get related docid that had the following msg20replies RelatedDocId *rd = &rds[i]; // his offset in his buf int32_t rdOff = (char *)rd - rdStart; // get linkdb list loaded from msg0 call above Msg0 *msg0 = &((Msg0 *)m_tmpMsg0Buf.getBufStart())[i]; RdbList *list = &msg0->m_handyList; list->resetListPtr(); // scan the docids in list for ( ; ! list->isExhausted() ; list->skipCurrentRec() ) { // get the current key if list has more left key224_t key; list->getCurrentKey( &key ); //int32_t itop = g_linkdb.getLinkerIp24_uk ( &key ); int32_t ip32 = g_linkdb.getLinkerIp_uk ( &key ); //bool isLinkSpam = g_linkdb.isLinkSpam_uk ( &key ); int64_t docId = g_linkdb.getLinkerDocId_uk ( &key ); //int32_t discovered = g_linkdb.getDiscoveryDate_uk(&key); // skip if no longer there on page, we keep these // only to graph lost links over time int32_t lostDate = g_linkdb.getLostDate_uk ( &key ); if ( lostDate ) continue; // if the inlink is from the same c-block IP as the // related docid it links to, then do not consider. // the ip used in linkdb is the current ip not the // first ip actually. if ( ipdom(ip32)==ipdom(rd->m_relatedCurrentIp)) continue; if ( ipdom(ip32)==ipdom(rd->m_relatedFirstIp)) continue; // if the linking document links to the same related // docid multiple times/ we need to dedup so m_votes // is not incremented multiple times! // actually make it use c-block not docid to fix // links/pages getting two m_votes for linking to // two competitors, where each competitor linked to // is on the same c-block... kinda strange. int64_t dkey = docId ^ ipdom(rd->m_relatedFirstIp); if ( dedupVotesTable.isInTable(&dkey) ) continue; if ( ! dedupVotesTable.addKey(&dkey) ) return NULL; // now we associate a new class with each unique linker int32_t *poff = (int32_t *)riTable.getValue ( &docId ); // if there, it will be an offset into the links buf if ( poff ) { char *ptr = tmpBuf.getBufStart(); ptr += *poff; RecommendedLink *rip = (RecommendedLink *)ptr; rip->m_totalRecommendedScore += rd->m_relatedWeight; rip->m_votes++; // add to array of rd offs int32_t k; for ( k = 0 ; k < 10 ; k++ ) { if ( rip->m_relatedDocIdOff[k]==-1) break; } if ( k < 10 ) rip->m_relatedDocIdOff[k] = rdOff; continue; } // reserve space int32_t need = sizeof(RecommendedLink); // reserve if ( ! tmpBuf.reserve ( need , "tt5buf" ) ) { m_hadLinkInfoError = g_errno; continue; } // save this int32_t firstOff = tmpBuf.length(); // ref it char *buf = tmpBuf.getBuf(); ri = (RecommendedLink *)buf; // advance over that int32_t over = sizeof(RecommendedLink); // increase buf length tmpBuf.incrementLength(over); // this is how similar the relatedDocId is to the // main url. these dotproducts are all relative // with the other relatedDocIds for this url. // the dotproduct was basically a dotproduct // of the score vector of "rd" with that of // the main url for the same queries. and that // was normalized by the score of the top result // for each query that have in common. see the // the algo above for the "m_dotProduct" computation. ri->m_totalRecommendedScore = rd->m_relatedWeight; ri->m_votes = 1; ri->m_rl_docId = docId; // we do not know these things until we call msg20 // on the docid: ri->m_rl_siteRank = -1;//reply->m_siteRank; ri->m_rl_firstIp = 0;//reply->m_firstIp; // each recommended link links to one or more // related docids. so record them! ri->m_relatedDocIdOff[0] = rdOff; ri->m_relatedDocIdOff[1] = -1; ri->m_relatedDocIdOff[2] = -1; ri->m_relatedDocIdOff[3] = -1; ri->m_relatedDocIdOff[4] = -1; ri->m_relatedDocIdOff[5] = -1; ri->m_relatedDocIdOff[6] = -1; ri->m_relatedDocIdOff[7] = -1; ri->m_relatedDocIdOff[8] = -1; ri->m_relatedDocIdOff[9] = -1; ri->m_urlSize = 0; ri->m_titleSize = 0; // store it in table then, pointing into the new buf if ( ! riTable.addKey ( &docId, &firstOff ) ) m_hadLinkInfoError = g_errno; } // free that list now to save mem list->freeList(); } // free the msg0s now, including Msg0::m_handyList, what we used // to hold the linkdb list for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) { Msg0 *array = (Msg0 *)m_tmpMsg0Buf.getBufStart(); Msg0 *msg0 = &array[i]; // free the mem and the handylist now that we've processed them msg0->reset(); } // no longer need the msg0s and linkdb lists (Msg0::m_handyLists) m_tmpMsg0Buf.purge(); // // now sort RecommendedLinks in tmpBuf by their scores // // get the top 300 recommended links so we can save mem and // store this beastie in cachedb SafeBuf ptrBuf; int32_t maxNumPtrs = tmpBuf.length() / sizeof(RecommendedLink); if ( ! ptrBuf.reserve(maxNumPtrs *sizeof(RecommendedLink *),"ptrbuf")) return NULL; char *p = tmpBuf.getBufStart(); char *pend = tmpBuf.getBuf(); int32_t numPtrs = 0; for ( ; p < pend ; ) { RecommendedLink *ri = (RecommendedLink *)p; ptrBuf.pushPtr ( ri ); p += sizeof(RecommendedLink); // we have no title or url at this point... if ( ri->getSize() != sizeof(RecommendedLink) ) { char *xx=NULL;*xx=0; } numPtrs++; } // now sort! RecommendedLink **ptrs = (RecommendedLink **)ptrBuf.getBufStart(); gbqsort ( ptrs , numPtrs , sizeof(RecommendedLink *), riCmp, m_niceness ); // copy over the top recommended links into permanent buffer in order // of score int32_t need2 = tmpBuf.length(); // increase for storing titles/urls into here need2 = numPtrs * sizeof(RecommendedLink); // allocate that now if ( ! m_recommendedLinksBuf.reserve ( need2 ,"rlkbuf") ) return NULL; // and copy over from tmpBuf, sorted by the score for ( int32_t i = 0 ; i < numPtrs ; i++ ) m_recommendedLinksBuf.safeMemcpy(ptrs[i], sizeof(RecommendedLink)); // this can be really huge! > 30MB tmpBuf.purge(); // free the ptrs too! ptrBuf.purge(); // // now m_recommendedLinksBuf is a bunch of RecommendedLinks sorted // by score. now use msg20 to lookup the top 300 or so that // do not link to our main doc // m_msg20Phase = true; return lookupTitles (); } //static void gotLinkerTitleWrapper ( void *state ) { // Msg20 *msg20 = (Msg20 *)state; // XmlDoc *THIS = (XmlDoc *)msg20->m_state2; // THIS->gotLinkerTitle ( msg20 ); // THIS->m_masterLoop ( THIS->m_masterState ); //} SafeBuf *XmlDoc::lookupTitles ( ) { CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // none have a title/url following them in here yet int32_t numLinkers = m_recommendedLinksBuf.length(); numLinkers /= sizeof(RecommendedLink); if ( ! m_msg20Array.length() ) { int32_t need = numLinkers * sizeof(Msg20); if ( ! m_msg20Array.reserve ( need,"m20arr" ) ) return (SafeBuf *)-1; // do not re-call! m_msg20Array.setLength(need); char *p = m_msg20Array.getBufStart(); char *pend = p + need; for ( ; p < pend ; p += sizeof(Msg20) ) ((Msg20 *)p)->constructor(); } Msg20 *msg20s = (Msg20 *)m_msg20Array.getBufStart(); // one per linker int32_t numMsg20s = numLinkers; // we can use the array model because each element is fixed size // because they do not have the url/title string following them // yet... char *ppp = m_recommendedLinksBuf.getBufStart(); RecommendedLink *ptr = (RecommendedLink *)ppp; // scan the msg20s we allocated to see if any got a reply for ( int32_t i = 0 ; i < numMsg20s ; i++ ) { // int16_tcut Msg20 *msg20 = &msg20s[i]; // skip if never launched if ( ! msg20->m_launched ) continue; // skip if it is in progress, awaiting its reply if ( msg20->m_inProgress ) continue; // ok, it has a reply. could be NULL if g_errno was set. if ( ! gotLinkerTitle ( msg20 ) ) m_recommendedLinkError = g_errno; // reset it for later us... or not... msg20->reset(); } // // call a msg20 on each recommendedlink to get url/title and // see if it links to any url on our main url's site/domain // for ( ; m_titleCursor < numLinkers ; m_titleCursor++ ) { // bail? if ( m_numMsg20sOut - m_numMsg20sIn > 60 ) break; // stop launching if got enough if ( m_numValidMsg20s >= MAX_RECOMMENDED_LINKS ) break; // cast it RecommendedLink *rl = &ptr[m_titleCursor]; // get avail msg20 int32_t i; for ( i = 0 ; i < 100 ; i++ ) { if ( msg20s[i].m_inProgress ) continue; break; } // sanity! if ( i >= 100 ) { char *xx=NULL;*xx=0; } // look it up Msg20 *msg20 = &msg20s[i]; // make request Msg20Request req; req.m_docId = rl->m_rl_docId; //req.m_state = msg20; req.m_state = m_masterState;//this; req.m_callback2 = m_masterLoop;//gotLinkerTitleWrapper; //req.ptr_coll = cr->m_coll; //req.size_coll = gbstrlen(cr->m_coll)+1; req.m_collnum = cr->m_collnum; req.m_expected = true; req.m_niceness = m_niceness; // do not get summary stuff. too slow. req.m_numSummaryLines = 0; // if it has an outlink to our site/domain set // Msg20Reply::m_hasLinkToOurDomOrHost req.m_ourHostHash32 = getHostHash32a(); req.m_ourDomHash32 = getDomHash32(); // store cursor in msg20 itself so we know what rd it's using msg20->m_hack2 = m_titleCursor; // assume outstanding m_numMsg20sOut++; // debug //log("seo: DEBUG: launching msg20 d=%"INT64"",req.m_docId); // get it. continue if blocked if ( ! msg20->getSummary ( &req ) ) continue; // error? if ( ! gotLinkerTitle ( msg20 ) ) m_recommendedLinkError = g_errno; // save mem msg20->reset(); } // wait for all to return? if ( m_numMsg20sOut > m_numMsg20sIn ) return (SafeBuf *)-1; // we called gotLinkerTitle() on all msg20s, so destroy them for ( int32_t i = 0 ; i < numMsg20s ; i++ ) { // int16_tcut Msg20 *msg20 = &msg20s[i]; // free msg20->destructor(); } // and free the lot of them m_msg20Array.purge(); // now revert back m_recommendedLinksBuf.stealBuf ( &m_newLinkerBuf ); // . this is an array of Inlinks // . shit, but we need to add a count of how many related docids // had the inlink, and what the weight or score of it was // . it should be based on the weights/scores of the related docids // . maybe just hijack "Inlink::m_numUniqueIPs" or something // . crap, we also need to store the RelatedDocIds, i guess we // could store a list of offsets to them in m_relatedDocIdBuf m_recommendedLinksBufValid = true; // store in cachedb. if it blocks return -1. bufvalid is set to // true so when this function is re-entered it should return // the safebuf ptr right away. if ( ! storeRecommendedLinksBuf () ) return (SafeBuf *)-1; return &m_recommendedLinksBuf; } // returns false and sets g_errno on error bool XmlDoc::gotLinkerTitle ( Msg20 *msg20 ) { // count it as returned m_numMsg20sIn++; // debug //log("seo: DEBUG: got msg20 reply"); // get the recommendedlink for this (titleCursor) char *vvv = m_recommendedLinksBuf.getBufStart(); RecommendedLink *rptrs = (RecommendedLink *)vvv; int32_t titleCursor = msg20->m_hack2; RecommendedLink *rl = &rptrs[titleCursor]; // sanity if ( titleCursor < 0 ) {char *xx=NULL;*xx=0;} // not found? if ( g_errno ) { log("seo: lookuptitles: %s",mstrerror(g_errno)); // ignore g_errno = 0; return true; } // get reply Msg20Reply *reply = msg20->getReply(); // skip if linked to our site! if ( reply->m_hasLinkToOurDomOrHost ) { if ( m_seoDebug >= 2 ) log("seo: inlinker %s links to our " "domain. ignoring.", reply->ptr_ubuf); return true; } // or if banned/filtered.. then skip if ( reply->m_errno ) { if ( m_seoDebug >= 2 ) log("seo: inlinker %s had error: %s", reply->ptr_ubuf, mstrerror(reply->m_errno)); return true; } // wtf? if ( reply->size_ubuf <= 1 ) { return true; } // set basic info rl->m_rl_siteRank = reply->m_siteRank; rl->m_rl_firstIp = reply->m_firstIp; // sanity if ( rl->m_rl_docId != reply->m_docId ) { char *xx=NULL;*xx=0; } char *title = reply->ptr_tbuf; int32_t titleSize = reply->size_tbuf; if ( titleSize == 0 ) { title = "\0"; titleSize = 1; } // debug //log("seo: DEBUG: got VALID msg20 reply #%"INT32"",m_numValidMsg20s); // count as valid m_numValidMsg20s++; rl->m_urlSize = reply->size_ubuf; rl->m_titleSize = titleSize; if ( ! m_newLinkerBuf.safeMemcpy ( rl , sizeof(RecommendedLink) ) ) return false; if ( ! m_newLinkerBuf.safeMemcpy ( reply->ptr_ubuf,reply->size_ubuf)) return false; if ( ! m_newLinkerBuf.safeMemcpy ( title , titleSize ) ) return false; // i guess we are done then return true; } /* // returns false if blocked, true otherwise. sets g_errno on error bool XmlDoc::printRecommendedLinksBuf ( SafeBuf *sb ) { SafeBuf *recBuf = getRecommendedLinksBuf(); if ( ! recBuf ) return true; if ( recBuf == (void *)-1 ) return false; int32_t count = 1; char *p = recBuf->getBufStart(); char *pend = recBuf->getBuf (); for ( ; p < pend ; ) { // cast it RecommendedLink *ri = (RecommendedLink *)p; // skip it p += ri->getSize(); // print it out sb->safePrintf("%"INT32") %.04f %s | %s
" ,count++ ,ri->m_totalRecommendedScore ,ri->getUrl(recBuf) ,ri->getTitle(recBuf) ); } return true; } */ // . use Msg25::m_numReplyPtrs and Msg25::m_replyPtrs[i] to access the // Msg20s of the inlinks // . NOT the same as getLinkInfo() because this does not filter out the // "bad" inlinks, it gets everything and keeps the full Msg20Replies!! Msg25 *XmlDoc::getAllInlinks ( bool forSite ) { // if valid, return it now if ( forSite && m_tempMsg25SiteValid ) return m_tempMsg25Site; if ( ! forSite && m_tempMsg25PageValid ) return m_tempMsg25Page; Msg25 *myMsg25 ; if ( forSite ) myMsg25 = m_tempMsg25Site; else myMsg25 = m_tempMsg25Page; int32_t *ipp = getIp(); if ( ! ipp || ipp == (void *)-1 ) return (Msg25 *)ipp; int64_t *d = getDocId(); if ( ! d || d == (int64_t *)-1 ) return (Msg25 *)d; char *site = getSite (); if ( ! site || site == (char *)-1 ) return (Msg25 *)site; CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; Url *fu = getFirstUrl(); // make a new one if ( ! myMsg25 ) { Msg25 *msg25 = NULL; try { msg25 = new ( Msg25 ); } catch ( ... ) { g_errno = ENOMEM; log("xmldoc: linksrc error2 = %s",mstrerror(g_errno)); m_hadLinkInfoError = g_errno; } mnew ( msg25, sizeof(Msg25),"m25li"); // record it for freeing/deleting later if ( forSite ) m_tempMsg25Site = msg25; else m_tempMsg25Page = msg25; // reference it myMsg25 = msg25; } int32_t type ; if ( forSite ) type = cr_Msg25SiteInfo; else type = cr_Msg25PageInfo; // get list RdbList *myList; if ( forSite ) myList = &m_siteReplyList; else myList = &m_pageReplyList; int32_t uh32 =(uint32_t)((uint64_t)getFirstUrlHash64()); // first check cachedb! bool checkIt = false; if ( forSite && ! m_checkedCachedbForSite ) checkIt = true; if ( ! forSite && ! m_checkedCachedbForPage ) checkIt = true; if ( checkIt ) { // do not repeat if ( forSite ) m_checkedCachedbForSite = true; else m_checkedCachedbForPage = true; // use 0 for content hash since the link info is independent // of your page's or site's content key_t sk = g_cachedb.makeStartKey2 ( uh32 , 0 , type ); key_t ek = g_cachedb.makeEndKey2 ( uh32 , 0 , type ); // . get it from the appropriate host // . get cachedb rec for all types of safebufs for this // url/content // . then we will set safebufs based on what recs we find // in the returned list if ( ! m_msg0.getList ( -1, // hostid 0 , // ip 0 , // port 0 , // maxcacheage false, // addtocache? RDB_CACHEDB, cr->m_collnum , myList, // &m_cacheList, (char *)&sk , (char *)&ek , 30000000, // minrecsizes 30MB m_masterState, m_masterLoop, m_niceness ) ) // blocked? return (Msg25 *)-1; } Msg20Reply *reply; // even if it had 0 msg20replies, list should be non-zero length if ( ! myList->isEmpty() ) { // get # replies char *p = myList->getList(); // first is key p += 12; // then datasize p += 4; // then # msg20 replies int32_t numReplies = *(int32_t *)p; p += 4; myMsg25->m_numReplyPtrs = numReplies; // do not free any replies, they reference into m_pageList myMsg25->m_ownReplies = false; // loop over replies for ( int32_t i = 0 ; i < numReplies ; i++ ) { // get reply size int32_t replySize = *(int32_t *)p; p += 4; // reply itself reply = (Msg20Reply *)p; // reconstruct ptrs from the offsets relative // to start of "reply" int32_t used = reply->deserialize(); if ( used < 0 ) { log("xmldoc: reply deserialize error"); g_errno = ECORRUPTDATA; return NULL; } // skip reply p += replySize; // store it myMsg25->m_replyPtrs[i] = reply; } // validate! if ( forSite ) m_tempMsg25SiteValid = true; else m_tempMsg25PageValid = true; // all done! return myMsg25; } bool *calledItPtr ; if ( forSite ) calledItPtr = &m_calledMsg25ForSite; else calledItPtr = &m_calledMsg25ForPage; // ok, get it the hard way // send out the request now if ( ! *calledItPtr ) { // do not re-call! *calledItPtr = true; // call it now if ( ! myMsg25->getLinkInfo2( site, fu->getUrl() , // url false , // isSiteLinkInfo? *ipp, *d, // docid m_collnum,//cr->m_coll, NULL, // qbuf 0, // qbufSize m_masterState, // state m_masterLoop, // callback false, // isInjecting? false, // pbuf (for printing) this, // xd holder (Msg25::m_xd) // this is irrelevant since we // are getting all inlinks: 0, // siteNumInlinks, irrelevant NULL, // oldlinkinfo m_niceness, true, // doLinkSpamCheck? true, // onevoteperip. unused? false,// can be cancelled? 0, // lastupdatetime // !!!!!!!!!! // we want all!!!!!!!!!!!!!!!!!!! // !!!!!!!!!! false ,//onlyneedgoodinlinks? false,//getlinkertitles? 0, // ourhosthash32 (special) 0, // ourdomhash32 (special) &m_myTempLinkInfoBuf ) ) // blocked? return (Msg25 *)-1; } // validate it so when msg1 below returns and calls this function // again at the top we return the ptr right away if ( forSite ) m_tempMsg25SiteValid = true; else m_tempMsg25PageValid = true; // serialize the msg20 reply ptrs into a buf for list SafeBuf listBuf; // compute datasize int32_t dataSize = 0; // # of replies dataSize += 4; // each reply for ( int32_t i = 0 ; i < myMsg25->m_numReplyPtrs ; i++ ) { // reply size dataSize += 4; // reply data //dataSize += myMsg25->m_replySizes[i]; // we can't use replySizes[i] because Linkdb.cpp will // MODIFY the msg20 requests to add ptr_note/size_note reply = myMsg25->m_replyPtrs[i]; // so we have to calculate the new serialized size dataSize += reply->getStoredSize(); } // how much to reserve? int32_t need = sizeof(key_t) + 4 + dataSize; // reserve that space! if ( ! listBuf.reserve ( need ,"listbuf" ) ) { // just ignore error g_errno = 0; // and return if ( forSite ) return m_tempMsg25Site; else return m_tempMsg25Page; } // make key for it, contenthash is 0, since it is irrelevant key_t kk = g_cachedb.makeKey ( uh32 , 0 , type ); // store key listBuf.safeMemcpy ( &kk , sizeof(key_t) ); // store datasize listBuf.pushLong ( dataSize ); // # of replies listBuf.pushLong ( myMsg25->m_numReplyPtrs ); // store each reply then for ( int32_t i = 0 ; i < myMsg25->m_numReplyPtrs ; i++ ) { // get reply reply = myMsg25->m_replyPtrs[i]; // . how many bytes to store the MODIFIED msg20reply? // . Linkdb.cpp adds the ptr_note AFTER it receives all replies // so we can't just use Msg25::m_replySizes[i] int32_t replySize = reply->getStoredSize(); listBuf.pushLong ( replySize ); // store that int32_t stored = reply->serialize ( listBuf.getBuf() , listBuf.getAvail() ); // skip that listBuf.incrementLength ( stored ); // sanity if ( stored != replySize ) { char *xx=NULL;*xx=0; } } // sanity if ( listBuf.length() != need ) { char *xx=NULL;*xx=0; } // make the list to add to cachedb RdbList storeList; key_t startKey = g_cachedb.makeStartKey2 ( uh32, 0 , type ); key_t endKey = g_cachedb.makeEndKey2 ( uh32, 0 , type ); m_storeList.set ( listBuf.getBufStart() , listBuf.length() , listBuf.getBufStart() , // alloc listBuf.getCapacity(), // allocsize startKey, endKey, -1, // fixeddatasize true, // owndata? false ); // use half keys? // disconnect it from safebuf so it doesn't get freed listBuf.detachBuf(); //m_storeList.printList(); QUICKPOLL(m_niceness); char *tt ; if ( forSite ) tt = "site"; else tt = "page"; log("xmldoc: adding msg20%slinkreplies list of %"INT32" bytes to cachedb", tt,m_storeList.m_listSize); // returns false if it blocks, true otherwise if ( ! m_msg1.addList ( &m_storeList, RDB_CACHEDB , cr->m_collnum, m_masterState, m_masterLoop, false, // forcelocal? m_niceness ) ) // blocked? return (Msg25 *)-1; if ( forSite ) return m_tempMsg25Site; else return m_tempMsg25Page; } // . returns false and sets g_errno on error // . sets RelatedDocId::m_relatedWeight // . when printing the competitor pages, we sort by this, highest first // 1. then scan the list of queries for each related docid // 2. determine each of those matching queries weights // 3. add up the weights and set RelatedDocId::m_relatedWeight to that bool XmlDoc::setRelatedDocIdWeightAndRank ( RelatedDocId *rd ) { // get our site hash int32_t *shp = getSiteHash32(); if ( ! shp ) return false; if ( shp == (int32_t *)-1 ) { char *xx=NULL;*xx=0; } if ( ! m_siteHash32Valid ) { char *xx=NULL;*xx=0; } int32_t mainUrlSiteRank = getSiteRank(); // max queries int32_t nc = rd->m_numCommonQueries; int32_t unit = 0; unit += sizeof(float); //unit += sizeof(Msg99Reply *); unit += sizeof(Query); unit += sizeof(HashTableX); unit += sizeof(QueryNumLinkedNode *); int32_t need = nc * unit; char *mem = (char *)mmalloc ( need , "qrybuf" ); if ( ! mem ) { log("seo: failed to set related docid weight: %s", mstrerror(g_errno)); return false; } char *p = mem; float *queryWeights = (float *)p; p += nc * sizeof(float); //Msg99Reply **replyPtrs = (Msg99Reply **)p; //p += nc * sizeof(Msg99Reply *); Query *queries = (Query *)p; p += nc * sizeof(Query); QueryNumLinkedNode **qnPtrs = (QueryNumLinkedNode **)p; p += nc * sizeof(QueryNumLinkedNode *); HashTableX *htables = (HashTableX *)p; p += nc * sizeof(HashTableX); // sanity if ( p != mem + need ) { char *xx=NULL;*xx=0; } // initialize the mem for ( int32_t i = 0 ; i < nc ; i++ ) { queryWeights[i] = 1.0; qnPtrs[i] = NULL; queries[i].constructor(); htables[i].constructor(); } // total pages indexed! //int64_t numPagesIndexed = g_titledb.getGlobalNumDocs(); float totalWeight; // get matching queries //SafeBuf *qpbuf = getMatchingQueriesScored(); //if ( ! qpbuf || qpbuf == (void *)-1 ) { char *xx=NULL;*xx=0; } // cast it //Msg99Reply **qptrs=(Msg99Reply **)qpbuf->getBufStart(); SafeBuf *mq = getMatchingQueryBuf(); if ( mq == NULL || mq == (void *)-1 ) { char *xx=NULL;*xx=0; } int32_t nks = mq->length() / sizeof(QueryLink); QueryLink *qks = (QueryLink *)mq->getBufStart(); // print the queries in common! int32_t firstOff = rd->m_firstCommonQueryNumOff; int32_t offset = firstOff; int32_t qc = 0; //int64_t numPagesIndexed = g_titledb.getGlobalNumDocs(); // this is fixed at the time we set QueryLogEntry::m_numResultsInSlice int64_t numPagesIndexed = 1114000000; int64_t point0 = numPagesIndexed / 119LL; int64_t point1 = numPagesIndexed / 15LL; // loop over the query/score pairs this related docid matched for ( ; offset >= 0 ; qc++ ) { // get that node char *buf = m_commonQueryNumBuf.getBufStart(); // and offset buf += offset; // then cast QueryNumLinkedNode *qn; qn = (QueryNumLinkedNode *)buf; // advance. will be -1 when done if ( qn ) offset = qn->m_nextOff; else offset = -1; // get #qn into there //Msg99Reply *rp = qptrs[qn->m_queryNum]; if ( qn->m_queryNum < 0 || qn->m_queryNum >= nks ) { char *xx=NULL;*xx=0; } QueryLink *qk = &qks[qn->m_queryNum]; QueryLogEntry *qe ; qe = qk->getQueryLogEntry(&m_matchingQueryStringBuf); char *qstr = qe->getQueryString(); qnPtrs[qc] = qn; // save ptrs too //replyPtrs[qc] = rp; // get main url score for query //float mainUrlScore = rp->m_myScore; int32_t mainUrlSiteHash26 = m_siteHash32; // seems like clusterdb masks them a bit in // Clusterdb::getSiteHash() mainUrlSiteHash26 &= 0x03ffffff; int32_t mainUrlRank = -1; int32_t rdRank = -1; //float mainUrlSerpScore = -1.0; // . the relateddocidnumhack // . this is used as the topdocidnum # in the case of // m_matchingQueryBuf (doMatchingQueries) int32_t tdnum = qk->m_relatedDocIdNum; TopDocIds *tds = (TopDocIds *)m_topDocIdsBuf.getBufStart(); int32_t maxnum = m_topDocIdsBuf.length()/sizeof(TopDocIds); if ( tdnum < 0 || tdnum >= maxnum ) { char *xx=NULL;*xx=0; } TopDocIds *td = &tds[tdnum]; // assume none //float rdScore = 0.0; // find docid for this related docid //TopDocIds *td = rp->getTopDocIds(&m_topDocIdsBuf); int32_t nd = td->m_numDocIds; for ( int32_t y = 0 ; y < nd ; y++ ) { // if we first encounter a result from the same // site as the main url then stop! you don't get // the 10x bonus then! if ( td->m_topSiteHashes26[y] == mainUrlSiteHash26 && mainUrlRank == -1 ) { //mainUrlSerpScore = td->m_topScores[y]; mainUrlRank = y; } // set our score? if ( td->m_topDocIds[y] == rd->m_docId ) { //rdScore = td->m_topScores[y]; rdRank = y; } } // these should always be set! even if not ranked in the // top 300 because of our new logic using msg4f in // getRelatedDocIdsScored() float rdScore = qn->m_relatedDocIdSerpScore; float mainUrlSerpScore = qk->m_serpScore; bool better = false; // give it a weight of 10 if higher-scoring! //if ( rdRank < mainUrlRank ) better = true; if ( rdScore >= mainUrlSerpScore ) better = true; // if your site not in top 300 or so, and he is, he's better //if ( mainUrlRank == -1 && rdRank >= 0 ) better = true; // this is the specific url, not the SITE, like // mainUrlRank is, for the entire site //if ( rdScore > mainUrlScore ) better = true; // how many search results does this query have total? int64_t numResults = qe->m_numTotalResultsInSlice; // fix it to be global numResults *= (int64_t)g_hostdb.getNumShards(); // big indexes did the "slice logic" restricting docid // range to MAX_DOCID * .10 when setting this! if ( numPagesIndexed > 10000000 ) numResults *= 10; //////////////////// // // Scoring is what we do when the number of combinations // it too high to effectively compute. - matt // //////////////////// // lower from 10 so google still won't dominate generic qyries? // crap, at 2.0 gigablast.com had bad competitors because // they all matc queries with gigablast in them. // i put it down from 30.0 to 5.0 to fix chessusa.com // who was getting bad competitor pages that had just // 'ccc' matching non-generic queries having them come up too // high of score. //if ( better ) // queryWeights[qc] = 1.0;//30.0;//100.0; // 10.0; // // do not give related docid query that has YOUR brand in it // much weight. we do not want it talking about you, because // it is a competitor. // // PROBLEM: "cheatcodes.com"'s brand is descriptive! // // . if not generic and it beats YOU, give more! // . try to fix ibm.com gigablast.com seomoz.org ahrefs.com // that suffer because of matching their brand. actually // maybe only do this if seomoz.org matches this query // with their link text only...??? thus, pages that contain // "seo moz" will match the "seo moz" query but will gain // RELATIVELY little because they can't be seomoz.org on it. // . crap though this will hurt chessusa.com right?? try again // since algo changed a lot since then bool isBrand = true; // if other guy ranks better than you, probably not // your brand, or if it is, it could be his brand too? if ( better ) // && numResults < point0 ) isBrand = false; // or if you are not in the top 100 it is probably not // your brand name either! if ( mainUrlRank == -1 ) isBrand = false; // fix chessusa.com for 'chess' by lowering from 100 to 20... if ( mainUrlRank >= 20 ) isBrand = false; // fix 'corporation' for ibm.com. it is too generic to // be a brand. on our 1.1B page index, point0 is like 9.3M. // 'ibm' is 5.5M, 'corporation' is 25M,... if ( numResults >= point0 ) isBrand = false; // or for ibm.com ... or other pages with high siteranks, // your brand queries should be in the top 10!! otherwise, // ibm has so many other matching queries in the top 100 that // are not brands for it because its siterank is so high. if ( mainUrlSiteRank >= 10 && mainUrlRank >= 10 ) isBrand = false; // top 5 for brands in siterank 11 sites if ( mainUrlSiteRank >= 11 && mainUrlRank >= 5 ) isBrand = false; // . good competitors will be in top 30 for a query // . let's keep in mind though that we use these competitors // to find backlinks AND to generate related terms, so // it's not so important that they dominate a query, but // rather that they match your content... /* if ( better && numResults < point0 && rdRank >= 0 && rdRank < 20 ) queryWeights[qc] *= 1.2;//50.0; // top ten??? if ( better && numResults < point0 && rdRank >= 0 && rdRank < 10 ) queryWeights[qc] *= 1.3;//51.0; // top 5? if ( better && numResults < point0 && rdRank >= 0 && rdRank < 5 ) queryWeights[qc] *= 1.4;//52.0; */ // weight it by how relevant the query it matches is to us //if ( better && numResults < point0 ) // queryWeights[qc] = (qk->m_serpScore / 1000000.0); // // generic query? // float weight = 1.0; if ( numResults < point0 ) weight = 100.0; else if ( numResults < point1 ) weight = 10.0; queryWeights[qc] *= weight; // // weight by related docid's serp score // float ss = qk->m_serpScore; float w2 = 1.0; if ( ss > 1000000000.0 ) w2 = 10.0; // > 1B else if ( ss > 100000000.0 ) w2 = 9.0; // > 100M else if ( ss > 10000000.0 ) w2 = 8.0; // > 10M else if ( ss > 1000000.0 ) w2 = 7.0; // > 1M else if ( ss > 100000.0 ) w2 = 6.0; // > 100k else if ( ss > 10000.0 ) w2 = 5.0; // > 10k else if ( ss > 1000.0 ) w2 = 4.0; // > 1k else if ( ss > 100.0 ) w2 = 3.0; // > 100 else if ( ss > 10.0 ) w2 = 2.0; // > 10 queryWeights[qc] *= w2; // // weight by main url's serp score as well! // ss = mainUrlSerpScore;//qk->m_serpScore; w2 = 1.0; if ( ss > 1000000000.0 ) w2 = 10.0; // > 1B else if ( ss > 100000000.0 ) w2 = 9.0; // > 100M else if ( ss > 10000000.0 ) w2 = 8.0; // > 10M else if ( ss > 1000000.0 ) w2 = 7.0; // > 1M else if ( ss > 100000.0 ) w2 = 6.0; // > 100k else if ( ss > 10000.0 ) w2 = 5.0; // > 10k else if ( ss > 1000.0 ) w2 = 4.0; // > 1k else if ( ss > 100.0 ) w2 = 3.0; // > 100 else if ( ss > 10.0 ) w2 = 2.0; // > 10 queryWeights[qc] *= w2; // punish query weight if it is your brand most likely //if ( isBrand ) // queryWeights[qc] = 0.01; // . store related docid rank and your rank // . then we do not need cache m_topDocIdsBuf and seo.cpp // has this info readily available. qn->m_relatedDocIdRank = rdRank; qn->m_mainUrlRank = mainUrlRank; //qn->m_mainUrlSerpScore = mainUrlSerpScore; /* int64_t numResults = qe->m_numTotalResultsInSlice; // fix it to be global numResults *= (int64_t)g_hostdb.getNumGroups(); // big indexes did the "slice logic" restricting docid // range to MAX_DOCID * .10 when setting this! if ( numPagesIndexed > 10000000 ) numResults *= 10; // fix divide by zero and make all rare queries similar weight //if ( numResults < 1000 ) numResults = 1000; // divide by # results query has so more generic stuff // is down weighted //queryWeights[qc] /= (float)numResults; if ( numResults < 1000 ) queryWeights[qc] /= 1; else if ( numResults < 10000 ) queryWeights[qc] /= 2; else if ( numResults < 100000 ) queryWeights[qc] /= 4; else if ( numResults < 1000000 ) // 1M queryWeights[qc] /= 8; else if ( numResults < 10000000 ) // 10M queryWeights[qc] /= 16; else if ( numResults < 10000000 ) // 100M queryWeights[qc] /= 32; else queryWeights[qc] /= 64; */ //int32_t qlen = gbstrlen(qstr); // int16_tcuts Query *qp = &queries[qc]; HashTableX *ht = &htables[qc]; // this is currently a int64_t bit vector int32_t vs = sizeof(qvec_t); if ( ! ht->set ( 8,vs,128,NULL,0,false,m_niceness,"wbvbuf") ) // hopefully g_errno is preserved goto done; // if unknown use english so pandora's -> pandora,pandoras? // because 'pandora's tower' was not matching // 'pandoras tower' because both words could have been // english or german, thus the queries were thought to be // independent! giving rise to high-scoring competitive pages // that matched only those two queries. uint8_t qlangId = qe->m_langId; if ( ! qlangId ) qlangId = langEnglish; qp->set2 ( qstr , qlangId , true ); // hash it up for ( int32_t i = 0 ; i < qp->m_numTerms ; i++ ) { // int16_tcut QueryTerm *qt = &qp->m_qterms[i]; // bigrams imply 2 explicit bits, one from each term // in the bigram. synonym terms should share the same // bit as the term they are a synonym of int64_t bits = qt->m_implicitBits; // . add bit vec. use rawTermId? // . hash to wordbit vector of query words contained if ( ! ht->addKey ( &qt->m_termId , &bits ) ) goto done; } } // . set the dup flags! // . scan queries related docid matches for ( int32_t i = 0 ; i < qc ; i++ ) { // get it Query *qpi = &queries[i]; HashTableX *hti = &htables[i]; // scan all queries above for ( int32_t j = i+1 ; j < qc ; j++ ) { // reset bool jIsSubQueryOfi = false; bool iIsSubQueryOfj = false; // skip ourselves //if ( j == i ) continue; // get it Query *qpj = &queries[j]; HashTableX *htj = &htables[j]; // scan every query term in query #j and map each // termid to the term bit vector that indicates what // terms query #j has in query #i. qvec_t totalVec = 0LL; // is it a dup? for ( int32_t k = 0 ; k < qpj->m_numTerms ; k++ ) { // int16_tcut QueryTerm *qt = &qpj->m_qterms[k]; // see if in there char *val ; val = (char *)hti->getValue(&qt->m_termId); if ( ! val ) continue; // get implied term bits qvec_t vec = *(qvec_t *)val; // this is the termbit vector for query #i. // it tells us what terms query #j shares. totalVec |= vec; } // we only care about "required" terms. i.e. bigrams // are essentially ignored if not in quotes. totalVec &= qpi->m_requiredBits; // how many words do we match? if ( sizeof(qvec_t) != 8 ) { char *xx=NULL;*xx=0; } int32_t numSharedWithQueryi = getNumBitsOn64(totalVec); // how many required bits does it have? int32_t needi = getNumBitsOn64(qpi->m_requiredBits); // if all terms in query #i are in query #j then subset if ( numSharedWithQueryi == needi ) iIsSubQueryOfj = true; // // now go the other way // totalVec = 0LL; // is it a dup? for ( int32_t k = 0 ; k < qpi->m_numTerms ; k++ ) { // int16_tcut QueryTerm *qt = &qpi->m_qterms[k]; // see if in there char *val; val = (char *)htj->getValue(&qt->m_termId); if ( ! val ) continue; // get implied term bits qvec_t vec = *(qvec_t *)val; // this is the termbit vector for query #j. // it tells us what terms query #i shares. totalVec |= vec; } // we only care about "required" terms. i.e. bigrams // are essentially ignored if not in quotes. totalVec &= qpj->m_requiredBits; // how many words do we match? if ( sizeof(qvec_t) != 8 ) { char *xx=NULL;*xx=0; } int32_t numSharedWithQueryj = getNumBitsOn64(totalVec); // how many required bits does it have? int32_t needj = getNumBitsOn64(qpj->m_requiredBits); // if all terms in query #i are in query #j then subset if ( numSharedWithQueryj == needj ) jIsSubQueryOfi = true; // now set dup bit if query #i is same as query #j // taking into account "missing spaces" so that we // have two terms in one query , and their bigram // in the other query. OR we have synonyms. OR we // have differences of "ignored" words. // "leg" = "legs" // "cheat code" = "cheatcodes" // "the tigers" = "tigers" if(jIsSubQueryOfi&& iIsSubQueryOfj&& queryWeights[j]>.02){ // debug? if ( m_seoDebug >= 2 ) log("seo: %s ISDUPOF %s", qpj->m_orig, qpi->m_orig); // the dup weight is .02 queryWeights[j] *= .1; // = .02 } // proper subquery examples: // "leg" is subquery of "nice legs" else if ( jIsSubQueryOfi && ! iIsSubQueryOfj && queryWeights[j] > .05 ) { // debug? if ( m_seoDebug >= 2 ) log("seo: %s SUBQUERYOF %s", qpj->m_orig, qpi->m_orig); // the subquery weight is .05 queryWeights[j] *= 0.1; // = 5.0;//.05; } // is query #i a PROPER subquery of query #j else if ( iIsSubQueryOfj && ! jIsSubQueryOfi && queryWeights[i] > .05 ) { // debug? if ( m_seoDebug >= 2 ) log("seo: %s SUBQUERYOF %s", qpi->m_orig, qpj->m_orig); // the subquery weight is .05 // increase to 5.0 to try to drown out the // anomaly queries promoting poker sites // for cheatcodes.com competitors queryWeights[i] *= 0.1; // = 5.0;//.05; } else { // debug? //if ( debug ) //log("seo: %s UNRELATEDTO %s", // qpi->m_orig, // qpj->m_orig); } } } // scan the queries again and add up their weights this time! totalWeight = 0.0; for ( int32_t i = 0 ; i < qc ; i++ ) { totalWeight += queryWeights[i]; qnPtrs[i]->m_queryScoreWeight = queryWeights[i]; //Msg99Reply *ptr = replyPtrs[i]; Query *qp = &queries[i]; char *qstr = qp->m_orig;//ptr->m_queryStr; // log it if ( m_seoDebug >= 2 ) log("seo: docid=%"INT64" weight=%f qry=%s", rd->m_docId, queryWeights[i], qstr); } // that is the docid related weight now rd->m_relatedWeight = totalWeight; done: for ( int32_t i = 0 ; i < nc ; i++ ) { queries[i].destructor(); htables[i].destructor(); } mfree ( mem , need , "qrybuf" ); return true; } // returns false and sets g_errno on error bool XmlDoc::addRelatedDocIdInfo ( int64_t docId , int32_t queryNum , float score , int32_t rank , int32_t siteHash26 ) { // do not add if does not match the query if ( score <= 0.0 ) return true; // alloc space if first time calling if ( ! m_rdtab.m_numSlots ) { if ( ! m_rdtab.set(8,sizeof(RelatedDocId),1024,NULL,0, false,0,"rdtab")) return false; } // get the related docid as it exists in m_relatedDocIdBuf RelatedDocId *rd = NULL; // now we also store these for intersecting // in phase 2 to see what urls are most // similar to us int32_t slot = m_rdtab.getSlot(&docId); // if not there, add it if ( slot < 0 ) { // make one RelatedDocId rdx; // the most important thing is the docid! rdx.m_docId = docId; // and now the 32-bit site hash rdx.m_siteHash26 = siteHash26; // how many search results we are in rdx.m_numCommonQueries = 0; // the queryImportance should be our score // for this query divided by m_minTop50Score // to normalize it. //float qimp=qp->m_queryInfo.m_queryImportance; // just add up the query importance for // each query we share in common with main url //rd.m_similarityScore = qip; // now we do a dot product of this related // docids score vector with the main url's // score vector. both vector's are normalized // using the score of the 1st result! //rd.m_dotProduct = score; // reset this rdx.m_rd_siteRank = -1; rdx.m_rd_langId = 255; rdx.rd_title_off = -1; rdx.rd_url_off = -1; rdx.rd_site_off = -1; // point to beginning of linked list of qrynums rdx.m_firstCommonQueryNumOff = -1;//off; //rdx.m_lastCommonQueryNumOff = -1;//off; // remember offset int32_t rdOff = m_relatedDocIdBuf.length(); // store it m_relatedDocIdBuf.safeMemcpy ( &rdx , sizeof(RelatedDocId) ); // add OFFSET to table. data is 12 bytes if(! m_rdtab.addKey(&docId,&rdOff)) return false; // all done then //continue; // set this for adding to the linked list char *p = m_relatedDocIdBuf.getBufStart() + rdOff; // cast it rd = (RelatedDocId *)p; } else { // get the data int32_t rdOff = *(int32_t *)m_rdtab.getValueFromSlot(slot); // point to it char *p = m_relatedDocIdBuf.getBufStart() + rdOff; // cast it rd = (RelatedDocId *)p; } // before we add the querynumlinkednode make sure not a dup! char *qnbuf = m_commonQueryNumBuf.getBufStart(); // . offset of first node for this related docid // . this is the start of his linked list of query/score nodes int32_t firstOff = rd->m_firstCommonQueryNumOff; // sanity if ( firstOff == -1 && rd->m_numCommonQueries ) { char *xx=NULL;*xx=0;} // assume no linked list QueryNumLinkedNode *node = NULL; // only a linked list if firstOff is not -1 if ( firstOff >= 0 ) node = (QueryNumLinkedNode *)(qnbuf + firstOff); // scan the nodes (query/score pairs) we got for this related docid for ( ; node ; ) { // if this query is already in the linked list, stop! we // do not want to add dup QueryNumLinkedNode nodes. if ( node->m_queryNum == queryNum ) return true; // end of linked list? if ( node->m_nextOff == -1 ) break; // advance to next node in linked list node = (QueryNumLinkedNode *)(qnbuf+node->m_nextOff); } // store query num element in a linked list so // we can print the actualy queryNums a related // docid has in common with the main url int32_t nodeOff = m_commonQueryNumBuf.length(); // we can record our rank and your rank in this! QueryNumLinkedNode qn; qn.m_queryNum = queryNum; // qp->m_queryNum; qn.m_nextOff = -1; qn.m_relatedDocIdRank = rank; qn.m_relatedDocIdSerpScore = score; qn.m_mainUrlRank = -1; //qn.m_mainUrlSerpScore = -1.0; int32_t sq = sizeof(QueryNumLinkedNode); // point to it if ( ! m_commonQueryNumBuf.safeMemcpy(&qn,sq) ) return false; // point to node we stored in the buf so we can adjust it below QueryNumLinkedNode *stored ; stored = (QueryNumLinkedNode *)(m_commonQueryNumBuf.getBuf() - sq); // increment the count. the # of nodes in his linked list. rd->m_numCommonQueries++; // continue the linked list qnbuf = m_commonQueryNumBuf.getBufStart(); // the first node? if ( firstOff == -1 ) { rd->m_firstCommonQueryNumOff = nodeOff; //rd->m_lastCommonQueryNumOff = nodeOff; return true; } // get the current first int32_t oldFirstOff = rd->m_firstCommonQueryNumOff; //char *vv = qnbuf + rd->m_firstCommonQueryNumOff; //QueryNumLinkedNode *first = (QueryNumLinkedNode *)vv; // we are the new first rd->m_firstCommonQueryNumOff = nodeOff; // we point to old first as our next stored->m_nextOff = oldFirstOff; // and update that node's next link //last->m_nextOff = nodeOff; // and our new tail //rd->m_lastCommonQueryNumOff = nodeOff; return true; } // . safebuf returned is a buffer of QueryLinks // . use m_matchingQueryBuf/m_matchingStringBuf SafeBuf *XmlDoc::getMatchingQueryBuf ( ) { setStatus ( "getmatchingqueries" ); // try to set from cachedb record if ( ! checkCachedb() ) return (SafeBuf *)-1; if ( m_matchingQueryBufValid ) return &m_matchingQueryBuf; if ( ! m_beginTimeAllMatch ) m_beginTimeAllMatch = gettimeofdayInMilliseconds(); if ( m_docIdListBuf.length() == 0 ) m_docIdListBuf.pushLongLong(m_docId); // true = doMatchingQueries? SafeBuf *qkbuf = getQueryLinkBuf ( &m_docIdListBuf , true ); if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf; m_matchingQueryBuf .stealBuf ( qkbuf ); m_matchingQueryStringBuf.stealBuf ( &m_queryLinkStringBuf ); // show time int64_t now = gettimeofdayInMilliseconds(); int64_t took = now - m_beginTimeAllMatch; log("seopipe: time: getMatchingQueries took %"INT64" ms",took); m_matchingQueryBufValid = true; // if getRelatedQueryBuf calles getQueryLinkBuf() it should // do a recompute, so set this to false m_queryLinkBufValid = false; m_docIdListBuf.purge(); // store it if ( ! storeMatchingQueriesIntoCachedb() ) return (SafeBuf *)-1; return &m_matchingQueryBuf; } // . returns safebuf of QueryLinks, representing the intersected matching // queries of all the related docids SafeBuf *XmlDoc::getRelatedQueryBuf () { // try to set from cachedb record if ( ! checkCachedb() ) return (SafeBuf *)-1; if ( m_relatedQueryBufValid ) return &m_relatedQueryBuf; // we need these SafeBuf *rdbuf = getRelatedDocIdsWithTitles(); if ( ! rdbuf || rdbuf == (void *)-1 ) return rdbuf; if ( ! m_beginRelatedQueries ) m_beginRelatedQueries = gettimeofdayInMilliseconds(); if ( m_docIdListBuf.length() == 0 ) { int32_t numRelatedDocIds = rdbuf->length()/sizeof(RelatedDocId); // just use the top 50 for related queries for speed! if ( numRelatedDocIds > 50 ) numRelatedDocIds = 50; RelatedDocId *rds = (RelatedDocId *)rdbuf->getBufStart(); for ( int32_t i = 0 ; i < numRelatedDocIds ; i++ ) { RelatedDocId *rd = &rds[i]; m_docIdListBuf.pushLongLong(rd->m_docId); } } // false = doMatchingQueries? SafeBuf *qkbuf = getQueryLinkBuf ( &m_docIdListBuf , false ); if ( ! qkbuf || qkbuf == (void *)-1 ) return qkbuf; m_relatedQueryBuf .stealBuf ( qkbuf ); m_relatedQueryStringBuf.stealBuf ( &m_queryLinkStringBuf ); m_relatedQueryBufValid = true; m_queryLinkBufValid = false; m_docIdListBuf.purge(); // show time int64_t now = gettimeofdayInMilliseconds(); int64_t took = now - m_beginRelatedQueries; log("seopipe: time: getRelatedQueries took %"INT64" ms",took); // store it if ( ! storeRelatedQueriesIntoCachedb() ) return (SafeBuf *)-1; return &m_relatedQueryBuf; } static void gotMsg8eReplyWrapper ( void *state , UdpSlot *slot ) { XmlDoc *THIS = (XmlDoc *)state; int32_t hostId = slot->m_hostId; THIS->m_msg8eReply [hostId] = slot->m_readBuf; THIS->m_msg8eReplySize[hostId] = slot->m_readBufSize; // do not let udpserver.cpp free it, we will later slot->m_readBuf = NULL; log("seo: got msg8e reply #%"INT32" of %"INT32" from host #%"INT32"", (int32_t)THIS->m_numMsg8eReplies, (int32_t)THIS->m_numMsg8eRequests, (int32_t)hostId); THIS->m_numMsg8eReplies++; // do not free send buf until last reply! if ( THIS->m_numMsg8eReplies < THIS->m_numMsg8eRequests ) { slot->m_sendBufAlloc = NULL; return; } // ok, sendBuf will auto free in UdpServer.cpp when we return from this THIS->m_masterLoop ( THIS->m_masterState ); } //static void gotMsg20ReplyWrapper ( void *state ) { // XmlDoc *THIS = (XmlDoc *)state; // THIS->m_numMsg20Replies++; // if ( THIS->m_numMsg20Replies < THIS->m_numMsg20Requests ) // return; // THIS->m_masterLoop ( THIS->m_masterState ); //} // . returned safebuf is array of QueryLinks // . gets all matching queries from all related docids and store them // compactly as QueryLinks, otherwise we'd run out of memory because // each docid has like 50,000 matching queries on avg. // . we now get matching queries in modulus parts to avoid OOM, because // with my new changes i made we are getting like a few hundred thousand // matching queries per related docid. // . we do not store the query string, etc, for the QueryLink, // just the query offset and the hostid that has the query in its // memory (g_qbuf). after we intersect the QueryLinks we will get the // query strings, etc. there will be a lot fewer in the intersection. SafeBuf *XmlDoc::getQueryLinkBuf(SafeBuf *docIdList, bool doMatchingQueries) { if ( m_queryLinkBufValid ) return &m_queryLinkBuf; bool doRelatedQueries = true; if ( doMatchingQueries ) doRelatedQueries = false; // get the 32-bit terms the main doc matches, so we may determine // what terms in a related query are novel to this document. SafeBuf *mainUrlTwidBuf32 = NULL; if ( doRelatedQueries ) { mainUrlTwidBuf32 = getTermId32Buf() ;//InfoBuf(); if ( ! mainUrlTwidBuf32 || mainUrlTwidBuf32 == (void *)-1 ) return mainUrlTwidBuf32; } CollectionRec *cr = getCollRec(); if ( ! cr ) return NULL; // // SHIT! we can't use the keys in the termlistbuf for dual purpose // role as terms the doc contains, because they do not have the // synonym forms!!! So we have to get this terminfobuf as wells // as the termlistbuf for each docid!!!! // // so we might as well not sort by the lower 32 bit hack as well // // // // 1. get termlistbuf for each docid possibly using msg20s // // we need this for getting the QueryLink::m_serpScores in // handleRequest8e // // //int32_t numDocIds = docIdList->length() / 8; //int64_t *docIds = (int64_t *)docIdList->getBufStart(); //SafeBuf *tlistBuf = NULL; //SafeBuf *twidBuf32 = NULL; // . we just want the termlistbuf of each related docid // . hack: it should be sorted by the LOWER 32 bits of termid // so handlerequest8e does not need to sort its termid32/twid32 buf //if ( doMatchingQueries ) { // tlistBuf = getTermListBuf(); // if ( ! tlistBuf || tlistBuf == (void *)-1 ) return tlistBuf; // twidBuf32 = getTermId32Buf(); // if ( ! twidBuf32 || twidBuf32 == (void *)-1 ) return twidBuf32; //} /* if ( doRelatedQueries && ! m_launchedAll ) { int32_t need = sizeof(Msg20) * numDocIds; // we also use this same buf in getRelatedDocIdsWithTitles if ( ! m_msg20Buf.reserve ( need,"m20buf3" ) ) return NULL; // mark it all in use m_msg20Buf.setLength(need); // init them Msg20 *mp = (Msg20 *)m_msg20Buf.getBufStart(); int32_t numMsg20s = m_msg20Buf.length()/sizeof(Msg20); for ( int32_t i = 0 ; i < numMsg20s ; i++ ) mp[i].constructor(); // reset cursor to start with first related docid m_numMsg20Replies = 0; m_numMsg20Requests = 0; // launch all! for ( int32_t i = 0 ; i < numMsg20s ; i++ ) { // int16_tcut Msg20 *msg20 = &mp[i]; // get current related docid //RelatedDocId *rd = &rds[i]; // make the request Msg20Request req; req.ptr_coll = cr->m_coll; req.size_coll = gbstrlen(cr->m_coll)+1; req.m_docId = docIds[i]; req.m_expected = true; req.m_niceness = m_niceness; //req.m_state = m_masterState; //req.m_callback2 = m_masterLoop; req.m_state = this; req.m_callback2 = gotMsg20ReplyWrapper; // do not get summary stuff. too slow. req.m_numSummaryLines = 0; // get this req.m_getTermListBuf = true; // count these! m_numMsg20Requests++; // store cursor in msg20 itself so we know the rd //msg20->m_hack2 = i; // launch it if ( ! msg20->getSummary ( &req ) ) continue; // error? if ( ! g_errno ) { char *xx=NULL;*xx=0; } // note it log("seo: error getting termlistbuf docid=%"INT64"", docIds[i]); // reset it //msg20->reset(); // count reply as back now m_numMsg20Replies++; } m_launchedAll = true; } // wait for one reply per related docid if ( doRelatedQueries && m_numMsg20Replies < m_numMsg20Requests ) return (SafeBuf *)-1; */ // // // 2. send one msg8e request to each host with those termlistbufs // // it has one termlistbuf per relateddocid, enough info // for handlerequest8e to return the list of matching QueryLinks // intersected for all related docids. // if ( m_numMsg8eRequests == 0 ) { SafeBuf request; // how big is the request? int32_t need = 0; need += 1; // for the byte flag int32_t collLen = gbstrlen(cr->m_coll); need += collLen + 1; // list of docids (just one for matching queries) need += 4; need += docIdList->length(); // twidtable alloc if ( doRelatedQueries ) { need += 4; need += mainUrlTwidBuf32->length(); } //if ( doMatchingQueries ) { // // just our main url's termlistbuf // need += 4; // need += tlistBuf->length(); // need += 4; // need += twidBuf32->length(); //} // // make the 8e request // if ( ! request.reserve ( need ,"rep8ebuf" ) ) return NULL; // first store flag to indicate if getting matching or // related queries if ( doMatchingQueries ) request.pushChar(1); else request.pushChar(0); // then coll\0 request.safeMemcpy ( cr->m_coll, collLen ); request.pushChar ( 0 ); // then docids after the collection name request.pushLong ( docIdList->length() ); request.safeMemcpy ( docIdList ); // then if doing related queries we need to store our // 32-bit twids of the main url for setting m_uniqueRound if ( doRelatedQueries ) { request.pushLong(mainUrlTwidBuf32->length()); request.safeMemcpy(mainUrlTwidBuf32->getBufStart(), mainUrlTwidBuf32->length() ); } /* // then store each termlistbuf from each msg20 for ( int32_t i = 0 ; doRelatedQueries && i < numDocIds ; i++ ) { // int16_tcut Msg20 *mp = &mps[i]; Msg20Reply *rep = mp->getReply(); if ( rep ) { request.pushLong ( rep->size_tlistBuf ); request.safeMemcpy ( rep->ptr_tlistBuf , rep->size_tlistBuf ); // then the 32-bit termid buf with synonyms // that the above posdblist termlists don't // have so we can match queries request.pushLong ( rep->size_tiBuf ); request.safeMemcpy ( rep->ptr_tiBuf, rep->size_tiBuf ); } // make them empty i guess else { request.pushLong ( 0 ); request.pushLong ( 0 ); } } */ /* // just our main url's termlistbuf if ( doMatchingQueries ) { request.pushLong (tlistBuf->length()); request.safeMemcpy (tlistBuf); // then the 32-bit termid buf with synonyms that // the above posdblist termlists don't have so // we can match queries request.pushLong (twidBuf32->length()); request.safeMemcpy (twidBuf32); } */ // sanity if ( request.length() != need ) { char *xx=NULL;*xx=0; } // do not free it here, let udpserver free it char *req = request.getBufStart(); int32_t reqSize = request.length(); request.detachBuf(); // we've formulated the 8e request, no need for msg20s anymore //for ( int32_t i = 0 ; doRelatedQueries && i < numDocIds ; i++ ){ // // int16_tcut // Msg20 *mp = &mps[i]; // mp->destructor(); //} // free the mem as well //m_msg20Buf.purge(); // must be host #0 for this next algo to work if ( g_hostdb.m_hostId != 0 ) { char *xx=NULL;*xx=0; } // // send msg8e request to each host. skip if dead. // for ( int32_t k = 1; k <= g_hostdb.m_numHosts ; k++ ) { // breathe QUICKPOLL(m_niceness); // send to ourselves last so we can do all in parallel int32_t hosti = k; if ( k == g_hostdb.m_numHosts ) hosti = 0; // get ptr to the host Host *host = g_hostdb.getHost(hosti); // get hostid of host #i int32_t hostId = host->m_hostId; if ( hostId != hosti ) { char *xx=NULL;*xx=0; } // count it m_numMsg8eRequests++; // skip if dead. i guess no queries from that guy. we // can't send to a twin because the twin does not have // the same queries in its in-memory query log. if ( g_hostdb.isDead(hostId) && host->m_wasEverAlive) { log("seo: skipping msg8e to dead host %"INT32"", hostId); m_msg8eReply [hostId] = NULL; m_msg8eReplySize[hostId] = 0; m_numMsg8eReplies++; continue; } // . send request to him // . reply is the query strings // . when reply comes in we store it in the query // string buf and make the QueryLinks reference it // with their QueryLink::m_queryStringOffset if ( ! g_udpServer.sendRequest ( req , reqSize , 0x8e , // msgtype host->m_ip , // ip host->m_port , // port hostId, NULL, // retslot this, gotMsg8eReplyWrapper, 999999, // timeout -1 , // backoff -1 , // maxwait NULL, // replybuf 0, // replybufmaxsize m_niceness // niceness )) { // let admin know about error log("seopipe: sendRequest 8e had error: %s", mstrerror(g_errno)); // count it as replied then m_numMsg8eReplies++; continue; } } } // this should never happen now with our new wrapper if ( m_numMsg8eReplies < m_numMsg8eRequests ) return (SafeBuf *)-1; // // // 3. MERGE the msg8e replies from all hosts // // // gotMgs8eReplyWrapper() should have recorded each one into // m_msg8Reply[i], the msg20 reply ptr. set up for merging. char *bestPtr[MAX_HOSTS]; char *bufEnd [MAX_HOSTS]; for ( int32_t i = 0; i < g_hostdb.m_numHosts ; i++ ) { char *reply = m_msg8eReply [i]; // this happens if host is dead... if ( ! reply ) { bestPtr[i] = NULL; bufEnd [i] = NULL; continue; } //int32_t replySize = m_msg8eReplySize [i]; // it should be a list of QueryLinks char *p = reply; int32_t queryLinkBufSize = *(int32_t *)p; p += 4; bestPtr[i] = p; // bufEnd[i] also marks the start of the querystringbuf bufEnd [i] = p + queryLinkBufSize; } int32_t count = 0; int32_t maxQueryLinks = MAX_RELATED_QUERIES; if ( doMatchingQueries ) maxQueryLinks = MAX_MATCHING_QUERIES; // now merge the top "max" highest scoring // QueryLinks and their correspoding QueryLogEntries into // m_queryLinkBuf/m_queryLinkStringBuf storeMore: // get the max scoring QueryLink from the 8e replies int32_t maxi = -1; float maxScore = -1.0; for ( int32_t i = 0 ; i < g_hostdb.m_numHosts ; i++ ) { // skip if exhausted if ( bestPtr[i] >= bufEnd[i] ) continue; // cast it QueryLink *qk = (QueryLink *)bestPtr[i]; // sanity, if not list head core if ( ! qk->m_isFirst ) { char *xx=NULL;*xx=0; } // skip if score is not the current maximum if ( qk->m_totalQueryImportance < maxScore ) continue; // we got a new max! maxScore = qk->m_totalQueryImportance; maxi = i; } // store max into m_queryLinkBuf and m_queryLinkStringBuf if ( maxi >= 0 ) { // int16_tcut QueryLink *best = (QueryLink *)bestPtr[maxi]; // get # to copy int32_t toCopy = sizeof(QueryLink); if ( doRelatedQueries ) // how many querylinks in this list? i.e. those // that all share the same query, but different // relateddocid? toCopy = best->m_numInList * sizeof(QueryLink); // copy the querylink if ( ! m_queryLinkBuf.reserve ( toCopy ) ) return NULL; // point to it QueryLink *qk = (QueryLink *)m_queryLinkBuf.getBuf(); // THEN store it m_queryLinkBuf.safeMemcpy( best , toCopy ); // point to its querylogentry buf, it occurs right // after the list of QueryLinks! char *p = bufEnd[maxi]; // and the query it is for p += qk->m_queryStringOffset; // cast that QueryLogEntry *qe = (QueryLogEntry *)p; // ensure enough space if ( ! m_queryLinkStringBuf.reserve(qe->getSize(),"rqbb" ) ) return NULL; // we are moving it into the final buf qk->m_queryStringOffset = m_queryLinkStringBuf.length(); // store query log entry here now m_queryLinkStringBuf.safeMemcpy ( qe, qe->getSize() ); // advance bestPtr[maxi] += toCopy; } // limit if ( ++count < maxQueryLinks ) goto storeMore; // liberate those msg20 reply buffers for ( int32_t i = 0; i < g_hostdb.m_numHosts;i++) { if ( ! m_msg8eReply[i] ) continue; mfree ( m_msg8eReply[i] , m_msg8eReplySize[i] , "8erep" ); m_msg8eReply[i] = NULL; } // reset our parms if we are re-called for related queries m_numMsg8eReplies = 0; m_numMsg8eRequests = 0; m_queryLinkBufValid = true; // show time int64_t now = gettimeofdayInMilliseconds(); int64_t took = now - m_beginRelatedQueries; log("seopipe: getrelatedquerybuftook %"INT64" ms",took); m_beginRelatedQueries = 0LL; // validate m_queryLinkBufValid = true; /* // log for debug qks = (QueryLink *)m_queryLinkBuf->getBufStart(); nks = m_queryLinkBuf->length() / sizeof(QueryLink); for ( int32_t k = 0 ; k < nks ; k++ ) { // now we use offsets into m_relatedQueryBuf.m_buf[] QueryRel *qk = &qks[k]; // skip if not a head if ( ! qk->m_isFirst ) continue; char *qstr = qk->getQueryString(&m_queryLinkStringBuf); log("seopipe: relquery=\"%s\" imp=%f votes=%"INT32"", qstr, qk->m_rq_totalScore, qk->m_docIdVotes); } */ return &m_queryLinkBuf; } // scan matches like XmlDoc::getSummary() does and get all sentences // containing a query term... //void XmlDoc::getGigabitExcerpts ( ) { //} // this is still used by Title.cpp to get the title: field quickly char *getJSONFieldValue ( char *json , char *field , int32_t *valueLen ) { if ( ! json ) return NULL; // get length int32_t fieldLen = gbstrlen(field); // keep track of in a quote or not bool inQuotes = false; char *stringStart = NULL; char *p = json; bool gotOne = false; int32_t depth = 0; // scan for ( ; *p ; p++ ) { // escaping a quote? ignore quote then. if ( *p == '\\' && p[1] == '\"' ) { // skip two bytes then.. p++; continue; } // count {} depth if ( ! inQuotes ) { if ( *p == '{' ) depth++; if ( *p == '}' ) depth--; } // a quote? if ( *p == '\"' ) { inQuotes = ! inQuotes; // set start of the string if quote is beginning if ( inQuotes ) stringStart = p + 1; // if quote is ending and a colon follows then // it was a json field name. so if it matches the // field we want return the following field for it. else if ( ! inQuotes && ! gotOne && p[1] == ':' && // {"title":"whatever",...} // could be product:{title:... depth=2 (depth == 1 ||depth==2) && stringStart && (p - stringStart) == fieldLen && strncmp(field,stringStart,fieldLen)==0 ) { // now, the next time we set stringStart // it will be set to the VALUE of this field // assuming the field is a STRING!!!! gotOne = true; // return after the quote //return p + 2; } // ok, we got the string after the field string... else if ( ! inQuotes && gotOne ) { if ( valueLen ) *valueLen = p - stringStart; return stringStart; } // keep chugging continue; } } // done, not found return NULL; } Json *XmlDoc::getParsedJson ( ) { if ( m_jpValid ) return &m_jp; // core if not a json object if ( m_contentTypeValid && m_contentType != CT_JSON ) { char *xx=NULL;*xx=0; } // \0 terminated char **pp = getUtf8Content(); if ( ! pp || pp == (void *)-1 ) return (Json *)pp; // point to the json char *p = *pp; // empty? all done then. //if ( ! p ) return (char *)pp; // . returns NULL and sets g_errno on error // . if p is NULL i guess this should still be ok and be empty if ( ! m_jp.parseJsonStringIntoJsonItems ( p , m_niceness ) ) { g_errno = EBADJSONPARSER; return NULL; } m_jpValid = true; return &m_jp; } // . returns -1 if blocked, returns NULL and sets g_errno on error // . hash each json VALUE (not FIELD) ... AND ... hash each json // VALUE with its FIELD like "title:cool" or "description:whatever" // . example: // [{"id":"b7df5d33-3fe5-4a6c-8ad4-dad495b586cd","finish":1378322570280,"matched":64,"status":"Stopped","start":1378322184332,"token":"poo","parameterMap":{"token":"poo","seed":"www.alleyinsider.com","api":"article"},"crawled":64},{"id":"830e0584-7f69-4bdd- #include "Json.h" char *XmlDoc::hashJSONFields ( HashTableX *table ) { setStatus ( "hashing json fields" ); HashInfo hi; hi.m_tt = table; hi.m_desc = "json object"; // use new json parser Json *jp = getParsedJson(); if ( ! jp || jp == (void *)-1 ) return (char *)jp; JsonItem *ji = jp->getFirstItem(); char nb[1024]; SafeBuf nameBuf(nb,1024); //int32_t totalHash32 = 0; for ( ; ji ; ji = ji->m_next ) { QUICKPOLL(m_niceness); // skip if not number or string if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING ) continue; // reset, but don't free mem etc. just set m_length to 0 nameBuf.reset(); // get its full compound name like "meta.twitter.title" JsonItem *p = ji; char *lastName = NULL; char *nameArray[20]; int32_t numNames = 0; for ( ; p ; p = p->m_parent ) { // empty name? if ( ! p->m_name ) continue; if ( ! p->m_name[0] ) continue; // dup? can happen with arrays. parent of string // in object, has same name as his parent, the // name of the array. "dupname":[{"a":"b"},{"c":"d"}] if ( p->m_name == lastName ) continue; // update lastName = p->m_name; // add it up nameArray[numNames++] = p->m_name; // breach? if ( numNames < 15 ) continue; log("build: too many names in json tag"); break; } // if we are the diffbot reply "html" field do not hash this // because it is redundant and it hashes html tags etc.! // plus it slows us down a lot and bloats the index. if ( ji->m_name && numNames==1 && strcmp(ji->m_name,"html")==0) continue; // assemble the names in reverse order which is correct order for ( int32_t i = 1 ; i <= numNames ; i++ ) { // copy into our safebuf if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) ) return NULL; // separate names with periods if ( ! nameBuf.pushChar('.') ) return NULL; } // remove last period nameBuf.removeLastChar('.'); // and null terminate if ( ! nameBuf.nullTerm() ) return NULL; // change all :'s in names to .'s since : is reserved! char *px = nameBuf.getBufStart(); for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.'; // // DIFFBOT special field hacks // char *name = nameBuf.getBufStart(); hi.m_hashGroup = HASHGROUP_BODY; if ( strstr(name,"title") ) hi.m_hashGroup = HASHGROUP_TITLE; if ( strstr(name,"url") ) hi.m_hashGroup = HASHGROUP_INURL; if ( strstr(name,"resolved_url") ) hi.m_hashGroup = HASHGROUP_INURL; if ( strstr(name,"tags") ) hi.m_hashGroup = HASHGROUP_INTAG; if ( strstr(name,"meta") ) hi.m_hashGroup = HASHGROUP_INMETATAG; // // now Json.cpp decodes and stores the value into // a buffer, so ji->getValue() should be decoded completely // // . get the value of the json field // . if it's a number or bool it converts into a string int32_t vlen; char *val = ji->getValueAsString( &vlen ); char tbuf[32]; // if the value is clearly a date, just hash it as // a number, so use a temporary value that holds the // time_t and hash with that... this will hash // diffbot's article date field as a number so we can // sortby and constrain by it in the search results if ( name && strcasecmp(name,"date") == 0 ) { // this is in HttpMime.cpp int64_t tt = atotime1 ( val ); // we can't store 64-bit dates... so truncate to -2147483648 // which is Dec 13 1901. so we don't quite get the 1898 date // for the new york times dbpedia entry. maybe if we added // an extra termlist for more precision to indicate century or // something. if ( tt && tt < (int32_t)0x80000000 ) tt = (int32_t)0x80000000; // likewise, we can't be too big, passed 2038 if ( tt && tt > 0x7fffffff ) tt = (int32_t)0x7fffffff; if ( tt ) { // print out the time_t in ascii vlen = sprintf(tbuf,"%"INT32"",(int32_t)tt); // and point to it for hashing/indexing val = tbuf; } } // // for deduping search results we set m_contentHash32 here for // diffbot json objects. // we can't do this here anymore, we have to set the // contenthash in ::getContentHash32() because we need it to // set EDOCUNCHANGED in ::getIndexCode() above. // /* if ( hi.m_hashGroup != HASHGROUP_INURL ) { // make the content hash so we can set m_contentHash32 // for deduping int32_t nh32 = hash32n ( name ); // do an exact hash for now... int32_t vh32 = hash32 ( val , vlen , m_niceness ); // accumulate, order independently totalHash32 ^= nh32; totalHash32 ^= vh32; } */ // index like "title:whatever" hi.m_prefix = name; hashString ( val , vlen , &hi ); // hash gbfieldmatch:some.fieldInJson:"case-sens field Value" if ( name ) hashFieldMatchTerm ( val , (int32_t)vlen , &hi ); // hash without the field name as well hi.m_prefix = NULL; hashString ( val , vlen , &hi ); /* // a number? hash special then as well if ( ji->m_type != JT_NUMBER ) continue; // use prefix for this though hi.m_prefix = name; // hash as a number so we can sort search results by // this number and do range constraints float f = ji->m_valueDouble; if ( ! hashNumber2 ( f , &hi ) ) return NULL; */ } //m_contentHash32 = totalHash32; //m_contentHash32Valid = true; return (char *)0x01; } char *XmlDoc::hashXMLFields ( HashTableX *table ) { setStatus ( "hashing xml fields" ); HashInfo hi; hi.m_tt = table; hi.m_desc = "xml object"; hi.m_hashGroup = HASHGROUP_BODY; Xml *xml = getXml(); int32_t n = xml->getNumNodes(); XmlNode *nodes = xml->getNodes (); SafeBuf nameBuf; // scan the xml nodes for ( int32_t i = 0 ; i < n ; i++ ) { // breathe QUICKPOLL(m_niceness); // . skip if it's a tag not text node skip it // . we just want the "text" nodes if ( nodes[i].isTag() ) continue; // assemble the full parent name // like "tag1.tag2.tag3" nameBuf.reset(); xml->getCompoundName ( i , &nameBuf ); // this is \0 terminated char *tagName = nameBuf.getBufStart(); // get the utf8 text char *val = nodes[i].m_node; int32_t vlen = nodes[i].m_nodeLen; // index like "title:whatever" if ( tagName && tagName[0] ) { hi.m_prefix = tagName; hashString ( val , vlen , &hi ); } // hash without the field name as well hi.m_prefix = NULL; hashString ( val , vlen , &hi ); } return (char *)0x01; } // if our url is that of a subdoc, then get the url of the parent doc // from which we were a subsection char *XmlDoc::getDiffbotParentUrl( char *myUrl ) { // remove -diffbotxyz if ( ! m_kbuf.safeStrcpy( myUrl ) ) return NULL; char *p = m_kbuf.getBufStart(); char *s = strstr(p,"-diffbotxyz"); if ( s ) { *s = '\0'; return p; } // temporarily until we inject "diffbotreply" uncomment this /* // otherwise i guess we got dan's format of -article|%"INT32"|%"INT32" char *e = m_kbuf.getBuf() - 1; for ( ; *e && is_digit(*e) ; e-- ); if ( *e != '|' ) return NULL; e--; for ( ; *e && is_digit(*e) ; e-- ); if ( *e != '|' ) return NULL; e--; // now to hyphen char *estart = m_kbuf.getBufStart(); for ( ; e>estart && *e !='-' ; e-- ); if ( *e != '-' ) return NULL; *e = '\0'; return p; */ return NULL; } bool XmlDoc::storeFacetValues ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) { // sanity if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; } storeFacetValuesSite ( qs, sb, fvh ); // if "qa" is a gbxpathsitehash123456 type of beastie then we // gotta scan the sections if ( strncasecmp(qs,"gbxpathsitehash",15) == 0 ) return storeFacetValuesSections ( qs , sb , fvh ); // if a json doc, get json field if ( m_contentType == CT_JSON ) return storeFacetValuesJSON ( qs , sb , fvh ); if ( m_contentType == CT_HTML ) return storeFacetValuesHtml ( qs , sb , fvh ); if ( m_contentType == CT_XML ) return storeFacetValuesXml ( qs , sb , fvh ); return true; } // Store facet for site bool XmlDoc::storeFacetValuesSite ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) { char* val = getSite(); int vlen = gbstrlen(val); FacetValHash_t val32 = hash32 ( val , vlen ); // skip if not for us if ( fvh && val32 != fvh ) return false; if ( strcmp("gbtagsite",qs) ) return false; // otherwise add facet FIELD to our buf if ( ! sb->safeStrcpy(qs) ) return false; if ( ! sb->pushChar('\0') ) return false; // then add facet VALUE if ( !sb->safePrintf("%"UINT32",",(uint32_t)val32)) return false; if ( val && vlen && ! sb->safeMemcpy(val,vlen) ) return false; if ( ! sb->pushChar('\0') ) return false; return true; } bool XmlDoc::storeFacetValuesSections ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) { // scan all sections Sections *ss = getSections(); if ( ! ss ) return false; if ( ss == (void *)-1 ) { char *xx=NULL;*xx=0; } Words *ww = getWords(); if ( ! ww ) return false; if ( ww == (void *)-1 ) { char *xx=NULL;*xx=0; } int32_t siteHash32 = *getSiteHash32(); // qs is like gbxpathsitehash1234567 // so get the digit part char *p = qs; for ( ; *p && ! is_digit(*p); p++ ); uint64_t xsh = (uint64_t)atoll(p); bool isString = false; if ( strncmp(qs-4,"str:",4) == 0 ) isString = true; Section *si = ss->m_rootSection; //sec_t mflags = SEC_SENTENCE | SEC_MENU; for ( ; si ; si = si->m_next ) { // breathe QUICKPOLL(m_niceness); // is it a match? uint64_t mod; mod = (uint32_t)si->m_turkTagHash32; mod ^= (uint32_t)siteHash32; if ( mod != xsh ) continue; // . then add facet VALUE // . hash of the innerhtml of sentence // . get hash of sentences this tag contains indirectly uint32_t val32 = (uint32_t)si->m_indirectSentHash64; if ( ! val32 ) continue; // if a facetvalhash was provided we must match if ( fvh && val32 != fvh ) continue; // got one print the facet field if ( ! sb->safeStrcpy(qs) ) return false; if ( ! sb->pushChar('\0') ) return false; if ( isString && ! sb->safePrintf("%"UINT32",",val32) ) return false; // put ALSO print the string somewhat char *a = m_words.m_words[si->m_next->m_a]; char *b = m_words.m_words[si->m_next->m_b-1]; b += m_words.m_wordLens [si->m_next->m_b-1]; if ( ! sb->safeTruncateEllipsis (a,b-a,160) ) return false; if ( ! sb->pushChar('\0') ) return false; // if wanted a specific string, we are done if ( fvh ) return true; } return true; } bool XmlDoc::storeFacetValuesHtml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) { Xml *xml = getXml(); int32_t qsLen = gbstrlen(qs); bool isString = false; if ( strncmp(qs-4,"str:",4) == 0 ) isString = true; // check for gblang:en etc. // if ( isString && strncmp(qs,"gblang",6)==0 ) { // if (!sb->safeStrcpy(qs) ) return false; // if (!sb->pushChar('\0') ) return false; // // find the lang that has that hash! // if (!sb->safePrintf("%"UINT32",",(uint32_t)val32))return false; // if (!sb->safeMemcpy(content,contentLen) ) return false; // if (!sb->pushChar('\0') ) return false; //} char *content; int32_t contentLen; int32_t nameLen; char *s; int32_t i = 0; bool uniqueField = false; // a title tag can count now too if ( strcmp(qs,"title") == 0 ) { // skip leading spaces = false content = xml->getString ("title",&contentLen,false); uniqueField = true; goto skip; } // find the first meta summary node for ( i = 0 ; i < xml->m_numNodes ; i++ ) { // continue if not a meta tag if ( xml->m_nodes[i].m_nodeId != TAG_META ) continue; // . does it have a type field that's "summary" // . // . s = xml->getString ( i , "name", &nameLen ); // "s" can be "summary","description","keywords",... if ( nameLen != qsLen ) continue; if ( strncasecmp ( s , qs , qsLen ) != 0 ) continue; // point to the summary itself content = xml->getString ( i , "content" , &contentLen ); if ( ! content || contentLen <= 0 ) continue; skip: // hash it to match it if caller specified a particular hash // because they are coming from Msg40::lookUpFacets() function // to convert the hashes to strings, like for rendering in // the facets box to the left of the search results FacetValHash_t val32 = hash32 ( content, contentLen); if ( fvh && fvh != val32 ) continue; // otherwise add facet FIELD to our buf if ( ! sb->safeStrcpy(qs) ) return false; if ( ! sb->pushChar('\0') ) return false; // then add facet VALUE if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32)) return false; if ( !sb->safeMemcpy(content,contentLen) ) return false; if ( !sb->pushChar('\0') ) return false; // if only one specified, we are done if ( fvh ) return true; if ( uniqueField ) return true; } return true; } bool XmlDoc::storeFacetValuesXml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) { Xml *xml = getXml(); int32_t qsLen = gbstrlen(qs); bool isString = false; if ( strncmp(qs-4,"str:",4) == 0 ) isString = true; int32_t i = 0; bool uniqueField = false; SafeBuf nameBuf; // find the first meta summary node for ( i = 0 ; i < xml->m_numNodes ; i++ ) { // skip text nodes if ( xml->m_nodes[i].m_nodeId == 0 ) continue; // assemble the full parent name // like "tag1.tag2.tag3" nameBuf.reset(); xml->getCompoundName ( i , &nameBuf ); int32_t nameLen = nameBuf.length(); char *s = nameBuf.getBufStart(); // . does it have a type field that's "summary" // . // . //s = xml->getString ( i , "name", &nameLen ); // "s" can be "summary","description","keywords",... if ( nameLen != qsLen ) continue; if ( strncasecmp ( s , qs , qsLen ) != 0 ) continue; // got it... // wtf? if ( i + 1 >= xml->m_numNodes ) continue; // point to the content! this is a text node? // skip if not a text node, we don't return tag nodes i guess if ( xml->m_nodes[i+1].m_nodeId ) continue; char *content = xml->m_nodes[i+1].m_node; int32_t contentLen = xml->m_nodes[i+1].m_nodeLen; // skip if empty if ( ! content || contentLen <= 0 ) continue; // skip commen cases too! like white space if ( contentLen == 1 && is_wspace_a(content[0]) ) continue; // hash it to match it if caller specified a particular hash // because they are coming from Msg40::lookUpFacets() function // to convert the hashes to strings, like for rendering in // the facets box to the left of the search results FacetValHash_t val32 = hash32 ( content, contentLen); if ( fvh && fvh != val32 ) continue; // otherwise add facet FIELD to our buf if ( ! sb->safeStrcpy(qs) ) return false; if ( ! sb->pushChar('\0') ) return false; // then add facet VALUE if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32)) return false; if ( !sb->safeMemcpy(content,contentLen) ) return false; if ( !sb->pushChar('\0') ) return false; // if only one specified, we are done if ( fvh ) return true; if ( uniqueField ) return true; } return true; } bool XmlDoc::storeFacetValuesJSON (char *qs, SafeBuf *sb,FacetValHash_t fvh ) { // use new json parser Json *jp = getParsedJson(); JsonItem *ji = jp->getFirstItem(); char nb[1024]; SafeBuf nameBuf(nb,1024); bool isString = false; if ( strncmp(qs-4,"str:",4) == 0 ) isString = true; for ( ; ji ; ji = ji->m_next ) { QUICKPOLL(m_niceness); // skip if not number or string if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING ) continue; // reset, but don't free mem etc. just set m_length to 0 nameBuf.reset(); // get its full compound name like "meta.twitter.title" ji->getCompoundName ( nameBuf ); // skip if not for us if ( strcmp(nameBuf.getBufStart(),qs) ) continue; // // now Json.cpp decodes and stores the value into // a buffer, so ji->getValue() should be decoded completely // int32_t vlen; char *val = ji->getValueAsString( &vlen ); // hash it to match it if caller specified a particular hash // because they are coming from Msg40::lookUpFacets() function // to convert the hashes to strings, like for rendering in // the facets box to the left of the search results FacetValHash_t val32 = hash32 ( val , vlen ); if ( fvh && val32 != fvh ) continue; // otherwise add facet FIELD to our buf if ( ! sb->safeStrcpy(qs) ) return false; if ( ! sb->pushChar('\0') ) return false; // then add facet VALUE if ( isString && !sb->safePrintf("%"UINT32",",(uint32_t)val32)) return false; if ( val && vlen && ! sb->safeMemcpy(val,vlen) ) return false; if ( ! sb->pushChar('\0') ) return false; // if wanted a specific string, then we are done if ( fvh ) return true; } return true; }