open-source-search-engine/LanguageIdentifier.h
2014-11-10 14:45:11 -08:00

186 lines
5.0 KiB
C++

/// \file language.c \brief Language detection utility routines.
///
/// Contains the main utility function, guessLanguage(), and all
/// the support routines for detecting the language of a web page.
///
/// 2007 May 24 09:02:52
/// $ID$
/// $Author: John Nanney$
/// $Workfile$
/// $Log$
///
// using a different macro because there's already a Language.h
#ifndef LANGUAGEIDENTIFIER_H
#define LANGUAGEIDENTIFIER_H
#include "gb-include.h"
#include "Xml.h"
#include "Linkdb.h"
//#include "LinkInfo.h"
#include "Query.h"
/// Contains methods of language identification by various means.
class LanguageIdentifier {
public:
/// Constructor, does very little.
LanguageIdentifier();
/// Destructor, does very little.
~LanguageIdentifier() { return; }
/// Get the language from the page's lang="" tag.
///
/// Looks for a lang="x" property in the HTML, BODY, or HEAD
/// tag. Returns the first match. This is usually a very
/// accurate guess of the language, since the author of the
/// page went through all the trouble to make sure it was
/// in there.
///
/// @param xml the page's xml object
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromTag(Xml *xml);
/// Guess the language from the TLDs of outlinks found in the page.
///
/// TLDs which are ambiguous like .com are skipped.
///
/// @param links a list of links
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromOutlinks(Links *links);
/// Guess the language from the page's TLD.
///
/// @param linktext the ascii URL
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromTld(char *linktext);
/// Guess the language from the languages of the inlinks.
///
/// @param linkInfo
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromInlinks(LinkInfo *linkInfo, int32_t ip);
/// Determine whether a given TLD is suitable for language detection.
/// @param tld the TLD in ascii
/// @param len the length of tld
/// @return true if suitable, false if not
///
inline bool isAmbiguousTLD(char *tld, int len);
/// Return the greater of two ints.
inline int maxOf(int a, int b) {
if(b > a) return(b);
return(a);
}
/// Guesses language from the DOCTYPE string present in many pages.
///
/// @param xml the page's xml object
/// @param content the page's content, for finding the doctype
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromDoctype(Xml *xml, char *content);
/// Guess a language from a tag in the user agent string.
///
/// @param str the user agent string
///
/// @return the language, or langUknown
///
uint8_t guessLanguageFromUserAgent(char *str);
/// Find a language for a given IP address.
///
/// @param address the address
///
/// @return the language ID, or langUnknown
///
uint8_t guessLanguageFromIP(uint32_t address);
/// Find a country code for a given IP address.
///
/// This probably should not be called from user code,
/// but it may be useful in some situations.
///
/// @param address the IP address as an unsigned int
/// @param max optional, highest entry to search
/// @param min optional, lowest entry to search
/// @param *ldepth to be filled in by the highest depth of recursion
///
/// @return country code on success, or NULL (or "zz") on failure
///
char *findGeoIP(uint32_t address, uint32_t max,
uint32_t min = 0, uint32_t *ldepth = NULL);
/// Find an address in DMOZ for the language.
///
/// Looks up the page address in the category language tables.
///
/// @param addr the page address
///
/// @return language, or langUnknown if not found
///
uint8_t guessLanguageFromDMOZ(char *addr);
/// Guess the query language from the query terms.
///
/// This algorithm looks for two consecutive terms with the
/// same language.
///
/// @param q the query object
///
/// @return the language, or langUnknown
///
uint8_t guessLanguageFromQuery(Query *q);
/// Find a language from DMOZ topic.
///
/// The function name is a bit misleading, we expect
/// the country from the Top/World/X node.
///
/// @param topic the country name
///
/// @return the language, or langUnknown
///
uint8_t findLangFromDMOZTopic(char *topic);
uint8_t getBestLanguage(char** method,
Url* url,
Xml* xml,
Links* links,
LinkInfo* linkInfo,
char* content);
uint8_t getBestLangsFromVec(char* langCount,
//SiteType* typeVec,
int32_t *langIds ,
uint8_t *langScores ,
int32_t tagVecSize);
uint8_t guessGBLanguageFromUrl(char *url);
uint8_t guessLanguageFromUrl(char *url);
uint8_t guessLanguageFreqCount(Xml *xml,
int pageLimit /* = 512 */);
uint8_t guessCountryTLD(const char *url);
uint8_t guessCountryIP(uint32_t ip);
uint8_t guessCountryFromUserAgent(char *ua);
};
extern class LanguageIdentifier g_langId;
extern const uint8_t *langToTopic[];
#endif // LANGUAGEIDENTIFIER_H