
20886 lines
607 KiB
Raw Normal View History

2013-08-03 00:12:24 +04:00
//-*- coding: utf-8 -*-
#include "Proxy.h"
class Address *g_address; // for debug
#define CRID_ANY 0
#define CRID_US 226
// if you have "in <city/adm1 name>" in same sentence as street then
// require that that item be a city/adm1 in any address you try to do.
// i would set "long long inPrepPhrase" to be the city/adm1 place hash.
// so if it is not zero, check for it. but add it with addProperPlaces()
// first to see if it added anything!! then we can
//and fix it so "1914" years and older years are pub dates!
//and inclide days of the week in pub dates like "sunday, april 11, 2004"
//do not allow lower case 'or' in place name!
//do not allow place names starting with "arrangements by" or "sponsored by"
// test on
// test on
// TODO: FOR ADDRESS overlap detection, just hash every word index for
// every Place which can not be shared. then store the score and
// Address ptr as the data value, so we can do a quick compare!
// TODO: also add conflicting addresses with the same score as winners.
// if we can't resolve a winner then we should just eliminate both/all
// to be on the safe side. like the page has both albuquerque
// and santa fe in the <title> tag so it is really just lucky that we
// pick albuquerque most of the time... we might be able to bring in
// street name to city map to help us fix this one. if both cities have
// the same street name, then nuke both! any other ideas?
// TODO: for the page we need to determine the most popular
// city/adm1 pair over the whole page and use that as another default
// option. also consider if we should have several and score them...
// TODO: for all the phrases in "small" sections and all phrases following
// "at" or "at the" look those phrases up in placedb as place names
// to get their addresses. also confirm the place names we extract
// that are immediately before street names. also get all the possible
// city/adm1/ctry tuples that each place name might have. if these
// are not right next to it then i guess you need to get them from
// the title and tagdb. that way the placedb lookup can integrate
// the tuples into the key and greatly narrow the list. we may have
// to then do multiple lookups for the same place name in placedb,
// so another reason we should distribute them and keep them in memory
// or at least on an SSD. use *namedb* to index place names just like
// indexdb. then we can conduct a search for a place name on namedb
// and get the corresponding keys of the place records in placedb.
// namedb will need to be mostly in memory then!
// TODO: verify street addresses we do extract by looking up each one in
// placedb by the street. each street may have multiple city/adm1/ctry
// tuples, so this lookup should narrow it down!
// test zipcode hyphen fix on
#include "gb-include.h"
#include "Address.h"
#include "Sections.h"
//#include "DateParse2.h"
#include "Abbreviations.h"
#include "Phrases.h"
//#include "Weights.h"
#include "XmlDoc.h" // hashWords()
#include "Hostdb.h"
#include "Placedb.h"
#include "sort.h"
#include "HttpServer.h"
//#define CF_UNIQUE (((unsigned long long)1LL)<<63)
bool getBestLatLon ( RdbList *list ,
double *bestLat ,
double *bestLon ,
long *numVotes ,
long niceness ,
long winnerSnh ) ;
char *getLatLonPtrFromStr ( char *data ) ;
void getLatLonFromStr ( char *data , double *lat , double *lon);
char *getStateAbbr ( uint64_t bit ) ;
long long getWordXorHash ( char *s ) ;
long long getWordXorHash2 ( char *s ) ;
long getStateOffset ( long long *h ) ;
class StateDesc *getStateDescFromBits ( uint64_t bit ) ;
// returns 0 if not a state:
uint64_t getStateBitFromHash ( long long *h ) ;
static bool setHashes ( class Place *p , Words *ww , long niceness ) ;
static bool addIndicator ( char *s , char bit , float boost );
static bool addIndicator ( long long h , char bit , float boost );
//static void printAddress ( class Address *A , class SafeBuf *pbuf , long i);
static void printPlaces ( PlaceMem *pm , SafeBuf *pbuf ,
class Sections *sections ,
class Address *base ) ;
static bool getZipLatLon ( char *zip ,
long zipLen ,
float *zipLat ,
float *zipLon ) ;
// new stuff
static bool generatePlacesFile ( ) ;
static bool loadPlaces ( ) ;
class PlaceDesc *getState_new ( uint64_t pd64 , uint8_t crid , long niceness );
PlaceDesc *getState2_new ( char *state , uint8_t crid , long niceness ) ;
class PlaceDesc *getCity_new ( uint64_t ch64 ,
char *stateAbbr ,
uint8_t crid ,
long niceness ) ;
class PlaceDesc *getCity2_new ( char *city ,
char *stateAbbr ,
uint8_t crid ,
long niceness ) ;
PlaceDesc *getCity3_new ( uint64_t ch64 ,
uint64_t stateHash64,
uint8_t crid ,
long niceness ) ;
bool getLongestPlaceName_new ( long i,
long alnumPos,
Words *w,
uint8_t placeType,
uint8_t crid, // can be CRID_ANY
char *stateAbbr, // can be NULL
uint64_t *placeHash64,
long *placeAlnumA,
long *placeAlnumB,
long *placeA,
long *placeB ,
// set to most popular match
PlaceDesc **pdp ) ;
bool getZip_new ( long a ,
long alnumPos ,
Words *words ,
uint64_t *zipHash64 ,
uint64_t *zipCityHash64 ,
uint64_t *zipStateHash64 ,
long *zipAlnumA,
long *zipAlnumB,
long *zipA,
long *zipB ,
float *zipLat,
float *zipLon) ;
PlaceDesc *getMostPopularPlace_new ( long long cityHash64,
uint8_t crid ,
uint8_t placeType,
long niceness );
char *g_pbuf = NULL;
long g_pbufSize = 0;
HashTableX g_nameTable;
char *PlaceDesc::getOfficialName ( ) {
return g_pbuf + m_officialNameOffset;
char *PlaceDesc::getStateName ( ) {
// get our state abbr
char buf[3];
buf[0] = m_adm1[0];
buf[1] = m_adm1[1];
buf[2] = '\0';
// does this convert to lowercase? yes... it should
uint64_t placeHash64 = getWordXorHash ( buf );
// look up the place desc for the state
PlaceDesc *sd = getPlaceDesc ( placeHash64 ,
buf, // state abbr
0 ); // niceness
if ( ! sd ) return NULL;
return sd->getOfficialName();
const char *PlaceDesc::getCountryName ( ) {
return g_countryCode.getName ( m_crid );
HashTableX g_indicators;
static HashTableX g_timeZones;
static HashTableX g_cities;
static HashTableX g_states;
static HashTableX g_aliases;
static HashTableX g_zips;
char *g_cityBuf = NULL;
long g_cityBufSize = 0;
// . NOW each slot in the g_cities has a ptr to a CityDesc in SafeBuf g_cityBuf
// . so now we can put all the alternate names and aliases into the same table
class CityDesc {
// set bit for each state that the city is in
uint64_t m_adm1Bits;
// for chicago, we would pick "13" since s_states[13] is illinois
char m_mostPopularState;
// "us.nm,us.ny,es.a1,...|en-nl-fi=cincinnati,es-de=cincinnatus,..."
char m_data[];
//bool setFromStr(Address *a,char *s,pbits_t flags ,
// Place *places , long *np , long maxPlaces, long niceness );
static uint64_t getAddressHash ( Place *street ,
Place *city ,
Place *adm1 ,
Place *zip ) ;
static void verifiedWrapper ( void *state ) ;
static void gotMsg2cReplyWrapper ( void *state , void *state2 ) ;
static void gotList2c ( void *state , RdbList *xxx , Msg5 *yyy ) ;
static void sendBackAddress ( class State2c *st ) ;
Place *g_pa = NULL;
#define MIN_POP_COUNT 500
//#define MAX_STREETS 300
//#define MAX_PLACES 3500
// i raised from 15 to 25 since "Virginia Beach" city was not being picked up
// on
#define MAX_CITIES 25
#define MAX_ADM1 80 // 1500
#define MAX_ZIPS 5
// stock g_zips with these zip code descriptors
class ZipDesc {
// . this is unique within the country code only
// . see /gb/geo/geonames/admin1Codes.txt for the list
// . remove the "CC." country code prefixing each
// . example from that file: "NL.09 Utrecht\n"
char m_adm1[2];
// a single byte country id (converted to from a 2 char country id)
//uint8_t m_crid;
// hash of the city it is in
long long m_cityHash;
// offset into g_cityBuf of the city name
long m_cityOffset;
// now we use the adm1 bits since US-only now
uint64_t m_adm1Bits;
// lat/lon of centroid. for sorting by dist when user's zip is known
float m_latitude;
float m_longitude;
//void reset() {m_crid = 0; m_adm1[0] = m_adm1[1] = 0;};
void reset() {m_adm1Bits = 0;m_adm1[0]=0; m_adm1[1]=0;};
static char *s_days[] = {
static StateDesc s_states[] = {
{"dc","district of columbia","d.c."},
{"nh","new hampshire","n.h."},
{"nj","new jersey","n.j."},
{"nm","new mexico","n.m."},
{"ny","new york","n.y."},
{"nc","north carolina","n.c."},
{"nd","north dakota","n.d."},
{"ri","rhode island","r.i."},
{"sc","south carolina","s.c."},
{"sd","south dakota","s.d."},
{"wv","west virginia","w.v."},
#include "StopWords.h"
static HashTableX s_doyTable;
static bool s_doyInit = false;
long getDayOfWeek ( long long h ) {
if ( ! s_doyInit ) {
s_doyInit = initWordTable(&s_doyTable, s_days ,sizeof(s_days),
if ( ! s_doyInit ) return -1;
// . get from table
// . score should be 1 for sunday i guess
long score = s_doyTable.getScore ( &h );
// make it 0-6
score = (score-1) % 7;
// that's it
return score;
// (243k! do not truncate!!)
// (rss)
// (journal pavilion)
// (
// **
// address parsing test cases:
// address examples:
// Marina Costa e Silva
// Rua Afonso Canargo, 805
// Santana
// 85070-200 Guarapuava - PR
// University of New Mexico
// Department of Physics and Astronomy
// MSC07 4220
// 800 Yale Blvd NE
// Albuquerque, New Mexico 87131-0001 USA
// US-380
// Lincoln, NM
// Saturday, August 8, 2009
static bool s_init = false;
Addresses::Addresses ( ) {
m_buf = NULL;
m_bufSize = 0;
m_calledGeocoder = false;
m_xd = NULL;
m_msg2c = NULL;
m_sorted = NULL;
m_sortedValid = false;
m_breached = false;
m_numValid = 0;
Addresses::~Addresses ( ) {
void Addresses::reset ( ) {
if ( m_buf && m_bufSize )
mfree ( m_buf , m_bufSize , "adata");
m_buf = NULL;
m_bufSize = 0;
//m_ptValid = false;
//m_msg2c.m_requests = 0;
//m_msg2c.m_replies = 0;
m_firstBreach = true;
m_breached = false;
m_numValid = 0;
m_calledGeocoder = false;
if ( m_msg2c ) {
mdelete ( m_msg2c , sizeof(Msg2c),"aamsg2c");
delete (m_msg2c);
m_msg2c = NULL;
// free buf
if ( m_sorted )
mfree ( m_sorted , m_sortedSize , "asortbuf");
m_sorted = NULL;
m_sortedValid = false;
m_uniqueStreetHashes = 0;
static long long h_court;
static long long h_i;
static long long h_interstate;
static long long h_page ;
static long long h_corner ;
static long long h_between ;
static long long h_btwn ;
static long long h_bet ;
static long long h_streets ;
static long long h_sts ;
static long long h_at ;
static long long h_come ;
static long long h_is ;
static long long h_located ;
static long long h_intersection;
static long long h_law ;
static long long h_address ;
static long long h_added ;
static long long h_copy ;
static long long h_search ;
static long long h_find ;
static long long h_go ;
static long long h_town ;
static long long h_city ;
static long long h_street ;
static long long h_telephone;
static long long h_tel ;
static long long h_ph ;
static long long h_fax ;
static long long h_where ;
static long long h_location;
static long long h_venue ;
static long long h_map ;
static long long h_office ;
static long long h_center ;
static long long h_mailing ;
static long long h_mail ;
static long long h_snail ;
static long long h_edit ;
static long long h_email ;
static long long h_phone ;
static long long h_inc ;
static long long h_llc ;
static long long h_review ;
static long long h_reviews ;
static long long h_write ;
static long long h_add ;
static long long h_view ;
static long long h_favorites ;
static long long h_more ;
static long long h_info ;
static long long h_information ;
static long long h_the ;
static long long h_in ;
static long long h_a ;
static long long h_paseo ;
static long long h_de ;
static long long h_del ;
static long long h_all ;
static long long h_rights ;
static long long h_reserved ;
static long long h_contact ;
static long long h_us ;
static long long h_by ;
static long long h_of ;
static long long h_for ;
static long long h_arrangements ;
static long long h_arranged ;
static long long h_sponsored ;
static long long h_to ;
static long long h_every ;
static long long h_p ;
static long long h_b ;
static long long h_hwy ;
static long long h_state ;
static long long h_county ;
static long long h_cnty ;
static long long h_cty ;
static long long h_road ;
static long long h_route ;
static long long h_rte ;
static long long h_rt ;
static long long h_highway ;
static long long h_hiway ;
static long long h_cr ;
static long long h_o ;
static long long h_po ;
static long long h_post ;
static long long h_box ;
static long long h_top ;
static long long h_one ;
static long long h_noon ;
static long long h_midnight ;
static long long h_daily ;
static long long h_st ;
static long long h_nd ;
static long long h_rd ;
static long long h_th ;
static long long h_away ;
static long long h_results ;
static long long h_days ;
static long long h_blocks ;
static long long h_block ;
static long long h_miles ;
static long long h_mile ;
static long long h_year ;
static long long h_years ;
static long long h_yr ;
static long long h_yrs ;
static long long h_hours ;
static long long h_hrs ;
static long long h_hour ;
static long long h_hr ;
static long long h_mi ;
static long long h_kilometers;
static long long h_km ;
static long long h_copyright ;
static long long h_and ;
static long long h_or ;
static long long h_suite ;
static long long h_ste ;
static long long h_bldg ;
static long long h_bld ;
static long long h_building ;
static long long h_unit ;
static long long h_room ;
static long long h_pier ;
static long long h_rm ;
static long long h_run ;
static long long h_ne ;
static long long h_nw ;
static long long h_se ;
static long long h_sw ;
static long long h_n ;
static long long h_s ;
static long long h_e ;
static long long h_w ;
static long long h_north;
static long long h_northeast;
static long long h_northwest;
static long long h_east;
static long long h_west;
static long long h_south;
static long long h_southeast;
static long long h_southwest;
static long long h_heart ;
static long long h_core ;
static long long h_least ;
static long long h_most ;
static long long h_this ;
static long long h_appeared ;
static long long h_role ;
static long long h_studied;
static long long h_prize;
static long long h_finish;
static long long h_door;
static long long h_entrance;
static long long h_area;
static long long h_left ;
static long long h_right ;
static long long h_stare ;
static long long h_sea ;
static long long h_discount ;
static long long h_discounted ;
static long long h_www;
static long long h_gaze ;
static long long h_look ;
static long long h_looking;
static long long h_be ;
static long long h_determined ;
static long long h_call ;
static long long h_details;
static long long h_tba;
static long long h_avenue;
static long long h_ave;
static long long h_register;
static long long h_sign;
static long long h_up;
static long long h_signup;
static long long h_tickets;
static long long h_purchase;
static long long h_get;
static long long h_enroll;
static long long h_buy;
static long long h_presale ;
static long long h_pre ;
static long long h_sale ;
static long long h_on ;
static long long h_sales ;
static long long h_end ;
static long long h_begin ;
static long long h_start ;
static long long h_am;
static long long h_fm;
// . first identifies all the "Places" using the rules above
// . then clusters the "Places" together into an "Address"
// . we use the address at the top of the page, and the site contact info,
// etc. to be defaults, so we can inherit, city, state, etc. from those
// . returns false if blocked, true otherwise. sets g_errno on error.
bool Addresses::set ( Sections *sections ,
Words *words ,
Bits *bits ,
TagRec *gr ,
Url *url ,
long long docId ,
//char *coll ,
collnum_t collnum ,
2013-08-03 00:12:24 +04:00
long domHash32 ,
long ip ,
//long tagPairHash ,
long niceness ,
SafeBuf *pbuf ,
void *state ,
void (*callback) (void *state) ,
uint8_t contentType ,
// from XmlDoc::ptr_addressReply in a title rec
//char *addressReply ,
//long addressReplySize ,
//bool addressReplyValid ,
char *siteTitleBuf ,
long siteTitleBufSize ,
XmlDoc *xd ) {
// save stuff
m_xd = xd;
m_sections = sections;
m_words = words;
m_wptrs = words->m_words;
m_wlens = words->m_wordLens;
m_nw = words->m_numWords;
m_wids = words->getWordIds();
m_tids = words->getTagIds();
m_bits = bits;
m_gr = gr;
m_url = url;
m_docId = docId;
m_collnum = collnum;
2013-08-03 00:12:24 +04:00
m_domHash32 = domHash32;
m_ip = ip;
//m_tagPairHash = tagPairHash;
m_niceness = niceness;
m_pbuf = pbuf;
m_state = state;
m_callback = callback;
m_contentType = contentType;
//m_addressReply = addressReply;
//m_addressReplySize = addressReplySize;
//m_addressReplyValid = addressReplyValid;
m_siteTitleBuf = siteTitleBuf;
m_siteTitleBufSize = siteTitleBufSize;
static bool s_setHashes = false;
if ( ! s_setHashes ) {
// flag it
s_setHashes = true;
// shortcuts
h_i = hash64n ("i");
h_court = hash64n ("court");
h_interstate = hash64n ("interstate");
h_page = hash64n ("page");
h_corner = hash64n ("corner");
h_between = hash64n ( "between");
h_btwn = hash64n ( "btwn");
h_bet = hash64n ( "bet");
h_streets = hash64n ( "streets");
h_sts = hash64n ( "sts");
h_at = hash64n ( "at" );
h_come = hash64n ("come");
h_is = hash64n ( "is" );
h_located = hash64n ( "located" );
h_intersection = hash64n("intersection");
h_law = hash64 ( "law" ,3);
h_address = hash64 ( "address",7);
h_added = hash64 ( "added",5);
h_copy = hash64 ( "copy",4);
h_search = hash64 ( "search",6);
h_find = hash64 ( "find",4);
h_go = hash64 ( "go",2);
h_town = hash64n ( "town");
h_city = hash64n ( "city");
h_street = hash64 ( "street",6);
h_telephone = hash64 ( "telephone",9);
h_tel = hash64 ( "tel",3);
h_ph = hash64 ( "ph",2);
h_fax = hash64 ( "fax",3);
h_where = hash64 ( "where",5);
h_location= hash64 ( "location",8);
h_venue = hash64n("venue");
h_map = hash64 ( "map" ,3);
h_office = hash64 ( "office" ,6);
h_center = hash64n ("center");
h_mailing = hash64 ( "mailing" ,7);
h_mail = hash64 ( "mail" ,4);
h_snail = hash64 ( "snail" ,5);
h_edit = hash64 ( "edit" ,4);
h_email = hash64 ( "email" ,5);
h_phone = hash64 ( "phone" ,5);
h_inc = hash64 ( "inc" ,3);
h_llc = hash64 ( "llc" ,3);
h_review = hash64 ( "review" ,6);
h_reviews = hash64 ( "reviews" ,7);
h_write = hash64 ( "write", 5);
h_add = hash64 ( "add",3 );
h_view = hash64 ( "view", 4);
h_favorites = hash64 ( "favorites", 9);
h_more = hash64 ( "more",4 );
h_info = hash64 ( "info",4 );
h_information = hash64 ( "information", 11);
h_the = hash64 ( "the" ,3);
h_in = hash64 ( "in" ,2);
h_a = hash64 ( "a" ,1);
h_paseo = hash64n ( "paseo");
h_de = hash64n ( "de");
h_del = hash64n ( "del");
h_all = hash64 ( "all" ,3);
h_rights = hash64 ( "rights" ,6);
h_reserved = hash64 ( "reserved" ,8);
h_contact = hash64 ( "contact" , 7);
h_us = hash64 ( "us" , 2);
h_by = hash64 ( "by" ,2);
h_of = hash64 ( "of" ,2);
h_for = hash64 ( "for" ,3);
h_arrangements = hash64("arrangements",12);
h_arranged = hash64("arranged",8);
h_sponsored = hash64("sponsored",9);
h_to = hash64 ( "to" ,2);
h_every = hash64 ( "every",5);
h_p = hash64 ( "p" ,1);
h_b = hash64n ( "b" );
h_hwy = hash64 ( "hwy" ,3);
h_state = hash64 ( "state" ,5);
h_county = hash64 ( "county" , 6 );
h_cnty = hash64 ( "cnty" , 4 );
h_cty = hash64 ( "cty" , 3 );
h_road = hash64 ( "road" ,4);
h_route = hash64 ( "route" ,5);
h_rte = hash64 ( "rte" ,3);
h_rt = hash64 ( "rt" ,2);
h_highway = hash64 ( "highway" ,7);
h_hiway = hash64 ( "hiway" ,5);
h_cr = hash64 ( "cr" ,2);
h_o = hash64 ( "o" ,1);
h_po = hash64 ( "po" ,2);
h_post = hash64 ( "post" ,4);
h_box = hash64 ( "box" ,3);
h_top = hash64n ( "top" );
h_one = hash64 ( "one" ,3);
h_noon = hash64n ( "noon" );
h_midnight = hash64n ( "midnight" );
h_daily = hash64n ( "daily" );
h_st = hash64 ( "st" ,2);
h_nd = hash64 ( "nd" ,2);
h_rd = hash64 ( "rd" ,2);
h_th = hash64 ( "th" ,2);
h_away = hash64 ( "away" ,4);
h_results = hash64 ( "results" , 7 );
h_days = hash64 ( "days", 4 );
h_blocks = hash64 ( "blocks",6);
h_block = hash64 ( "block",5);
h_miles = hash64 ( "miles",5);
h_mile = hash64n ( "mile");
h_year = hash64n("year");
h_years = hash64n("years");
h_yr = hash64n("yr");
h_yrs = hash64n("yrs");
h_hours = hash64 ( "hours",5);
h_hrs = hash64 ( "hrs",3);
h_hour = hash64n ( "hour");
h_hr = hash64n ( "hr");
h_mi = hash64 ( "mi",2);
h_kilometers= hash64 ( "kilometers",10);
h_km = hash64 ( "km",2);
h_copyright = hash64 ( "copyright",9);
h_and = hash64 ( "and" , 3 );
h_or = hash64 ( "or" , 2 );
h_suite = hash64 ( "suite",5);
h_ste = hash64 ( "ste",3);
h_bldg = hash64 ( "bldg",4);
h_bld = hash64n ( "bld");
h_building = hash64 ( "building",8);
h_unit = hash64 ( "unit",4);
h_room = hash64 ( "room",4);
h_pier = hash64 ( "pier",4);
h_rm = hash64 ( "rm",2);
h_run = hash64n ("run");
h_ne = hash64 ( "ne" ,2);
h_nw = hash64 ( "nw" ,2);
h_se = hash64 ( "se" ,2);
h_sw = hash64 ( "sw" ,2);
h_n = hash64 ( "n" ,1);
h_s = hash64 ( "s" ,1);
h_e = hash64 ( "e" ,1);
h_w = hash64 ( "w" ,1);
h_north = hash64n("north");
h_south = hash64n("south");
h_east = hash64n("east");
h_west = hash64n("west");
h_northeast = hash64n("northeast");
h_northwest = hash64n("northwest");
h_southeast = hash64n("southeast");
h_southwest = hash64n("southwest");
h_heart = hash64n ( "heart" );
h_core = hash64n ( "core" );
h_least = hash64n ( "least" );
h_most = hash64n ( "most" );
h_this = hash64n ( "this" );
h_north = hash64n ( "north" );
h_south = hash64n ( "south" );
h_east = hash64n ( "east" );
h_west = hash64n ( "west" );
h_appeared = hash64n ( "appeared" );
h_role = hash64n ( "role" );
h_studied = hash64n ( "studied" );
h_prize = hash64n ( "prize" );
h_finish = hash64n("finish");
h_door = hash64n("door");
h_entrance = hash64n("entrance");
h_area = hash64n("area");
h_left = hash64n ( "left" );
h_right = hash64n ( "right" );
h_stare = hash64n ( "stare" );
h_sea = hash64n ( "sea" );
h_discount = hash64n("discount");
h_discounted = hash64n("discounted");
h_www = hash64n("www");
h_gaze = hash64n ( "gaze" );
h_look = hash64n ( "look" );
h_looking = hash64n ( "looking" );
h_be = hash64n("be");
h_determined = hash64n("determined");
h_call = hash64n("call");
h_details = hash64n("details");
h_tba = hash64n("tba");
h_avenue = hash64n("avenue");
h_ave = hash64n("ave");
h_register = hash64n("register");
h_sign = hash64n("sign");
h_up = hash64n("up");
h_signup = hash64n("signup");
h_tickets = hash64n("tickets");
h_purchase = hash64n("purchase");
h_get = hash64n("get");
h_enroll = hash64n("enroll");
h_buy = hash64n("buy");
h_presale = hash64n("presale");
h_pre = hash64n("pre");
h_sale = hash64n("sale");
h_on = hash64n("on");
h_sales = hash64n("sales");
h_end = hash64n("end");
h_begin = hash64n("begin");
h_start = hash64n("start");
h_am = hash64n("am");
h_fm = hash64n("fm");
// sanity check -- did set2() corrupt our junk?
//if ( m_msg2c.m_mcast.m_ownMsg && m_msg2c.m_mcast.m_msgSize > 5000 ){
// char *xx=NULL;*xx=0; }
// returns false and sets g_errno on error
bool status = set2 ( );
// sanity check -- did set2() corrupt our junk?
//if ( m_msg2c.m_mcast.m_ownMsg && m_msg2c.m_mcast.m_msgSize > 5000 ){
// char *xx=NULL;*xx=0; }
// sanity check
if ( ! status && ! g_errno ) { char *xx=NULL;*xx=0; }
// return true on error now
if ( ! status ) return true;
// . ok, go no further if from msg13
// . it will have to check m_good or something, not m_valid
if ( ! m_sections ) return true;
// if valid and empty, we are done
//if ( m_addressReplyValid && ! m_addressReply ) return true;
-- mdw took this out because it had too many false positives. often
the place name 1 and/or 2 was wrong and was calling nonsense a
place! for many urls... and now that i removed the
SEC_CONTENDED_ADDRESS algo all the events on a page even if
different tag hashes, can share the same address. to replace
that algo i am ignore events with SEC_TITLE_OUTLINKED if the
event title is an outlink to another page, and also i am trying
to identify all place names in events. this outlinked bit should
fix the url, since it has a
little section that has "You may Also Like..." for events at
different venues, mentioned by name.
// . now use the addresses that were inlined to verify those
// that were not inlined, assuming the place name matches
// . this will allow "The Filling Station" to be verified in
// 88543421-the-love-song-of-j-robert-oppenheimer-by-carson-kreitzer
// . first scan the addresses for inlined ones
// . logic taken basically from hashForPlacedb()
// init the table
HashTableX pt;
// returns true with g_errno set on error
if ( ! pt.set ( 8,4,256,NULL,0,false,m_niceness) ) return true;
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// get it
Address *a = &m_addresses[i];
// must be inlined
if ( ! ( a->m_flags & AF_INLINED ) ) continue;
// sometimes a street can exist in two cities or states
if ( a->m_flags & AF_AMBIGUOUS ) continue;
// must not have a place name in place of the street name
if ( a->m_street.m_flags2 & PLF2_IS_NAME ) continue;
// hash into table only if valid
long long h1 = a->m_name1.m_hash;
// adjust it since setHashes() xors in 0x123456 for street
// names that are actually place names in disguise
h1 ^= 0x123456;
// incorporate the adm1 and city and ctry
h1 = hash64 ( a->m_city.m_hash , h1 );
h1 = hash64 ( a->m_adm1.m_hash , h1 );
h1 = hash64 ( a->m_ctry.m_hash , h1 );
// put it in
if ( a->m_name1.m_strlen && ! pt.addKey ( (char *)&h1, &a ) )
return true;
// same for second place name
long long h2 = a->m_name2.m_hash;
// adjust it since setHashes() xors in 0x123456 for street
// names that are actually place names in disguise
h2 ^= 0x123456;
// incorporate the adm1 and city and ctry
h2 = hash64 ( a->m_city.m_hash , h2 );
h2 = hash64 ( a->m_adm1.m_hash , h2 );
h2 = hash64 ( a->m_ctry.m_hash , h2 );
// hash into table only if valid
if ( a->m_name2.m_strlen && ! pt.addKey ( (char *)&h2, &a ) )
return true;
// now scan our addresses that have a place name in place of
// the street name and see if we can get a match
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// get it
Address *a = &m_addresses[i];
// we want a place name in place of the street name now
if ( ! ( a->m_street.m_flags2 & PLF2_IS_NAME ) ) continue;
// . USE the STREET here, not the name
// . it should already have had the 0x123456 xor'ed in
// in the logic below because PLF2_IS_NAME is set.
long long h1 = a->m_street.m_hash;
// incorporate the adm1 and city and ctry
h1 = hash64 ( a->m_city.m_hash , h1 );
h1 = hash64 ( a->m_adm1.m_hash , h1 );
h1 = hash64 ( a->m_ctry.m_hash , h1 );
// note it
//logf(LOG_DEBUG,"add: lookuphash=%llx",a->m_street.m_hash);
// test that
//if ( a->m_street.m_hash == 0x14a2446f2d5a2647LL ) {
// setHashes ( &a->m_street );
// logf(LOG_DEBUG,"Add: had=%llx",a->m_street.m_hash);
// get hash of street, i.e. hash of name
// see if we have that in the table
long slot = pt.getSlot ( &h1 );
// skip if not there
if ( slot < 0 ) continue;
// kewl, we got a match, get the matching address
Address *ma = *(Address **)pt.getValueFromSlot ( slot );
// . now use it, i.e. replace ourselves with its info
// . this logic is from above.
// shortcuts
Place *name1 = &a->m_name1;
Place *street = &a->m_street;
// street name was name1
memcpy ( name1 , street , sizeof(Place) );
// and set the street to what it should be
street->m_str = ma->m_street.m_str;
street->m_strlen = ma->m_street.m_strlen;
// let it fly
a->m_flags |= AF_VERIFIED_STREET;
// do not verify place name though!
a->m_flags |= AF_VERIFIED_PLACE_NAME_1;
// so set hashes makes its own words class
street->m_a = -1;
street->m_b = -1;
// clear these, since PLF2_IS_NAME should be clear for us!!
// otherwise it causes setHashes() function below to set
// our hash as if we were a place name!!!
street->m_flags2 = 0;
// compute the street hash
// Events.cpp relies on this to make substitutions to places
// that have verified place names
// and in case hashForPlacedb() is called on us we
// have to tell it to not hash us!! so put flag back!!
street->m_flags2 |= PLF2_IS_NAME;
// free mem
// update status
if ( m_xd ) // && ! m_addressReplyValid )
m_xd->setStatus ( "consulting placedb" );
// make a msg2c first
try { m_msg2c = new (Msg2c); }
catch ( ... ) {
g_errno = ENOMEM;
log("addr: msg2c: new(%i): %s", sizeof(Msg2c),
// return true on error with g_errno set
return true;
mnew ( m_msg2c , sizeof(Msg2c) , "aamsg2c" );
// use niceness 0 if we are a turk injecting
long niceness2 = m_niceness;
if ( m_xd->m_oldsrValid &&
m_xd->m_oldsr.m_isInjecting &&
m_xd->m_oldsr.m_isPageInject )
niceness2 = 0;
if ( m_xd->m_oldsrValid &&
m_xd->m_oldsr.m_isInjecting &&
m_xd->m_oldsr.m_isPageReindex )
niceness2 = 0;
// rather than look up stuff in placedb, if we have m_addressReply
// provided, then that data represents placedb when we first
// indexed this titleRec and we need to use that to ensure
// parsing consistency
if ( //! m_addressReplyValid &&
! m_msg2c->verifyAddresses ( this ,
m_collnum ,
2013-08-03 00:12:24 +04:00
m_domHash32 ,
m_ip ,
m_niceness ,
this ,
verifiedWrapper ) )
return false;
// . update addresses from the table
// . returns false and sets g_errno on error
updateAddresses ( );
// all done
return true;
void verifiedWrapper ( void *state ) {
// get us
Addresses *THIS = (Addresses *)state;
// update addresses from replies
if ( ! g_errno ) THIS->updateAddresses();
// try this now. return if it blocked
//if ( ! g_errno && ! THIS->getGeocoderLatLon() ) return;
// call callback
THIS->m_callback ( THIS->m_state );
Address *g_aa = NULL;
// . return false with g_errno set on error
// . take the msg2c replies we got in m_sb.m_buf or in m_addressReply,
// which is a save of m_sb.m_buf in the titleRec (XmlDoc), and use
// those replies to set Address::m_flags bits.
// . also use those replies to update the place names in your addresses
// to verified place names
bool Addresses::updateAddresses ( ) {
// bail on error
if ( g_errno ) return false;
// sanity check - i think
// loop over replies in the replyBuf
char *p = m_sb.getBufStart();
char *pend = p + m_sb.length();
// . but use this buffer from title rec if valid though
// . this will ensure parsing consistency
//if ( m_addressReplyValid ) {
// p = m_addressReply;
// pend = p + m_addressReplySize;
// loop over the msg2c replies
for ( ; p < pend ; ) {
// breathe
QUICKPOLL ( m_niceness );
// parse this reply
long addrNum = *(long *)p; p += 4;
long replySize = *(long *)p; p += 4;
char *reply = p; p += replySize;
// sanity check
if ( addrNum >= m_am.getNumPtrs() ) { char *xx=NULL;*xx=0;}
if ( addrNum < 0 ) { char *xx=NULL;*xx=0;}
// skip if none!
if ( replySize == 0 ) continue;
// sanity check... why was this here? it was coring for
// a bunch of suites in 500 marquette ave.
//if ( replySize > 3000 ) { char *xx=NULL;*xx=0; }
if ( replySize > 5000 )
logf(LOG_DEBUG,"addr: got large addr reply of %li "
// sanity check
if ( replySize < 0 ) { char *xx=NULL;*xx=0; }
// sanity check
if ( p > pend ) { char *xx=NULL;*xx=0; }
// shortcut
Address *a = (Address *)m_am.getPtr(addrNum);
// make sure never got a reply for this
if ( a->m_flags & AF_GOT_REPLY ) { char *xx=NULL;*xx=0; }
// mark it
a->m_flags |= AF_GOT_REPLY;
// . parse it up
// . both reply types now have this same header
char *p = reply; // + 1;
// # of voters for the following lat/lon
long numVotes = *(long *)p; p += 4;
// then the lat lon
double lat = *(double *)p; p += sizeof(double);
double lon = *(double *)p; p += sizeof(double);
// sanity check
if ( p > reply + replySize ) { char *xx=NULL;*xx=0; }
// do not confuse with a->m_latitude/m_longitude
// because we do not want to re-serialize these back
// into the placedb record voting framework that
// would create some kind of feedback loop
a->m_importedLatitude = lat;
a->m_importedLongitude = lon;
a->m_importedVotes = numVotes;
// is the street really a place name (Tingley Colesium)
char isName = ( a->m_street->m_flags2 & PLF2_IS_NAME );
// deal with normal case
if ( ! isName ) {
// must be one byte
//if ( replySize != 1 ) { char *xx=NULL;*xx=0; }
// or in the flags
a->m_flags |= *p; p++; // *reply;
// then the alternate placedb names
char *str = p;
// set end
char *replyEnd = reply + replySize;
// and now we have a list of score/names separated
// by \0's
a->m_placedbNames = str;
a->m_placedbNamesEnd = replyEnd;
// assume no best
a->m_bestPlacedbName = NULL;
// max score
long max = 0;
// set the best one
for ( ; ; str += gbstrlen(str) + 1 ) {
// stop if that was it
if ( str >= replyEnd ) break;
// get score
long vote = *(long *)str;
// skip vote
str += 4;
// skip if not max
if ( vote <= max ) continue;
// set max
max = vote;
// got new max
a->m_bestPlacedbName = str;
// if no, best, make this null too
if ( ! a->m_bestPlacedbName ) a->m_placedbNames = NULL;
// all done integrating this reply
// if the address parser changes a lot of times the addrNum
// is incorrect, so really we should do it by the unique
// hash of the entire string
//if ( replySize == 1 ) {
// log("addr: addr num out of sync with addr data. "
// "addr parser change and was not versioned.");
// continue;
//if ( replySize == 1 ) { char *xx=NULL;*xx=0; }
// parse out street from reply (name1;name2;suite;street;...)
char *sp = p; // reply;
// reset count
long scount = 0;
char *replyEnd = reply+replySize;
// advance
for ( ; sp < replyEnd && scount < 3 ; sp++ )
if ( *sp == ';' ) scount++;
// crazy! must be the street
if ( ! *sp ) {
// print it out
log("addr: no street for %s",p);
//char *xx=NULL;*xx=0; }
return false;
// get end
char *spend = sp;
// advance to next ;
for ( ; *spend && *spend != ';' ; spend++ );
// sanity check
if ( ! *spend ) {
// print it out
log("addr: no street end for %s",p);
//char *xx=NULL;*xx=0; }
return false;
// shortcuts
//Place *name1 = a->m_name1;
//Place *street = a->m_street;
// now we just ptr swap
a->m_name1 = a->m_street;
// make that street reference this address then
// i guess we are supplanting the Place::m_address setting
// logc below here
a->m_name1->m_address = a;
// but we need a new street place
//if ( m_np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
Place *street = (Place *)m_pm.getMem(sizeof(Place));
if ( ! street ) return false;
a->m_street = street;
// street name was name1
//memcpy ( name1 , street , sizeof(Place) );
// and set the street to what it should be
street->m_str = sp;
street->m_strlen = spend - sp;
// this means from placedb i guess... HACK!
street->m_bits |= PLF_FROMTAG;//|PLF_FROMTITLE;
// let it fly
a->m_flags |= AF_VERIFIED_STREET;
a->m_flags |= AF_VERIFIED_PLACE_NAME_1;
// so set hashes makes its own words class
street->m_a = -1;
street->m_b = -1;
// clear these, since PLF2_IS_NAME should be clear for us!!
// otherwise it causes setHashes() function below to set
// our hash as if we were a place name!!!
street->m_flags2 = 0;
// fix this before doing hash, otherwise setHashes() is wrong
street->m_type = PT_STREET;
// compute the street hash
// Events.cpp relies on this to make substitutions to places
// that have verified place names
setHashes(street, m_words, m_niceness );
// and in case hashForPlacedb() is called on us we
// have to tell it to not hash us!! so put flag back!!
street->m_flags2 |= PLF2_IS_NAME;
// . what is this then??
// . we use this for setting the lat/lon, etc.
a->m_hash = getAddressHash ( a->m_street,
a->m_zip );
//if ( m_np < MAX_PLACES ) continue;
//log("addr: hit np limit");
Section **sp = m_sections->m_sectionPtrs;
// . auto verify place names if in <eventVenue> tag
// . supports injection of our xml format
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get address
Address *aa = (Address *)m_am.getPtr(i);
// get place name
Place *name1 = aa->m_name1;
// skip if none
if ( ! name1 ) continue;
// now we always set this so we can make it a turk
// venue candidate
name1->m_unverifiedAddress = aa;
// set this too!
if ( aa->m_name2 ) aa->m_name2->m_unverifiedAddress = aa;
// get word pos
long a = name1->m_a;
// skip if not in doc
if ( a < 0 ) continue;
// get section its in
Section *ns = sp[a];
// go up if sentence or implied
for ( ; ns ; ns = ns->m_parent ) {
// breathe
// need a tag
if ( m_tids[ns->m_a] ) break;
// stop if not in tag at all
if ( ! ns ) continue;
// get tag word then
a = ns->m_a;
// get tagid, must be xml
if ( m_tids[a] != TAG_XMLTAG ) continue;
// get tag name
if ( ! strncasecmp(m_wptrs[a],"<eventVenue",11) )
// it's a match!
aa->m_flags |= AF_VERIFIED_PLACE_NAME_1;
// loop over all addresses
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// get address
Address *a = &m_addresses[i];
// get the reply byte
char *replyFlags = (char *)m_avt.getValue(&a->m_avtKey);
// skip if not there
if ( ! replyFlags ) continue;
// grab em
a->m_flags |= *replyFlags;
// skip if not ambiguous
//if ( ! ( a->m_flags & AF_AMBIGUOUS ) ) continue;
// needs to have verified at least the street/city/ctry
//if ( ! ( a->m_flags & AF_VERIFIED_STREET ) ) continue;
// ok, remove the ambiguous flag
//a->m_flags &= ~AF_AMBIGUOUS;
// . now re-set the AF_AMBIGUIOUS flags
// . we do this again now that we have set a lot of Address::m_flags
// like AF_VERIFIED_PLACE_NAME_1 etc from the msg2c replies
// (or msg2c replies saved in the titleRec/XmlDoc)
// keep count if unique street hashes
long count = 0;
// keep a table
char tmp[5000];
HashTableX ds; ds.set(8,0,300,tmp,5000,false,m_niceness,"addr-strhsh");
// count how many distinct street hashes we have
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get address
Address *a = (Address *)m_am.getPtr(i);//&m_addresses[i];
// get street hash
long long sh = a->m_street->m_hash;
// skip if already got
if ( ds.isInTable ( &sh ) ) continue;
// add it. i guess ignore if on error
if ( ! ds.addKey ( &sh ) ) return false;
// count it
// set it
m_uniqueStreetHashes = count;
// shortcuts
long x , y;
wbit_t *bits = m_bits->m_bits;
unsigned char vflags = 0;
vflags |= AF_INLINED;
// now that we have verified the addresses, set the D_IS_IN_ADDRESS
// bit for those words in verified addresses... but only for
// words in verified portions or any portion of an inlined address
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get address
Address *a = (Address *)m_am.getPtr(i);//&m_addresses[i];
// must have something verified or be inlined
if ( ! ( a->m_flags & vflags ) ) continue;
// is it inlined
bool inlined = (a->m_flags & AF_INLINED);
// . even if inlined, if its a "fake" street it
// needs to be verified
// . fixes "RAFFLE ... Rio Rancho NM" for which
// thought that "RAFFLE" was a "street" and we ended up
// setting D_IS_IN_ADDRESS for it, and then in Events.cpp
// it got demoted for being a title even though it was
// part of the actual event title!
if ( inlined && (a->m_street->m_flags2 & PLF2_IS_NAME) )
inlined = false;
// get flags
if ( inlined || (a->m_flags & AF_VERIFIED_STREET) ) {
// loop over words in street
x = a->m_street->m_a;
y = a->m_street->m_b;
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
for ( ; x >= 0 && x < y ; x++ )
bits[x] |= D_IS_IN_ADDRESS;
// now all place names must be verified only to avoid
// false positives in the event title scoring algo
if ( a->m_name1 ){//(a->m_flags & AF_VERIFIED_PLACE_NAME_1) ) {
// loop over words in street
x = a->m_name1->m_a;
y = a->m_name1->m_b;
// verified or not?
wbit_t af ;
if ( a->m_flags & AF_VERIFIED_PLACE_NAME_1 )
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
if ( ! a->m_name1->m_str ) { x = 0; y = 0; }
for ( ; x >= 0 && x < y ; x++ )
if ( (a->m_flags & AF_VERIFIED_PLACE_NAME_2) ) {
// loop over words in street
x = a->m_name2->m_a;
y = a->m_name2->m_b;
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
if ( ! a->m_name2->m_str ) { x = 0; y = 0; }
for ( ; x >= 0 && x < y ; x++ )
// suite
if ( a->m_suite ) {
x = a->m_suite->m_a;
y = a->m_suite->m_b;
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
//if ( ! a->m_suite->m_str ) { x = 0; y = 0; }
for ( ; x >= 0 && x < y ; x++ )
bits[x] |= D_IS_IN_ADDRESS;
// verified if anything was
if ( a->m_city ) {
x = a->m_city->m_a;
y = a->m_city->m_b;
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
for ( ; x>= 0 && x < y ; x++ )
bits[x] |= D_IS_IN_ADDRESS;
if ( a->m_adm1 ) {
x = a->m_adm1->m_a;
y = a->m_adm1->m_b;
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
for ( ; x >= 0 && x < y ; x++ )
bits[x] |= D_IS_IN_ADDRESS;
// zip
if ( a->m_zip ) {
x = a->m_zip->m_a;
y = a->m_zip->m_b;
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
//if ( ! a->m_zip->m_str ) { x = 0; y = 0; }
for ( ; x >= 0 && x < y ; x++ )
bits[x] |= D_IS_IN_ADDRESS;
// hash the words in such address names into this hash table, name tble
HashTableX nt1;
//HashTableX nt2;
HashTableX nt3;
char ntbuf1[5000];
//char ntbuf2[5000];
char ntbuf3[5000];
nt1.set ( 8,8,256,ntbuf1,5000,true,m_niceness,"addr-nt1");
//nt2.set ( 8,4,256,ntbuf2,5000,true,m_niceness);
nt3.set ( 8,4,256,ntbuf3,5000,true,m_niceness,"addr-nt3");
long goodCount = 0;
// hash words of the addresses
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get address
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
// is it inlined
bool inlined = (ad->m_flags & AF_INLINED);
// is its name verified?
bool vn1 = ( ad->m_flags & AF_VERIFIED_PLACE_NAME_1) ;
bool vn2 = ( ad->m_flags & AF_VERIFIED_PLACE_NAME_2) ;
bool vs = ( ad->m_flags & AF_VERIFIED_STREET);
// must be inlined or verified or after "at"
// add place name even if not verified, because if we match
// an unverified place name the alias must have its
// PLF2_AFTER_AT flag set, meaning it was after the word "at"
// so it is a lot less likely to be a false positive.
// this fixes the solstics seed swap url:
// 88884664-solstice-seed-swap because it was not allowing
// "exploratorium" to be an alias with the exploratorium
// inlined address because its place name was not verified.
// so down below we make sure to only allow such aliasing if
// the place name alias is "after an at"... so it is clearly
// a place name and not just menu cruft.
if ( ! inlined && ! vn1 && ! vn2 && ! vs ) continue;
// . i don't want aliases to a po box
// . fixes which aliases
// "at the adobe theater" to the po box address at the
// bottom of the page because it is a better match than
// the placedbName "adobe theater" that we have as an
// alternative name for the non-pobox address...
if ( ad->m_street->m_flags2 & PLF2_IS_POBOX ) continue;
// do not add if ambiguous and known to be BAD city/state
if ( ad->m_flags3 & AF2_BADCITYSTATE ) continue;
// sometimes a street can exist in two cities or states
//if ( ad->m_flags & AF_AMBIGUOUS ) continue;
// count
uint64_t v = ((uint64_t)((unsigned long)ad));
// . hash place name 1
// . use "0" for the name number
if ( ad->m_name1 &&
! hashPlaceName (&nt1,
return false;
// use "1" for the name number
if ( ad->m_name2 &&
! hashPlaceName (&nt1,
v| (1LL<<32) ) )
return false;
// hash the verified alternative names
char *s = ad->m_placedbNames;
char *send = ad->m_placedbNamesEnd;
uint64_t count = 2;
// scan them
for ( ; s && s < send ; count++ ) {
// breathe
// skip score
s += 4;
// empty? strange...
if ( ! *s ) { char *xx=NULL;*xx=0; }
// hash that
Words tmp;
if ( ! tmp.set9 ( s, m_niceness ) ) return false;
long nw = tmp.m_numWords;
if ( ! hashPlaceName (&nt1,&tmp,0,nw,v|(count<<32)) )
return false;
// skip that and the \0
s += gbstrlen(s) + 1;
// hash their street hash and street num hash
long long ch = 0;
ch ^= ad->m_street->m_hash;
ch ^= ad->m_street->m_streetNumHash;
ch ^= ad->m_street->m_streetIndHash;
if ( ! nt3.addKey ( &ch , &ad ) ) return false;
// hash the street as a name!
if ( ! nt3.addKey(&ad->m_street->m_wordHash64,&ad))
return false;
// . and exact name too for placedb verified names
// . it includes a xor'ed 0x123456 in its hash to distinguish
// from street names that are the same name
if ( vn1 && ! nt3.addKey ( &ad->m_name1->m_hash , &ad ) )
return false;
if ( vn2 && ! nt3.addKey ( &ad->m_name2->m_hash , &ad ) )
return false;
// . if we had no inlined or verified addresses, bail at this point
// . no, might be able to add some lat/lon only addresses below!
//if ( goodCount == 0 ) {
// // validate this
// m_numSorted = 0;
// m_sortedValid = true;
// return true;
// Lastly, set Street/Place::m_alias and m_address
// So now streets point to the inlined/verified address that uses them.
// make the match table
char mtbuf[5000];
HashTableX mt;
mt.set ( 8,4,32,mtbuf,5000,true,m_niceness,"plmtchtbl");
//Section **sp = m_sections->m_sectionPtrs;
// no! scan the streets since maybe alias did not pair up with
// a city/adm1 and make it into the m_addresses[] array
for ( long i = 0 ; i < m_sm.getNumPtrs() ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get it
Place *street = (Place *)m_sm.getPtr(i);
// skip if already has an address set from above in this func
if ( street->m_address ) continue;
// is it a name?
bool isName = street->m_flags2 & PLF2_IS_NAME ;
// if we are a street like "111 Maple SE" for
// because it is listed twice! one time is inlined and the
// other is not!
if ( ! isName ) {
// make special hash
long long ch = 0;
ch ^= street->m_hash;
ch ^= street->m_streetNumHash;
ch ^= street->m_streetIndHash;
Address **pad = (Address **) nt3.getValue ( &ch );
if ( ! pad ) continue;
if ( (*pad)->m_street->m_a == street->m_a )
street->m_address = *pad;
street->m_alias = *pad;
// need a place name
//if ( ! isName ) continue;
// match name to name of address that was verified in placedb
Address **pad = (Address **) nt3.getValue ( &street->m_hash );
// sometimes what is really the street has isName set to
// true. we do not know its a street name in this context
// because it does not end in an indicator. but the address
// we are trying to alias to it does end in an indicator
// or in a city/state. like for
// "1160 Camino Cruz Blanca". it is used twice on the page.
// the first time it is clearly a street, the 2nd time is
// why we are doing this! Same for "705 Camino Lejo" on
// that page as well!
if ( ! pad ) {
pad =(Address **) nt3.getValue (&street->m_wordHash64);
// are we a street address ourself?
if ( pad ) {
street->m_alias = *pad;
if ( pad &&
(*pad)->m_name1 &&
(*pad)->m_name1->m_a == street->m_a ) {
street->m_address = *pad;
if ( pad &&
(*pad)->m_name2 &&
(*pad)->m_name2->m_a == street->m_a ) {
street->m_address = *pad;
// . and make it after at i guess
// . no we need "Explora" as an alias too!
// . no! for "santa fe playhouse" it is not preceeded by an at
// ... so i hope commenting this out is ok
//if ( ! afterAt ) continue;
// grabs its name
long a = street->m_a;
long b = street->m_b;
// . are we after at?
// . this also includes being after "location: " and some
// other strong place indicators
bool afterAt = street->m_flags2 & PLF2_AFTER_AT ;
// reset mt
// count its words
long need = 0;
// scan its words
for ( long k = a ; k < b ; k++ ) {
// skip if not word
if ( ! m_wids[k] ) continue;
// . we do not need to match an initial the
// . fix for aliasing "The Adobe Theater" to
// "Adobe Theater" for
if ( need == 0 && m_wids[k] == h_the ) continue;
// count it
// get possible candidates
long slot1 = nt1.getSlot ( &m_wids[k] );
// if no match, forget it! we need to match
// all our words
//if ( slot1 < 0 ) break;
// loop
// get the value
uint64_t val =
*(uint64_t *)nt1.getValueFromSlot(slot1);
// lower 32 bits is the address ptr
Address *cand = (Address *)(val & 0xffffffff);
// upper 32 bits is the name number
long nn = (val >> 32);
// sanity check
if ( nn < 0 ) { char *xx=NULL;*xx=0; }
if ( nn > 10000 ) { char *xx=NULL;*xx=0; }
// get street flags
pflags_t sf = cand->m_street->m_flags2;
// if name number is 0, then place name 1 must
// be verified or at least "after at"
if ( nn==0 &&
!afterAt )
// same goes for place name 2
if ( nn==1 &&
!afterAt )
// other nn's are place names with 2+ votes
// from placedb in Address::m_placedbNames
// so let them ride.
// store in match table, add one point
if(!mt.addTerm((long long *)&val))return false;
// scan match table for best matches
long dups = 0;
Address *best = NULL;
long bestScore = 0;
Section *bestContainer = NULL;
long bestnn = -1;
// shortcut
char vmask1 = 0;
for ( long y = 0 ; y < mt.m_numSlots ; y++ ) {
// skip if empty bucket/slot
if ( ! mt.m_flags[y] ) continue;
// get score
long score = mt.getScoreFromSlot ( y );
// need to match all of our words
if ( score < need ) continue;
// skip if not max
//if ( score < max ) continue;
// get the address ptr that has this score
//Address *matcher = *( Address **)mt.getKey ( y );
uint64_t v = *(uint64_t *)mt.getKey ( y );
// get name number
long nn = v>>32;
// sanity check
if ( nn < 0 || nn > 10000 ) { char *xx=NULL;*xx=0; }
// get matching address
Address *matcher = (Address *)(v & 0xffffffff);
// get our alias section
Section *ads = sp[street->m_a];//ad->m_section;
// . telescope our alias up
// . see which address it hits first, "best" or
// "matcher"
// . if it hits both at the same time then it is
// ambiguous and we can't make a decision
// . keep telescoping out matcher until it contains
// the alias
Section *sm = matcher->m_section;
for ( ; sm ; sm = sm->m_parent )
if ( sm->contains ( ads ) ) break;
// we got one, or tied for max
if ( ! best ) {
bestScore = score;
best = matcher;
bestContainer = sm;
bestnn = nn;
// if our container is smaller we win!
if ( bestContainer->contains ( sm ) ) {
bestScore = score;
best = matcher;
bestContainer = sm;
dups = 0;
bestnn = nn;
// if we contain him, he stays winning
if ( sm->contains ( bestContainer ) )
// otherwise we are brothers or in the same section
// if it is a dup of the best just ignore it
if ( matcher->m_street->m_hash ==
best->m_street->m_hash &&
matcher->m_street->m_streetNumHash ==
best->m_street->m_streetNumHash &&
matcher->m_street->m_streetIndHash ==
best->m_street->m_streetIndHash )
// ok, it is a tie! we won't be able to alias him!
// if winner is ambiguous, this address, "ad", has no alias
if ( dups ) continue;
// or if no winner
if ( ! best ) continue;
// . had an address like
// "Aztec, NM<br />398 S Light Plant Rd, Aztec, NM 87410-1826"
// and then referred to NM below, and we thought it was
// an alias for that address!
// . BUT it turns out that when i fixed the bug above for
// incorrectly checking to make sure that matching places
// had verified place name 1 or 2, then that fixed this bug,
// but if the place name had the word "NM" or "Aztec" in it
// AND was verified, i would expect us to need this code
// so let's make sure we are "after at" if only doing a
// partial alias
if ( ! afterAt ) {
// get alnum words in best
//long aw1 = 0;
//long aw2 = 0;
Place *n1 = best->m_name1;
Place *n2 = best->m_name2;
//if ( n1 ) aw1 = n1->m_alnumB - n1->m_alnumA;
//if ( n2 ) aw2 = n2->m_alnumB - n2->m_alnumA;
// crap, what if we matched a str in m_placedbName,
// we don't know which one we matched! yes we do,
// its # "nn-2" in the string
char *ps = NULL;
long pslen;
if ( bestnn == 0 ) {ps=n1->m_str; pslen=n1->m_strlen;}
if ( bestnn == 1 ) {ps=n2->m_str; pslen=n2->m_strlen;}
// subtract
bestnn -= 2;
// otherwise, gotta cycle
char *s = best->m_placedbNames;
char *send = best->m_placedbNamesEnd;
// scan them and set "aw"
for ( ; bestnn>= 0 && s && s < send ; bestnn-- ) {
// breathe
// skip score
s += 4;
// point to it
char *wp = s;
// get this
long slen = gbstrlen(s);
// skip that and the \0
s += slen + 1 ;
// skip if not 0
if ( bestnn > 0 ) continue;
// set the process string
ps = wp;
pslen = slen;
// and break for processing
// make into word array
Words tmp;
if ( ! tmp.setx (ps,pslen,m_niceness)) return false;
// count the alnumwords, but ignore "the"
long aw = 0;
for (long x=0;x<tmp.m_numWords;x++) {
if ( ! tmp.m_wordIds[x] ) continue;
if ( tmp.m_wordIds[x] == h_the) continue;
bool fullMatch = false;
if ( aw == need ) fullMatch = true;
if ( ! fullMatch ) continue;
// shortcut
char vmask2 = 0;
Address *ak = NULL;
// might not be ordered by position
long k = 0;
// get the min position right above us
long abovePos = -1;
Address *above = NULL;
long belowPos = -1;
Address *below = NULL;
// now the winner must also be the first verified address
// above or below us!!!
for ( k = 0 ; k < m_am.getNumPtrs() ; k++ ) {
// get it
ak = (Address *)m_am.getPtr(k);//&m_addresses[k];
// ignore if a place name
if ( ak->m_street->m_flags2 & PLF2_IS_NAME )
// skip if not inlined or verified
bool inlined = (ak->m_flags & AF_INLINED);
// is its name verified?
bool verified = ( ak->m_flags & vmask2);
// skip if not either!
if ( ! inlined && ! verified ) continue;
// ignore if after us, must be ABOVE us since we
// are referencing it as an alias
if ( ak->m_street->m_a < a ) {
// skip if doesn't beat the current "above" one
if ( ak->m_street->m_a <= abovePos ) continue;
// set it
above = ak;
abovePos = ak->m_street->m_a;
// ok, below winner?
// skip if doesn't beat the current "above" one
if ( belowPos >= 0 &&
ak->m_street->m_a >= belowPos ) continue;
// set it
below = ak;
belowPos = ak->m_street->m_a;
// skip if not one before us
if ( ! above && ! below ) continue;
// try "above"
if ( above ) {
// skip if not a match with the winner, "best"
if ( best ->m_street->m_hash !=
above->m_street->m_hash )
above = NULL;
if ( above &&
best ->m_street->m_streetNumHash !=
above->m_street->m_streetNumHash )
above = NULL;
if ( above &&
best ->m_street->m_streetIndHash !=
above->m_street->m_streetIndHash )
above = NULL;
// try "below"
if ( below ) {
// skip if not a match with the winner, "best"
if ( best ->m_street->m_hash !=
below->m_street->m_hash )
below = NULL;
if ( below &&
best ->m_street->m_streetNumHash !=
below->m_street->m_streetNumHash )
below = NULL;
if ( below &&
best ->m_street->m_streetIndHash !=
below->m_street->m_streetIndHash )
below = NULL;
// pick the non null one
if ( ! above && ! below ) continue;
// ok, use him as our alias
if ( above ) street->m_alias = above;
else if ( below ) street->m_alias = below;
Place *prev = NULL;
// set m_alias for intersections
for ( long i = 0 ; i < m_sm.getNumPtrs() ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get it
Place *street = (Place *)m_sm.getPtr(i);
// if intersection, check if alias of prev street
if ( ! ( street->m_flags2 & PLF2_INTERSECTION ) ) {
// update this so its a real street always
prev = street;
// if we are actually in an address like
// "CORNER OF HWY 64& HWY 38\0 EAGLE NEST, NM 87718"
// then skip it as well!
if ( street->m_address ) continue;
// try next street
Place *next = NULL;
//Place *prev = NULL;
// if we can get it, get it
//if ( i - 1 >= 0 ) prev = &m_streets[i-1];
if ( i + 1 < m_sm.getNumPtrs() )
next = (Place *)m_sm.getPtr(i+1);
// ignore if also intersection
if ( prev && (prev->m_flags2 & PLF2_INTERSECTION)) prev = NULL;
if ( next && (next->m_flags2 & PLF2_INTERSECTION)) next = NULL;
// try prev first
Place *first = prev;
// declare up here
long a;
long b;
bool good;
long long commonIds[32];
long nc;
// loop over both
// need a street above us to be alias of
if ( ! first ) goto done;
// must be an address
if ( !first->m_address && !first->m_alias ) goto done;
// must match up
a = first ->m_b;
b = street->m_a;
// swap em
// forget it if too big
if ( b - a > 200 ) continue;
// scan to make sure only good words in between
long j; for ( j = a ; j < b ; j++ ) {
// skip if not wid
if ( ! m_wids[j] ) continue;
// must be special word
if ( m_wids[j] == h_of ) continue;
if ( m_wids[j] == h_at ) continue;
if ( m_wids[j] == h_intersection ) continue;
if ( m_wids[j] == h_corner ) continue;
if ( m_wids[j] == h_sw ) continue;
if ( m_wids[j] == h_ne ) continue;
if ( m_wids[j] == h_nw ) continue;
if ( m_wids[j] == h_se ) continue;
// set if good - if only words we permit in between
good = (j >= b);
// if that failed we could still success by containing
// a street name in common!
if ( ! good ) {
nc = getCommonWordIds ( street->m_a ,
street->m_b ,
first->m_a ,
first->m_b ,
m_wids ,
commonIds ,
32 ,
m_niceness );
for ( long k = 0 ; k < nc ; k++ ) {
// get it
long long cid = commonIds[k];
// skip if indicator, must be non-indicator
IndDesc *id;
id = (IndDesc *)g_indicators.getValue(&cid);
if ( id ) continue;
// that is good enough!
good = true;
// if it was not an alias, go on to next place
if ( ! good ) goto done;
// assign our m_alias
if ( first->m_address )
street->m_alias = first->m_address;
else if ( first->m_alias )
street->m_alias = first->m_alias;
// give up if really done
if ( first == next ) continue;
// try next now
first = next;
goto subloop;
// set m_alias for intersections more loosely
// fixes "14th and Curtis, Denver CO" on
// which is a proper address and has the full address next to it
for ( long i = 0 ; i < m_sm.getNumPtrs() ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get it
Place *street = (Place *)m_sm.getPtr(i);
// if intersection, check if alias of prev street
if ( ! ( street->m_flags2 & PLF2_INTERSECTION ) ) {
// update this so its a real street always
prev = street;
// must be full address for this algo
if ( ! street->m_address ) continue;
// try next street
Place *next = NULL;
// if we can get it, get it
if ( i + 1 < m_sm.getNumPtrs() )
next = (Place *)m_sm.getPtr(i+1);
// ignore if also intersection
if ( prev && (prev->m_flags2 & PLF2_INTERSECTION)) prev = NULL;
if ( next && (next->m_flags2 & PLF2_INTERSECTION)) next = NULL;
// try prev first
Place *first = prev;
if ( ! first ) first = next;
if ( ! first ) continue;
char cmpbuf[1024];
HashTableX cmp;
// see if matches one non-indicator in street
for ( long j = first->m_a ; j < first->m_b ; j++ ) {
// get it
long long h = m_wids[j];
// skip punct
if ( ! h ) continue;
// skip if indicator
if ( g_indicators.isInTable(&h) ) continue;
// hash it otherwise
if ( ! cmp.addKey(&h) ) return false;
// assume intersection does not match any words
bool matched = false;
// now compare to our intersection streets
for ( long j = street->m_a ; j < street->m_b ; j++ ) {
// get it
long long h = m_wids[j];
// skip punct
if ( ! h ) continue;
// skip if indicator
if ( g_indicators.isInTable(&h) ) continue;
// hash it otherwise
if ( ! cmp.isInTable(&h) ) continue;
// got a match!
matched = true;
// all done
// if no match, forget the alias
if ( ! matched ) {
// give up if really done
if ( first == next ) continue;
// or if nex tis NULL
if ( ! next ) continue;
// try next now
first = next;
goto subloop2;
// it matched!
if ( first->m_address )
street->m_alias = first->m_address;
else if ( first->m_alias )
street->m_alias = first->m_alias;
// set D_IS_IN_ADDRESS[_NAME] for places that alias an address
// . now scan the places. if not in an address, but aliases one then
// we need to set D_IS_IN_ADDRESS[_NAME] for it...
// . this fixes the aliased streets and names in from
// being event titles...
for ( long i = 0 ; i < m_sm.getNumPtrs() ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get it
Place *street = (Place *)m_sm.getPtr(i);//&m_streets[i];
// skip if no alias
Address *alias = street->m_alias;
if ( ! alias ) continue;
// is it a name?
bool isName = street->m_flags2 & PLF2_IS_NAME ;
// if a street, set this
wbit_t flag;
if ( isName ) flag = D_IS_IN_VERIFIED_ADDRESS_NAME;
else flag = D_IS_IN_ADDRESS;
// set bits for alias
long x = street->m_a;
long y = street->m_b;
if ( y > m_nw ) { char *xx=NULL;*xx=0; }
for ( ; x >= 0 && x < m_nw && x < y ; x++ )
bits[x] |= flag;
// set m_numNonDupAddresses
m_numNonDupAddresses = 0;
for ( long i = 0 ; i < m_am.getNumPtrs() - 1 ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get it
Address *aa = (Address *)m_am.getPtr(i);//&m_addresses[i];
// get street position
long a = aa->m_street->m_a;
// sanity check
if ( a < 0 ) continue;
// get section
Section *ss = sp[a];
// skip if dup
//if ( ss->m_flags & SEC_DUP ) continue;
if ( ss->m_votesForDup > 0 ) continue;
// count it otherwise
// set Address::m_flags AF_VENUE_DEFAULT bit
m_numVenues = 0;
// what are the addresses of this website? (assuming this website
// is essentially the website of a venue or physical place)
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get address
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
// is its name verified?
bool vn1 = (ad->m_flags & AF_VERIFIED_PLACE_NAME_1) ;
bool vn2 = (ad->m_flags & AF_VERIFIED_PLACE_NAME_2) ;
// we might have some alternative verified names too!
bool vn3 = (bool) ad->m_bestPlacedbName;
// must be inlined or verified
if ( ! vn1 && ! vn2 && ! vn3 ) continue;
// if address used the dc[] array that consist of elements
// from the venue tag in tagdb, then do not add it back
// to tagdb
//bool add = true;
//if ( ad->m_street->m_a < 0 ) add = false;
// see if its place name 1 is in the siteTitleBuf
char *p1 = NULL;
char *p2 = NULL;
if ( vn1 && ad->m_name1 ) p1 = ad->m_name1->m_str;
if ( vn2 && ad->m_name2 ) p2 = ad->m_name2->m_str;
// temp null term
char c1;
char c2;
long plen1;
long plen2;
if ( p1 ) plen1 = ad->m_name1->m_strlen;
if ( p2 ) plen2 = ad->m_name2->m_strlen;
char *saved1 = NULL;
char *saved2 = NULL;
if ( p1 ) saved1 = &p1[plen1];
if ( p2 ) saved2 = &p2[plen2];
if ( p1 ) { c1 = *saved1; *saved1 = 0; }
if ( p2 ) { c2 = *saved2; *saved2 = 0; }
// . skip "the"
// . fixes "the adobe theater" in title and "adobe theater"
// being the verified place name for
if ( p1 && strncasecmp(p1,"the ",4) == 0 ) p1 += 4;
if ( p2 && strncasecmp(p2,"the ",4) == 0 ) p2 += 4;
// scan m_siteTitleBuf for either p1 or p2
char *d = m_siteTitleBuf;
char *dend = m_siteTitleBuf + m_siteTitleBufSize;
// loop over the \0 delimeted list of titles
for ( ; d < dend ; d += gbstrlen(d) + 1 ) {
// skip "the"
if ( strncasecmp(d,"the ",4) == 0 ) d += 4;
// compare
bool match = false;
if ( p1 && gb_strcasestr ( d , p1 ) ) match = true;
if ( p2 && gb_strcasestr ( d , p2 ) ) match = true;
// loop over all possible alternative placedb names
// that have 2 or more votes as well
char *s = ad->m_placedbNames;
for ( ; s && s<ad->m_placedbNamesEnd;s+=gbstrlen(s)+1){
// breathe
// skip score of 4 bytes
s += 4;
// skip "the"
if ( strncasecmp(s,"the ",4)==0) s += 4;
// compare
if ( ! gb_strcasestr(d,s) ) continue;
// got a match
match = true;
// stop
// go to next title if no match
if ( ! match ) continue;
// we got a match!
ad->m_flags |= AF_VENUE_DEFAULT;
// count it
// done
if ( saved1 ) *saved1 = c1;
if ( saved2 ) *saved2 = c2;
//long imax = m_nw;
// skip if no streets... no might add a lat/lon "street" below
//if ( m_sm.getNumPtrs() <= 0 ) imax = 0;
// we gotta call this twice. once here and once below
if ( ! setFirstPlaceNums() ) return false;
// scan for lat/long coordinates
// US lat from 24.450000 to 47.4666666
// US lon from -71.083333 to -114.1333333
// <span class="latitude" id="map-latitude">35.146292</span>
// <span class="longitude" id="map-longitude">-90.0148638</span>
// <span class="latitude">37.793126</span>
// <span class="longitude">-122.42289</span>
// <div style="display:none" class="result_json">{"lat":"35.084278",
// "lon":"-106.649467","cb":false,"photo":""}</div>
// <a href="
// ... &POI1lat=039396979&POI1lng=-076564398&POI1name=Baynesville+..
// /listing-map.png?lat=35.0981&amp;long=-106.6694
// use "center=" cgi parm on
// google maps link
// src=";hl=en&amp;msa=0&amp;msid=104870349047867594566.0004626e9d41225400a1c&amp;ll=40.761325,-73.977642&amp;sp...
char *bufEnd = m_words->getContentEnd();
char *bufStart = m_words->getContent ();
// now we do a generic scan for any numbers that look like lat/lon
p = m_words->getContent();
// must be latitude then longitude, in that order
long lastScore = -1;
double lastVal ;
char *lastPos = NULL;
char lastType;
char *lastAddedPos = NULL;
long lastAddedWord = -1;
long lastAddedWordDist;
long lastAddedCharDist;
bool addedSomething = false;
if ( ! p ) p = "\0";
for ( ; *p ; p++ ) {
// breathe
// skip if not digit
if ( ! is_digit(*p) ) continue;
// set start
char *start = p;
// avoid %3D from url encodings
if ( p > bufStart && p[-1] == '%' &&
p[0] == '3' &&
to_lower_a(p[1]) == 'd' ) {
// skip over that encoded equal sign
p += 2;
start += 2;
// skip over negative sign
if ( *p == '-' ) { p++; start++; }
// forget it if got a negative sign or a non-digit
if ( ! is_digit(*p) ) continue;
// negative sign?
if ( p>bufStart && p[-1] == '-' ) start--;
// reset counts
long digitCount = 0;
long decimalCount = 0;
// do not scan so far
char *pmax = p + 20;
if ( pmax > bufEnd ) pmax = bufEnd;
// scan until no digit or period
for ( ; *p && p < pmax ; p++ ) {
// count the digits
if ( is_digit(*p) ) {
// decimal point is ok
if ( *p == '.' ) {
// stop on other crap
// give up if end of doc
if ( ! *p ) break;
// give up if less than 6 digits encountered
if ( digitCount < 6 ) continue;
// some pages have no period in it
// and we just have to assume the first
// 3 digits are before the period. like for
// urls
if ( decimalCount >= 2 ) continue;
// convert
double dval = atod2(start,p-start);
// fix stuff which has no decimal pt
if ( decimalCount == 0 ) {
// how many digits to left of decimal
long left = 3;
// make a divisor
double ddd = 1;
for ( long vv = 0 ; vv<digitCount-left; vv++)
ddd *= 10;
// fix it
dval /= ddd;
// bail if bad
if ( dval < -180.0 || dval > 180.0 ) continue;
// the continental US ranges from
// latitude : 24 27/60 (
// latitude : 49 (
// longitude: 71 5/60 (
// longitude: 114 8/60 (
// which is lat from 24.450000 to 47.4666666
// which is lon from 71.083333 to 114.1333333
// in the usual decimal it is
// lat from 24.450000 to 47.4666666
// lon from -71.083333 to -114.1333333
char type = 0;
if ( dval >= 24.45 && dval <= 50.0 ) type = 1; // lat
if ( dval >= -125.0 && dval <= -66.1 ) type = 2; // lon
// this overrides though
char *r = start -1;
char *rend = start - 10;
if ( rend < bufStart + 5 ) rend = bufStart + 5;
for ( ; r >= rend ; r-- ) {
if ( ! is_alpha_a(*r) ) continue;
// <latitude> facebook/brazil
if ( to_lower_a(r[ 0]) == 'e' &&
to_lower_a(r[-1]) == 'd' &&
to_lower_a(r[-2]) == 'u' &&
to_lower_a(r[-3]) == 't' &&
to_lower_a(r[-4]) == 'i' &&
to_lower_a(r[-5]) == 't' ) {
type = 1;
// <longitude> facebook/brazil
if ( to_lower_a(r[ 0]) == 'e' &&
to_lower_a(r[-1]) == 'd' &&
to_lower_a(r[-2]) == 'u' &&
to_lower_a(r[-3]) == 't' &&
to_lower_a(r[-4]) == 'i' &&
to_lower_a(r[-5]) == 'g' ) {
type = 2;
// bail if unknown lat or lon
if ( type == 0 ) continue;
// . need a latitude before longitude can be accepted
// . fixes
// which had a bogus large number (no decimal) after the
// first legit lat/lon pair in the filename of a url i think
//if ( needLat && type == 2 ) continue;
// get word position for this function
long wn2 = m_words->getWordAt ( start );
// sanity check
if ( wn2 < 0 ) { char *xx=NULL;*xx=0; }
// find nearest place. the associated place must be a verified
// place name or a true street.
Place *ap2 = getAssociatedPlace ( wn2 );
// get the address that contains the place
Address *aa = NULL;
// try address
if ( ! aa && ap2 ) aa = ap2->m_address;
// try alias
if ( ! aa && ap2 ) aa = ap2->m_alias;
// if this lat/lon had an associated place but the associated
// place had no address because it is like "at Effex"
// (after at) then allow it through. we should add the lat/lon
// as its own address and alias the simple place, ap2, to
// that. i.e. ap2->m_alias = newlatlonaddress
//if ( ! aa && ap2 ) continue;
// assign it
double *ptr = NULL;
if ( type == 1 && aa ) ptr = &aa->m_latitude;
if ( type == 2 && aa ) ptr = &aa->m_longitude;
// are we from google maps url?
// src=";hl=en&amp;msa=0&amp;msid=104870349047867594566.0004626e9d41225400a1c&amp;ll=40.761325,-73.977642&amp;sp...
// compute the score of the lat/lon pair
long score = -1;
bool inFormat = false;
// . ll=lat,lon
// . this is the center of the map and almost always not
// exactly the exact place of the business which tends to be
// a little lower down below the center of the map, however
// if a query is specified then google highlights all
// locations on the map that match that query
if ( start - 10 >= bufStart &&
start[-1] == '=' &&
start[-2] == 'l' &&
start[-3] == 'l' &&
(start[-4] == ';'||start[-4]=='&') ) {
// this is the correct one
score = 100;
inFormat = true;
// cbll=lat,lon
else if ( start - 15 >= bufStart &&
start[-1] == '=' &&
start[-2] == 'l' &&
start[-3] == 'l' &&
start[-4] == 'b' &&
start[-5] == 'c' &&
(is_punct_a(start[-6])) ) {
// this is street view coords
score = 50;
inFormat = true;
// sll=lat,lon (this is not good!?!?!)
else if ( start - 15 >= bufStart &&
start[-1] == '=' &&
start[-2] == 'l' &&
start[-3] == 'l' &&
start[-4] == 's' ) {
// business search thingy? MAKE IT NEGATIVE SCORE!
score = -20;
inFormat = true;
// geocode=0,lat,lon
else if ( start - 20 >= bufStart &&
start[-1] == ',' &&
start[-2] == '0' &&
start[-3] == '=' &&
start[-4] == 'e' &&
start[-5] == 'd' &&
start[-6] == 'o' &&
start[-7] == 'c' &&
start[-8] == 'o' &&
start[-9] == 'e' &&
start[-10] == 'g' &&
(is_punct_a(start[-11])) ) {
// related to directions somehow
score = 30;
inFormat = true;
score = 10;
// save that
char *savePos = lastPos;
long saveScore = lastScore;
char saveType = lastType;
double saveVal = lastVal;
// then update
lastPos = start;
lastScore = score;
lastType = type;
lastVal = dval;
// if first number, skip
if ( ! savePos ) continue;
// if too far apart, forget it! most likely not a lat/lon pair
//if ( start - savePos > 100 ) continue;
// skip if both are lats or both are lons
if ( saveType == type ) continue;
// if it is a google url thing then we need to wait for
// the longitude right after the latitude
if ( inFormat && type == 1 ) continue;
// a negative score curses the longitude that follows
if ( saveScore < 0 ) continue;
// get word # and associated place of previous lat/lon #
long wn1 = m_words->getWordAt ( savePos );//start );
if ( wn1 < 0 ) { char *xx=NULL;*xx=0; }
// find nearest place. the associated place must be a verified
// place name or a true street.
Place *ap1 = getAssociatedPlace ( wn1 );
if ( ap1 != ap2 ) continue;
// super crazy? try to fiz which pairs together
// to bogus numbers that are really far apart
long wordDist = wn2 - wn1;
if ( wordDist > 30 )
// better distance counting. should fix
//;gid=2415 which
// has multiple lat/lon pairs all that had a different #
// of chars between them, but this will make their distances
// equal where they should be now
long dist = 0;
bool inalnum = false;
bool inpunct = false;
for ( char *d = savePos ; d < start ; d++ ) {
// breathe
// skip if space
if ( is_wspace_a(*d) ) {
inalnum = false;
inpunct = false;
// count words
if ( is_alnum_a(*d) ) {
if ( inalnum ) continue;
inalnum = true;
inpunct = false;
// punctuation
if ( inpunct ) continue;
inpunct = true;
inalnum = false;
// this is likewise bad as well...
if ( dist > 30 )
bool addLatLonAddress = false;
if ( ! ap1 ) addLatLonAddress = true;
if ( ap1 && ! ap1->m_alias && ! ap1->m_address )
addLatLonAddress = true;
// if neither lat nor lon has associated place then add addr
if ( addLatLonAddress ) {
// if last address we added used the number at
// savePos then we can't both be right. so compare
if ( lastAddedPos == savePos &&
lastAddedWordDist == 0 &&
wordDist >= 2 )
if ( lastAddedPos == savePos &&
lastAddedCharDist > 1 &&
lastAddedCharDist < dist/2 &&
dist > 10 )
if ( lastAddedWord == wn1 &&
lastAddedWord == wn2 ) {
// nuke what we had added just before
if ( addedSomething ) {
addedSomething = false;
addedSomething = true;
// note what we add
if ( wn1 == wn2 ) lastAddedWord = wn1;
else lastAddedWord = -1;
lastAddedPos = start;
lastAddedWordDist = wordDist;
lastAddedCharDist = dist;
// set this to the added address
Address *retAddr = NULL;
// . now try to add place vec to our array of addresses
// . we now supply the containing section, "sec"
// so we can vote on which tag hash supplied the best
// addresses
if ( ! addAddress ( NULL,//name1 ,
NULL,//name2 ,
NULL,//suite ,
NULL,//street ,
NULL,//city ,
NULL,//adm1 ,
NULL,//zip ,
NULL , // ctry ,
-1, // startAlnum ,
&retAddr ) ) return false;
// set lat/lon
if ( type == 2 ) {
retAddr->m_latitude = saveVal;
retAddr->m_longitude = dval;
else {
retAddr->m_latitude = dval;
retAddr->m_longitude = saveVal;
// add the lat or lon as a simple place
Place *pp = (Place *)m_sm.getMem(sizeof(Place));
if ( ! pp ) return false;
pp->m_address = retAddr;
// this seems good to do
retAddr->m_street = pp;
pp->m_str = savePos;//start;
pp->m_strlen = p - savePos;//start;
long long h1 = *(long long *)&retAddr->m_latitude;
long long h2 = *(long long *)&retAddr->m_longitude;
pp->m_hash = hash64h ( h1 , h2 );
pp->m_bits = 0; // |= PLF_FROMTAG;//|PLF_FROMTITLE;
pp->m_a = wn1;
pp->m_b = wn2+1;
pp->m_flags2 = 0;
pp->m_type = PT_LATLON;
pp->m_flags2 = 0; // PLF2_IS_NAME;
// address hash is usually set by calling
// getAddressHash() but just use the hash of the
// lat/lon from "street" we already computed
retAddr->m_hash = pp->m_hash;
//a->m_street = street;
Section *as = NULL;
if ( m_sections ) {
as = m_sections->m_sectionPtrs[pp->m_a];
retAddr->m_section = as;
// add the nearest city to that lat/lon so
// that Address::getTimeZone() works
float distInMilesSquared = 100.0;
uint32_t cid32 = getNearestCityId(retAddr->m_latitude ,
m_niceness ,
// only set this if nearby...
if ( distInMilesSquared < 1000)
retAddr->m_cityId32 = cid32;
retAddr->m_cityId32 = 0;
// if we had "at Effex" then alias "Effex" to
// this lat/lon address
if ( ap1 ) ap1->m_alias = retAddr;
// if we had matching associated places but the associated
// place is not part of a good address, skip it
if ( ! aa ) continue;
// pick the highest score between us and the last guy,
// AS LONG AS WE ARE A LONGITUDE since google maps always
// has latitude then longitude
if ( saveScore > score && type == 2 )
score = saveScore;
// get our distance
//long dist = start - savePos;
// if we are know to be right, and it wasn't we can override
// it without triggering the ambiguous flag
if ( score > aa->m_latLonScore ||
// if score is tied but distance is less than, we can
// win on that too!
( score == aa->m_latLonScore && dist<aa->m_latLonDist) ) {
if ( type == 2 ) {
aa->m_latitude = saveVal;
aa->m_longitude = dval;
else {
aa->m_latitude = dval;
aa->m_longitude = saveVal;
aa->m_latLonScore = score;
aa->m_latLonDist = dist;
// if we lost, bail
if ( score < aa->m_latLonScore || dist > aa->m_latLonDist)
// . if already has one set flag
// . but only mark it as ambiguous if the conflicting location
// is more than .010 of a degree off. this fixes
// which has a few different &ll=x,y values in its goog url
// . don't worry about it now since we have a geocoder
// . this was causing a core because it was resetting the
// lat/lon of lat/lon only address for
// and was coring in Dates::getIntervals2() because the
// timezone was like "66" because the lat/lon was reset
// here to 888 or 999 or whatever
// . but we need this in case there is ambiguity as to
// which lat/lon pair is the real deal when there are
// mutiple ones in the same vicinity...
// . so we have to nuke the address somehow if its lat/lon
// only
if ( *ptr != dval && fabs(*ptr - dval) > .010 ) {
// blank out the lat/lon if we do not have both for an address
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get address
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
// skip address if no lat/lon
bool haveBoth = true;
if ( ad->m_latitude == NO_LATITUDE ) haveBoth = false;
if ( ad->m_latitude == AMBIG_LATITUDE ) haveBoth = false;
if ( ad->m_longitude == NO_LONGITUDE ) haveBoth = false;
if ( ad->m_longitude == AMBIG_LONGITUDE ) haveBoth = false;
if ( haveBoth ) continue;
// blank out both otherwise
ad->m_latitude = NO_LATITUDE;
ad->m_longitude = NO_LONGITUDE;
// blank out all lat/lon of two are identical
// if two different addresses have the same lat/lon then disregard
// all on that page
class Coordinate { public: double lat; double lon; };
HashTableX dat;
char datbuf[2000];
dat.set ( 16 , 8 , 32 , datbuf , 2000 , false ,m_niceness,"latlontbl");
Coordinate nukeList[5000];
long nc = 0;
// scan the addresses and hash the lat/lon of each one
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get address
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
// skip address if no lat/lon
if ( ad->m_latitude == NO_LATITUDE ) continue;
// skip if its a plain lat/lon address
if ( ad->m_flags3 & AF2_LATLON ) continue;
// make the coordinate
Coordinate cc; = ad->m_latitude;
cc.lon = ad->m_longitude;
// get it as a hash
//long long h1 = *(long long *)((double *)&ad->m_latitude);
//long long h2 = *(long long *)((double *)&ad->m_latitude);
//long long h = hash64 ( h1 , h2 );
//double pr = ad->m_latitude*ad->m_longitude;
//long long h = *(long long *) &pr;
// mix it up some more
//h = hash64 ( h , h1 );
//h = hash64 ( h , h2 );
// if another entry that has this same lat/lon exists but
// different address hash, then nuke them all!
uint64_t *addrHash = (uint64_t *) dat.getValue ( &cc );
// check if there
if ( addrHash && *addrHash != ad->m_hash ) {
//nuke = true;
// now just add to the nuke list
if ( nc < 5000 ) nukeList[nc++] = cc;
// hash it in "Dup Address Table"
if ( ! dat.addKey ( &cc , &ad->m_hash ) ) return false;
for ( long i = 0 ; nc > 0 && i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get address
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
// skip if its a plain lat/lon address
if ( ad->m_flags3 & AF2_LATLON ) continue;
// see if in nuke like
for ( long j = 0 ; j < nc ; j++ ) {
if ( nukeList[j].lat != ad->m_latitude ) continue;
if ( nukeList[j].lon != ad->m_longitude ) continue;
// blank it out
ad->m_latitude = NO_LATITUDE;
ad->m_longitude = NO_LONGITUDE;
// set m_latitude and m_longitude for the same address
HashTableX nt4;
HashTableX nt5;
char ntbuf4[5000];
char ntbuf5[5000];
nt4.set ( 8,4,256,ntbuf4,5000,false,m_niceness,"nt4addr");
nt5.set ( 8,4,256,ntbuf5,5000,false,m_niceness,"nt5addr");
// hash words of the addresses
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get address
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
// skip address if no lat/lon
if ( ad->m_latitude != NO_LATITUDE &&
// do not add if already in there
! nt4.isInTable(&ad->m_hash) )
// return false if error adding
if ( ! nt4.addKey(&ad->m_hash,&ad) ) return false;
// deal with imported lat/lon too
if ( ad->m_importedLatitude != NO_LATITUDE &&
// do not add if already in there
! nt5.isInTable(&ad->m_hash) )
// return false if error adding
if ( ! nt5.addKey(&ad->m_hash,&ad) ) return false;
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get it
Address *ad = (Address *)m_am.getPtr(i);//&m_addresses[i];
// see if other same address but with lat/lon exists
Address **pad = (Address **) nt4.getValue ( &ad->m_hash );
// inherit otherwise
if ( pad && ad->m_latitude == NO_LATITUDE ) {
ad->m_latitude = (*pad)->m_latitude;
ad->m_longitude = (*pad)->m_longitude;
// see if other same address but with lat/lon exists
Address **pad2 = (Address **) nt5.getValue ( &ad->m_hash );
// inherit otherwise
if ( pad2 && ad->m_importedLatitude == NO_LATITUDE ) {
ad->m_importedLatitude = (*pad2)->m_importedLatitude;
ad->m_importedLongitude = (*pad2)->m_importedLongitude;
ad->m_importedVotes = (*pad2)->m_importedVotes;
// . set AF2_LATLONDUP for dup lat/lons like stubhub has
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get address
Address *ad = (Address *)m_am.getPtr(i);
// skip if its a plain lat/lon address
if ( !(ad->m_flags3 & AF2_LATLON) ) continue;
// see if in matches another
for ( long j = i+1 ; j < m_am.getNumPtrs() ; j++ ) {
Address *aj = (Address *)m_am.getPtr(j);
// must also be lat/lon
if ( !(aj->m_flags3 & AF2_LATLON) ) continue;
// compute distance
float d1 = ad->m_latitude - aj->m_latitude;
float d2 = ad->m_longitude - aj->m_longitude;
if ( d1 > .01 ) continue;
if ( d2 > .01 ) continue;
if ( d1 < -.01 ) continue;
if ( d2 < -.01 ) continue;
// . ok, they are the same i guess
// . prefer the one with the longest digits as the orig
// and the other as the alias
if ( ad->m_street->m_strlen > aj->m_street->m_strlen){
//aj->m_street->m_alias = ad;
ad->m_street->m_flags3 |= PLF3_LATLONDUP;
else {
//ad->m_street->m_alias = aj;
aj->m_street->m_flags3 |= PLF3_LATLONDUP;
// . fixes xml feed
// . supplant afterAt and other lat/lon addresses with a single
// winning lat/lon address
// . the problem with the getAssociatedPlace() logic above is
// that it only aliases out true street names or verified street
// names that are afterat... so we have to fix afterat streets
// that are not verified here.
// . fixes "blah blah at STUBHUB. <lat=yyy>><lon=xxx>" so that
// STUBHUB gets AF3_SUPPLANTED set so that Events.cpp ignores it
// as a competing address.
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get address
Address *ad = (Address *)m_am.getPtr(i);
// skip if not a lat/lon ADDRESS (unassociated with street)
// i.e. an independent lat/lon because getAssociatedPlace()
// above was returning NULL for this lat/lon..
if ( !(ad->m_flags3 & AF2_LATLON) ) continue;
// skip if dup lat/lon though
if ( ad->m_street &&
(ad->m_street->m_flags3 & PLF3_LATLONDUP) )
// get its section and blow it up until right before we
// hit a verified fake street name or we hit a street name
// or we hit a latlon that is not a latlondup.
// use Section::m_firstPlaceNum. we set that above, but
// we also set it right below in a secon call to
// setFirstPlaceNums().
Section *sk = sp[ad->m_street->m_a];
// telescope section up around this lat/lon address
for ( ; sk ; sk = sk->m_parent ) {
// breathe
// get it
long pi = sk->m_firstPlaceNum;
bool hitRealStreet = false;
// . scan places in this section
// . just like Events.cpp address assigning algo does
for ( ; pi >= 0 && pi < m_numSorted ; pi++ ) {
// get it
Place *sr = m_sorted[pi];
// stop if section breach
if ( sr->m_a >= sk->m_b ) break;
// sanity
if ( sr->m_a < 0 ) { char *xx=NULL;*xx=0; }
// skip us
if ( sr == ad->m_street ) continue;
// ignore if POBOX
if ( sr->m_flags2 & PLF2_IS_POBOX ) continue;
// skip if dup latlon
if ( sr->m_flags3 & PLF3_LATLONDUP ) continue;
// is the street name really a place name?
bool isName = ( sr->m_flags2 & PLF2_IS_NAME );
// skip if fake name
if ( isName ) continue;
// stop on real street (not-fake name)
hitRealStreet = true;
// stop if we hit real street!
if ( hitRealStreet )
// ok, supplant all if no real street name to go
// with our lat/lon
pi = sk->m_firstPlaceNum;
// do the scan again
for ( ; pi >=0 && pi < m_numSorted ; pi++ ) {
// get it
Place *sr = m_sorted[pi];
// stop if section breach
if ( sr->m_a >= sk->m_b ) break;
// sanity
if ( sr->m_a < 0 ) { char *xx=NULL;*xx=0; }
// skip us
if ( sr == ad->m_street ) continue;
// flag it
sr->m_flags3 |= PLF3_SUPPLANTED;
// normalize m_latitude and m_longitude to be from 0 to 360
// no! - just do in Events::hash() now
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get address
Address *ad = &m_addresses[i];
// skip address if no lat/lon
if ( ad->m_latitude == NO_LATITUDE ) continue;
if ( ad->m_latitude == AMBIG_LATITUDE ) continue;
ad->m_latitude += 180.0;
ad->m_longitude += 180.0;
// set Address::m_timeZoneOffset (from GMT)
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get it
Address *aa = &m_addresses[i];
Place *city = aa->m_city;
Place *zip = aa->m_zip;
Place *adm1 = aa->m_adm1;
// and city hash
uint64_t cityHash = 0;
if ( city ) cityHash = city->m_hash;
else if ( zip ) cityHash = zip->m_cityHash;
if ( ! cityHash ) { char *xx=NULL;*xx=0; }
// need this
char *adm1Str = NULL;
if ( adm1 ) adm1Str = adm1->m_adm1;
else if ( zip ) adm1Str = zip->m_adm1;
else if ( city && city->m_adm1[0] ) adm1Str = city->m_adm1;
else { char *xx=NULL;*xx=0; }
// sanity check
if ( is_upper_a(adm1Str[0]) ) { char *xx=NULL;*xx=0; }
if ( is_upper_a(adm1Str[1]) ) { char *xx=NULL;*xx=0; }
uint32_t adm1Hash32 = (uint32_t)*((uint16_t *)adm1Str);
uint32_t cityHash32 = (uint32_t)cityHash;
// combine the two hashes
uint32_t cityStateHash = hash32h(cityHash32,adm1Hash32);
// get timezone
long slot = g_timeZones.getSlot ( &cityStateHash );
// call it 0 if not good
aa->m_timeZoneOffset = 0;
// otherwise, set m_timeZoneOffset appropriately
if ( slot >= 0 )
aa->m_timeZoneOffset = *(char *)g_timeZones.
// set Section::m_firstPlaceNum
// . so we can quickly scan the places contained by a section
if ( ! setFirstPlaceNums() ) return false;
// count # of valid/inlined addresses we have
m_numValid = 0;
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get it
Address *aa = (Address *)m_am.getPtr(i);//&m_addresses[i];
// is inlined or verified?
bool valid = false;
if ( aa->m_flags & AF_INLINED ) valid = true;
// but unverified streetisname is not good
if ( aa->m_street && (aa->m_street->m_flags2 & PLF2_IS_NAME) )
valid = false;
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_1 ) valid = true;
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_2 ) valid = true;
if ( aa->m_flags & AF_VERIFIED_STREET ) valid = true;
if ( ! valid ) continue;
aa->m_flags3 |= AF2_VALID;
return true;
static void gotGeocoderReply ( void *state , TcpSocket *s ) {
// get us
Addresses *THIS = (Addresses *)state;
// process it
THIS->processGeocoderReply ( s );
// call callback
THIS->m_callback ( THIS->m_state );
// . set m_geocoderLat/m_geocoderLon
// . returns false if blocks
// . returns true with g_errno set on error
// . only call from Events.cpp if we have 1+ valid event that will be
// indexed...
bool Addresses::setGeocoderLatLons ( void *state,
void (*callback) (void *state) ) {
// only call this once unless we get reset()
if ( m_calledGeocoder ) return true;
m_calledGeocoder = true;
m_callback = callback;
m_state = state;
// store candidates to select from here
long cands[MAX_GEOCODERS];
long nc = 0;
// select a geocoder by IP
for ( long i = 0 ; i < MAX_GEOCODERS ; i++ ) {
// check ip
if ( ! g_conf.m_geocoderIps[i] ) continue;
// add to candidates
cands[nc++] = g_conf.m_geocoderIps[i];
// if none, bail, we do not do this
if ( nc <= 0 ) return true;
long need = 0;
// loop over each valid address we and add to request size
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get it
Address *aa = (Address *)m_am.getPtr(i);
// reset
aa->m_geocoderLat = 999;
aa->m_geocoderLon = 999;
// is inlined or verified?
if ( ! ( aa->m_flags3 & AF2_VALID ) ) continue;
// only do it if used in event now
if ( ! ( aa->m_flags3 & AF2_USEDINEVENT ) ) continue;
// skip if lat/lon address
if ( aa->m_flags3 & AF2_LATLON ) {
// just inherit that
aa->m_geocoderLat = aa->m_latitude;
aa->m_geocoderLon = aa->m_longitude;
// check the cache first!!! used by Repair.cpp to speed up!!
long long key64 = aa->m_hash;
double *recs;
long recSize;
bool inCache = m_latLonCache.getRecord ( (collnum_t) 0,
(char *)&key64 ,
(char **)&recs ,
&recSize ,
false ,
3600 ,
false );
if ( inCache && recs && recs[0] != 999 ) {
aa->m_geocoderLat = recs[0];
aa->m_geocoderLon = recs[1];
// request needs street,state,city (and zip if there)
need += aa->m_street->m_strlen + 1;
// get city length
if ( aa->m_city ) need += aa->m_city->m_strlen;
else if ( aa->m_zip ) need += strlen(aa->m_zip->m_cityStr);
else if ( aa->m_flags3 & AF2_LATLON );
else { char *xx=NULL;*xx=0; }
if ( aa->m_zip ) need += 2 + aa->m_zip->m_strlen;
//need += aa->m_adm1->m_strlen + 1;
need += 2; // use state abbr
need += 20; // addrXXX=...&
// if none valid, vail
if ( need == 0 ) return true;
// need url cruft "http://..../"
need += 100;
char sbuf[5024];
char *requestBuf = NULL;
if ( need < 5024 ) requestBuf = sbuf;
if ( ! requestBuf ) requestBuf = (char *)mmalloc(need,"geocode");
if ( ! requestBuf ) return true;
// make the url
char *p = requestBuf;
// select a geocoder randomly
long r = rand() % nc;
// to request manually:
// make the request
p += sprintf(p,"POST /xml? HTTP/1.0\r\n"
"Accept: */*\r\n"
"Host: %s:5678\r\n"
"Content-Length: xxxxxx\r\n"
long num = 1;
char *contentStart = p;
// loop over each valid address we and add to request size
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get it
Address *aa = (Address *)m_am.getPtr(i);
// is inlined or verified?
if ( ! ( aa->m_flags3 & AF2_VALID ) ) continue;
// only do it if used in event now
if ( ! ( aa->m_flags3 & AF2_USEDINEVENT ) ) continue;
// skip if we got it already in the cache above
if ( aa->m_geocoderLat != 999 ) continue;
// for debugging
//char *start = p;
// request needs street,state,city (and zip if there)
p += sprintf(p,"addr%li=",num++);
p += aa->m_street->m_strlen;
*p++ = ',';
*p++ = ' ';
if ( aa->m_city ) {
p += aa->m_city->m_strlen;
else if ( aa->m_zip ) {
long clen = strlen(aa->m_zip->m_cityStr);
p += clen;
else if ( aa->m_flags3 & AF2_LATLON );
else { char *xx=NULL; *xx=0; }
*p++ = ' ';
// get state abbr
if ( aa->m_adm1 )
else if ( aa->m_zip )
else if ( aa->m_flags3 & AF2_LATLON );
else { char *xx=NULL;*xx=0; }
p += 2;
// zip if we got it, seems to help geocoder sometimes
if ( aa->m_zip ) {
*p++ = ' ';
long zlen = aa->m_zip->m_strlen;
p += zlen;
*p++ = '&';
// log debug
//log("addr: GET %s",start);
// null term
*p = '\0';
// fix content-length
char *qq = strstr(requestBuf,"xxxxxx");
if ( ! qq ) { char *xx=NULL;*xx=0; }
if ( p-contentStart > 999999 ) { char *xx=NULL;*xx=0; }
qq[6]='\r'; // sprintf might have written a \0, so put \r back
// finish it
//p += sprintf(p," HTTP/1.0\r\n\r\n");
// size of it
long reqLen = p - requestBuf;
// sanity
if ( reqLen >= need ) { char *xx=NULL;*xx=0; }
// send it off to get back xml reply
bool status = g_httpServer.getDoc( cands[r] , // ip
5678 , // port
requestBuf ,
reqLen ,
this ,
gotGeocoderReply ,
60*1000 , // timeout 60s
-1 , // no max
-1 );// no max
// free the request since it mdups it
if ( requestBuf != sbuf ) mfree ( requestBuf , need , "geocode" );
// return false if it blocked
if ( ! status ) return false;
// error? ENOMEM?
if ( g_errno ) {
log("addr: get geocoder lat lon: %s",mstrerror(g_errno));
return true;
// otherwise, should always block!
char *xx=NULL;*xx=0;
return true;
// process it
bool Addresses::processGeocoderReply ( TcpSocket *s ) {
if ( g_errno ) {
log("addr: geocoder reply: %s",mstrerror(g_errno));
return true;
// get reply
char *reply = s->m_readBuf;
//long replyAlloc = s->m_readBufSize;
//long replySize = s->m_readOffset;
// same for an empty reply
if ( ! reply || s->m_readBufSize == 0 ) {
log("addr: geocoder returned empty reply: %s",
return true;
// breathe
long num = 0;
// loop over each valid address we and add to request size
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get it
Address *aa = (Address *)m_am.getPtr(i);
// is inlined or verified?
if ( ! ( aa->m_flags3 & AF2_VALID ) ) continue;
// only do it if used in event now
if ( ! ( aa->m_flags3 & AF2_USEDINEVENT ) ) continue;
// skip if we got it already in the cache above
if ( aa->m_geocoderLat != 999 ) continue;
// inc it
// make the tag name
char tagName[32];
// ok now get that reply
char *p = strstr(reply,tagName);
// not found?
if ( ! p ) {
log("addr: missing geocoder reply for addr #%li",num);
// get end tag of it
char endTagName[32];
char *end = strstr(p,endTagName);
// strange!
if ( ! end ) {
log("addr: missing geocoder endtag for addr #%li",num);
// tmp shutoff
char c = *end;
*end = '\0';
// set official latitude, this
double lastLat = NO_LATITUDE;
// ok, got it, grab all possible lat/lons for it
for ( char *s = strstr(p,"<lat>"); s ; s=strstr(s+1,"<lat>")){
// breathe
// get that
double lat = atof(s+5);
// had a last? if so, and they do not match, then
// give up because i'm not sure which is right
if ( lastLat != NO_LATITUDE && lat != lastLat ) {
lastLat = NO_LATITUDE;
// mark this
lastLat = lat;
// same for longitude
double lastLon = NO_LONGITUDE;
// ok, got it, grab all possible lon/lons for it
for ( char *s = strstr(p,"<lon>"); s ; s=strstr(s+1,"<lon>")){
// breathe
// get that
double lon = atof(s+5);
// had a last? if so, and they do not match, then
// give up because i'm not sure which is right
if ( lastLon != NO_LONGITUDE && lon != lastLon ) {
// mark this
lastLon = lon;
// put back for next address's reply
*end = c;
// skip if not good
if ( lastLat == NO_LATITUDE || lastLon == NO_LONGITUDE ) {
// log it now
SafeBuf sb;
if ( aa->m_city ) {
if ( aa->m_adm1 ) {
if ( aa->m_zip && aa->m_zip->m_strlen ) {
log("addr: geocoder failed on %s",sb.getBufStart());
// otherwise, set it!
aa->m_geocoderLat = lastLat;
aa->m_geocoderLon = lastLon;
// free when done
//mfree ( reply , replyAlloc , "geocodrp");
return true;
void Address::getLatLon( double *lat, double *lon ) {
// use geocoder if valid
if ( m_geocoderLat != NO_LATITUDE && m_geocoderLon != NO_LONGITUDE ) {
*lat = (double)m_geocoderLat;
*lon = (double)m_geocoderLon;
// use other guy otherwise
if ( m_latitude != NO_LATITUDE && m_longitude != NO_LONGITUDE ) {
*lat = (double)m_latitude;
*lon = (double)m_longitude;
// otherwise, no go
bool hashPlaceName ( HashTableX *nt1,
Words *words,
long a ,
long b ,
uint64_t v ) {
long long *wids = words->m_wordIds;
// hash
for ( long k = a ; k < b ; k++ ) {
// skip if not word
if ( ! wids[k] ) continue;
// add it
if ( ! nt1->addKey ( &wids[k] , &v ) ) return false;
return true;
// returns -1 and sets g_errno on error
long getCommonWordIds ( long a1 , long b1 ,
long a2 , long b2 ,
long long *wids ,
long long *commonIds ,
long max ,
long niceness ) {
long nc = 0;
HashTableX ht;
char sbuf[640];
ht.set ( 8,0,64,sbuf,640,false,niceness,"cmmnwrds");
// hash first round
for ( long i = a1 ; i < b1 ; i++ ) {
// skip if not word
if ( ! wids[i] ) continue;
// add it otherwise
if ( ! ht.addKey ( &wids[i] ) ) return -1;
// now check the other guy
for ( long i = a2 ; i < b2 ; i++ ) {
// skip if not word
if ( ! wids[i] ) continue;
// add it otherwise
if ( ! ht.isInTable ( &wids[i] ) ) continue;
// add him to our common list
commonIds[nc++] = wids[i];
// stop if no room left
if ( nc >= max ) break;
// return that
return nc;
Place *Addresses::getAssociatedPlace ( long i ) {
// get smallest section containing word #i
Section *si = m_sections->m_sectionPtrs[i];
// scan addresses also in this section
for ( ; si ; si = si->m_parent ) {
// key mixing now
//long key = hash32h((long)si,456789);
// ok, now telescope our section out until we
// find the address
//long slot = pt->getSlot ( &key );
// get it
long pi = si->m_firstPlaceNum;
// telescope if none
//if ( slot < 0 ) continue;
if ( pi < 0 ) continue;
// count them
//long count = 0;
long long lasth = 0LL;
Place *lastpp = NULL;
// . scan the addresses in section "si"
// . the places in m_sorted[] are streets or are verfied
// place names
for ( ; pi < m_numSorted ; pi++ ) {
// breathe
// get place
Place *pp = m_sorted[pi];
// stop if breach
if ( pp->m_a >= si->m_b ) break;
// get that place
//Place *pp = *(Place **)pt->getValueFromSlot(slot);
// use address or alias
Address *aa = pp->m_address;
if ( ! aa ) aa = pp->m_alias;
// get hash. fix
// which has "Low Spirits" as a place which aliases
// to an address whose street is 2823 2nd St NW. as
// are all the places around this url's only pair of
// valid lat/lon coordinates.
long long h = pp->m_hash;
if ( aa ) h = aa->m_street->m_hash;
// compare to last h
if ( lasth && h != lasth ) { lastpp = NULL; break; }
// set it for next guy
lasth = h;
// save it
lastpp = pp;
// count them
// if multiple stop, we can not be sure with
// which address we are associated
//if ( count >= 2 )
// break;
//if ( slot >= 0 )
// break;
//if ( ! lastpp )
// break;
// get that address
//Place *pa = *(Place **)pt->getValue(&key);
// this returns NULL if we had multiple possible addresses
return lastpp; // pa;
return NULL;
// . array for setting s_lc hashtable
// . these are words that can be lower case in a place name
// . fixes "Santa Maria de la Paz Catholic Church" not being a place name
static char *s_lcWords[] = {
"re", // you're
"s", // Slim's
"y", // spanish "Pupuseria y Restaurant Salvado"
"del", // spanish "this" "Bosque del Apache National Wildfile Refuge"
"del", // spanish "of" "Casa de las Chimeneas"
"las", // spanish "the"
"ll", // they'll this'll that'll you'll
"ve" // would've should've
// returns false with g_errno set on error
bool setHashes ( Place *p , Words *ww , long niceness ) {
//Words *ww = m_words;
long a = p->m_a;
long b = p->m_b;
// adm1 hash is just hash of the two letters
if ( p->m_type == PT_STATE ) {
// must be there
// do not core here anymore since we coule be a foreign
// latlon only place in which case this will be zero.
// happens when such a place is in the contactinfo tag
//if ( ! p->m_adm1Bits ) { char *xx=NULL;*xx=0;}
//p->m_hash = hash64Lower_utf8 ( p->m_adm1 , 2);
// will this work?
p->m_hash = p->m_adm1Bits;
return true;
// if place name was taken from a tag or placedb then we have
// to set the words class ourself
Words tmp;
if ( p->m_a < 0 ) {
// return false with g_errno set on error
if ( ! tmp.set ( p->m_str ,
p->m_strlen ,
true ,
niceness ) ) return false;
// set it up
ww = &tmp;
a = 0;
b = ww->m_numWords;
long long *wids = ww->m_wordIds;
long *wlens = ww->m_wordLens;
char **wptrs = ww->m_words;
long nw = ww->m_numWords;
// the straight up hash
long long h = 0LL;
// hash of the non indicator alpha words in street name
long long h1 = 0;
// . includes hash of directional indicators
// . we only use this if street name is a directional indicator
long long h2 = 0;
long long h2b = 0;
long long h3 = 0;
long long h4 = 0;
// word id of previous word
long long pi = 0LL;
long alphaCount = 0;
long long prevIndId = 0LL;
// to fix the street that is "25 School" we cannot map "school"
// to h_zero
bool isStreet = ( p->m_type == PT_STREET );
// sanity check -- no, suites start with punct!
//if ( ! wids[a] ) { char *xx=NULL;*xx=0; }
p->m_simpleHash32 = 0;
// loop over words
for ( long i = a ; i < b ; i++ ) {
// skip if not alnum word
if ( ! wids[i] ) continue;
// make a simple hash so setting the EV_STORE_HOURS flag
// in Events.cpp works, since we compare it to the simple
// hash of the event title
p->m_simpleHash32 ^= (uint32_t)wids[i];
// this logic taken from Sections.cpp where it is setting
// Section::m_sentenceContentHash
if ( p->m_simpleHash32 == 0 )
p->m_simpleHash32 = 123456;
// get synonym of word id
long long *swid = getSynonymWord ( &wids[i] , &pi , isStreet );
// word id of previous word
pi = wids[i];
// mix it up
h <<= 1LL;
// xor it in
h ^= *swid;
// done if not street
if ( p->m_type != PT_STREET ) continue;
// is street a place name in disguise? if so, continue
if ( p->m_flags2 & PLF2_IS_NAME ) continue;
// shortcut
bool isNum = ww->isNum2(i);
// count it
if ( ! isNum ) alphaCount++;
// the street num hash, hash of the first number
if ( isNum && h3 == 0 ) h3 = wids[i];
// is this word like "st" or "ave" or "blvd"
IndDesc *id=(IndDesc *)g_indicators.getValue(swid);
// hash of last "indicator"
if ( id ) {
// map them
h4 = *swid;
// map "N.E." to "NE"
if ( prevIndId == h_north && *swid == h_east )
h4 = h_northeast;
if ( prevIndId == h_north && *swid == h_west )
h4 = h_northwest;
if ( prevIndId == h_south && *swid == h_east )
h4 = h_southeast;
if ( prevIndId == h_south && *swid == h_west )
h4 = h_southwest;
// save that
prevIndId = *swid;
// prevIndId only means for the previous word, so reset it
prevIndId = 0LL;
// set some flags based on indFlags
bool isStreetInd = ( id && (id->m_bit & IND_STREET) );
bool isDir = ( id && (id->m_bit & IND_DIR ) );
// cancel the 'S' indicator if potential
// apostrophe! "aug 17 burt's lounge"
// we do not want "17 burt's"
if ( isDir &&
wlens[i] == 1 &&
(wptrs[i][0]=='s' || wptrs[i][0]=='S') &&
i > 1 &&
wptrs[i][-1] != ' ' )
isDir = false;
// . update this.
// . exclude numbers from this!
// . allow other numbers if no alpha word before them!
// . exclude directional indicators from this
// . MDW: for PLF2_INTERSECTION "streets" we need to allow
// when i == a! because we do not have numeric addresses
// for intersections, so made it from i>a to i>=a
if ( i >= a &&
// but allow directional indicators if right after
// the street number though, like "123 west street"
( ! isDir || i == a + 2 ) &&
// commenting this out hurts "100 3/4 road"
// but it helps "2001 1/2 montgomery blvd"
//( ! isNum || alphaCount == 0 ) &&
! isNum &&
! isStreetInd ) {
// mix it up
h1 <<= 1;
// xor it
h1 ^= *swid;//wids[j];
// fix "2804 hwy 250" from excluding the "250"
if ( isNum && alphaCount > 0 ) {
// mix it up
h1 <<= 1;
// xor it
h1 ^= *swid;//wids[j];
// set back up hash in case the others are 0
if ( isStreetInd ) {
h2b <<= 1;
h2b ^= wids[i];
if ( isDir ) {
// mix it up
h2 <<= 1;
// include it in this
h2 ^= wids[i];
// set hash
p->m_hash = h;
// keep this as it is
p->m_wordHash64 = h;
// . if we are a city look up in g_places and see if we are an
// alias for a different city name
// . fix "abq" so it maps to albuquerque
// . we now fixed getAddressHash() so this logic is not needed
//if ( p->m_type == PT_CITY ) { // && (p->m_flags & PF_IS_ALIAS) ) {
// // convert hash to alias hash
// long long *newh = (long long *)g_aliases.getValue ( &h );
// // set that to h now
// if ( newh ) p->m_hash = *newh;
// // could not find this city in the table... strange
// return true;
// done if not street
if ( p->m_type != PT_STREET ) return true;
// only use the purer hash if it is non-zero
if ( h1 ) p->m_hash = h1;
else if ( h2 ) p->m_hash = h2;
else p->m_hash = h2b;
// sanity check
//if ( p->m_hash == 0 ) { char *xx=NULL;*xx=0; }
p->m_streetNumHash = h3;
p->m_streetIndHash = h4;
// if we are a "fake" street
if ( p->m_flags2 & PLF2_IS_NAME )
// PROBLEM: the street "6201 San Antonio Dr NE" is matching the
// place name "San Antonio" so let's mix up "h" a little when
// we are using "place names" in place of the street
// ALSO, lets revert it back to "h" not "h1", since "h1" is
// probably zero since i added that extra "continue" above.
p->m_hash = h ^ 0x123456;
// . sanity check
// . no! the word "The" has a hash of 0, and we don't add it
// from the caller's point
//if ( p->m_hash == 0LL ) { char *xx=NULL;*xx=0; }
// done if a fake street
if ( p->m_flags2 & PLF2_IS_NAME ) return true;
// done if street was not a "pobox street"
if ( to_lower_a(wptrs[a][0])!='p' ) return true;
// assume none
long k = -1;
// "p o box 123"
if ( a + 6 < nw &&
wids[a ] == h_p &&
wids[a+2] == h_o &&
wids[a+4] == h_box &&
is_digit(wptrs[a+6][0]) )
k = a + 6;
// "p o box 123"
if ( a + 6 < nw &&
wids[a ] == h_post &&
wids[a+2] == h_office &&
wids[a+4] == h_box &&
is_digit(wptrs[a+6][0]) )
k = a + 6;
// "po box 123"
if ( a + 4 < nw &&
wids[a ] == h_po &&
wids[a+2] == h_box &&
is_digit(wptrs[a+4][0]) )
k = a + 4;
// "p.o. 81255"
if ( a + 4 < nw &&
wids[a ] == h_p &&
wids[a+2] == h_o &&
is_digit(wptrs[a+4][0]) )
k = a + 4;
// "p o b 81255"
if ( a + 6 < nw &&
wids[a ] == h_p &&
wids[a+2] == h_o &&
wids[a+4] == h_b &&
2013-08-03 00:12:24 +04:00
is_digit(wptrs[a+6][0]) )
k = a + 6;
// not a po box i guess
if ( k == -1 ) return true;
// xor it in along with h_po
p->m_hash = h_po ^ wids[k];
return true;
static HashTableX s_lc;
//static char s_lcbuf[2000];
static HashTableX s_jobTable;
// . called from above
// . returns false and sets g_errno on error
bool Addresses::set2 ( ) {
// sanity check
if ( ! s_init ) { char *xx=NULL; *xx=0; }
bool printed = false;
// shortcuts
long nw = m_words->getNumWords();
// msg13 provides a NULL sections ptr. it can't set them for speed!
// it is the spider compression proxy...
Section **sp = NULL;
if ( m_sections ) sp = m_sections->m_sectionPtrs;
// shortcut
//Sections *ss = m_sections;
// reset # of addresses we got
//m_na = 0;
// and streets
//m_ns = 0;
// and cities, states, zips
//m_np = 0;
// place mem and street mem and address mem
// init them. poolSize=5000.initnumpoolptrs=300.initnumplaceptrs=3000
// . inherit from contact info page ONLY IF NO OTHERS
// . tag format = "city=x;adm1=*;adm2=*;country=*"
// . get up to 10 addresses from the contact info
Address da[10];
// init
long dc = 0;
// first address is the empty one
memset ( &da[0] , 0 , sizeof(Address) );
// skip it
// get contact info addresses, use their city/state for our addresses
long tt = getTagTypeFromStr ( "contactaddress" );
Tag *tag = NULL;
// . taken from TagRec::getTag() function
// . Msg13.cpp does not have tag..
if ( m_gr ) tag = m_gr->getFirstTag();
// loop over all contact info addresses in the TagRec
for ( ; tag && dc < 10 ; tag = m_gr->getNextTag(tag) ){
// breathe
// . skip if not a "address" tag (ci=contactInfo)
// . no, now these are venue default addresses
if ( tag->m_type != tt ) continue;
// get str
char *str = tag->getTagData();
// reserve mem for it
// . set address, da[dc], from tag "tag"
// . flags to OR into Place::m_bits
return false;
// if it was a latlon only address, just skip it for now
// because i'm not sure what the effects will be. plus its
// m_adm1 and m_city are typically NULL!!
if ( da[dc].m_flags3 & AF2_LATLON )
// check it out
// . this just means it was an AF2_LATLON but we were not
// able to set that because it has the foreign state
// and city and country set.
//if ( ! da[dc].m_adm1->m_hash ) { char *xx=NULL;*xx=0; }
if ( ! da[dc].m_adm1->m_hash ) continue;
// advance
// . inherit from what says about our place
// . tag format = "city=x;adm1=*;adm2=*;country=*"
if ( ( tag = m_gr->getTag("abyznewslinks.address") ) &&
// skip if not a "address" tag (ci=contactInfo)
tag->m_type == tt ) {
// get str
char *str = tag->m_data;
// . set address, da[dc], from tag "tag"
// . flags to OR into Place::m_bits
if ( ! setFromStr ( &da[dc] , str,PLF_FROMTAG,m_niceness))
return false;
// advance
// now use the default venue address, should be more accurate?
tt = getTagTypeFromStr ( "venueaddress" );
// taken from TagRec::getTag() function
if ( m_gr ) tag = m_gr->getFirstTag();
// loop over all contact info addresses in the TagRec
for ( ; tag && dc < 10 ; tag = m_gr->getNextTag(tag) ){
// breathe
// . skip if not a "address" tag (ci=contactInfo)
// . no, now these are venue default addresses
if ( tag->m_type != tt ) continue;
// get str
char *str = tag->getTagData();
// . set address, da[dc], from tag "tag"
// . flags to OR into Place::m_bits
return false;
// if it was a latlon only address, just skip it for now
// because i'm not sure what the effects will be. plus its
// m_adm1 and m_city are typically NULL!!
if ( da[dc].m_flags3 & AF2_LATLON )
// check it out
// . this just means it was an AF2_LATLON but we were not
// able to set that because it has the foreign state
// and city and country set.
//if ( ! da[dc].m_adm1->m_hash ) { char *xx=NULL;*xx=0; }
if ( ! da[dc].m_adm1->m_hash ) continue;
// advance
// stop it
// let's use the meta description as well.
// should get jonson gallery on
//char *md = m_xd->getMetaDescription();
// . if section flag is one of these, ignore the words in it
// . google seems to index marquee, so i took SEC_MARQUEE out
// . SEC_HIDDEN applies to text and tags in style=display:none tags.
// fill this array
//Place streets[MAX_STREETS];
//Place *streets = m_streets;
//long qx = 0;
// the copyright symbol in utf8 (see Entities.cpp for the code)
char copy[3];
copy[0] = 0xc2;
copy[1] = 0xa9;
copy[2] = 0x00;
// shortcuts
Words *ww = m_words;
long long *wids = ww->getWordIds();
char **wptrs = ww->getWordPtrs();
long *wlens = ww->getWordLens();
nodeid_t *tids = ww->getTagIds();
// . if section flag is one of these, ignore the words in it
// . google seems to index marquee, so i took SEC_MARQUEE out
// . SEC_HIDDEN applies to text and tags in style=display:none tags.
// shortcut
wbit_t *bits = NULL;
if ( m_bits ) bits = m_bits->m_bits;
// does the word "at" preceed the potential address?
//bool atPreceeds = false;
// reset this position
long alnumPos = -1;
// "b" of last street added
long lastb = -1;
// previous word id
long long savedPrevWid = 0LL;
// scan the entire document
for ( long i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// debug
//if ( wptrs[i][0]=='1' &&
// wptrs[i][1]=='3' &&
// wptrs[i][2]=='1' ) {
// char *xx=NULL;*xx=0; }
// skip if not an alnum word
if ( ! wids[i] ) {
if ( wlens[i] == 1 ) continue;
if ( wlens[i] > 5 ) continue;
if ( ! m_words->hasChar(i,'&') ) continue;
// skip if in a script section
if ( sp && sp[i] && (sp[i]->m_flags & badFlags) ) continue;
// stop if streets are maxed
//if ( m_ns >= MAX_STREETS ) break;
// record
long long prevWid = savedPrevWid;
// and update
savedPrevWid = wids[i];
// it's an alnum OR has " & " (see above)
if ( wids[i] ) alnumPos++;
// . if we are not outside the scope of previous street then
// keep going!
// . fixes "1025 1/2 Lomas Blvd" from picking up the substreet
// of "2 Lomas Blvd" which was causing an AF_AMBIGUOUS
if ( i < lastb ) continue;
// make this the end point
// quickly add po boxes
if ( to_lower_a(wptrs[i][0])=='p' ||
// sometimes they just have "box 27693" like on
to_lower_a(wptrs[i][0])=='b' ) {
// assume none
long j = -1;
// the hash
//long long poh = 0LL;
// "box 123"
if ( i + 2 < nw &&
wids[i ] == h_box &&
is_digit(wptrs[i+2][0]) ) {
j = i + 2;
// "p o box 123"
if ( i + 6 < nw &&
wids[i ] == h_p &&
wids[i+2] == h_o &&
wids[i+4] == h_box &&
is_digit(wptrs[i+6][0]) ) {
j = i + 6;
//poh = h_po ^ wids[j];
// "p o box 123"
if ( i + 6 < nw &&
wids[i ] == h_post &&
wids[i+2] == h_office &&
wids[i+4] == h_box &&
is_digit(wptrs[i+6][0]) ) {
j = i + 6;
//poh = h_po ^ wids[j];
// p o b 123
if ( i + 6 < nw &&
wids[i ] == h_p &&
wids[i+2] == h_o &&
wids[i+4] == h_b &&
is_digit(wptrs[i+6][0]) ) {
j = i + 6;
//poh = h_po ^ wids[j];
// "po box 123"
if ( i + 4 < nw &&
wids[i ] == h_po &&
wids[i+2] == h_box &&
is_digit(wptrs[i+4][0]) ) {
j = i + 4;
//poh = h_po ^ wids[j];
// "p.o. 81255"
if ( i + 4 < nw &&
wids[i ] == h_p &&
wids[i+2] == h_o &&
is_digit(wptrs[i+4][0]) ) {
j = i + 4;
//poh = h_po ^ wids[j];
// skip if no good
if ( j < 0 ) continue;
// shortcuts
long a = i;
long b = j+1;
// add the street
Place *street = (Place *)m_sm.getMem(sizeof(Place));
if ( ! street ) return false;
street->m_a = a;
street->m_b = b;
street->m_alnumA = alnumPos;
street->m_alnumB = alnumPos+(j-i+2)/2;
street->m_type = PT_STREET;
street->m_str = wptrs[i];
street->m_strlen = wptrs[j]+wlens[j]-wptrs[i];
//street->m_adm1[0] = 0;
//street->m_adm1[1] = 0;
street->m_adm1Bits= 0LL;
//street->m_crid = 0;
street->m_flags2 = 0;
street->m_bits = 0;
street->m_address = NULL;
street->m_alias = NULL;
//street->m_hash = poh;
street->m_streetNumHash = wids[j];
street->m_streetIndHash = h_po;
// prevent overlap with next street
lastb = street->m_b;
// . need to know this for getting place name
// . place name must also be in upper case if po box is
if ( is_upper_a(wptrs[i][0]) )
street->m_bits |= PLF_HAS_UPPER;
// and note that it is a po box so Events.cpp can
// exclude it as an event location
street->m_flags2 |= PLF2_IS_POBOX;
// set its m_hash member
setHashes ( street , m_words , m_niceness );
// set some bits
for ( long k = a ; bits && k < b ; k++ )
bits[k] |= D_IS_IN_STREET;
// advance
// stop if overflowing
//if ( m_ns >= MAX_STREETS ) break;
// advance, no! this fux up alnumPos... use lastb
//i = j;
// to next
// we might be a street intersection!
bool hasAmp = m_words->hasChar(i,'&') ;
if ( wids[i] == h_and || hasAmp ) {
//if ( m_words->hasChar(i,'&') ) {
// save it
long old = m_sm.getNumPtrs();
// use this
long alnumPosArg = alnumPos;
// modify alnumPos if we are amp so it doesn't double
// count the word before the ampersand!
if ( hasAmp ) alnumPosArg++;
//m_ns = m_ns;
if ( ! addIntersection(i,alnumPosArg) )
return false;
// show it
long a = i - 8;
long b = i + 8;
if ( a < 0 ) a = 0;
if ( m_ns != old ) {
a = m_streets[m_ns-1].m_a;
b = m_streets[m_ns-1].m_b;
char *str = m_wptrs[a];
long ss = m_words->getStringSize ( a , b );
SafeBuf pp;
char c = str[ss];
str[ss] = 0;
char *gs = "bad";
if ( m_ns != old ) gs = "GOOD";
log("intersect: %s \"%s\"", gs,str);
str[ss] = c;
//m_ns = m_ns;
long ns = m_sm.getNumPtrs();
// if no intersection added, keep on going
if ( ns == old ) continue;
// keep going if not a street before it either
if ( ns <= 1 ) continue;
// get it and street before it
Place *s1 = (Place *)m_sm.getPtr(ns-1);
Place *s2 = (Place *)m_sm.getPtr(ns-2);
// get prev two streets
if ( s2->m_a > s1->m_a ) {
// i saw this for
// "Corner of 1551 State Route 232 and
// State Route 52". the street at m_ns-2
// was "1551 State Route 232" and the
// intersection street started at the word
// "Corner", so its m_a was less than...
// so in this case, let's simply disregard
// this intersection and not core.
// CAUTION. some m_bits are still set to
// D_IS_IN_STREET though...
// url was
//m_ns = old;
m_sm.setNumPtrs ( old );
//char *xx=NULL;*xx=0; }
// do not overlap streets!
//i = streets[m_ns-1].m_b - 1;
lastb = s1->m_b;
// we must now start with a number since we are just doing
// addresses in the usa, BUT i am now allowing "PO Box 1234"
// to be a valid street address
if ( ! is_digit(wptrs[i][0]) && wids[i] != h_one ) continue;
// if we are h_one we must be capitalized!
if ( wids[i] == h_one && wptrs[i][0] != 'O' ) continue;
// must not be in a date!
if ( bits &&
(bits[i] & D_IS_IN_DATE) &&
// noon street?
wids[i] != h_daily &&
wids[i] != h_noon &&
wids[i] != h_midnight )
// a '#' sign can not preceed us
// "KELLY S #7 JUAN TABO 1418 JUAN TABO NE, ..."
// . no! messes up "#3515 Berkeley Place NE"
//if ( i-1 >= 0 && wptrs[i ][-1]=='#' ) continue;
//if ( i-1 >= 0 && wptrs[i-1][ 0]=='#' ) continue;
// do not split hyphens
if ( i-2 >= 0 &&wptrs[i-1][0]=='-'&&wlens[i-1]==1&&wids[i-2])
// do not split periods like '1."5 miles west"'
if ( i-1 >= 0 && wptrs[i-1][0]=='.'&&wlens[i-1]==1 )
// fix "top X", that is not a street name!
if ( i-2 >= 0 && wids[i-2] == h_top )
// fix "route 66 casino" (highway 32 hotdogs) etc.
if ( i-2 >= 0 && wids[i-2] == h_route )
if ( i-2 >= 0 && wids[i-2] == h_rte )
// . fix 'highway "14 on the sandia crest road"'
// . yeah, the "14" is not a street address
if ( i-2 >= 0 && wids[i-2]==h_highway )
// fix 'hwy "14 on the sandia crest road"'
if ( i-2 >= 0 && wids[i-2]==h_hwy )
// fix 'hwy "14 on the sandia crest road"'
if ( i-2 >= 0 && wids[i-2]==h_hiway )
// fix "8600 West Bryn Mawr Avenue, Suite 920-N, Chicago, IL"
if ( prevWid == h_suite )
// and "county road" i guess
if ( i-2 >= 0 && wids[i-2]==h_cr )
// and "state road/route 14" too i guess
if ( i-4 >= 0 &&
(wids[i-4]==h_state ||
wids[i-4]==h_cnty ||
wids[i-4]==h_cty ||
wids[i-4]==h_county ) &&
(wids[i-2]==h_road ||
wids[i-2]==h_rd ||
wids[i-2]==h_rt ||
wids[i-2]==h_rte ||
wids[i-2]==h_route ) )
// . skip if an an "open" section
// . cored on
// . 'continue' was causing us to miss 4915 hawkins street
// for that url, so i commented out
//if ( sp[i]->m_wordEnd == -1 ) {
// char *xx=NULL;*xx=0;
// continue;
// sanity check. make sure its the right section
//if ( i >= sp[i]->m_wordEnd ) {char*xx=NULL;*xx=0;}
// sanity check
if ( sp && i < sp[i]->m_a ) {char*xx=NULL;*xx=0;}
// are we a stop word?
//bool isStop = wlens[i] <=1 || ww->isQueryStopWord(i);
// are we cap?
//bool isCap = ww->isCapitalized(i);
// do not start with uncapitalized stop word
//if ( isStop && ! isCap ) continue;
// never start with "At"
//if ( wids[i] == h_at ) { atPreceeds = true; continue; }
// count the number of numbers
long nums = 0;
// are we delimeted on the left end?
//bool leftEnd = false;
// keep an accumulative hash of all the wids in the phrase
bool firstWasDir = false; // 1st word is a direction?
bool hadCornerDir = false;
char uc = -1; // are we capitalized?
long alphaCount = 0;
long indCountStreet = 0;
long indCountDir = 0;
long stopCount = 0;
long numCount = 0;
bool firstWordIsNum = false;
bool lastWasNum = false;
bool lastWasDir = false;
long commaCount = 0;
long alnumsInPhrase = 0;
long long lastIndStreetHash = 0LL;
// hash of the non indicator alpha words in street name
//long long h1 = 0;
// . includes hash of directional indicators
// . we only use this if street name is a directional indicator
//long long h2 = 0;
//long long h2b = 0;
//long long h3 = 0;
//long long h4 = 0;
// word id of previous word
//long long pi = 0LL;
// punct right before us is a left bookend
//if ( i-1 >= 0 && wlens[i-1] >= 2 ) leftEnd = true;
//if ( i-1 >= 0 && wptrs[i-1][0] != ' ' &&
// getUtf8CharSize(wptrs[i-1])==1) leftEnd = true;
// if we are a number that is good too
//if ( is_digit(wptrs[i][0]) ) leftEnd = true;
// or a number is before us
//if ( i-1 >= 0 && is_digit(wptrs[i-1][0]) ) leftEnd = true;
// or tag is before us, no alnumword in between us and the tag
//if ( i-1 >= 0 && tids[i-1] ) leftEnd = true;
//if ( i-2 >= 0 && tids[i-2] ) leftEnd = true;
// if we are cap'd and word before us is not let that be a
// delimeter as well
//if (i-2>= 0 && isCap && wids[i-2] &&!ww->isCapitalized(i-2))
// leftEnd = true;
// need a delimeter on the left
//if ( ! leftEnd ) { atPreceeds = false; continue; }
// save it
long ns_stack = m_sm.getNumPtrs();//m_ns;
// a flag for "1025 1/2 Lomas Blvd NE..."
long fractionj = -1;
// "620-624 Central Ave SW." (El Rey) ?
bool hasRange = false;
// fix for "4909-15 Hawkins NE" for
bool hasHyphenAddress = false;
// reset this
long lastSpecialj = -1;
// loop over it
for ( long j = i ; j < nw ; j++ ) {
// breathe
QUICKPOLL ( m_niceness );
// we can never contain a tag
if ( tids[j] ) {
// skip if <sup>
if ( tids[j] == TAG_SUP ) continue;
if ( tids[j] == (TAG_SUP|BACKBIT) ) continue;
// fix "1024 4th st sw <span>edit</span>" for
// url, but carefule, i think
// a trumba url or something uses spans
// within its addresses
if ( (indCountDir || indCountStreet) &&
tids[j] == TAG_SPAN )
// skip if non-breaking tag
if ( ! isBreakingTagId(tids[j]) )
// . allow br tags since microsofot front page
// . no! this is causing the zip code from
// a previous address to be used as the
// street address for the name of a business
// for
// . well then at least allow it for
// "14 s.<br>2nd street"???? dunno...
//if ( tids[j] == TAG_BR )
// continue;
// allow xml tags
// . NO! this may help because they
// have stupid xml tags in between addresses
// but it hurts
// "86454011</guid>\r\n\t\t\t
// <xCal:summary>9th Annual Thanksgiving..."
// because most people do not do this!
//if ( tids[j] == TAG_XMLTAG )
// continue;
//if ( tids[j] == (TAG_XMLTAG|BACKBIT) )
// continue;
// otherwise, stop it
// are we punctuation?
if ( ! wids[j] ) {
// single space is ok
if (wptrs[j][0]==' '&&wlens[j]==1) continue;
// double space is ok
if (wptrs[j][0]==' '&&wptrs[j][1]==' '&&
wlens[j]==2) continue;
// period only after abbreviation
if ( wptrs[j][0] == '.' && j > 0 &&
// watch out for "4477 9TH AVE. SE"
// from
m_words->isSpaces2(j,1) )
//wptrs[j][1] == ' ' && wlens[j]==2 )
// . period after a single letter as well
// . N. M.
if ( wptrs[j][0] == '.' && j > 0 &&
wlens[j-1]==1 &&
// fix "8. wall street"
!is_digit(wptrs[j-1][0]) &&
wptrs[j][1] == ' ' &&
wlens[j]==2 ) continue;
// N.M.
if ( wptrs[j][0] == '.' && j > 0 &&
// fix 1."5 miles west"
!is_digit(wptrs[j-1][0]) &&
wlens[j-1]==1 && wlens[j]==1 ) continue;
// quote: The Noyes House 2525 "N" Avenue
// National
if (wptrs[j][0]=='\"'&&wptrs[j][1]==' ' &&
// 'closer to 37"' is not a street name!
if (wptrs[j][0]==' ' &&wptrs[j][1]=='\"'&&
wlens[j]==2) continue;
// punct mark: st. michael's drive
if (wptrs[j][0]=='\''&&wlens[j]==1) continue;
// mosby's run: utf8 apostrophe
if (wlens[j]==3&&
wptrs[j][0]==-30 &&
wptrs[j][1]==-128 &&
wptrs[j][2]==-103 )
// village of los ranchos growers' market
if (wptrs[j][0]=='\''&&wptrs[j][1]==' '&&
wlens[j]==2) continue;
// hyphens usually bad, but x-y is ok.
// fix "3650-A Hwy 528..."
is_alpha_a(wptrs[j+1][0])) continue;
// "620-624 Central Ave SW." (El Rey)
if ( hasRange &&j==i+1 ) continue;
// fix for 4909-15 Hawkins NE" for
wlens[j-1]>=4&&is_digit(wptrs[j-1][0]) ) {
hasHyphenAddress = true;
// sequence of whitespace is ok
long k; for(k=0;k<wlens[j];k++)
if(!is_wspace_a(wptrs[j][k])) break;
if(k==wlens[j]) continue;
// '/' is ok if part of a fraction!
if( j == fractionj ) continue;
// . allow commas in foreign street addresses
// . brazil street address:
// "Rua Afonso Canargo, 805"
//if ( wptrs[j][0]==',' && wptrs[j][1]==' ' &&
// is_digit(wptrs[j][2]) &&
// j>0 && !is_digit(wptrs[j][-1]) ) {
// commaCount++;
// continue;
//if ( wptrs[j][0]==' ' && wptrs[j][1]==',' &&
// is_digit(wptrs[j][2]) &&
// j>0 && !is_digit(wptrs[j][-1]) ) {
// commaCount++;
// continue;
// . comma allowed only b4 directional indicatr
// . "131 Monroe St, NE"
// . no because we got a false positive:
// "1024 4th street, sw corner..."
// . ok, this is back again now! BUT... need
// to make sure a tag or city name follows it
// . crap, now we got
// "5305 Gibson, S.E. <b>Albuquerque ..."
// . shoot, also need to watch out for
// "Wisconsin Ave., NW"
if ( j+3 >= nw ) break;
bool commaAfter = false;
if ( wptrs[j][0]==',' )
commaAfter = true;
if ( wptrs[j][0]=='.' && wptrs[j][1]==',')
commaAfter = true;
if ( wptrs[j][0]==' ' && wptrs[j][1]==',')
commaAfter = true;
if ( ! commaAfter ) break;
char gotDir = 0;
if ( wids[j+1] == h_ne ) gotDir = 2;
if ( wids[j+1] == h_nw ) gotDir = 2;
if ( wids[j+1] == h_se ) gotDir = 2;
if ( wids[j+1] == h_sw ) gotDir = 2;
if ( wids[j+1] == h_n&&wids[j+3]==h_e)gotDir=4;
if ( wids[j+1] == h_n&&wids[j+3]==h_w)gotDir=4;
if ( wids[j+1] == h_s&&wids[j+3]==h_e)gotDir=4;
if ( wids[j+1] == h_s&&wids[j+3]==h_w)gotDir=4;
if ( ! gotDir ) break;
// do not breach
if ( j+gotDir >= nw ) continue;
// its great if tag follows the dir indicator
if ( tids[j+gotDir] ) continue;
// do not breach
if ( j+gotDir+1 >= nw ) continue;
// or a punct then a tag
if ( tids[j+gotDir+1] ) continue;
// fix for "700 Louisiana, SE 87108" for
// url
// ok, a cap word must follow
if ( ! is_upper_utf8 (wptrs[j+gotDir+1]))break;
// we are good
// otherwise, stop, we hit bad punct that
// can not be included in a street address
// . otherwise we are alphanumeric
// . more than 10 is too many for a street
if ( alnumsInPhrase++ >= 10 ) break;
// one common is enough for a street address
if ( commaCount >= 2 ) break;
// . forbidden words
// . fixes "less than ; 1 mile away ; abq nm"
if ( wids[j] == h_away ) break;
// showing "39 results near" Albuquerque, NM
if ( wids[j] == h_results ) break;
// "3 Ave, E 144 To E 145 Sts"
if ( j==i+2 && wids[j] == h_to ) break;
// "11 Ave" implies "11th avenue"
if ( j==i+2 && wids[j] == h_ave ) break;
if ( j==i+2 && wids[j] == h_avenue ) break;
// "24 st to crescent st"
// has
// a ton of street formations describing park
// boundaries. so fix those:
if ( j==i+2 &&
j+2<nw &&
(wids[j] == h_st ||
wids[j] == h_sts ||
wids[j] == h_street ||
wids[j] == h_streets ||
wids[j] == h_ave ||
wids[j] == h_avenue ||
wids[j] == h_road ||
wids[j] == h_rd ) &&
(wids[j+2] == h_bet ||
wids[j+2] == h_between ||
wids[j+2] == h_btwn ||
wids[j+2] == h_to ||
wids[j+2] == h_at ) )
// "90 And E"
if ( j==i+2 && wids[j] == h_and ) break;
// 124 st btwn 5 ave"
if ( wids[j] == h_btwn ) break;
// are we a stop word?
//bool isStopWord=wlens[j]<=1 ||ww->isQueryStopWord(j);
bool isStopWord=wlens[j]<=1 ||s_lc.isInTable(&wids[j]);
// treat this as a stop word, fixes
// "2001 E 7<sup>th</sup>"
if ( lastWasNum ) {
if ( wids[j] == h_th ) isStopWord = true;
if ( wids[j] == h_st ) isStopWord = true;
if ( wids[j] == h_nd ) isStopWord = true;
if ( wids[j] == h_rd ) isStopWord = true;
// are we upper or not?
bool upper = is_upper_utf8(wptrs[j]);
// do we have an upper or lower case word?
if ( uc == -1 && ! is_digit(wptrs[j][0]) ) {
if ( upper ) uc = 1;
else if ( ! isStopWord ) uc = 0;
// mixed case? if so stop!
if ( ! isStopWord &&
! is_digit(wptrs[j][0])&&
upper != uc ) {
// . fix "123 Wyoming ave."
// . fix "123 Wyoming ne"
IndDesc *id;
id=(IndDesc *)g_indicators.getValue(&wids[j]);
// set some flags based on indFlags
if ( ! id ) break;
// must be "avenue" or "ne" etc.
if ( ! (id->m_bit & IND_STREET) &&
! (id->m_bit & IND_DIR) )
// if lower case stop word of two letters or more
// leads then do not allow that
// "1950 in New York, NY"
if ( isStopWord && wlens[j]>=2 && !upper && j==i+2 )
// "7 days a week"
if ( wids[j]==h_days && j==i+2 )
// "2 blocks north"
if ( wids[j]==h_blocks && j==i+2 )
// "1 block north"
if ( wids[j]==h_block && j==i+2 )
// "90 miles north"
if ( wids[j]==h_miles && j==i+2 )
// "1 hour ago"
if ( wids[j]==h_hour && j==i+2 )
if ( wids[j]==h_hr && j==i+2 )
// "8 hours ..."
if ( wids[j]==h_hours && j==i+2 )
if ( wids[j]==h_hrs && j==i+2 )
// "2 mi north"
if ( wids[j]==h_mi && j==i+2 )
// "cross 8 mile road"
if ( wids[j]==h_mile && j==i+2 )
// "90 kilometers north"
if ( wids[j]==h_kilometers && j==i+2 )
// "90 km north"
if ( wids[j]==h_km && j==i+2 )
// "5 reviews"
if ( wids[j]==h_reviews && j==i+2)
// 18 year(s) old
if ( (wids[j] == h_year ||
wids[j] == h_years ||
wids[j] == h_yr ||
wids[j] == h_yrs ) && j==i+2 )
// this is not a street:
// "[copyright] 2008 The E.W. Scripps Co."
if ( j==i && i-1>0 && !tids[i-1] && !wids[i-1] &&
gb_strncasestr(wptrs[i-1],wlens[i-1],copy) )
// this is not a street:
// "[copyright] 1997 - 2009 Albuquerque Journal"
if ( j==i && i-4>0 && is_digit(wptrs[i-2][0]) &&
gb_strncasestr(wptrs[i-1],wlens[i-1],copy) )
// assume not
bool isDir = false;
bool isStreetInd = false;
// shortcut
bool isNum = ww->isNum2(j);
// set "lastWasNum"
if ( isNum ) lastWasNum = true;
else lastWasNum = false;
// treat this as a number too!
if ( wids[j] == h_one ) isNum = true;
// are we a number? (might also be "13a")
if ( isNum ) {
// . only one number per phrase?
// . NO! "2860 state highway 14 N.". needs 2!
if ( ++nums >= 3 ) break;
// if a $ preceeds, that is bad!
if ( j-1>=0 && wptrs[j][-1]=='$' ) break;
// . or break in front
// . was messing up "Elk Lodge #929\n
// 1720 N Montana Ave" so i added the tids
// check
// . i took this out because of
// "Albertsons #903 4300 ridge crest..."
// for http://www.estrelladelnortevineyard.
// com/SFV_retloc.php
// !ww->hasChar(j-1,','))
// break;
// . filter "23,000 years ago"
// . filter "ages 8-16"
// . filter "ages 8 - 16"
// . filter "june 3-31"
// . filter "june 3 - 31"
// . filter "tuesday 3 - 5"
// . get first number, make it word #f
if ( wlens[j]==3 && j-2>=0 &&
wlens[j-2]<=3 &&
wptrs[j-1][1]=='-') ) {
// "620-624 Central Ave SW." (El Rey)
// if word was not a number before us
if ( ! hasRange ) break;
if ( j != i+2 ) break;
if ( wlens[j]<=3 && j+2<nw &&
is_digit(wptrs[j+2][0]) &&
wlens[j+2]==3 &&
wlens[j+1]==1 &&
wptrs[j+1][1]=='-') ) {
// "620-624 Central Ave SW." (El Rey)
// if word was not a number before us
if ( j != i ) break;
if ( wptrs[j+1][0]==',') break;
long a = ww->getAsLong(j);
long b = ww->getAsLong(j+2);
if ( a >= b ) break;
if ( b - a > 10 ) break;
// i guess it is ok now
hasRange = true;
// no years.
long n = ww->getAsLong(j);
// possible possessive year?
if ( n>=1980 && n<=2030 &&
j+1<nw && wptrs[j+1][0]=='\'')
// year ending in s (1960s)
// count it
// and if we are first
if ( i == j ) firstWordIsNum = true;
// use for street num hash
//if ( nums == 1 ) h3 = wids[j];
// inc this count if not a number
else alphaCount++;
// time indicator?
//if ( wids[j] == h_am ) break;
//if ( wids[j] == h_pm ) break;
//if ( wids[j] == h_a && j+2<nw &&wids[j+2]==h_m)break;
//if ( wids[j] == h_p && j+2<nw &&wids[j+2]==h_m)break;
// break if we hit a suite indicator
if ( wids[j] == h_suite ) break;
if ( wids[j] == h_ste ) break;
// does a single letter or number follow "room"?
bool numFollows = false;
if ( j+2<nw && is_digit(wptrs[j+2][0]))numFollows=true;
// a single letter counts as a number too!
if (j+2<nw&&wids[j+2] && wlens[j+2]==1)numFollows=true;
// or ends in a number (like "A1")
if ( j+3<nw &&is_digit(wptrs[j+3][-1]))numFollows=true;
// these are like suites but need a number or
// single letter after them
if ( ( wids[j] == h_unit ||
wids[j] == h_bldg ||
wids[j] == h_bld ||
wids[j] == h_building ||
wids[j] == h_room ||
wids[j] == h_pier ||
wids[j] == h_rm ) && numFollows )
// does this number start a fraction?
// 1025 1/2 Lomas Boulevard North West, Albuquerque, NM
if ( isNum && numCount == 2 && j+2<nw &&
wlens[j] == 1 && wptrs[j+1][0]=='/' &&
wlens[j+1]==1 && ww->isNum(j+2) ) {
// ignore it kinda
numCount -= 2;
nums -= 2;
// allow the / to pass
fractionj = j+1;
// no back to back numbers allowed in street address
else if ( isNum && j+3<nw && ww->isNum(j+2) &&
// exception for "1025 1/2 Lomas Blvd..."
( wptrs[j+3][0]!='/' || wlens[j+3]!=1) &&
// exception for "4909-15 hawkins NE"
// for
wlens[j]<=2&&wlens[j-2]>=4) &&
! hasRange )
// street has 2 or less numbers though!
if ( numCount >= 3 ) break;
// . if we are the 2nd number in the street name
// we must follow a "highway" or "state route" or
// "state road" or such abbreviation...
// . if we are "3rd" that should not be considered a
// num so isNum should be false for that,
// but we might have 3<sup>rd</sup>
// . this screws ups "Corrales Office Plaza,
// 3611 NM 528 NW, Ste. B, ABQ 87114" and makes us
// thinks the road is "528 NW" and "3611 NM" is
// part of the place name
if ( isNum && numCount == 2 ) {
// assume not ok!
bool ok = false;
// are we ok?
if ( i-2>=0 && wids[i-2]==h_hwy )
ok = true;
if ( i-2>=0 && wids[i-2]==h_highway )
ok = true;
if ( i-4>=0 &&
wids[i-4]==h_state &&
wids[i-2]==h_road )
ok = true;
if ( i-4>=0 &&
wids[i-4]==h_state &&
wids[i-2]==h_route )
ok = true;
// get next alnum word, should be
// the "th" in "4 th street" for example
long nn = i + 2;
if ( nn<nw && tids[nn] ) nn++;
if ( nn<nw && !wids[nn] ) nn++;
if ( nn<nw && wids[nn]==h_st ) ok = true;
if ( nn<nw && wids[nn]==h_nd ) ok = true;
if ( nn<nw && wids[nn]==h_rd ) ok = true;
if ( nn<nw && wids[nn]==h_th ) ok = true;
if ( ! ok )
// . fix "4701 wyoming blvd. NE abq nm 87111"
// . watch out for "501 elizabeth st. S.E."
// . after dir pretty much stop
// . "204 bryn mawr drive north east" --> 5 --> 6
if ( indCountDir>0 && alphaCount >= 6 ) break;
// containing an indicator qualifies us.
IndDesc *id=(IndDesc *)g_indicators.getValue(&wids[j]);
// set some flags based on indFlags
if ( id && (id->m_bit & IND_STREET) ) {
// invalidate it if it is "8k run"
if ( wids[j] == h_run &&
j-2>0 &&
is_digit(wptrs[j-2][0]) &&
to_lower_a(wptrs[j-1][-1])=='k' )
// otherwise count it
isStreetInd = true;
// save it
lastIndStreetHash = wids[j];
// back up hash
//h2b <<= 1;
//h2b ^= wids[j];
if ( id && (id->m_bit & IND_DIR ) ) {
// cancel the 'S' indicator if potential
// apostrophe! "aug 17 burt's lounge"
// we do not want "17 burt's"
if ( wlens[j]==1&&
(wptrs[j][0]=='s' ||
wptrs[j][0]=='S' ) &&
j>1 && wptrs[j][-1]!=' ' )
id = NULL;
else {
// mix it up
//h2 <<= 1;
// include it in this
//h2 ^= wids[j];
// assume not
lastWasDir = false;
if ( id && (id->m_bit & IND_DIR ) ) {
isDir = true;
if ( alphaCount == 1 ) firstWasDir = true;
// se? ne? nw? sw?
if ( wlens[j] == 2 ) hadCornerDir = true;
// northeast? etc.
if ( wlens[j] >= 9 ) hadCornerDir = true;
lastWasDir = true;
// . fix "1024 4th st sw <span>edit</span>" for
// url
// . this caught "330 Tijeras Ave NW Ofc Albuquerque,"
// . and "1664 Bridge Boulevard Southwest Rea" but i
// don't know what ofc and rea mean??
// . crap we lost "10000 NW Coors Blvd" which is a
// type-o
//if ( hadCornerDir && ! id && alphaCount >= 2 )
// break;
// stop "KELLY S #7 JUAN TABO 1418 JUAN TABO NE"
// from giving "7 JUAN TABO 1418 JUAN TABO NE" street
// basically, do not allow a part of the street name
// to be after this 2nd number...
if ( numCount == 2 &&
! isNum &&
! isDir &&
! isStreetInd &&
! hasRange &&
! hasHyphenAddress &&
wids[j] != h_st &&
wids[j] != h_nd &&
wids[j] != h_rd &&
wids[j] != h_th )
// get synonym of word id
//long long *swid = getSynonymWord ( &wids[j] , &pi );
// word id of previous word
//pi = wids[j];
// this too
//if ( id ) h4 = *swid;//wids[j];
// . update this.
// . exclude numbers from this!
// . allow other numbers if no alpha word before them!
// . exclude directional indicators from this
// . but allow directional indicators if right after
// the street number though
//if ( j > i &&
// ( ! isDir || j == i + 2 ) &&
// // commenting this out hurts "100 3/4 road"
// // but it helps "2001 1/2 montgomery blvd"
// //( ! isNum || alphaCount == 0 ) &&
// ! isNum &&
// ! isStreetInd ) {
// // mix it up
// h1 <<= 1;
// // xor it
// h1 ^= *swid;//wids[j];
// fix "2804 hwy 250" from excluding the "250"
//if ( isNum && alphaCount > 0 ) {
// // mix it up
// h1 <<= 1;
// // xor it
// h1 ^= *swid;//wids[j];
// count stop words
//if ( ! id && ww->isStopWord(j) ) stopCount++;
if ( ! id && s_lc.isInTable(&wids[j]) ) stopCount++;
// need at least one number to be a street address
if ( numCount == 0 ) continue;
// . first or last word must be num
// . now i am deciding to limit to america only so
// we need the first word to be a number
//if ( ! firstWordIsNum && ! isNum ) continue;
if ( ! firstWordIsNum ) continue;
// need at least one alpha word
if ( alphaCount <= 0 ) continue;
// if first was number and we are stop word,
// no stop word right after the number!
// "2009 at the arts alliance gallery,1100 san mateo.."
// what about "488 E. hwy 66" ! E is a stop word!
//if ( numCount == 1 && stopCount == 1 &&
// alnumsInPhrase == 2 )
// break;
// can't have just stop words
if ( alphaCount == stopCount ) continue;
// or if a single char word, skip!
if ( j == i && wlens[i] == 1 ) continue;
// do not split hyphens
if ( j+2 <nw && wlens[j+1]==1 && wptrs[j+1][0]=='-'&&
// if both are digits, it is ok!
(!is_digit(wptrs[j][0])||!is_digit(wptrs[j+2][0])) )
// ok, now we are name, street or suite
bool goodStreet = ( indCountStreet >= 1 );
// if we are not an indicator but "Paseo de" preceeds
// us like in "Paseo de Peralta" then consider us to
// be good!
bool isPaseoDe = false;
if ( ! isStreetInd && j-4 > i &&
(wids[j-2]==h_de ||
// "407 paseo del canon" for
wids[j-2]==h_del ) &&
wids[j-4]==h_paseo ) {
isPaseoDe = true;
goodStreet = true;
// . can't end on a lower case word if we have upper
// . "311 Main Street is in" was a street name!!
if ( uc==1 && ! upper && !is_digit(wptrs[j][0]))
goodStreet = false;
// direction is ok too
if ( firstWasDir ) goodStreet = true;
if ( isDir ) goodStreet = true;
// if just one alpha word and one indicator,that is bad
if ( alphaCount == 1 && indCountStreet==1 )
goodStreet = false;
if ( alphaCount == 1 && indCountDir ==1 )
goodStreet = false;
// if we are not good but an indicator follows, wait
if ( ! goodStreet && j+2<nw ) {
IndDesc *id=(IndDesc *)
if ( id && (id->m_bit & IND_STREET) ) continue;
if ( id && (id->m_bit & IND_DIR ) ) continue;
if ( is_digit(wptrs[j+2][0] ) ) continue;
// did we have a highway? (or state route)
bool isHighwayNum = false;
if ( isNum && j-2>=0 && wids[j-2] == h_highway )
isHighwayNum = true;
if ( isNum && j-2>=0 && wids[j-2] == h_hwy )
isHighwayNum = true;
if ( isNum && j-2>=0 && wids[j-2] == h_hiway )
isHighwayNum = true;
if ( isNum && j-2>=0 && wids[j-2] == h_cr )
isHighwayNum = true;
if ( isNum && j-4>=0 &&
(wids[j-4] == h_state ||
wids[j-4] == h_county ||
wids[j-4] == h_cnty ||
wids[j-4] == h_cty ) &&
( wids[j-2] == h_rd ||
wids[j-2] == h_road ) )
isHighwayNum = true;
// 1501 Route 66 (no state or county before it req'd)
if ( wids[j-2] == h_route ||
wids[j-2] == h_rte ||
wids[j-2] == h_rt )
isHighwayNum = true;
// ok if we are like "1300 state route 12" that is good
if ( isHighwayNum )
goodStreet = true;
// two or more street indicators can signifiy
// a combo of two streets. crap but we have
// "750 North St. Francis Drive" !
// "1300 st. hway 14"
//if ( indCountStreet >= 2 ) goodStreet = false;
// we must end on an indicator (or be like hwy 13)
if ( ! isDir && ! isStreetInd && ! isHighwayNum &&
! isPaseoDe )
goodStreet = false;
// . check this only if we need to
// . fixes "328 galisteo<br>santa fe. NM 87501"
// . should fix's
// "T & D Market 485 Parker, Santa Rosa, NM..."
if ( ! goodStreet &&
alphaCount >= 1 &&
! isNum && j+2<nw &&
// for for "77kkob am abq nm" (radio station fix)
wids[j] != h_am &&
wids[j] != h_fm ) {
long follows = cityAdm1Follows(j+2);
// good then
if ( follows ) goodStreet = true;
// error? this can never happen...
//if ( follows == -1 ) return false;
// fix for "6th Ave. New York, NY" which
// thinks that the city is "York!" for
if ( follows ) {
long f2 = cityAdm1Follows(j);
// this can never happen... comment out
//if ( f2 == -1 ) return false;
if ( f2 ) goodStreet = false;
// if suite follows that is good too:
// "One Hallidie Plaza, Suite 404,..."
// from
if ( ! goodStreet && alphaCount >= 1 &&
! isNum && j+2<nw &&
( wids[j+2]==h_suite ||
wids[j+2]==h_ste ) ) {
// set it good
goodStreet = true;
// does a single letter or number follow "room"?
bool numFollows2 = false;
if( j+4<nw && is_digit(wptrs[j+4][0]))numFollows2=true;
// a single letter counts as a number too!
if(j+4<nw&&wids[j+4] && wlens[j+4]==1)numFollows2=true;
// or ends in a number (like "A1")
if( j+5<nw &&is_digit(wptrs[j+5][-1]))numFollows2=true;
// room <num> is likewise a good stopping point
if ( ! goodStreet &&
alphaCount >= 1 &&
! isNum &&
numFollows2 &&
( wids[j+2]==h_building ||
wids[j+2]==h_bldg ||
wids[j+2]==h_bld ||
wids[j+2]==h_unit ||
wids[j+2] == h_pier ||
wids[j+2] == h_room ||
wids[j+2] == h_rm ) )
goodStreet = true;
// if we end on "hwy" and a number follows, incl #
if ( (wids[j] == h_hwy ||
wids[j] == h_highway ||
wids[j] == h_hiway ||
wids[j] == h_cr ) &&
j + 2 < nw && ww->isNum(j+2) && wlens[j+1]<=3 &&
! tids[j+1] &&
// fix "86 Old Las Vegas Hwy., 983-2700."
! ww->hasChar(i,',') &&
(j+3>=nw||wptrs[j+3][0]!='-') )
goodStreet = false;
// same goes for state routes/roads
if ( (wids[j] == h_route ||
wids[j] == h_road ||
wids[j] == h_rd ||
wids[j] == h_rte ||
wids[j] == h_route ) &&
j - 2 >= 0 &&
( wids[j-2] == h_state ||
wids[j-2] == h_cty ||
wids[j-2] == h_cnty ||
wids[j-2] == h_county )&&
j + 2 < nw && ww->isNum(j+2) && wlens[j+1]<=3 &&
! tids[j+1] &&
// anticipate similar problem to
// "86 Old Las Vegas Hwy., 983-2700."
! ww->hasChar(i,',') &&
(j+3>=nw||wptrs[j+3][0]!='-') )
goodStreet = false;
// must not end on a lower case stop word of 2+ letters
if ( wids[j] == h_and || wids[j] == h_or ||
// fixes "2006 census for ... abq nm"
wids[j] == h_for )
goodStreet = false;
// fix 'b "9 st n" of boardwalk'
if ( numCount == 1 &&
indCountDir == 1 &&
indCountStreet == 1 &&
// fix "357 Court NE" for
lastIndStreetHash != h_court &&
lastWasDir &&
alphaCount == (indCountDir + indCountStreet) )
goodStreet = false;
// add as a street?
if ( ! goodStreet ) continue;
// only add one street per i
// UNLESS lasti ended right before a city or state
// in which case we should add both
if ( lastSpecialj == -1 )
//m_ns = ns_stack;
// record if a city/state follows us so if we end
// up absorbing that city/state to make a bigger
// street name then we create 2+ streets and do not
// erase the previous one
if ( goodStreet &&
j+4<nw &&
// "9501 Indian School NE" for
// was thinking about "School, Nebraska" so
// let's fix that with this h_ne constraint
m_wids[j+4] != h_ne && // nebraska = NorthEast
cityAdm1Follows(j+2) &&
lastSpecialj < 0 )
lastSpecialj = j;
// . erase previous entry if same starting point
// . like "501 Copper Ave" vs "501 Copper Ave. NW"
//if ( ns > 0 && i == streets[ns-1].m_a ) ns--;
// length of current street (place)
//long plen = (wptrs[j] + wlens[j]) - wptrs[i];
// short cut
long a = i;
long b = j+1;
// fix "corrales bosque gallery
// 4685 Corrales Rd. *in* Corrales NM"
if ( m_wids[b-1] == h_in && alphaCount >= 2 ) {
b -= 2;
alnumsInPhrase -= 1;
// length of current street (place)
long plen = (wptrs[b-1] + wlens[b-1]) - wptrs[a];
// add the street
Place *street = (Place *)m_sm.getMem(sizeof(Place));
if ( ! street ) return false;
street->m_a = a;
street->m_b = b;
street->m_alnumA = alnumPos;
street->m_alnumB = alnumPos + alnumsInPhrase;
street->m_type = PT_STREET;
street->m_str = wptrs[i];
street->m_strlen = plen;
//street->m_adm1[0] = 0;
//street->m_adm1[1] = 0;
street->m_adm1Bits= 0LL;
//street->m_crid = 0;
street->m_flags2 = 0;
street->m_bits = 0;
street->m_address = NULL;
street->m_alias = NULL;
// only use the purer hash if it is non-zero
//if ( h1 ) street->m_hash = h1;
//else if ( h2 ) street->m_hash = h2;
//else street->m_hash = h2b;
//street->m_streetNumHash = h3;
//street->m_streetIndHash = h4;
// set its m_hash member
setHashes ( street , m_words , m_niceness );
// prevent overlap with next street
lastb = street->m_b;
// . need to know this for getting place name
// . place name must also be in upper case if
// the street is...
if ( uc == 1 ) street->m_bits |= PLF_HAS_UPPER;
// . set some bits
// . only do this if we are the unambiguous part,
// otherwise we miss "Sandia Park" in
// the 2nd street has "SANDIA PARK" as part of it
// and is doesn't get considered as a city to add
// to m_places[] below because this bit was getting
// set -- i.e. we don't take cities from street names
if ( lastSpecialj==-1 || lastSpecialj==j ) {
for ( long k = a ; bits && k < b ; k++ )
bits[k] |= D_IS_IN_STREET;
// this is a hack
if ( lastSpecialj >= 0 && lastSpecialj != j ) {
long ns = m_sm.getNumPtrs();
Place *ps = (Place *)m_sm.getPtr(ns-2);
ps ->m_flags2 |= PLF2_COLLISION;
street->m_flags2 |= PLF2_COLLISION;
// had an indicator? ave rd or direction
//if ( indCountDir || indCountStreet )
// street->m_flags2 |= PLF2_HAD_INDICATOR;
// point to next street
// stop if overflowing
//if ( m_ns >= MAX_STREETS ) break;
// nuke this
//atPreceeds = false;
// end i loop - go to next potential start of a phrase
// SET the m_places[] array (m_np) of cities, states and zips
// we now allow any street address to use any city/state mentioned
// anywhere in the document.
// for setting Place
alnumPos = -1;
long ignoreUntil = -1;
long lastCityAlnumB = -1;
long long prevWid = 0LL;
bool inCityIndicator = false;
bool inStateIndicator = false;
// scan the entire document
for ( long i = 0 ; i < nw ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// a tag?
if ( m_tids[i] ) {
// assume not an indicator tag
inCityIndicator = false;
inStateIndicator = false;
// mus tbe xml
if ( m_tids[i] != TAG_XMLTAG ) continue;
// it can inidcate things
char *tagName = m_wptrs[i]+1;
if ( strncasecmp(tagName,"eventCity",9) == 0 )
inCityIndicator = true;
if ( strncasecmp(tagName,"eventState",10) == 0 )
inStateIndicator = true;
// skip if not alnum
if ( ! m_wids[i] ) continue;
// skip if in a script section
if ( sp && (sp[i]->m_flags & badFlags) ) continue;
// count alnums
// skip if in a street. avoid getting "NE" for nebraska when
// it is in a street like "1234 girard NE" or something. same
// goes for streets named after cities or states. and using
// zip codes that are street numbers
// . assume if in street not capitalized, fixes
// "123 Main Street Abq" so Abq is not in a phrase too
if ( bits && (bits[i] & D_IS_IN_STREET) ) continue;
// skip if in menu
//if ( sp[i]->m_flags & SEC_MENU ) continue;
if ( i < ignoreUntil ) continue;
// get it
long long lastWid = prevWid;
// update it
prevWid = m_wids[i];
// must be a zip
if ( is_digit(m_wptrs[i][0]) ) {
// shortcut
// this crashed for h=70799779105646092LL
// word="60527"
long long h = m_wids[i];
// 5 digits
if ( m_wlens[i] != 5 ) continue;
// check for zip code
long slot = g_zips.getSlot(&h);
// skip if not
if ( slot < 0 ) continue;
// make sure only one! US-only for now...
// unfortunately we do have zips that have multiple
// city names... so we can't have this here...
// later we should add code to pick the best one...
// get the place
ZipDesc *zd =(ZipDesc *)g_zips.getValueFromSlot(slot);
// sanity check
//if ( m_np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
// ok, add this entry
Place *p = (Place *)m_pm.getMem(sizeof(Place));
if ( ! p ) return false;
// set it
p->m_adm1Bits = zd->m_adm1Bits;
p->m_adm1[0] = zd->m_adm1[0];
p->m_adm1[1] = zd->m_adm1[1];
p->m_type = PT_ZIP;
p->m_a = i;
p->m_b = i+1;
p->m_alnumA = alnumPos;
p->m_alnumB = alnumPos+1;
p->m_str = m_wptrs[i];
p->m_strlen = m_wlens[i];
p->m_hash = h;
p->m_cityHash = zd->m_cityHash;
p->m_cityStr = g_cityBuf + zd->m_cityOffset;
p->m_bits = 0;
// set PLF_FROMTITLE bit
if ( sp ) {
Section *ss = sp[p->m_a];
if ( ss->m_flags & SEC_IN_TITLE )
p->m_bits |= PLF_FROMTITLE;
// . fix <eventCity>abq</eventCity> for pageaddevent
// . update this now that we set lastWidCapitalized
if ( ! is_upper_utf8(m_wptrs[i]) &&
! inCityIndicator &&
! inStateIndicator )
// . deal with "Kansas City"
// . deal with "New Mexico" where "New" is also a city!
// . does this word start a city?
Place *pc = getCityPlace ( i , alnumPos , m_words );
// or start a state?
Place *ps = getStatePlace ( i , alnumPos , m_words );
// . ignore two letter state codes that are not both capped
// . fixes "In" "De Paul" "Co" "La"
if ( ps &&
ps->m_strlen==2 &&
// unless like <eventState>nm</eventState>
! inStateIndicator &&
!is_upper_a(m_wptrs[ps->m_a][1]) &&
// . unless we follow a city!
// . fixes "New Orleans;La;70113" for
lastCityAlnumB != alnumPos )
ps = NULL;
// if neither, continue on
if ( ! pc && ! ps ) continue;
// set preferred place, "pp"
Place *pp = NULL;
if ( ! pp ) pp = pc;
if ( ! pp ) pp = ps;
// . if tied prefer longer. if length tied prefer state
// . "California" is both a state and a city
if ( pc && ps ) {
// kill state if city longer
if ( pc->m_alnumB > ps->m_alnumB ) ps = NULL;
// or kill city is state is longer
else if ( pc->m_alnumB < ps->m_alnumB ) pc = NULL;
if ( pc )
lastCityAlnumB = pc->m_alnumB;
// set this
if ( pc ) ignoreUntil = pc->m_b;
if ( ps ) ignoreUntil = ps->m_b;
// prevent breach
// leave some room for adding places below...
//if ( m_np + 200 > MAX_PLACES ) {
// log("addr: too many cities/state to store in places "
// "array. truncating.");
// break;
// //char *xx=NULL;*xx=0;
bool inTitle = false;
// do not do this if called from msg13 and have no sections
if ( sp && (sp[i]->m_flags & SEC_IN_TITLE) ) inTitle = true;
if ( pc ) {
// shortcut
Place *p = (Place *)m_pm.getMem(sizeof(Place));
if ( ! p ) return false;
// ok, good to add
memcpy ( p , pc , sizeof(Place) );
// set PLF_FROMTITLE bit
if ( inTitle ) p->m_bits |= PLF_FROMTITLE;
// if last word was in,set this
if ( lastWid == h_in ) p->m_flags2 |= PLF2_REQUIRED;
if ( ps ) {
// shortcut
Place *p = (Place *)m_pm.getMem(sizeof(Place));
if ( ! p ) return false;
// ok, good to add
memcpy ( p , ps , sizeof(Place) );
// set PLF_FROMTITLE bit
if ( inTitle ) p->m_bits |= PLF_FROMTITLE;
// if last word was in,set this
if ( lastWid == h_in ) p->m_flags2 |= PLF2_REQUIRED;
// record end of this
m_npSaved = m_pm.getNumPtrs(); // m_np;
// make a list of occupation names for avoid false positive
// identifcation of a place because it is after the word "at" but
// really it is something like "john, an engineer at HP, ..." referring
// to where that person works. fixes
// which has
// "jon carpenter, digital strategist at stratacomm"
// left off on
// search for "One who..." in dictionary? "person that ..."
// "meet the engineer at cisco"
// - does the at phrase modify "meet" or "engineer" ???
static char *s_jobs[] = {
// interim rector at St. Margaret's (
"residence", // "artist in residence at the LA county HS"
"monkey", // grease monkey code monkey
static bool s_initJobs = false;
if ( ! s_initJobs ) {
// load it up
if ( ! initWordTable ( &s_jobTable,s_jobs,sizeof(s_jobs),
"jobstbl") )
return false;
// do not re-do
s_initJobs = true;
// "Tingley Colesium"
// We treat POTENTIAL place names as street names for all practical
// purposes.
// flag
char lastWasBreak = 0;
// reset this since we loop anew
alnumPos = -1;
// set if at preceeds the name
bool atFlag = false;
long long lastWid = 0LL;
// do not do this if we are javascript
long ni = nw;
if ( m_contentType == CT_JS ) ni = 0;
// do not do this if called from msg13
if ( ! m_sections ) ni = 0;
// the first word in a td table cell
long firstWordInCell;
// first we identify the candidate place names
for ( long i = 0 ; i < ni ; i++ ) {
// skip tags
if ( tids[i] ) {
// input tags reset at tag, like
// Location: <input ...> for
if ( tids[i] == TAG_INPUT ) atFlag = false;
// hit a td cell?
if ( sp[i]->m_tagId == TAG_TD )
firstWordInCell = sp[i]->m_firstWordPos;
lastWasBreak = 1;
// skip if in script section or whatever to keep alnumPos right
if ( sp[i]->m_flags & badFlags ) continue;
// skip if not alnum word
if ( ! wids[i] ) {
// if not just spaces, then we are a "break" in which
// case set "lastWasBreak" to true
char *p = wptrs[i];
char *pend = p + wlens[i];
for ( ; p < pend ; p++ ) {
if ( is_wspace_a(*p) ) continue;
// Dave & Buster's
if ( *p == '\'' ) continue;
// Dave & Buster's
if ( *p == '&' ) continue;
// St. John's College
if ( *p == '.' && is_wspace_a(p[1]) &&
i>0 && isAbbr(wids[i-1]) )
lastWasBreak = 1;
// skip this now
// it's an alnum
// remember last i
bool saved = atFlag;
// and update to the new one
atFlag = false;
// save this
long long savedWid = lastWid;
// update it now
lastWid = wids[i];
// do not start with a date
if ( bits && (bits[i]&D_IS_IN_DATE)){lastWasBreak=1;continue;}
// a lower guy followed by an upper guy is a break
if ( is_lower_utf8 ( wptrs[i] ) &&
is_upper_utf8 ( wptrs[i] ) ) {lastWasBreak = 1;continue;}
// if it is the first word in a td cell and the column header
// is like "location" or "venue" then mark it as after at
if ( i == firstWordInCell ) {
// get column header
Section *cp = sp[i]->m_headColSection;
if ( cp &&
cp->m_firstWordPos > 0 &&
// skip the header itself
cp->m_firstWordPos != i &&
// must just be one word for now
cp->m_firstWordPos == cp->m_lastWordPos &&
( wids[cp->m_firstWordPos] == h_location ||
wids[cp->m_firstWordPos] == h_venue ||
wids[cp->m_firstWordPos] == h_where ) ) {
// assume what follows is a place name
saved = true; // atFlag = true;
lastWasBreak = 1;
// this is a break
if ( wids[i] == h_at ) {
// ignore it though if previous word was one of
// these because it could be driving directions!!
// this fixes the "4139 prospect" event because we
// thought it had two locations and it got
// SEC_MULT_LOCATIONS because we thought "at Menaul"
// was a place name and not a driving direction
// for the url
if ( savedWid == h_left ||
savedWid == h_right ||
// appeared at the blah
savedWid == h_appeared ||
// had a role at the world premier
savedWid == h_role ||
savedWid == h_studied ||
// won a prize at the blah
savedWid == h_prize ||
savedWid == h_right ||
// men who stare at goats
savedWid == h_stare ||
savedWid == h_gaze ||
savedWid == h_look ||
savedWid == h_looking ||
//savedWid == ||
savedWid == h_north ||
savedWid == h_south ||
savedWid == h_east ||
savedWid == h_west ) {lastWasBreak=0;continue;}
// "at sea"
if ( i+2<nw &&
( wids[i+2] == h_sea ||
// "at discounted"
wids[i+2] == h_discounted ||
// "at"
wids[i+2] == h_www ||
// $10 at door
wids[i+2] == h_door ||
// "at discount price"
wids[i+2] == h_discount ) ) {
// skip directional at phrases like
// "(at Siler Road)" from
if ( i+4<nw &&
( wids[i+4]==h_road ||
// at the finish [line] (racing)
wids[i+4]==h_finish ||
// "at the door"
wids[i+4]==h_door ||
// "at [a|the] discount[ed]"
wids[i+4]==h_discount ||
wids[i+4]==h_discounted ||
wids[i+4]==h_street ||
wids[i+4]==h_avenue ||
wids[i+4]==h_ave ||
wids[i+4]==h_st ||
wids[i+4]==h_rd ) ) {
// "at the entrance" but not "at the entrance to"
if ( i+4<nw &&
wids[i+4] == h_entrance &&
(i+6>=nw || wids[i+6]!=h_to ) ) {
// . at the X area
// . x = registration (for races)
if ( i+6<nw &&
wids[i+2] == h_the &&
wids[i+6] == h_area ) {
// "[occuptation] at [company]"
if ( s_jobTable.isInTable(&savedWid) ) {
// otherwise assume what follows is a place name
atFlag = true;
lastWasBreak = 1;
// location: or where: indicates a location too!
if ( ( wids[i]==h_location ||
wids[i]==h_venue ||
wids[i]==h_where ) &&
i+1<nw && ww->hasChar(i+1,':') &&
// fix "Events at this location:" for
(i-2<0 || wids[i-2]!=h_this) ) {
atFlag = true;
lastWasBreak = 1;
// skip the colon-containing word
// . "come to" is similar to "at"
// . fixes
// show.details/showid/238/metropolis-wine-tasting.html
if ( i+4<nw && wids[i] == h_come && wids[i+2]== h_to ) {
atFlag = true;
lastWasBreak = 1;
i = i + 2;
// skip "at least"
if ( saved && wids[i] == h_least ) {lastWasBreak=0;continue;}
if ( saved && wids[i] == h_most ) {lastWasBreak=0;continue;}
if ( saved && wids[i] == h_this ) {lastWasBreak=0;continue;}
// allow lower case "the" after "at", but skip it
if ( saved && wids[i] == h_the ) {
// check for fake at phrase
if ( i+2 < nw && (wids[i+2] == h_heart ||
wids[i+2] == h_core ) ) {
// skip it
lastWasBreak = 0; continue; }
// if it is lower case skip it so it is not
// included in the place name
if ( is_lower_utf8(wptrs[i]) ) {
atFlag = true; lastWasBreak = 1; continue; }
// otherwise do not do the lower case check right below
// "at the entrace"
else if ( saved && wids[i] == h_entrance ) {
atFlag = true;
// not a break because we need "at the entrance to the"
lastWasBreak = 0;
else if ( saved && wids[i] == h_to && savedWid == h_entrance ){
atFlag = true;
lastWasBreak = 1;
// does it have some kind of delimeter before it?
else if ( is_lower_utf8(wptrs[i])){lastWasBreak = 0; continue;}
// each candidate needs somekind of "break" before them
if ( ! lastWasBreak ) continue;
// skip if in a script section
if ( sp[i]->m_flags & badFlags ) continue;
// or in menu
if ( sp[i]->m_flags & SEC_MENU ) continue;
// . skip if trying to start with a date
// . fixes so we do
// no start fake street names with ":30 pm ..."
if ( bits && (bits[i] & D_IS_IN_DATE) ) continue;
// skip if trying to start with something we have already
// listed as a street in the above loop
if ( bits && (bits[i] & D_IS_IN_STREET) ) continue;
// stop if streets are maxed
//if ( m_ns >= MAX_STREETS ) break;
// ok, we got a candidate, reset this
lastWasBreak = 0;
//long long h = 0LL;
long long pi = 0LL;
bool prevUpper = false;
bool prevAdded = false; // added prev to the street array?
// count em
long alphaCount = 0;
long numCount = 0;
// subalnum count
long subAlnumCount = 0;
long long h = 0LL;
long long lastWid2 = 0LL;
// . now make a hash of all substrings of the following words
// for lookup into namedb
for ( long j = i ; j < nw ; j++ ) {
// tags stop our train
if ( tids[j] ) break;
// or if ventures into a street from above
if ( bits && (bits[j] & D_IS_IN_STREET) ) break;
// do not include a date
if ( bits && (bits[j] & D_IS_IN_DATE) ) break;
// bad punct stops our train
if ( ! wids[j] ) {
char *p = wptrs[j];
char *pend = p + wlens[j];
for ( ; p < pend ; p++ ) {
if ( is_wspace_a(*p) ) continue;
if ( *p == '\'' ) continue;
// Dave & Buster's
if ( *p == '&' ) continue;
// St. John's College
if ( *p == '.' && is_wspace_a(p[1]) &&
j>0 && isAbbr(wids[j-1]) )
// bad punct stops the train!
if ( p < pend ) break;
// otherwise, just skip it
// count it
// . do not add the first word if its "The" into this
// . fixes "The Guild Cinema" not matching placedb
// entries for "Guild Cinema"
//if ( wids[j] == h_the && h == 0LL ) continue;
// are we upper?
bool isUpper = is_upper_utf8 ( wptrs[j] );
// fix for "North 4th Arts Center"
if ( is_digit(wptrs[j][0])){isUpper=true; numCount++; }
else alphaCount++;
// lowercase non-stopword stops our train
//if ( ! isUpper && ! ww->isStopWord(j) ) break;
if ( ! isUpper && ! s_lc.isInTable(&wids[j]) ) break;
// . convert place name word into base word
// . synonyms
// . converts 4th to fourth, theatre to theater, etc.
//long long *hw = getSynonymWord ( &wids[j] , &pi );
// wordid of previous word
pi = wids[j];
// shift and store
h <<= 1LL;
// xor it in
h ^= wids[j];
// save it
long long savedWid2 = lastWid2;
lastWid2 = wids[j];
// do not shorten "Center of Arts" to "Center" because
// it is causing the "Performing Arts Center of the
// the Steinbeck Institute of Art" to be an alias for
// "San Jose Performing Arts Center" because
// "Performing Arts Center" is a subset of
// "San Jose Performing Arts Center".
prevAdded = false;
// do not end on a lower case stop word
if ( ! isUpper ) {
// . got hash in stop words now
// . ignore it if syn table returned 0 (ignore)
//if ( *hw ) {
// h <<= 1LL;
// h ^= *hw;//wids[j];
prevUpper = false;
// prev was upper case and we are upper case,
// overwrite the previous entry
if ( prevAdded && prevUpper && isUpper ) {
prevAdded = false;
// likewise, do not split sequences of lowercase words
if ( prevAdded && ! prevUpper && ! isUpper ) {
prevAdded = false;
// fix "Submit a" in "Submit a New Event"
//if ( ! prevUpper && isUpper ) ns--;
// set this
prevUpper = isUpper;
// ignore it if syn table returned 0 (ignore) (school)
//if ( *hw ) {
// // mix it up
// h <<= 1LL;
// // incorporate
// h ^= *hw; // wids[j];
// do not add if only a number, like 4th or 113
if ( alphaCount == 0 ) continue;
// skip if crazy - fixes
if ( alphaCount > 10 ) continue;
// . do not add if only one word with one letter
// . fixes javascript variables being place names
if ( alphaCount == 1 && wlens[j] == 1 ) continue;
// or if just the word "the"
if ( alphaCount == 1 && wids[j] == h_the ) continue;
// now allowed to have City or Town like in
// "City/Town: Albuquerque NM"
// fixes
// nt:391851?xg_source=activity from getting that
// as a place name in abq
if ( alphaCount ==1 && wids[j] == h_city ) continue;
if ( alphaCount ==1 && wids[j] == h_town ) continue;
// . mdw mdw mdw
// . not allowed to be a city or adm1 name!
// . fixes us getting "albuquerque" as a place name!
if ( g_cities.isInTable ( &h ) ) continue;
// or state name
if ( g_states.isInTable ( &h ) ) continue;
// or zip
if ( g_zips.isInTable ( &h ) ) continue;
// TODO: or country????
// set this flag
prevAdded = true;
// add the street
Place *street = (Place *)m_sm.getMem(sizeof(Place));
if ( ! street ) return false;
street->m_a = i;
street->m_b = j+1;
street->m_alnumA = alnumPos;
street->m_alnumB = alnumPos+subAlnumCount;
street->m_type = PT_STREET;
street->m_str = wptrs[i];
street->m_strlen = wptrs[j]+wlens[j]-wptrs[i];
//street->m_adm1[0] = 0;
//street->m_adm1[1] = 0;
street->m_adm1Bits= 0LL;
//street->m_crid = 0;
street->m_bits = 0;
street->m_address = NULL;
street->m_alias = NULL;
//street->m_hash = h;
//street->m_streetNumHash = 0;//wids[j];
//street->m_streetIndHash = 0;//h_po;
// why do we need this now?
if ( is_upper_a(wptrs[i][0]) )
street->m_bits |= PLF_HAS_UPPER;
// we are SPECIAL!!!!!!
street->m_flags2 = PLF2_IS_NAME;
// or in this
if ( saved ) street->m_flags2 |= PLF2_AFTER_AT;
// set the m_hash member
setHashes ( street , m_words , m_niceness );
// do not add if hash is zero, that usually means it
// is the single word "the"
if ( street->m_hash == 0 ) {
// sanity check
//if(street->m_hash == 0 ) { char *xx=NULL;*xx=0;}
// stop if full
//if ( m_ns >= MAX_STREETS ) break;
// add UNKNOWN addresses
// i.e. "location to be determined"
// i.e. "call for location"
// This will cause Events.cpp to set the EV_UNKNOWN_LOCATION bit!!!
long b2;
bool add = false;
alnumPos = -1;
// do not do this if we are javascript
ni = nw;
if ( m_contentType == CT_JS ) ni = 0;
// do not do this if we have no sections -- call from msg13
if ( ! m_sections ) ni = 0;
// loop over every word
for ( long i = 0 ; i < ni ; i++ ) {
// skip if not word
if ( ! wids[i] ) continue;
// skip if in script section or whatever to keep alnumPos right
// we need this to keep alnumPos in alignment with the other
// places!
if ( sp[i]->m_flags & badFlags ) continue;
// count this
// must match this
if ( i+6<nw &&
wids[i ] == h_location &&
wids[i+2] == h_to &&
wids[i+4] == h_be &&
wids[i+6] == h_determined ) {
add = true;
b2 = i + 7;
if ( i+6<nw &&
wids[i ] == h_call &&
wids[i+2] == h_for &&
wids[i+4] == h_location ) {
add = true;
b2 = i + 5;
// . no,no, i like looking for words that indicate events.
// getting into the meaning of the language seems to be the
// way to go, because's sections are all
// div tags describing the same event really.
// . no, now we fix this right with SEC_TOD_EVENT flags
// set in Dates.cpp. you can't telescope to a brother
// that has that flag set
// . "details tba"
// . fixes where everyone uses the April 2010
// as a header
if ( i+2<nw &&
wids[i ] == h_details &&
wids[i+2] == h_tba ) {
add = true;
b2 = i + 3;
// call x-y-z for location
if ( i+6<nw &&
wids[i ] == h_call &&
wids[i+8] == h_for &&
wids[i+10] == h_location ) {
add = true;
b2 = i + 11;
// call x-y for location
if ( i+6<nw &&
wids[i ] == h_call &&
wids[i+6] == h_for &&
wids[i+8] == h_location ) {
add = true;
b2 = i + 9;
// skip if nothing found
if ( ! add ) continue;
// reset it
add = false;
// stop if full
//if ( m_ns >= MAX_STREETS ) break;
// add the street
Place *street = (Place *)m_sm.getMem(sizeof(Place));
if ( ! street ) return false;
street->m_a = i;//a2;
street->m_b = b2;
// do we need these?
street->m_alnumA = alnumPos;
street->m_alnumB = alnumPos + 1; // this is wrong
street->m_type = PT_STREET;
street->m_str = wptrs[i];
street->m_strlen = wptrs[b2-1]+wlens[b2-1]-wptrs[i];
//street->m_adm1[0] = 0;
//street->m_adm1[1] = 0;
street->m_adm1Bits= 0LL;
//street->m_crid = 0;
street->m_bits = 0;
street->m_address = NULL;
street->m_alias = NULL;
// why do we need this now?
if ( is_upper_a(wptrs[i][0]) )
street->m_bits |= PLF_HAS_UPPER;
// we are SPECIAL!!!!!!
street->m_flags2 = PLF2_IS_NAME | PLF2_AFTER_AT;
// set the m_hash member
setHashes ( street , m_words , m_niceness );
// do not add if hash is zero, that usually means it
// is the single word "the"
if ( street->m_hash == 0 ) continue;
// inc it
// update this
//m_ns = m_ns;
// sanity check
//if ( m_ns > MAX_STREETS ) { char *xx=NULL;*xx=0; }
//if ( m_ns == MAX_STREETS ) {
// log("addr: street buf is maxed out for %s!",m_url->m_url);
// //char *xx=NULL;*xx=0;
// if no streets found, then bail, that is it
if ( m_sm.getNumPtrs() == 0 ) return true;
// breached?
//if ( m_sm.getNumPtrs() > 4000 )
// m_breached = true;
// do not do this logic if we are javascript because we do not set
// SEC_SENTENCE if the file is javascript
long imax = m_sm.getNumPtrs();//m_ns;
if ( m_contentType == CT_JS ) imax = 0;
// if it is a place to buy tickets or register for an event then
// let's set this flag so Events.cpp can ignore it!
for ( long i = 0 ; i < imax ; i++ ) {
// not for msg13's call
if ( ! m_sections ) break;
// get the street that we center the address around
Place *street = (Place *)m_sm.getPtr(i);
// telescope up until we hit the sentence section
Section *ss = m_sections->m_sectionPtrs[street->m_a];
for ( ; ss ; ss = ss->m_parent )
if ( ss->m_flags & SEC_SENTENCE ) break;
// must have it
if ( ! ss ) { char *xx=NULL;*xx=0; }
// . if section is contained in title tag, allow it through
// . fixes "Tingley Coliseum : Buy Tickets , ... " for
if ( ss->m_flags & SEC_IN_TITLE ) continue;
// . use it as the bookends
// . [a,b) may now actually expand beyond the "ss" section
// because of the new split sentence logic in
// Sections::addSentences() to deal with sentences that
// unevenly span multiple sections like in
// and
long a = ss->m_senta;
long b = ss->m_sentb;
// use this i guess
if ( isTicketDate ( a , b , m_wids , m_bits , m_niceness ) )
street->m_flags2 |= PLF2_TICKET_PLACE;
// assume not
bool reg = false;
// now scan forward from there
for ( long j = a ; j < b ; j++ ) {
// skip punct words
if ( ! m_wids[j] ) continue;
// is it register?
if ( m_wids[j] == h_register ) {
reg = true; break; }
if ( m_wids[j] == h_sign && m_wids[j+2] == h_up ) {
reg = true; break; }
if ( m_wids[j] == h_signup ) {
reg = true; break; }
if ( m_wids[j] == h_buy && m_wids[j+2] == h_tickets ) {
reg = true; break; }
if ( m_wids[j] == h_purchase&&m_wids[j+2]==h_tickets) {
reg = true; break; }
if ( m_wids[j] == h_get && m_wids[j+2] == h_tickets ) {
reg = true; break; }
// "give them tickets to" for santafe playhouse url
// to cancel out "Max's or Dish n' Spoon" as a place
if ( m_wids[j] == h_tickets&& m_wids[j+2] == h_to ) {
reg = true; break; }
if ( m_wids[j] == h_presale ) {
reg = true; break; }
if ( m_wids[j] == h_on && m_wids[j+2] == h_sale ) {
reg = true; break; }
if ( m_wids[j] == h_pre && m_wids[j+2] == h_sale ) {
reg = true; break; }
if ( m_wids[j] == h_sales && m_wids[j+2] == h_end ) {
reg = true; break; }
if ( m_wids[j] == h_sales && m_wids[j+2] == h_begin ) {
reg = true; break; }
if ( m_wids[j] == h_sales && m_wids[j+2] == h_start ) {
reg = true; break; }
// it is such a place
if ( reg ) street->m_flags2 |= PLF2_TICKET_PLACE;
// . set Section::numStreets var
// . scan streets and set Section::m_numStreets
// . if streets are adjacent in one continuous mass, then treat as
// a single street for these purposes
for ( long X = 0 ; X < ns ; X++ ) {
// breathe
// get the street that we center the address around
Place *street = &streets[X];
// get street before it
Place *prev = NULL; if ( X > 0 ) prev = &streets[X-1];
// . if we had a street immediately before us, bail
// . we count consecutive streets as a single street
if ( prev && prev->m_alnumB == street->m_alnumA ) continue;
// get it
Section *si = sp[street->m_a];
// inc recusrively
for ( ; si ; si = si->m_parent )
// inc it
// debug
//printPlaces( streets , ns , m_pbuf , m_sections );
// . the huge address creation part
// . ultimately sets m_addresses[]/m_na array
// . make a 5 lists, one for each place type, to hold all the
// Places in the shortlist[] array we just created
// . include Places in the tagRec and title as well
// . use a NULL ptr to indicate "no place"
// . then do a 6-way nested loop over all the combos
Place *pname [10]; long nn = 0;
Place *padm1 [MAX_ADM1 ]; long na = 0;
Place *pcity [MAX_CITIES]; long nc = 0;
Place *pzip [MAX_ZIPS]; long nz = 0;
Place *psuite [10]; long nu = 0;
// each latlon might be tethered to a street address already
// topologically speaking. we need to telescope it out and
// tether it to the first street we hit. including afterats and
// fake street names? it might be tethered to a place venue name
// that we never recognize. and intead we tether it to a brother
// brother city/state when we shouldn't.
//Place *latlon [MAX_LATLONS];
//Place *pctry [10]; long ny = 0;
//Place places [ MAX_PLACES ];
//long np = 0;
// sanity check
//if ( 500 > MAX_PLACES ) { char *xx=NULL;*xx=0; }
// add places from the body!
//np = addProperPlaces ( 0 , nw , 500 , places , MAX_PLACES , np ,
// // set this flag Place::m_flags
// add in default adm1/city/zip from title
long a = 0;
long b = 0;
long tapos = 0;
if ( ss ) {
a = ss->m_titleStart;
tapos = ss->m_titleStartAlnumPos;
if ( ss ) b = ss->m_titleEnd ;
// limit those nasty long titles
if ( b > a + 30 ) b = a + 30;
// add proper places from title into "places" array
np = addProperPlaces ( a , b , 20 , places , MAX_PLACES , np ,
// . set this flag Place::m_flags
// alnumPos, subtract -1 since it immediately
// adds 1 to the first alnum it finds
tapos - 1 ,
-1 );
// breach check
if ( np > MAX_PLACES ) { char *xx=NULL;*xx=0; }
// save for popping
//long np_stack = m_np;
// shortcut
char **w = wptrs;
HashTableX dat;
char datbuf[4000];
dat.set ( 4 , 4 , 256, datbuf, 4000,false,m_niceness,"adm1buf");
// . set up the base array of all states
// . "bn" = baseNum
// . TODO: make sure state we select is not in a street!
long bn = 0;
// alway have a NULL
padm1 [ bn++ ] = NULL;
// then
for ( long i = 0 ; i < m_npSaved ; i++ ) {
// breathe
// get city, state or zip
Place *p = (Place *)m_pm.getPtr(i);
// . allow state to come from anywhere in the document
// . TODO: later add meta description to get
if ( p->m_type != PT_STATE ) continue;
// skip if interesects a street, like "ohio street"
if ( p->m_a >= 0 && bits && (bits[p->m_a] & D_IS_IN_STREET) )
// make the key for deduping
char key[4];
key[0] = p->m_adm1[0];
key[1] = p->m_adm1[1];
key[2] = 0;
key[3] = 0;
// skip if dup
if ( dat.isInTable ( &key ) ) continue;
// add it to the dedup table
if ( ! dat.addKey ( &key, &p ) ) return false;
// add to our array
padm1 [ bn++ ] = p;
// how can this happen?
if ( bn > 55 ) { char *xx=NULL;*xx=0; }
// "X" loops over all the streets we have
for ( long X = 0 ; X < m_sm.getNumPtrs() ; X++ ) {
// get the street that we center the address around
Place *street = (Place *)m_sm.getPtr(X);
// debug
//logf(LOG_DEBUG,"events: ****** X=%li *****",X);
// reset these
nc = 0;
na = bn;
nz = 0;
nn = 0;
nu = 0;
//ny = 0;
// preserve the places on there from title
//np = np_stack;
// these guys are allowed to have "no place", but everyone else
// must have something
pzip [nz++] = NULL;
//padm1 [na++] = NULL;
//psuite [nu++] = NULL;
//pctry [ny++] = NULL;
//if ( dc > 0 ) pcity [nc++] = NULL;
//if ( dc > 0 ) padm1 [na++] = NULL;
//if ( dc > 0 ) pname [nn++] = NULL;
// add a NULL because if city is unique we can fill this in
//padm1 [na++] = NULL;
// likewise, if we have a zip code we can fill in the city too
pcity [nc++] = NULL;
// search for a suite name BEFORE the street
long k = street->m_a - 1 ;
// re-set this
alnumPos = street->m_alnumA ;
// start of it
long ak = -1;
// flag init
bool gotSuiteBefore = 0;
// ptr
Place *suiteBefore = NULL;
// suite hash
long long suh = 0LL;
// start alnumPos
long akPos = -1;
// now scan for suite, stop after hitting our first alnum word
for ( ; k >= 0 ; k-- ) {
// breathe
QUICKPOLL ( m_niceness );
// skip if non alnum word
if ( ! wids[k] ) continue;
// skip if in a script section
if (sp&&sp[k]&&(sp[k]->m_flags & badFlags) ) continue;
// it's an alnum
// stop if we are not a suite designation
if ( wlens[k] != 1 && ! m_words->hasDigit(k) ) break;
// now before us must be a # sign
if ( k - 1 > 0 && m_words->hasChar(k-1,'#') ) {
// start of it was this punct word i guess
ak = k - 1;
// and this
akPos = alnumPos;
// update suite hash
suh = wids[k];
// or a suite indicator
if ( k - 2 >= 0 &&
( wids[k-2] == h_suite ||
wids[k-2] == h_ste ||
wids[k-2] == h_building ||
wids[k-2] == h_bldg ||
wids[k-2] == h_bld ||
wids[k-2] == h_pier ||
wids[k-2] == h_room ||
wids[k-2] == h_rm ||
wids[k-2] == h_unit ) ) {
// set this
akPos = alnumPos - 1;
// start here
ak = k - 2;
// update suite hash
suh = wids[k];
// skip that
// skip punct word
// update suite hash
suh <<= 1;
// xor it in
suh ^= wids[k];
// and the indicator
suh <<= 1;
suh ^= wids[k-2];
// that is it either way
// add the suite before the place name
if ( suh ) { // && m_np < MAX_PLACES ) {
// note it
gotSuiteBefore = true;
// sanity check
//if ( m_np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
// point to the suite to add
Place *pp = (Place *)m_pm.getMem(sizeof(Place));
if ( ! pp ) return false;
// point to it
suiteBefore = pp;
// length
long plen = wptrs[k]-wptrs[ak]+wlens[k];
// point to the suite
char *ps = wptrs[ak];
// skip over initial comma
if ( *ps == ',' ) { ps++; plen--; }
// set it
pp->m_a = ak;
pp->m_b = k+1;
pp->m_alnumA = akPos;
pp->m_alnumB = alnumPos+1;
pp->m_type = PT_SUITE;
pp->m_str = ps;
pp->m_strlen = plen;
pp->m_hash = 0LL;//suh;
//pp->m_adm1[0] = 0;
//pp->m_adm1[1] = 0;
//pp->m_crid = 0;
pp->m_bits = 0;
pp->m_flags2 = 0;
// thats a suite
psuite[nu++] = pp;
// now just use this
// point to next place
// search for a suite name after the street
k = street->m_b;
// re-set this
alnumPos = street->m_alnumB - 1;
// suite hash
suh = 0LL;
// remember start of suite
long startk = -1;
long startAlnumPos = -1;
char got = 0;
// point to next street
Place *next = NULL;
if ( X+1 < m_sm.getNumPtrs() )
next = (Place *)m_sm.getPtr(X+1);
// skip until we got a wordid
for ( ; k < nw ; k++ ) {
// breathe
QUICKPOLL ( m_niceness );
// skip if not an alnum word
if ( ! wids[k] ) continue;
// skip if in a script section
if (sp&&sp[k]&&(sp[k]->m_flags & badFlags) ) continue;
// it's an alnum
// start here
if ( wids[k] == h_building ) { got = 3; continue; }
if ( wids[k] == h_bldg ) { got = 3; continue; }
if ( wids[k] == h_bld ) { got = 3; continue; }
if ( wids[k] == h_unit ) { got = 3; continue; }
if ( wids[k] == h_suite ) { got = 2; continue; }
if ( wids[k] == h_ste ) { got = 2; continue; }
if ( wids[k] == h_pier ) { got = 3; continue; }
if ( wids[k] == h_room ) { got = 3; continue; }
if ( wids[k] == h_rm ) { got = 3; continue; }
// having a # sign before us is good!
if ( k-1>=0 && !tids[k-1]&& ! got &&
got = 1;
// stop if no suite indicator
if ( ! got ) break;
// no tag must preceed us
if ( tids[k-1] ) break;
// a number follows?
bool isNum = false;
if ( is_digit(wptrs[k][0])) isNum = true;
// a single letter counts as a number too!
if ( wlens[k]==1 ) isNum = true;
// or if we end in a number
if ( is_digit(wptrs[k][wlens[k]-1])) isNum = true;
// everyone but suites need something more stringent
if ( got == 3 && ! isNum ) { got = 0; continue; }
// put back
if ( got == 3 ) got = 2;
// remember the start of it
startk = k - got;
// and this too
if ( got == 2 ) startAlnumPos = alnumPos - 1;
// if just the pound sign, do not change this
else startAlnumPos = alnumPos;
// incorporate into the suite place hash
if ( got == 2 ) suh = wids[k];
else suh = 0;
// incorporate ourselves into "suh" (suite hash)
suh <<= 1;
suh ^= wids[k];
// next is supposed to be the next street name!
// but it can run into the next list of fake street
// names that we added above, so fix that
if ( next && next->m_a <= k ) next = NULL;
// all done?
bool gotExt = true;
if ( k+1 >= nw ) gotExt = false;
else if ( wptrs[k+1][0] != '-' ) gotExt = false;
else if ( wlens[k+1] != 1 ) gotExt = false;
// fix "Suite 920-N"
//if ( ! is_digit(wptrs[k+2][0]) ) gotExt = false;
if ( next && k + 2 >= next->m_a ) gotExt = false;
// if we got something like "Suite G-2" (extension)
// then add these up
if ( gotExt ) {
k += 2;
alnumPos += 1;
// incorporate that too
suh <<= 1;
suh ^= wids[k];
// length
long plen = wptrs[k]-wptrs[startk]+wlens[k];
// sanity check. i've seen this happen before,
// on
// line/lm/default.asp for the $339 price, so let's
// just ignore such beasties now
if ( plen > 100 ) continue;//{ char *xx=NULL;*xx=0; }
// sanity check -- if we have no room, bail!
//if ( m_np >= MAX_PLACES ) break;
// point to the suite to add
Place *pp = (Place *)m_pm.getMem(sizeof(Place));
if ( ! pp ) return false;
// point to the suite
char *ps = wptrs[startk];
// skip over initial comma
if ( *ps == ',' ) { ps++; plen--; }
// set it
pp->m_a = startk;
pp->m_b = k+1;
pp->m_alnumA = startAlnumPos;
pp->m_alnumB = alnumPos+1;
pp->m_type = PT_SUITE;
pp->m_str = ps;
pp->m_strlen = plen;
pp->m_hash = 0;//suh;
//pp->m_adm1[0] = 0;
//pp->m_adm1[1] = 0;
//pp->m_crid = 0;
pp->m_bits = 0;
pp->m_flags2 = 0;
// thats a suite
psuite[nu++] = pp;
// now just use this
// point to next place
// all done
// provide an empty suite if none
if ( nu <= 0 ) psuite [nu++] = NULL;
// "end" is the word # of first word in the street address
long end = street->m_a;
long endAlnum = street->m_alnumA;
// but if we had a suite before... skip over it
if ( gotSuiteBefore ) {
end = suiteBefore->m_a;
endAlnum = suiteBefore->m_alnumA;
// GET THE PLACE NAME before the street (or before the suite)
// start at word before word # end
long i = end - 1;
// start here
long pa2 = m_am.getNumPtrs() - 1; // m_na - 1;
// save start of place array
long savednp = m_pm.getNumPtrs();//m_np;
// save start of name array
long savednn = nn;
// init
Address *preva = NULL;
// assign
if ( pa2 >= 0 ) preva = (Address *)m_am.getPtr(pa2);
// count how many place names we add
long pcount = 0;
// "Tingley Colesium, Abq NM"
// if the street is a place name, skip this next part...
if ( street->m_flags2 & PLF2_IS_NAME ) i = -1;
// we come back up here to filter out street address labels
// set this
long mini = -1;
// get the prev address b boundard
if ( preva ) mini = preva->m_street->m_b;
// if preva was inlined, use zip or adm1 then
if ( preva && (preva->m_flags & AF_INLINED) ) {
if ( preva->m_zip && preva->m_zip->m_b > mini )
mini = preva->m_zip->m_b;
if ( preva->m_adm1 && preva->m_adm1->m_b > mini )
mini = preva->m_adm1->m_b;
if ( preva->m_city && preva->m_city->m_b > mini )
mini = preva->m_city->m_b;
long parensCount = 0;
// keep an ongoing hash of alnum words in the name
//long long h = 0LL;
// backup until we hit an alnum
for ( ; i >= 0 ; i-- ) {
// do not cross a title tag to get place name
if ( tids[i] == TAG_TITLE ) { i = -1; break; }
if ( tids[i] == (TAG_TITLE|BACKBIT)) { i = -1; break; }
// skip if not alnum word
if ( ! wids[i] ) {
// skip tags
if ( tids[i] ) continue;
// see if this punct word has a ')' in it!
char *pp = wptrs[i];
char *ppend = pp + wlens[i];
for ( ; pp < ppend ; pp++ ) {
// count 'em
if ( *pp=='(' ) parensCount--;
if ( *pp==')' ) parensCount++;
// . skip if in bad section
// . the two urls have quite a few
// addresses in common, causing the place names
// to get their SEC_DUP bit set. But out new algo
// plays somewhat nicely with menu cruft because
// we have to verify the place names with another
// website to really make the place name stick,
// so let's no longer use SEC_DUP or'ed in with
// the badFlags. mdw.
if ( sp && (sp[i]->m_flags & badFlags ) ) // |SEC_DUP)
// has their address on every
// web page, but on one web page it was
// "202 Hardvard SE" and another it was SouthEast...
// BUT for the most part this logic is ok!
// if the street does not have SEC_DUP set in its
// section, BUT the name does, then ignore the name!
if ( street->m_a>= 0 &&
// msg13 has no sections
sp &&
// if street section does not have SEC_DUP set
! (sp[street->m_a]->m_votesForDup) &&
// but the ith word does
( sp[i]->m_votesForDup ) )
// then skip over this word and do not
// allow it to be the place name
// . skip if "at"
// . "Post Office & Library at 950 pinetree se ..."
// . no "thru October at 6718 Rio Grande NW."
// . "write elizabeth doak, treasurer at 1606 silver"
// . no no i guess we got date detection now
// . and skip "xyz [is located at] 123 main st"
if ( wids[i] == h_at && is_lower_utf8(wptrs[i]) )
if ( wids[i] == h_is && is_lower_utf8(wptrs[i]) )
if ( wids[i] == h_located && is_lower_utf8(wptrs[i]) )
// skip phone #'s
if ( i>=6 &&
wlens[i]==4 &&
m_words->isNum(i) &&
wlens[i-2]==3 &&
m_words->isNum(i-2) &&
wlens[i-4]==3 &&
m_words->isNum(i-4) ) {
i -= 4;
// phone with no area code
else if ( i>=4 &&
wlens[i]==4 &&
m_words->isNum(i) &&
wlens[i-2]==3 &&
m_words->isNum(i-2) ) {
i -= 2;
// . we are getting place names like "3 baths..."
// for "6769 Guadalupe Trl Nw" for the url
// Bledsoe-Rd-NW_Albuquerque_NM_87107_fa9ca500
// which are in the section of a different street,
// so fix that with this logic.
// . basically expand the section around "i" and see
// if it belongs to street #X or to street #X-1.
// get prev street
Place *prev = NULL;
if ( X>0 ) prev = (Place *)m_sm.getPtr(X-1);
// flags
bool gotOurStreet = false;
bool gotPrevStreet = false;
// keep expanding the section around the
// place name until we get a street or multiple
// streets. if we only get a single street, then
// it must be OUR STREET, "street"
Section *si = NULL;
// msg13 has no sections
if ( sp ) si = sp[i];
// keep expanding section until we got street in it
for ( ; prev && si ; si=si->m_parent ) {
// stop when it contains our street or
// previous street
if ( si->m_a <= street->m_a &&
si->m_b >= street->m_b )
gotOurStreet = true;
if ( si->m_a <= prev->m_a &&
si->m_b >= prev->m_b )
gotPrevStreet = true;
// break on either
if ( gotOurStreet ) break;
if ( gotPrevStreet ) break;
// if it is more closely related to the previous street
// then do not assign this place name to us, i guess
// we do not have a good one for this street!
if ( gotPrevStreet && ! gotOurStreet )
i = -1;
// ok we got a candidate
// . if our place name candidate is in a date, then assume
// that we have no place name!
// . fixes
if ( i >= 0 && i < nw && bits && ( bits[i] & D_IS_IN_DATE ) &&
// incase place name ends in midnight or noon
wids[i] != h_daily &&
wids[i] != h_noon &&
wids[i] != h_midnight )
i = -1;
// fix "copyright ; 2009 Albuquerque Journal; Abq ; NM"
// for
if ( i >= 0 && i < nw && wids[i] == h_copyright) {
// stop getting a name
i = -1;
// and mark street as bad
//street->m_bits |= PLF_IGNORE;
// go to next street!
// set that as our right side
long righti = i;
// reset this count
long alnumCount = 0;
long alphaCount = 0;
// reset this
long atPos = -1;
bool atCityName = false;
long atAlnumCount = -1;
// reset this
bool hadUpper = false;
bool hadLower = false;
bool hadAnd = false;
// save last good i
long lasti = -1;
bool isUpper;
bool isLower;
// . ok, go backwards up to 15 alnum words from there
// . The Harwood Museum of Art of the University of New Mexico
for ( ; i >= 0 && alnumCount < MAX_ALNUMS_IN_NAME ; i-- ) {
// ignore if in script, etc. tags
if ( sp && (sp[i]->m_flags & badFlags) ) continue;
// . ignore if in menu section
// . might be like "<td>place</td>"
// . i know for
// events.eventsmain?action=showEvent&eventID=833142
// we are getting "Address: " as the place name
// because it is in the table like that.
// . TODO: for single event pages we must require at
// least another page from same site with same
// tagPairHash to prevent this kind of thing
// . likewise, for the same reason above, there are
// two urls that share some addresses
// in common and the place name is getting its
// SEC_DUP bit set, so let's reply more on
// verifying place name 1 and 2 than this:
//if ( sp[i]->m_flags & SEC_DUP ) continue;
// stop at tag, not bold tags though
// fix for highlighting terms
// in the place name.
if ( tids[i] ) {
if ( tids[i] == TAG_B ) continue;
if ( tids[i] == (TAG_B | BACKBIT) ) continue;
// count alnums
if ( wids[i] ) {
// do not stop something in parentheses
if ( parensCount > 0 )
goto skipbreak;
// no dates allowed in name
if ( bits && (bits[i] & D_IS_IN_DATE) &&
// "1am gallery"
to_lower_a(wptrs[i][1])!='a') &&
// high noon saloon on
// SFV_retloc.php
wids[i] != h_daily &&
wids[i] != h_noon &&
wids[i] != h_midnight )
// if we are the "last" word in the place name
// then we must always be upper case!
if ( alnumCount == 0 &&
! is_upper_utf8(wptrs[i]) &&
// digits can not be upper case
! is_digit(wptrs[i]) &&
// allow "Subway at 1300 main st."
wids[i] != h_at &&
// allow ""
(i-1<0 || wptrs[i][-1]=='.') )
if ( alnumCount==0 && wids[i]==h_and) break;
// "Property Information for 440 Bledsoe Rd"
// "Map for ..."
if ( alnumCount==0 && wids[i]==h_for) continue;
// "Map of ..."
if ( alnumCount==0 && wids[i]==h_of) continue;
isLower = is_lower_utf8(wptrs[i]);
isUpper = is_upper_utf8(wptrs[i]);
// hack fix for "O'niell's Pub" (apostrop)
if ( i >= 2 &&
wlens[i-1] == 1 &&
wids[i-2] &&
wlens[i-2] == 1 &&
wptrs[i-2][0] =='O' ) {
// assume it is not lower case
isLower = false;
isUpper = true;
// if this is lower and we had an upper
if ( isLower &&
hadUpper &&
// must not be an allowable lowercase word
! s_lc.isInTable(&wids[i]) &&
// fix "Bandido's Hideout Restaurant" cuz
// it was breaking on the "s" cuz that is
// not a query stop word!
wlens[i] > 1 )
// if we had a lower non-stop word, and then
// we hit an upper...
if ( isUpper && hadLower ) {
// force an abort on this street
lasti = -1;
// if we hit a number followed by am or pm,
// that is a time so stop the scan!
//if (( wids[i] == h_am || wids[i] == h_pm ) &&
// i >= 2 && is_digit(wptrs[i-2][0]) )
// break;
// if we hit "by" and "sponsored" or
// "arranged" preceeds it, stop!
// fixes: "arrangements by ..." in
if ( wids[i] == h_by && i-2>=0 &&
( wids[i-2] == h_arrangements ||
wids[i-2] == h_arranged ||
wids[i-2] == h_sponsored ) )
// if we got something and we hit the
// previous address zip or state or city
// then just stop
if ( i < mini && lasti >= 0 )
// to be more strict, no lower at all!
// NO! we lose "explora" then
//if ( is_lower_utf8(wptrs[i]) &&
// ! ww->isQueryStopWord(i) )
// break;
// . cut off here too
// . do not include the previous street name
// as part of your place name
if ( //preva &&
i < mini && // preva->m_street->m_b &&
lasti == -1 ) {
// skip over it
i = preva->m_street->m_a - 1;
// update prev
if ( pa2>=0 )
preva=(Address *)m_am.getPtr(pa2);
preva = NULL;
// now we only redo if this is the
// FIRST place name
if ( pcount == 0 ) goto redo;
// otherwise, stop it!
// if we did have some junk in the place name
// then use that, but do not include this
// street name as part of it
if ( preva && i < preva->m_street->m_b )
// if we hit previous address
// store the last good word position
lasti = i;
// count it
// NO! we are looping backwards, so we
// can't do this here. we now do it below
// mix it up
//h <<= 1;
// hash it into our ongoing hash
//h ^= wids[i];
// skip words starting with a digit
if ( is_digit(wptrs[i][0]) ) continue;
// consider it alpha i guess now
// is it upper?
if ( isUpper ) hadUpper = 1;
if ( wids[i] == h_and ) hadAnd = true;
// caution "Santa Fe Co-op" or "E-mail" is ok ;
// don't set hasLower for "op" or "mail"
if ( i-2>= 0 && wptrs[i][-1]=='-' &&
is_alnum_a(wptrs[i][-2]) )
// same goes for ""
if ( i-2>= 0 && wptrs[i][-1]=='.' &&
is_alnum_a(wptrs[i][-2]) )
// hadLower only valid if not query stop word
if ( isLower && //_lower_utf8(wptrs[i]) &&
// must not be an allowable lowercase word
! s_lc.isInTable(&wids[i])
// for smoe reason 's' is not a query
// stop word, and we had a bar named
// "Slim's" that we needed to get
// ... this is in s_lc table now
//! ww->isStopWord(i) )
hadLower = 1;
// record first at
if ( wids[i] == h_at && atPos == -1) {
atPos= i;
// save this in case we trim off
atAlnumCount = alnumCount - 1;
// get string from right after "at"
// and before the street and see
// if it is a city name. get hash
// of all those words so we can look
// it up. hashes all alnum words
// in [i+2,righti+1) interval.
atCityName = isCityName(i+2,righti+1);
// skip to next
// keep parensCount up to date
char *pp = wptrs[i];
char *ppend = pp + wlens[i];
for ( ; pp < ppend ; pp++ ) {
// count 'em
if ( *pp=='(' ) parensCount--;
if ( *pp==')' ) parensCount++;
// do not stop something in parentheses
if ( parensCount > 0 ) continue;
// only certain types of punct can be in a place name
if ( wlens[i] == 1 ) {
// single space ok
if ( is_wspace_a(w[i][0]) ) continue;
if ( w[i][0] == '\r' ) continue;
// hyphen ok
if ( w[i][0] == '-' ) continue;
// apostrophe ok
if ( w[i][0] == '\'' ) continue;
// / ok, "QX&V Electro/Mechanical"
// but breaks:
// "Santa Fe Playhouse/Santa Fe Little Theater"
//if ( w[i][0] == '/' ) continue;
// ampersand ok
if ( w[i][0] == '&' ) continue;
// asterisk ok ( e*trade)
if ( w[i][0] == '*' ) continue;
// period ok (,u.s. post office)
if ( w[i][0] == '.' ) continue;
// . apostrophe ok if alnum-locked
// . "Bandido's Hideout"
if ( w[i][0]=='\'' )
if (is_alnum_a(w[i][-1]) &&
is_alnum_a(w[i][1]) )
// otherwise, not
if ( wlens[i] == 2 ) {
// . up to one parenthetical is ok
// . "The Filling Station (Albuquerque, NM)"
// the-filling-station-/V0-001-001121221-1
// . we now have parensCount for this
if ( is_wspace_a(w[i][0])&&
break; // continue;
// double space ok
if ( is_wspace_a(w[i][0])&&
// . comma space
// . i was only allow inc. or llc. to follow
// but what about:
// "NM Children, Youth, and Families Dept."
// . but then we got "St. John's College,
// Peterson Student Center" which is bad
// so now we require an and i guess
if ( w[i][0]==','&&
is_wspace_a(w[i][1]) &&
( hadAnd ||
wids[i+1] == h_inc ||
wids[i+1] == h_llc ) )
// Yahoo! or Yelp! Inc.
if ( w[i][0]=='!' &&
is_wspace_a(w[i][1]) &&
i+1<nw && wids[i+1]==h_inc )
// colon space
if ( w[i][0]==':'&&
is_wspace_a(w[i][1]) ) {
// NO NO NO, never allow names
// with colons in them now because
// we have "place name 2" to pick
// up the other name if it is a
// compound name containing a ':'
// . Location: not allowed!
// . "Location: Albuquerque Dance Ctr"
if ( i-1>=0 && wids[i-1]==h_location)
// . Address: not allowed!
if ( i-1>=0 && wids[i-1]==h_address)
// stop at Phone: too!
if ( i-1>=0 && wids[i-1]==h_phone)
// otherwise, allow it!
// the $1 store
if ( is_wspace_a(w[i][0])&&
w[i][1]== '$' )
// abbreviation (mtn. supply store)
if ( w[i][0]=='.'&&
// "moving co., inc." (allow comma after)
(is_wspace_a(w[i][1]) ||w[i][1]==',') &&
i-1>=0 && wids[i-1] &&
( isAbbr(wids[i-1]) || wlens[i-1]==1 ) &&
//fix "Institute Inc. All Rights Reserved"
// for
wids[i-1] != h_inc )
// store #13
if ( is_wspace_a(w[i][0]) &&
w[i][1]== '#' )
// apostrophe space is ok (dunkin' donuts)
if ( w[i][0]=='\''&&
// otherwise, not
if ( wlens[i] == 3 ) {
// crazy utf8 apostrophe from
// 197.html
if ( wptrs[i][0] == (char)0xe2 &&
wptrs[i][1] == (char)0x80 &&
wptrs[i][2] == (char)0x99 )
if ( wlens[i] == 3 ) {
// "B & B plumbing"
if ( is_wspace_a(w[i][0])&&w
is_wspace_a(w[i][2]) )
// otherwise, not
// a string of nothing but \n and ' ' is allowed
// and i see that in quite a few pages. microsoft
// front page had this issue as i remember...
long ampCount = 0;
long comCount = 0;
// "Dr. Smith, Obstetrician / Gynecologist"
long slashCount = 0;
// period is ok "Moving Co., Inc."
long kstart = 0;
if ( w[i][0]=='.'&&
(is_wspace_a(w[i][1]) ||w[i][1]==',') &&
i-1>=0 && wids[i-1] &&
( isAbbr(wids[i-1]) || wlens[i-1]==1 ) )
// ok now do the loop
long k ; for ( k = kstart ; k < wlens[i] ; k++ ) {
// "B & B Plumbing"
if ( w[i][k] == '&' ) {
if ( ++ampCount >= 2 ) break;
if ( comCount > 0 ) break;
if ( slashCount > 0 ) break;
if ( w[i][k] == '/' ) {
if ( ++slashCount >= 2 ) break;
if ( comCount > 0 ) break;
if ( ampCount > 0 ) break;
// . this is a good delimeter for place names
// usually, but of course if someone has
// "Gigablast, \nInc." then this will hurt!
// . i was only allow inc. or llc. to follow
// but what about:
// "NM Children, Youth, and Families Dept."
if ( w[i][k] == ',' &&
( hadAnd ||
wids[i+1]==h_inc ||
wids[i+1]==h_llc)) {
if ( ++comCount >= 2 ) break;
if ( ampCount > 0 ) break;
if ( slashCount > 0 ) break;
if ( ! is_wspace_a(w[i][k]) )
// skip if ok
if ( k == wlens[i] ) continue;
// nothing else allowed
// forget it if too long
if ( alnumCount >= MAX_ALNUMS_IN_NAME )
lasti = -1;
// come back up here after removing the " ... at" substring
// trim off lower case stop words from the beginning
for ( ; lasti >= 0 && lasti <= righti ; lasti++ ) {
// skip if not alnum
if ( ! wids[lasti] ) continue;
// assume nuked!
// is it like "Friday at The Source"?
if ( lasti+2 <= righti &&
//ww->isQueryStopWord(lasti+2) &&
s_lc.isInTable( &wids[lasti+2]) &&
getDayOfWeek ( wids[lasti] ) >= 1 ) continue;
// "monday, wednesday and friday at The Source"
if ( lasti+2 <= righti &&
getDayOfWeek ( wids[lasti+2]) >= 1 &&
getDayOfWeek ( wids[lasti] ) >= 1 ) continue;
// . or stopword + day of week is bad too!
// . "Every Monday at The Source"
if ( lasti+2 <= righti &&
getDayOfWeek ( wids[lasti+2] ) >= 1 &&
//( ww->isQueryStopWord(lasti) ||
( s_lc.isInTable(&wids[lasti]) ||
wids[lasti] == h_every ) ) continue;
// assume not nuked
// stop if not stop word
//if ( ! ww->isQueryStopWord(lasti) ) break;
if ( ! s_lc.isInTable(&wids[lasti]) ) break;
// stop if capitalized
if ( is_upper_utf8(wptrs[lasti]) &&
// trim a capitalized "At" off regardless
wids[lasti] != h_at ) break;
// assume nuked
// trim off lower case stop words from the end (Wat Center, at)
for ( ; righti >= lasti && lasti >= 0 ; righti-- ) {
// skip if not alnum
if ( ! wids[righti] ) continue;
// assume nuked
// . stop if not stop word
// . no! too strong. was removing "com" in ""
// . "Sonic Drive-In" "Stepping Stones-Drop In"
//if ( ! ww->isQueryStopWord(righti) ) break;
if ( wids[righti] == h_at ) continue;
//if ( wids[righti] == h_in ) continue;
//if ( wids[righti] == h_by ) continue;
//if ( wids[righti] == h_and ) continue;
// not nuked
// stop it
// if we included "at" then trim up until we hit the "at"
// UNLESS the place name starts with "The".
// we need to protect "The Lodge at Santa Fe" for instance.
if ( lasti>= 0 && lasti<=righti && wids[lasti] != h_the &&
atPos >= 0 &&
// ignore "at" in "at law" (e.g. "attorney at law")
// or really any other "at phrase" like that
wids[atPos+2] != h_law &&
// if a city name is between the "at" and the street,
// then assume the "at" is actually part of the place
// name!!
! atCityName ) {
lasti = atPos + 1;
// pop this back
alnumCount = atAlnumCount;
// undo
atPos = -1;
// redo filtering
goto subloop;
// "All rights reserved". no place name in this case
if ( alnumCount == 3 &&
lasti >= 0 &&
i+4<nw &&
wids[lasti ] == h_all &&
wids[lasti+2] == h_rights &&
wids[lasti+4] == h_reserved )
lasti = -1;
// "Contact Us". no place name in this case
if ( alnumCount == 2 &&
lasti >= 0 &&
i+2<nw &&
wids[lasti ] == h_contact &&
wids[lasti+2] == h_us )
lasti = -1;
// "[copyrightSign] 2000 Carrier Hotels"
if ( lasti-1>=0 &&
lasti = -1;
// "map of"
if ( alnumCount == 2 &&
lasti>=0 &&
wids[lasti] == h_map &&
lasti+2<nw &&
wids[lasti+2] == h_of )
lasti = -1;
// "map for"
if ( alnumCount == 2 &&
lasti>=0 &&
wids[lasti] == h_map &&
lasti+2<nw &&
wids[lasti+2] == h_for )
lasti = -1;
// fix "copyright ; 2009 Albuquerque Journal; Abq ; NM"
// for
if ( lasti >= 0 && alnumCount==1 && wids[lasti]==h_copyright)
lasti = -1;
// ends on lower case word with a whitespace before it
// so as to not hurt "Wendy's" or ""
if ( lasti >= 0 &&
hadUpper &&
righti>0 && // fix core...
is_wspace_a(wptrs[righti][-1]) &&
!is_digit(wptrs[righti][0]) &&
!is_upper_utf8(wptrs[righti]) )
lasti = -1;
// . we often get zips like "NM 87571" because the previous
// place has not official street but has a state/zip thing
// . fixes
if ( lasti >= 0 &&
alnumCount == 2 &&
lasti + 2 < nw &&
isStateName (lasti) &&
wlens[lasti+2] == 5 &&
is_digit(wptrs[lasti+2][0]) )
lasti = -1;
// "New Mexico 87109"
if ( lasti >= 0 &&
alnumCount == 3 &&
lasti + 4 < nw &&
isStateName (lasti) &&
wlens[lasti+4] == 5 &&
is_digit(wptrs[lasti+2][0]) )
lasti = -1;
// now check to see if we should skip this place name and
// try another before it...
if ( lasti >= 0 ) {
// watch out for "Address:" which often preceeds a
// street name when address is in a table
if ( alnumCount == 1 && wids[lasti] == h_address ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_street ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_where ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_location ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_office ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_map ) {
i = lasti - 1; goto redo; }
// fix "tel: xxxxxxx 9000 girard"
if ( alnumCount == 1 && wids[lasti] == h_tel ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_edit ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_email ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_added ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_copy ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_search ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_find ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_go ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_town ) {
i = lasti - 1; goto redo; }
if ( alnumCount == 1 && wids[lasti] == h_city ) {
i = lasti - 1; goto redo; }
// sometimes "phone:" wedged in there
if ( alnumCount == 1 && wids[lasti] == h_phone ) {
i = lasti - 1; goto redo; }
// "e-mail"
if ( alnumCount == 2 &&
wids[lasti] == h_e &&
wids[lasti+2] == h_mail ) {
i = lasti - 1; goto redo; }
// "mailing address"
if ( alnumCount == 2 &&
wids[lasti] == h_mailing &&
wids[lasti+2] == h_address ) {
i = lasti - 1; goto redo; }
// "mail address"
if ( alnumCount == 2 &&
wids[lasti] == h_mail &&
wids[lasti+2] == h_address ) {
i = lasti - 1; goto redo; }
// "snail mail"
if ( alnumCount == 2 &&
wids[lasti] == h_snail &&
wids[lasti+2] == h_mail ) {
i = lasti - 1; goto redo; }
// . skip over "33 miles..." or "33 mi..."
// . Carlsbad Cavern National Park
// 27 miles S of Carlsbad
// 3225 National Parks Highway
if ( alnumCount >= 2 &&
is_digit(wptrs[lasti][0]) &&
( wids[lasti+2] == h_mi ||
wids[lasti+2] == h_miles ||
wids[lasti+2] == h_km ||
wids[lasti+2] == h_kilometers ) ) {
i = lasti - 1; goto redo; }
// skip over "(1 review)" or "(33 reviews)"
if ( alnumCount == 1 &&
( wids[lasti] == h_review ||
wids[lasti] == h_reviews ) ) {
// skip number before too!
if ( lasti-2>=0 && is_digit(wptrs[lasti-2][0]))
i = lasti - 3;
i = lasti - 1;
goto redo;
// skip over "Write a Review"
if ( alnumCount == 3 &&
wids[lasti] == h_write &&
wids[lasti+2] == h_a &&
wids[lasti+4] == h_review ) {
i = lasti - 1;
// skip back until we hit a tag i guess
// if we have "Be the first to Write a Review"
for ( ; i > 0 && ! tids[i] ; i-- );
goto redo;
// "Fax: "
if ( alnumCount >=2 && wids[lasti] == h_fax &&
m_words->hasChar(lasti+1,':') ) {
i = lasti - 1; goto redo; }
// "Ph: "
if ( alnumCount >=2 && wids[lasti] == h_ph &&
m_words->hasChar(lasti+1,':') ) {
i = lasti - 1; goto redo; }
// "Tel: "
if ( alnumCount >=2 && wids[lasti] == h_tel &&
m_words->hasChar(lasti+1,':') ) {
i = lasti - 1; goto redo; }
// "Telephone: "
if ( alnumCount >=2 && wids[lasti] == h_telephone &&
m_words->hasChar(lasti+1,':') ) {
i = lasti - 1; goto redo; }
// "Street Address:"
if ( alnumCount ==2 && wids[lasti] == h_street &&
wids[lasti+2] == h_address ) {
i = lasti - 1; goto redo; }
// "Location Address:"
if ( alnumCount ==2 && wids[lasti] == h_location &&
wids[lasti+2] == h_address ) {
i = lasti - 1; goto redo; }
// "Add to Favorites"
if ( alnumCount == 3 &&
wids[lasti ] == h_add &&
wids[lasti+2] == h_to &&
wids[lasti+4] == h_favorites ) {
i = lasti - 1; goto redo; }
// "view favorites"
if ( alnumCount == 2 &&
wids[lasti ] == h_view &&
wids[lasti+2] == h_favorites ) {
i = lasti - 1; goto redo; }
// "more info"
if ( alnumCount == 2 &&
wids[lasti ] == h_more &&
wids[lasti+2] == h_info ) {
i = lasti - 1; goto redo; }
// "more information"
if ( alnumCount == 2 &&
wids[lasti ] == h_more &&
wids[lasti+2] == h_information ) {
i = lasti - 1; goto redo; }
// if we just had a sequence of numbers for the place
// name then ignore that. usually a phone number. fixes
// Restaurants/Food+Delivery+Services
if ( alphaCount == 0 && alnumCount > 0 ) {
i = lasti - 1; goto redo; }
// . if street had upper case words, but we had lower case,
// then we are not a good place name!
// . put this after the redo's so we can redo things like
// "map" or "reviews" which may be in lower case
if ( (street->m_bits & PLF_HAS_UPPER) && hadLower ) {
//lasti = -1;
// skip back to a tag like we do for
// "Write a Review" skipping logic below
//i = lasti - 1;
// skip back until we hit a tag i guess
// if we have "Be the first to Write a Review"
for ( ; i > 0 && ! tids[i] ; i-- );
goto redo;
// . add the place name if we found something
// . if we broke out of the loop because of the alnumCount then
// that is NOT good because we want something that has a
// delimeter on the left!
if ( lasti >= 0 && lasti<=righti && alphaCount > 0 &&
// this is restricted above!
//alnumCount <10 &&
nn<10 ) { // && m_np<MAX_PLACES ) {
// point to it
char *p = wptrs[lasti];
// length
long plen = (wptrs[righti]+wlens[righti])-wptrs[lasti];
// set end
char *pend = p + plen;
// end on period if we had it
if ( *pend == '.' ) pend++;
// include terminating ')' if any
long parens = 0;
// start scan
for ( char *s = p ; s < pend ; s++ ) {
if ( *s == '(' ) parens++;
if ( *s == ')' ) parens--;
// term it with a ) if we had a (
if ( parens > 0 ) {
if ( *pend == ')' )
pend += 1;
else if ( is_wspace_a(*pend) && pend[1]==')')
pend += 2;
// re-set length
plen = pend - p;
// note it if crazy...
if ( plen >= 200 )
// note it
log("addr: got place name of %li chars long",
// sanity check
//if ( m_np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
// point to the place name
Place *pp = (Place *)m_pm.getMem(sizeof(Place));
if ( ! pp ) return false;
// set the type
long ptype = 0;
if ( pcount == 0 ) ptype = PT_NAME_1;
if ( pcount == 1 ) ptype = PT_NAME_2;
if ( ptype == 0 ) { char *xx=NULL;*xx=0; }
// set it
pp->m_a = lasti;
pp->m_b = righti+1;
pp->m_alnumA = -1;//alnumCount;
pp->m_alnumB = -1;//alnumCount + subcount;
pp->m_type = ptype;//PT_NAME;
pp->m_str = p;//wptrs[lasti];
pp->m_strlen = pend - p;//plen;
//pp->m_hash = h;
//pp->m_adm1[0] = 0;//pd->m_adm1[0];
//pp->m_adm1[1] = 0;//pd->m_adm1[1];
//pp->m_crid = 0;//pd->m_crid;
pp->m_bits = 0;//PLF_INFILE;
pp->m_flags2 = 0;
// reset hash
//long long h = 0LL;
// word if of previous word
//long long pi = 0LL;
// we WERE looping backwards, so we need to
// compute the hash here
setHashes ( pp , m_words , m_niceness );
// if name1/name2 is a city/state or state/city then
// do not add it
bool isGood = true;
// get previous two places, see if city/state
Place *prev1 = NULL;
Place *prev2 = NULL;
long np = m_pm.getNumPtrs();
if ( np >= 2 ) {
prev1 = (Place *)m_pm.getPtr(np-1);
prev2 = (Place *)m_pm.getPtr(np-2);
// . fix "Kimo Theater, Albuquerque NM, 423 Central"
// for
// ow/11865-kimo-theatre
// . do not allow a city & state to be the two names
// . sometimes ppl put this before the street
// . only do this after we have two names (pcount==1)
if ( pcount == 1 &&
np > savednp && // we at least added one to np
prev1 &&
prev2 &&
isCityState3 (prev1->m_hash,prev2->m_hash)==1) {
// wipe out previous name
nn = savednn;
// wipe out prevous place
//m_np = savednp;
m_pm.setNumPtrs ( savednp );
// reset this too!
pcount = 0;
// skip over these guys to get real name
i = lasti - 1;
// try again
goto redo;
// and do not add this one
//isGood = false;
// too long is bad
if ( plen >= 200 )
isGood = false;
if ( ! pp->m_hash )
isGood = false;
// . if nothing worth hashing, do not add it
// . only really add if length is somewhat sane!!
if ( isGood ) {
// store it
pname[nn++] = pp;
// sanity
//if (m_np>= MAX_PLACES ){char *xx=NULL;*xx=0;}
// advance it, but not if we only had "the" for
// the place name!!
for ( long k = pp->m_a ; k < pp->m_b ; k++ ) {
// skip if not word
if ( ! wids[k] ) continue;
// . do not add the first word if its "The"
// into this
// . fixes "The Guild Cinema" not matching
// placedb entries for "Guild Cinema"
if ( h == 0LL && wids[k] == h_the ) continue;
// . convert place name word into base word
// . synonyms
// . converts 4th to fourth, etc.
long long *hw = getSynonymWord (&wids[k],&pi);
// set previous id
pi = wids[k];
// ignore it if returned 0 (ignore) (school)
if ( ! *hw ) continue;
// mix it up
h <<= 1LL;
// xor it in
h ^= *hw; // wids[k];
// only consumate it if not the single word "the"
if ( h ) {
// set it
pp->m_hash = h;
// store it
pname[nn++] = pp;
// advance it, but not if we only had "the" for
// the place name!!
// point to before us!
i = lasti - 1;
// try to get another one if we only got one
if ( ++pcount == 1 )
goto redo;
// . if no name, beat it. go to the next street we got
// . no, some events just have a street address and no
// place name!
// continue;
// END GET THE PLACE NAME before the street
// . if we had multiple streets RIGHT AFTER us, skip over them!
// . where the "po box 1293" is technically a street
// . had some too
// start looking for city/state here
Place *xstreet = (Place *)m_sm.getPtr(X);
long start = xstreet->m_b;
long startAlnum = xstreet->m_alnumB;
// as = "After Street"
long as = X + 1;
// shortcut
long ns = m_sm.getNumPtrs();
// scan the streets after street #X
for ( ; as < ns ; as++ ) {
// get that
Place *astreet = (Place *)m_sm.getPtr(as);
// stop if "as" is a "fake street"
if ( astreet->m_flags2 & PLF2_IS_NAME ) break;
// if we are NOT the ending word of prev street, then
// stop this loop.
if ( startAlnum != astreet->m_alnumA ) break;
// assign, and do the next
startAlnum = astreet->m_alnumB;
start = astreet->m_b;
// use this
Place *sss = NULL;
if ( as < ns ) sss = (Place *)m_sm.getPtr(as);
// stop if "as" is a "fake street"
if ( as<ns && (sss->m_flags2 & PLF2_IS_NAME)) as=ns;
// skip over punct
if ( start < nw && ! wids[start] ) start++;
// . skip over "in"
// . inlines "950 Pinetree SE, in Rio Rancho, NM" for
if ( start<nw && wids[start] == h_in ) {
start += 2;
// do not scan past this then
long max = nw;
if ( as < m_sm.getNumPtrs() ) max = sss->m_a;
// for and
// the street was "124 ST BTWN 5 AVE" and the intersection
// "AVE AND MT MORRIS PARK WEST" intersected with that
// street and caused this to core!
// sanity check
//if ( max <= street->m_b ) { char *xx=NULL;*xx=0; }
// begin parsing out city/adm1/ctry/zip after street name
// . start scan at street->m_b
// . end scan at "max"
// . end scan after up to 15 alnum words as well
// . adds into our places[] array we started up above that
// includes places from the title
// . i am expanding from 6 words to 15 because of :
// "111 Maple Street SE @ Maple and Central beside "
// Knadjian's Oriental Rugs in Albuquerque, New Mexico "
// 87106. "
// . and to reduce bleeding into another address i am now
// limiting based on the start of the next street, "max"
np =addProperPlaces(start,max,15,places,MAX_PLACES,np,0,
// subtract 1 since it is an OPEN ended
// half interval just like [a,b)
startAlnum - 1,-1);
// breach check
if ( np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
// check before the street, too, but stay in the sentence!
if ( nn >= 1 ) {
long na = pname[0]->m_a;
long nb = pname[0]->m_b;
pname[0]->m_alnumA - 1,-1);
if ( nn >= 2 ) {
long na = pname[1]->m_a;
long nb = pname[1]->m_b;
pname[1]->m_alnumA - 1,-1);
// breach check
if ( np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
// ** "... in Santa Fe 213 Washington Ave."
// now scan the sentence this street is in for any
// prepositional phrase beginning with the preposition "in"
// immediately followed by a city or adm1 name.
// this logic was hurting because our sentence
// formation was not good enough and we were allowing the
// many span tags in the sentence to break the sentence into
// many smaller sentences because we decided span tags should
// do that by default. so i made the sentence detection logic
// better so that would keep 213 washington ave
// in the sentence that had "in Santa Fe" still...
Section *ss = m_sections->m_sectionPtrs[street->m_a];
for ( ; ss ; ss = ss->m_parent )
if ( ss && (ss->m_flags & SEC_SENTENCE) ) break;
// might not have a sentence if we are CT_JAVASCRIPT content
// type, sense we avoid sentence setting for those doc types
long sa = 0;
long sb = 0;
// scan the first and last word of the senentce this street
// is in. MAY ACTUALLY BE OUTSIDE of the "ss" section because
// of the new logic in Sections::addSentences() which allows
// us to have sentences that split sections now to deal with
//,, etc.
if ( ss ) { sa = ss->m_senta; sb = ss->m_sentb; }
// init this
bool hasRequiredPlace = false;
// set this. does it matter???
long alnumPos = 0;//ss->m_alnumA - 1;
bool afterIn = false;
// scan the sentence
for ( long i = sa ; i < sb ; i++ ) {
// skip if not alnum word
if ( ! m_wids[i] ) continue;
// count it
// skip if not "in"
if ( m_wids[i] == h_in ) {
afterIn = true;
// skip if not after the word "in"
if ( ! afterIn ) continue;
// reset in case we get continued below
afterIn = false;
// to avoid "just in case" or "in time" let's
// require it be capitalized
if ( ! m_words->isCapitalized(i) ) continue;
// find the end of it
long j = i + 1;
long lastj = j;
// loop until we hit something lowercase or number
for ( ; j < sb ; j++ ) {
// stop on tag
if ( m_tids[j] ) break;
// check case
if ( m_wids[j] ) {
// if upper that's ok
if ( ! m_words->isCapitalized(j) &&
! s_lc.isInTable(&m_wids[j]) )
// save it
lastj = j;
// stop on certain punct
char *p = wptrs[j];
char *pend = p + wlens[j];
for ( ; p < pend ; p++ ) {
if ( is_wspace_a(*p) )
// St. James?
if ( *p == '.' )
if ( p < pend ) break;
// save
long oldnp = np;
// reset
np = addProperPlaces(i,i+1,8,places,
// set the required bit
for ( long k = oldnp ; k < np ; k++ )
// set this bit
places[k].m_bits |= PLF2_REQUIRED;
// must contain a required bit?
if ( np > oldnp ) hasRequiredPlace = true;
// stop
// breach check
if ( np >= MAX_PLACES ) { char *xx=NULL;*xx=0; }
// parse up all our accumulated Places into arrays so we can
// loop over them all and get all the possible combinations
// of Place types, Place::m_type.
for ( long i = 0 ; i < np ; i++ ) {
// get it
Place *pi = &places[i];
// sanity check
if ( ! pi->m_hash ) { char *xx=NULL;*xx=0; }
// parse it up
if ( pi->m_type == PT_CITY ) {
if ( nc >= MAX_CITIES2 ) continue;
pcity[nc++] = pi;
if ( pi->m_type == PT_STATE ) {
if ( na >= MAX_ADM1 ) continue;
padm1[na++] = pi;
if ( pi->m_type == PT_ZIP ) {
if ( nz >= 10 ) continue;
pzip[nz++] = pi;
if ( pi->m_type == PT_CTRY ) {
if ( ny >= 10 ) continue;
pctry[ny++] = pi;
// sanity check
if ( pi && ! pi->m_hash ) { char *xx=NULL;*xx=0; }
// . the new way is to telescope out from our street section
// looking for cities
// . we note the telescope depth of each city/state/zip place
// we encounter so that we prefer the city topologically
// closest to us
long sa = xstreet->m_a;
if ( sa < 0 ) { char *xx=NULL;*xx=0; }
// shortcut
Place *st = xstreet;//&streets[X];
// are we a street or place name in the title?
bool streetInTitle = false;
if ( st->m_a > 0 && sp )
streetInTitle = (sp[st->m_a]->m_flags & SEC_IN_TITLE);
Section *ss = NULL;
long senta = -1;
long sentb = -1;
if ( m_sections ) {
ss = m_sections->m_sectionPtrs[street->m_a];
senta = ss->m_senta;
sentb = ss->m_sentb;
long maxZips = nz + 1;
bool hasRequiredCity = false;
bool hasRequiredState = false;
// set pcity[], array of potential cities for this street
for ( long i = 0 ; i < m_npSaved ; i++ ) {
// breathe
// get city, state or zip
Place *p = (Place *)m_pm.getPtr(i);
// sanity check
if ( p->m_alnumA < st->m_alnumA && p->m_a > st->m_a ) {
char *xx=NULL;*xx=0; }
// skip city if it intersects street
if ( p->intersects ( xstreet ) ) continue;
// skip city if it intersects the name too now
//if(nn>0&&pname[0]&&p->intersects(pname[0])) continue;
// or name2, to fix
//if(nn>1&&pname[1]&&p->intersects(pname[1])) continue;
// for zips really, should not be in the suite
if (psuite[0]&&p->intersects(psuite[0])) continue;
// is it required
bool isRequired = ( p->m_flags2 & PLF2_REQUIRED );
// . allow state to come from anywhere in the document
// . TODO: later add meta description to get
// etc.
if ( p->m_type == PT_STATE ) {
// is it in our sentence
bool inSent = (p->m_a>=senta&&p->m_a<sentb);
// if in our sentence and required, set this
if ( inSent &&
isRequired &&
// fix "in NE Albuquerque" so we do not
// think that means nebraska... this
// fixes address in
// Albuquerque/Food+Dining/Restaurants/
// Food+Delivery+Services
m_wids[p->m_a]!= h_ne)
hasRequiredState = true;
// make the key for deduping
char key[4];
key[0] = p->m_adm1[0];
key[1] = p->m_adm1[1];
key[2] = 0;
key[3] = 0;
// get if already in padm1[] array
Place **pp = (Place **)dat.getValue ( &key );
// if it is us already, skip for sure
if ( pp && *pp == p ) continue;
// if we are not near street, skip us
long dist1 = p->m_alnumA - st->m_alnumA;
long dist2 = p->m_alnumA - st->m_alnumB;
if ( dist1 < 0 ) dist1 *= -1;
if ( dist2 < 0 ) dist2 *= -1;
long mdist = dist1;
if ( dist2 < mdist ) mdist = dist2;
if ( mdist > 10 && ! inSent ) continue;
// sanity
if ( na >= 80 ) continue;
// ok, add it in even though this state might
// already be represented by another word
// somewhere else in the document
padm1 [ na++ ] = p;
// that's it
// . stop if far beyond the street
// . if in venue tag then m_a will be < 0
if ( p->m_a >= 0 &&
p->m_alnumA > st->m_alnumB + 10 )
// is place in title?
bool inTitle = (p->m_bits & PLF_FROMTITLE);
// if we are an xml doc they often have multiple
// <title> tags, one for each element, so do not
// consider in that case. this was causing
// to miss its city after the address.
if ( m_contentType == CT_XML ) inTitle = false;
// skip if before us and not in title
if ( p->m_a >= 0 &&
p->m_a < st->m_a &&
// well, allow it to be a few words before us
// to fix some addresses that have the city
// before the street. like
p->m_alnumB < st->m_alnumA - 5 &&
! inTitle )
// zip is not allowed to be before us ever though
// even if in title, which is not allowed
if ( p->m_type == PT_ZIP &&
p->m_a >= 0 &&
p->m_a < st->m_a )
// only use first zip, no because one zip may be
// in the title and the other in the body
if ( p->m_type == PT_ZIP && nz >= MAX_ZIPS )
// skip zip codes in the title
if ( p->m_type == PT_ZIP &&
p->m_a >= 0 &&
inTitle &&
! streetInTitle )
// skip zip codes in the tag
if ( p->m_type == PT_ZIP && p->m_a < 0 )
// only allow one zip from what we started with
if ( p->m_type == PT_ZIP && nz >= maxZips )
if ( p->m_type == PT_ZIP ) {
pzip [nz++] = p;
// limit to like 5 or so, that is indicative of
// a list of cities after us...
if ( nc >= MAX_CITIES )
// this can be a type of PT_NAME since we add tags
// from a tagrec like
// "Albuquerque Center for Peace and Justice;;;202
// Harvard Southeast;Albuquerque;nm;87106;;165445..."
// and that adds its places into m_places[] and
// incs m_np
if ( p->m_type != PT_CITY ) continue;
// add it to place table like how addProperPlaces() did
if ( p->m_type == PT_CITY ) pcity[nc++] = p;
// if in our sentence and required, set this
if ( p->m_a>= senta && p->m_a < sentb && isRequired )
hasRequiredCity = true;
// complain
if ( nn >= 10 ) {
if ( ! printed ) log("events: name breach");
printed = true;
//char *xx=NULL;*xx=0;
if ( nc >= MAX_CITIES ) {
if ( ! printed ) log("addr: cities breach");
printed = true;
// just bail out now to fix the slow parsing of
m_breached = true;
return false;
//char *xx=NULL;*xx=0;
if ( na >= MAX_ADM1 ) {
if ( ! printed ) log("events: adm1 breach");
printed = true;
//char *xx=NULL;*xx=0;
//if ( nc >= MAX_CITIES || nc <= 0 ) {
// log("events: city breach");
// char *xx=NULL;*xx=0;
// need at least one city or zip to make an address
if ( nc <= 1 && nz <= 1 ) continue;
// . PO Boxes do not have names
// . was picking up "yahoo" as the place name for:
// :
// " ** P.O. Box 94766, Albuquerque"
//if ( to_lower_a(street->m_str[0])=='p' ) nn = 0;
// . allow for a null place name
// . some events just have a street address with no official
// place name
if ( nn < 2 ) pname[nn++] = NULL;
if ( nn < 2 ) pname[nn++] = NULL;
// TODO: filter out places using the hashtable adm1/ctryId algo
// adjust nc
//long fakena = na + dc;
// . now the heavily nested loop (BIG LOOP)
// . first over addresses to inherit from
// . default addresses (from tagdb rec - contact info)
// . TODO: fix this i1 < 2 HACK!
for ( long i1 = 0 ; i1 < dc && i1 < 2 ; i1++ ) {
// loop over default address again, but ignore city and
// just use the adm1 (state).
// should fix "913 W. Alameda - Santa Fe" which has no state,
// but "Albuquerque, New Mexico" is in the tag!
for ( long i1b = 0 ; i1b < 2 /*3*/ ; i1b++ ) {
// adm1
for ( long i2 = 0 ; i2 < na ; i2++ ) {
// city
for ( long i3 = 0 ; i3 < nc ; i3++ ) {
// ctry
//for ( long i4 = 0 ; i4 < ny ; i4++ ) {
// zip
for ( long i5 = 0 ; i5 < nz ; i5++ ) {
// suite
for ( long i6 = 0 ; i6 < nu ; i6++ ) {
// place name
//for ( long i7 = 0 ; i7 < nn ; i7++ ) {
// breathe
// we only use i1b for default addresses in da[]
if ( i1b > 0 && i1 == 0 ) continue;
// shortcuts
Place *adm1 = padm1 [i2];
//Place *ctry = pctry [i4];
Place *zip = pzip [i5];
Place *suite = psuite [i6];
Place *name1 = pname [0];
Place *name2 = pname [1];
Place *city = pcity [i3];
// now if city is out of bounds use the venue address
if ( i1 > 0 ) {
// set it
Address *addr = &da [i1];
// always use venue's state!
adm1 = addr->m_adm1;
// 1 means inherit city too!
if ( i1b == 1 )
city = addr->m_city;
// don't take the zip!!
//zip = addr->m_zip;
zip = NULL;
if ( hasRequiredCity ) {
// skip if no city
if ( ! city ) continue;
// skip if city is not "required"
if ( ! ( city->m_flags2 & PLF2_REQUIRED ) )
// must be in our sentence! this fixes
// when we had "... in Central New Mexico"
// in the title, it thought Central was the
// city. but we had "in Abq" in our sentence.
// and both cities had this bit set but
// only Abq should have applied!
if ( city->m_a < senta ) continue;
if ( city->m_a >= sentb ) continue;
if ( hasRequiredState ) {
// skip if no state
if ( ! adm1 ) continue;
// skip if stateis not "required"
if ( ! ( adm1->m_flags2 & PLF2_REQUIRED ) )
// see the "city" fix right above
if ( adm1->m_a < senta ) continue;
if ( adm1->m_a >= sentb ) continue;
// no overlap of adm1 and city
if ( adm1 && city &&
adm1->m_a >= 0 &&
adm1->m_a == city->m_a ) continue;
// if we had a prepositional phrase starting with "in"
// then we must contain its city/adm1 name if it
// had one...
if ( hasRequiredPlace ) {
bool gotIt = false;
if ( city && (city->m_bits & PLF2_REQUIRED ) )
gotIt = true;
if ( adm1 && (adm1->m_bits & PLF2_REQUIRED ) )
gotIt = true;
if ( ! gotIt )
// . inherit!
// . "addr" i think is just the default venue addr now
if ( i1b == 0 ) {
// if addr is supplying these, skip if there
// was a collision.
if ( addr->m_adm1 && adm1 ) continue;
if ( addr->m_city && city ) continue;
//if(addr->m_name.m_str && name ) continue;
if ( addr->m_adm1 ) adm1 = addr->m_adm1;
if ( addr->m_city ) city = addr->m_city;
//if(addr->m_name.m_str ) name = &addr->m_name;
// . if i1b is 1 then we only inherit adm1!!!
// . this fixes the bug for 913 W. Alameda described
// above.
else if ( i1b == 1 ) {
// if addr is supplying these, skip if there
// was a collision.
if ( addr->m_adm1 && adm1 ) continue;
if ( addr->m_adm1 ) {
adm1 = addr->m_adm1;
if(!adm1->m_hash){char *xx=NULL;*xx=0;}
// need a city, can be implied by a zip
if ( ! city && ! zip ) continue;
// the CF_UNIQUE is too inaccruate for this!!
//bool hasState = false;
//if ( adm1 ) hasState = true;
//if ( zip ) hasState = true;
//if ( city && city->m_alnumA == st->m_alnumB &&
// city->m_adm1[0] )
// hasState = true;
//if ( ! hasState ) continue;
// . need a state too, can be implied by a zip
// . certain unique cities can also imply the state,
// like "Albuquerque" or "Washington DC"
if ( ! adm1 && ! zip ) continue;
// . how to fix "1024 4th St SW in downtown
// Albuquerque" which has no adm1?
// . get the adm1/state from the city, BUT
// only if city is UNIQUE!!!
if ( ! adm1 && city->m_bits & PLF_UNIQUE ) {
tap.m_crid = city->m_crid;
tap.m_str = city->m_adm1;
tap.m_strlen = 2;
tap.m_adm1[0] = city->m_adm1[0];
tap.m_adm1[1] = city->m_adm1[1];
adm1 = &tap;
// this is required
//if ( ! adm1 ) continue;
//if ( ! name ) continue;
// quickly check adm1 vs. city
//if ( adm1->m_adm1[0] != city->m_adm1[0] ) continue;
//if ( adm1->m_adm1[1] != city->m_adm1[1] ) continue;
//if ( adm1->m_crid != city->m_crid ) continue;
if ( adm1 && city &&
!(adm1->m_adm1Bits & city->m_adm1Bits))
// sanity check
if ( zip && ! zip->m_hash ) { char *xx=NULL;*xx=0; }
// cancel out bad zips
if ( zip && adm1 && adm1->m_adm1Bits!=zip->m_adm1Bits)
zip = NULL;//continue;
//if ( adm1->m_crid !=zip->m_crid )continue;
// cut the long long to a long for this compare
if ( zip && city && city->m_hash != zip->m_cityHash )
zip = NULL;//continue;
// debug
Address tmp;
memset ( &tmp , 0 , sizeof(Address) );
if ( street ) tmp.m_street = street;
if ( adm1 ) tmp.m_adm1 = adm1;
if ( city ) tmp.m_city = city;
//if ( ctry ) tmp.m_ctry = ctry;
if ( zip ) tmp.m_zip = zip;
if ( suite ) tmp.m_suite = suite;
if ( name1 ) tmp.m_name1 = name1;
if ( name2 ) tmp.m_name2 = name2;
//if ( street->m_str[0]=='4' && city->m_str[0]=='R'
// && name->m_str[0]=='E' && adm1->m_str[1]=='M'
// && name->m_str[20]=='n' ) {
//printAddress ( &tmp,NULL,0);
//log("events: i1=%li i2=%li i3=%li i4=%li "
// "i5=%li i6=%li i7=%li",
// i1,i2,i3,i4,i5,i6,i7);
// clear
char flags3 = 0;
// this should be an address flag because we might
// be using a city/state from another sentence
// in which it is required, but it is not for us
// if we are in a different sentence
if ( hasRequiredCity )
if ( hasRequiredState )
// . now try to add place vec to our array of addresses
// . we now supply the containing section, "sec"
// so we can vote on which tag hash supplied the best
// addresses
if ( ! addAddress ( name1 ,
name2 ,
suite ,
street ,
city ,
adm1 ,
zip ,
NULL , // ctry ,
startAlnum ,
flags3 ,
NULL ) ) return false;
//if ( m_breached )
// goto bustout;
} // i1
} // i1b
} //adm1
//} ctry
} //i5 nz
// end the BIG LOOP
// CRAP! this algo was causing many streets to be ignored on
// because it has like "main st" and "central" in multiple cities!
// so comment this algo out and try to think of a better way
// now if all street names are the same but with a
// different city then i would say nuke them! cuz it
// can be a list of some kind of statistic per city,
// like
// Amsterdam Netherlands (114 events)
// Anaheim CA United States (249 events)
// Ann Arbor MI United States (155 events)
// Atlanta GA United States (708 events)
// on
// only allow one city to use a streetHash
HashTableX su;
char subuf[2000];
// set allowDups to true!!!!
su.set ( 8 , 8 , 0 , subuf , 2000 , true , m_niceness );
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get it
Address *a = (Address *)m_am.getPtr(i);
// skip if not inlined
if ( ! ( a->m_flags & AF_INLINED ) ) continue;
// get street hash
long long sh = a->m_street->m_hash;
// get city hash
long long ch = a->m_city.m_hash;
// hash it. return false with g_errno set on error
if ( ! su.addKey ( &sh , &ch ) ) return false;
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get it
Address *a = (Address *)m_am.getPtr(i);
// skip if not inlined
if ( ! ( a->m_flags & AF_INLINED ) ) continue;
// get street hash
long long sh = a->m_street->m_hash;
// how many different cities have this same street?
long slot = su.getSlot ( &sh );
// reset count
long count = 0;
// multiple places might have this hash
for ( ; slot>=0 ; slot = su.getNextSlot ( slot , &sh ) ) {
// count it
// if only 1 city had this street name, keep it
if ( count <= 1 ) continue;
// otherwise, ignore this address
a->m_flags &= ~AF_INLINED;
a->m_flags |= AF_IGNORE;
// free mem just in case
// bustout:
// set the AF_AMBIGUOUS bits of each Address if we should
//log("events: combos=%li",combos);
//char *xx=NULL;*xx=0;
//log("events: sleeping 3 seconds. waiting for possible Ctrl-C");
return true;
Place *getZipPlace ( long a , long alnumPos , Words *words ) {
// must be a number
if ( ! is_digit(words->m_words[a][0]) ) return NULL;
// return this if we got one
static Place p;
// make hash
long long h = 0 ^ words->m_wordIds[a];
// check for zip code
long slot = g_zips.getSlot(&h);
// skip if not
if ( slot < 0 ) return NULL;
// get the place
ZipDesc *zd =(ZipDesc *)g_zips.getValueFromSlot(slot);
// set it
p.m_adm1Bits = zd->m_adm1Bits;
p.m_adm1[0] = zd->m_adm1[0];
p.m_adm1[1] = zd->m_adm1[1];
p.m_type = PT_ZIP;
p.m_a = a;
p.m_b = a+1;
p.m_bits = 0;
p.m_alnumA = alnumPos;
p.m_alnumB = alnumPos+1;
p.m_str = words->m_words[a];
p.m_strlen = words->m_wordLens[a];
p.m_hash = h;
p.m_cityHash = zd->m_cityHash;
p.m_cityStr = g_cityBuf + zd->m_cityOffset;
return &p;
Place *getCityPlace ( long a , long alnumPos , Words *words ) {
// return this if we got one
static Place p;
// init hash to zero
long long h = 0LL;
// max count
long count = 0;
// record start
long startAlnumPos = alnumPos;
// fix this
// return this
Place *retp = NULL;
// for some filtering
static bool s_flag = false;
static long long h_university;
static long long h_of;
if ( ! s_flag ) {
s_flag = true;
h_university = hash64n("university");
h_of = hash64n("of");
// shortcut
long nw = words->m_numWords;
long wcount = 0;
// loop over words in [a,b)
for ( long k = a ; k < nw ; k++ ) {
// or 15 words is good enough too!
if ( ++wcount >= 20 ) break;
// skip if not alnum
if ( ! words->isAlnum(k) ) continue;
// count it
// only up to 4 words in a city name
if ( ++count >= 5 ) break;
// get the hash of potential place name
long long wid = words->m_wordIds[k];
// shortcut
long wlen = words->m_wordLens[k];
char *wptr = words->m_words[k];
// if it ended in apostrophe s then fix that
if ( wlen > 2 &&
wptr[wlen-2]=='\'' &&
to_lower_a(wptr[wlen-1]) == 's' )
// hash the word without the 's
wid = hash64Lower_utf8(wptr,wlen-2);
// mix it up
h <<= 1;
// hash it into our ongoing hash
h ^= wid; // words->m_wordIds[k];
// might be alias
//long long *ah1 = (long long *) g_aliases.getValue(&h);
//if ( ah1 ) h = *ah1;
// ignore "University" if "of" follows
if ( h == h_university &&
k + 2 < nw &&
words->m_wordIds[k+2] == h_of )
// get it
CityDesc *cd = (CityDesc *)g_cities.getValue(&h);
if ( ! cd ) continue;
// check for "county" (santa fe county is not a city name)
if ( k + 2 < nw && words->m_wordIds[k+2] == h_county )
return NULL;
// shortcuts
char **wptrs = words->getWords();
long *wlens = words->getWordLens();
// set the place
p.m_adm1Bits = cd->m_adm1Bits;
p.m_type = PT_CITY;
p.m_a = a;
p.m_b = k+1;
p.m_alnumA = startAlnumPos;
p.m_alnumB = alnumPos+1;
p.m_str = wptrs[a];
p.m_strlen = wptrs[k]+wlens[k]-wptrs[a];
p.m_hash = h;
p.m_cityHash = h;
p.m_bits = 0;
// if city is unique, set its adm1Hash
if ( p.m_adm1Bits & CF_UNIQUE ) {
// get it
char *ap = getStateAbbr ( p.m_adm1Bits );
// set it
p.m_adm1[0] = ap[0];
p.m_adm1[1] = ap[1];
else {
p.m_adm1[0] = 0;
p.m_adm1[1] = 0;
// note it
retp = &p;
// see if we can beat it though
return retp;
Place *getStatePlace ( long a , long alnumPos , Words *words ) {
// return this if we got one
static Place p;
// init hash to zero
long long h = 0LL;
// max count
long count = 0;
// record start
long startAlnumPos = alnumPos;
// fix this
// shortcut
long nw = words->getNumWords();
// loop over words in [a,b)
for ( long k = a ; k < nw ; k++ ) {
// skip if not alnum
if ( ! words->isAlnum(k) ) continue;
// count it
// only up to 3 words "district of columbia"
if ( ++count >= 4 ) break;
// get the hash of potential place name
long long wid = words->m_wordIds[k];
// shortcut
long wlen = words->m_wordLens[k];
char *wptr = words->m_words[k];
// if it ended in apostrophe s then fix that
if ( wlen > 2 &&
wptr[wlen-2]=='\'' &&
to_lower_a(wptr[wlen-1]) == 's' )
// hash the word without the 's
wid = hash64Lower_utf8(wptr,wlen-2);
// mix it up
h <<= 1;
// hash it into our ongoing hash
h ^= wid; // words->m_wordIds[k];
// get it
long pos = getStateOffset ( &h );
// skip if not a state
if ( pos < 0 ) continue;
// shortcuts
char **wptrs = words->getWords();
long *wlens = words->getWordLens();
// otherwise, set it
long long stateBit = 1LL << pos;
p.m_adm1Bits = stateBit;
p.m_type = PT_STATE;
p.m_a = a;
p.m_b = k+1;
p.m_alnumA = startAlnumPos;
p.m_alnumB = alnumPos+1;
p.m_str = wptrs[a];
p.m_strlen = wptrs[k]+wlens[k]-wptrs[a];
// set adm1 code
StateDesc *sd = &s_states[pos];
p.m_adm1[0] = sd->m_adm1[0];
p.m_adm1[1] = sd->m_adm1[1];
p.m_hash = p.m_adm1Bits;;
p.m_bits = 0;
return &p;
return NULL;
// . returns -1 and sets g_errno on error
// . returns false if not city/state combo, true otherwise
long Addresses::isCityState3 ( long long h1 , long long h2 ) {
long long nh1 = h1;
long long nh2 = h2;
// we now put the aliases into g_cities as if they were their own
// cities!
// convert aliases -- only for cities methinks
//long long *ah1 = (long long *) g_aliases.getValue(&h1);
//if ( ah1 ) nh1 = *ah1;
//long long *ah2 = (long long *) g_aliases.getValue(&h2);
//if ( ah2 ) nh2 = *ah2;
// get the places
bool c1 = g_cities.isInTable ( &nh1 );
bool c2 = g_states.isInTable ( & h1 );
if ( ! c1 && ! c2 ) return false;
bool d1 = g_cities.isInTable ( &nh2 );
bool d2 = g_states.isInTable ( & h2 );
if ( ! d1 && ! d2 ) return false;
// "Coutrnyside Mobile Home Park" is a PPL (popluated place) in MN
// so we assume it to be a city. then it is mentioned on the new mexico
// page in new mexico.
// so make sure the city is in that state i guess...
if ( d1 && c2 ) {
CityDesc *cd = (CityDesc *)g_cities.getValue(&nh2);
uint64_t sb = getStateBitFromHash ( &h1 );
if ( ! ( (cd->m_adm1Bits) & sb ) ) { d1 = false; c2 = false; }
if ( d2 && c1 ) {
CityDesc *cd = (CityDesc *)g_cities.getValue(&nh1);
uint64_t sb = getStateBitFromHash ( &h2 );
if ( ! ( (cd->m_adm1Bits) & sb ) ) { d2 = false; c1 = false; }
if ( c1 && d2 ) return true;
if ( c2 && d1 ) return true;
return false;
// words range is [a,b)
bool Addresses::isCityName ( long a , long b ) {
// init hash to zero
long long h = 0LL;
// loop over words in [a,b)
for ( long k = a ; k < b ; k++ ) {
// skip if not alnum
if ( ! m_words->isAlnum(k) ) continue;
// mix it up
h <<= 1;
// hash it into our ongoing hash
h ^= m_wids[k];
// might be alias
//long long *ah1 = (long long *) g_aliases.getValue(&h);
//if ( ah1 ) h = *ah1;
// get it
return g_cities.isInTable(&h);
// words range is [a,b)
bool Addresses::isStateName ( long a ) {
// init hash to zero
long long h = 0LL;
// max count
long count = 0;
// loop over words in [a,b)
for ( long k = a ; k < m_nw ; k++ ) {
// skip if not alnum
if ( ! m_words->isAlnum(k) ) continue;
// only up to "district of columbia"
if ( ++count >= 4 ) break;
// mix it up
h <<= 1;
// hash it into our ongoing hash
h ^= m_wids[k];
// get it
if ( g_states.isInTable(&h) ) return true;
return false;
// . words range is [a,b)
// . used by Events.cpp to demote title score
bool Addresses::isCityState ( Section *si ) {
// skip if too many words
long na = si->m_lastWordPos - si->m_firstWordPos;
if ( na <= 0 ) return false;
if ( na >= 2*10 ) return false;
long a = si->m_a;
long b = si->m_lastWordPos + 1;
long lastb = isCityState2 ( a , b );
if ( lastb <= 0 ) return false;
if ( lastb == si->m_lastWordPos ) return true;
return false;
// . returns -1 and sets g_errno on error
// . returns 0 or 1 otherwise
long Addresses::cityAdm1Follows ( long a ) {
// returns -1 if does not follow
if ( isCityState2 ( a , m_nw ) < 0 ) return 0;
// it did follow
return 1;
long Addresses::isCityState2 ( long a , long b ) {
// m must lie on a punt word or tag
for ( ; a < b ; a++ ) {
// breathe
// stop on wid
if ( m_wids[a] ) break;
// bail if no wid
if ( a >= b ) return -1;
Place *cp = getCityPlace ( a , 0 , m_words );
if ( ! cp ) return -1;
// point to start of state
long sta = cp->m_b;
for ( ; sta < b ; sta++ ) {
// breathe
// need a wid
if ( m_wids[sta] ) break;
// bail if no room
if ( sta >= b ) return -1;
// otherwise, see if its a state
Place *sp = getStatePlace ( sta , cp->m_alnumB , m_words );
// skip if not
if ( ! sp ) return -1;
// now we make sure city supports state
if ( ! ( sp->m_adm1Bits & cp->m_adm1Bits ) ) return -1;
// return last word we match otherwise
return sp->m_b - 1;
void Addresses::setAmbiguousFlags ( ) {
// clear those flags first
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
Address *ai = (Address *)m_am.getPtr(i);
ai->m_flags &= ~AF_AMBIGUOUS;
// . loop over the addresses we got
// . determine which addresses we want to add to placedb and namedb
// . placedb key is based on street address, city, adm1,crid(ctry),name
// . namedb key is based on name, city, adm1, crid
// . only add in addresses that are definitive
// . must have zip code, must not have another address with the same
// street address
for ( long i = 0 ; i < m_am.getNumPtrs() - 1 ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// get it
Address *a = (Address *)m_am.getPtr(i);
// do not do fake street names
if ( a->m_street->m_a < 0 ) continue;
// reset verified counts
long verified1 = 0;
long verified2 = 0;
long verified3 = 0;
long verified4 = 0;
// count dups, addresses using the same street
long dups = 0;
// do we have other verified addresses using this street?
for ( long j = i ; j < m_am.getNumPtrs() ; j++ ) {
// get one before us
Address *b = (Address *)m_am.getPtr(j);
// stop when street is different
if ( b->m_street->m_a != a->m_street->m_a ) break;
// count dups
// is "b" verified?
if ( b->m_flags & AF_VERIFIED_STREET )
if ( b->m_flags & AF_VERIFIED_STREET_NUM )
if ( b->m_flags & AF_VERIFIED_PLACE_NAME_1 )
if ( b->m_flags & AF_VERIFIED_PLACE_NAME_2 )
// loop over all the dups
for ( long j = i ; dups >= 2 && j < m_am.getNumPtrs() ; j++ ) {
// get one before us
Address *b = (Address *)m_am.getPtr(j);
// stop when street is different
if ( b->m_street->m_a != a->m_street->m_a ) break;
// if we are the only verified, we are not ambiguous
if((b->m_flags&AF_VERIFIED_STREET )&&verified1==1)
// otherwise, we are!
b->m_flags |= AF_AMBIGUOUS;
// this now too only if some street made it through
if ( verified2 ) b->m_flags3 |= AF2_BADCITYSTATE;
class SynTwin {
char *m_s1;
char *m_s2;
// map the place name synonyms here
static SynTwin s_synList[] = {
,{"4th","fourth"} // North 4th Arts Center, Abq NM
,{"theatre","theater"} // Kimo Theatre
// smith elementary should equal smith elementary school
// how about road stuff
// from
// cat usps_abbreviations.html | grep -v "*" | grep -v "back to" | awk '{print ",{\""$2"\",\""$1"\"}"}' > foo
// cat usps_abbreviations.html | grep -v "*" | grep -v "back to" | awk '{print ",{\""$3"\",\""$1"\"}"}' >> foo
// cat foo | sort | uniq >> Address.cpp
// . cities and states
// . helps with "abq square dance center" i guess
// . "abq jump" --> "albuquerque jump"
static HashTableX s_syn;
static bool s_synInit = false;
// . normalize some words in the place name
// . synonyms
// . 4th --> fourth
// . theatre --> theater
// . school --> {0}
long long *getSynonymWord ( long long *h, long long *prevId, bool isStreet ) {
static long long h_cafeteria;
static long long h_auditorium;
static long long h_school;
static long long h_library;
static long long h_zero;
static long long h_the;
// set syn table?
if ( ! s_synInit ) {
// init it
if ( ! s_syn.set ( 8,8,1024,NULL,0,false,0,"syntbl")){
// core dump if this fails
char *xx=NULL;*xx=0;}
// stock it
long n = (long)sizeof(s_synList)/ sizeof(SynTwin);
for ( long i = 0 ; i < n ; i++ ) {
// breathe
//QUICKPOLL ( m_niceness );
char *s1 = s_synList[i].m_s1;
char *s2 = s_synList[i].m_s2;
long len1 = gbstrlen ( s1 );
long len2 = gbstrlen ( s2 );
long long sh1 = hash64Lower_utf8 ( s1 , len1 );
long long sh2 = hash64Lower_utf8 ( s2 , len2 );
// skip if the same
if ( sh1 == sh2 ) continue;
// sanity check
if ( sh1 == 0 ) { char *xx=NULL;*xx=0; }
// core on failure here, this is critical
if ( ! s_syn.addKey (&sh1,&sh2)){char *xx=NULL;*xx=0;}
// set these
h_cafeteria = hash64b ( "cafeteria" );
h_auditorium = hash64b ( "auditorium" );
h_school = hash64b ( "school" );
h_library = hash64b ( "library" );
h_the = hash64b ( "the" );
h_zero = 0LL;
// only call once
s_synInit = true;
if ( ! isStreet ) {
// . fix for "Grant Middle School Cafeteria"
// . blank out "school cafeteria"
if ( *h==h_cafeteria && *prevId == h_school ) return &h_zero;
// blank out "school auditorium"
if ( *h==h_auditorium && *prevId == h_school ) return &h_zero;
// try for "Loma Colorado Main Library Auditorium"?
if ( *h==h_auditorium && *prevId == h_library ) return &h_zero;
// smith elementary should equal smith elementary school
if ( *h==h_school ) return &h_zero;
// TODO: uncomment this later and replace h_the logic above
if ( *h == h_the && *prevId == 0LL ) return &h_zero;
long long *p = (long long *)s_syn.getValue64 ( *h );
// check city aliases table. we no longer store city aliases
// in the synonym list
// . no! might have "SF Smith" not "Santa Fe Smith"
//if ( ! p ) {
// long long *ah1 = (long long *) g_aliases.getValue(h);
// if ( ah1 ) return ah1;
// return what we had if not in syn table
if ( ! p ) return h;
// . if *p is 0, that means to ignore it!
// . return the mapped guy otherwise
return p;
void Addresses::print ( SafeBuf *pbuf , long long uh64 ) {
// print the streets first
printPlaces( &m_sm , pbuf , m_sections , NULL);//&m_addresses[0] );
// print NAMES then
printPlaces( &m_pm , pbuf , m_sections , NULL);//&m_addresses[0] );
char *hdrFormat =
"<table cellpadding=3 border=1>\n"
"<td colspan=40>"
// table header row
"<td><b><nobr>start word</nobr></b></td>"
"<td><nobr><b>place name 1</b></nobr></td>"
"<td><nobr><b>place name 2</b></nobr></td>"
"</tr>\n" ;
// print address table header
pbuf->safePrintf ( hdrFormat , "Invalid Addresses" );
// print the final winning addresses
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
// get it
Address *aa = (Address *)m_am.getPtr(i);
// is inlined or verified?
bool valid = false;
if ( aa->m_flags & AF_INLINED ) valid = true;
// but unverified streetisname is not good
if ( aa->m_street && (aa->m_street->m_flags2 & PLF2_IS_NAME))
valid = false;
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_1 ) valid = true;
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_2 ) valid = true;
if ( aa->m_flags & AF_VERIFIED_STREET ) valid = true;
// we are only printing INvalids in this table
if ( valid ) continue;
// print to page parser pbuf
Address *ai = (Address *)m_am.getPtr(i);
ai->print2 ( i,pbuf , 0 );
pbuf->safePrintf("<a name=events>\n");
// Spider.cpp when storing parse.* file will also store an
// abbreviate file called parse-shortdisplay.* consisting only
// of these div tags for rendering within the qa.html file! that
// way the qa person can easily check/uncheck all the checkboxes
// right in the qa.html file
pbuf->safePrintf("<div class=shortdisplay>\n");
// print checkbox to indicate if events are wrong
pbuf->safePrintf ( "<!--ignore-->" // ignore for Test.cpp diff
// light blue background
"<span class=validated "
"<input type=checkbox "
"onclick=\"senddiv(this,'%lli');\" "
"<div class=validated style=display:inline>"
" Has <b>address</b> parsing issue. Flag to fix."
"<br>\n" ,
uh64 );
// print address table header
pbuf->safePrintf ( hdrFormat , "Inlined and Verified Addresses" );
// . first print only the INLINED (valid) addresses
// . i guess if they are verified that is considered valid too!
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// get it
Address *aa = (Address *)m_am.getPtr(i);
// is inlined or verified?
bool valid = false;
if ( aa->m_flags & AF_INLINED ) valid = true;
// but unverified streetisname is not good
// but unverified streetisname is not good
if ( aa->m_street && (aa->m_street->m_flags2 & PLF2_IS_NAME))
valid = false;
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_1 ) valid = true;
if ( aa->m_flags & AF_VERIFIED_PLACE_NAME_2 ) valid = true;
if ( aa->m_flags & AF_VERIFIED_STREET ) valid = true;
if ( ! valid ) continue;
// print to page parser pbuf
aa->print2 ( i,pbuf , uh64 );//&m_addresses[0]);
pbuf->safePrintf("</div class=shortdisplay>\n");
pbuf->safePrintf("<i>NOTE: a name must be VERIFIED before it will "
"be a KEY in placedb. So you generally need two "
"places inlining the same name before that will "
2014-02-10 06:09:44 +04:00
2013-08-03 00:12:24 +04:00
// . looks up each word/phrase in our table of known places
// . table incudes cities, countries, states (adm1), counties, zipcodes
long Addresses::addProperPlaces ( long a ,
long b ,
long maxAlnumCount ,
Place *places ,
long maxPlaces ,
long np ,
pbits_t flags ,
// this count excludes "a"?
long alnumPos ,
long forcedEnd ) {
// shortcuts
Words *ww = m_words;
long nw = ww->getNumWords();
long long *wids = ww->getWordIds();
char **wptrs = ww->getWordPtrs();
long *wlens = ww->getWordLens();
nodeid_t *tids = ww->getTagIds();
// "4 miles" and "miles" does not mean "miles, california", the city
long long h_miles = hash64 ( "miles",5);
long long h_mi = hash64 ( "mi",2);
long long h_kilometers= hash64 ( "kilometers",10);
long long h_km = hash64 ( "km",2);
// reset this count again
long alnumCount = 0;
// after the street is an optional city
for ( long j = a ; j<b && alnumCount<maxAlnumCount ; j++ ) {
// breathe
QUICKPOLL ( m_niceness );
// skip if not alnum
if ( ! wids[j] ) continue;
// count alnums
// skip "miles" in "4 miles"
if ( wids[j] == h_miles && j-2>= 0 && is_digit(wptrs[j-2][0]))
if ( wids[j] == h_mi && j-2>= 0 && is_digit(wptrs[j-2][0]))
if ( wids[j] == h_km && j-2>= 0 && is_digit(wptrs[j-2][0]))
if ( wids[j] == h_kilometers&&j-2>=0&&is_digit(wptrs[j-2][0]))
// . skip if only one char
// . no! might be like "N. M." to be "new mexico"
//if ( wlens[j] == 1 ) continue;
// . skip if two chars and not capitalized
// . no! misses "123 main st, albuquerque, nm"
//if ( wlens[j] == 2 && ! is_upper_utf8(wptrs[j]) ) continue;
// try just doing caps only for now
if ( is_lower_utf8(wptrs[j]) ) continue;
// do not skip too far
long max = j + 6;
// truncate?
if ( max > nw ) max = nw;
// init hash
long long h = 0LL;
// the alnumcount for this
long subcount = 0;
// scan for city/adm1/zip after this street address
for ( long k = j ; k < max ; k++ ) {
// stop if tag
if ( tids[k] ) {
// skip non-breaking tags
if ( !isBreakingTagId(tids[k]) ) continue;
// allow <br> too since microsoft front page
// inserts those to break a line
if ( tids[k] == TAG_BR ) continue;
// other tags, stop us
// is it punct?
if ( ! wids[k] ) {
// . big punct is a show stopper
// . no, we had "New\n Mexico"
//if ( wlens[k] >= 4 ) break;
// just skip otherwise
// count it
// mix it up
h <<= 1;
// hash it into our ongoing hash
h ^= wids[k];
// look it up
long slot = g_cities.getSlot(&h);
// length
long plen = (wptrs[k] + wlens[k]) - wptrs[j];
// skip otherwise
if ( forcedEnd >= 0 && k < forcedEnd ) continue;
// clear this
//long cityCount = 0;
// init
Place *pp;
// multiple places might have this hash
for ( ; slot>=0 ; slot=g_cities.getNextSlot(slot,&h)){
// get the place
PlaceDesc *pd =(PlaceDesc *)g_cities.
// it might be an alias to another slot!
long slot2 = -1;
if ( pd->m_bits & PLF_ALIAS ) {
// get the slot we alias
slot2 = pd->getSlot();
// sanity check
if ( slot2 < 0 ) {char *xx=NULL;*xx=0;}
// re-get
pd=(PlaceDesc *)g_cities.getValueFromSlot(slot2);
// skip if not a recognized place
if ( pd->m_type != PT_CITY &&
pd->m_type != PT_STATE &&
//pd->m_type != PT_ZIP &&
pd->m_type != PT_CTRY )
// city count
//if(pd->m_type == PT_CITY) cityCount++;
// skip if full
if ( np >= maxPlaces ) continue;
// point to the right place to store into
pp = &places[np];
// sanity check
if ( ! h ) { char *xx=NULL;*xx=0; }
// make a place
pp->m_a = j;
pp->m_b = k+1;
pp->m_alnumA = alnumPos + alnumCount;
pp->m_alnumB = alnumPos + alnumCount+subcount;
pp->m_type = pd->m_type;
pp->m_str = wptrs[j];
pp->m_strlen = plen;
pp->m_hash = h;
// . use the aliased city, etc. if we had it
// . that way when we lookup this place in
// placedb it will use the right hash
if ( slot2 >= 0 )
pp->m_hash = *(long long *)g_cities.getKeyFromSlot(slot2);
pp->m_adm1[0] = pd->m_adm1[0];
pp->m_adm1[1] = pd->m_adm1[1];
pp->m_crid = pd->m_crid;
pp->m_bits = PLF_INFILE | flags ;
// we use these for zip codes mostly
pp->m_cityHash= 0;//pd->m_cityHash;
// inc it
// sanity check
if ( np >= maxPlaces ) {char*xx=NULL;*xx=0;}
// only one word for zip code
if ( k != j ) continue;
// check if zip code
// look it up
slot = g_zips.getSlot(&h);
// multiple places might have this hash
for ( ; slot>=0 ; slot=g_zips.getNextSlot(slot,&h)){
// get the place
ZipDesc *zd =(ZipDesc *)g_zips.
// skip if full
if ( np >= maxPlaces ) continue;
// point to the right place to store into
pp = &places[np];
// sanity check
if ( ! h ) { char *xx=NULL;*xx=0; }
// make a place
pp->m_a = j;
pp->m_b = k+1;
pp->m_alnumA = alnumPos + alnumCount;
pp->m_alnumB = alnumPos + alnumCount+subcount;
pp->m_type = PT_ZIP;
pp->m_str = wptrs[j];
pp->m_strlen = plen;
pp->m_hash = h;
pp->m_adm1[0] = zd->m_adm1[0];
pp->m_adm1[1] = zd->m_adm1[1];
pp->m_crid = zd->m_crid;
pp->m_bits = PLF_INFILE | flags ;
// we use these for zip codes mostly
pp->m_cityHash= zd->m_cityHash;
pp->m_cityStr = g_cityBuf + zd->m_cityOffset;
// inc it
// sanity check
if ( np >= maxPlaces ) {char*xx=NULL;*xx=0;}
return np;
uint32_t getCityId32 ( uint64_t cityHash64, char *adm1Str ) {
// sanity checks
//if ( is_upper_a(adm1Str[0]) ) { char *xx=NULL;*xx=0; }
//if ( is_upper_a(adm1Str[1]) ) { char *xx=NULL;*xx=0; }
//if ( adm1Str[2] ) { char *xx=NULL;*xx=0; }
// make it lower case to normalize hash
char na[3];
na[0] = to_lower_a(adm1Str[0]);
na[1] = to_lower_a(adm1Str[1]);
na[2] = '\0';
// simple hash value
uint32_t adm1Hash32 = (uint32_t)*((uint16_t *)na);//adm1Str);
// get the hash
uint32_t cid32 = hash32h ( (uint32_t)cityHash64 , adm1Hash32 );
// . now normalize city if its an abbreviation
// . if we got the citystatehash for "SF, CA" we want to map it to
// "San Francisco, CA"'s citystatehash. this normalizes the cityid.
// . likewise "SF, NM" --> "Santa Fe, NM"
uint32_t *ah = (uint32_t *)g_aliases.getValue (&cid32);
// use that if we had it
if ( ah ) return *ah;
// otherwise, we were the real deal
return cid32;
// . make all possible addresses from Places in that section
// . use the Address class
// . only keep the address with maximum score/probability
// . record the section it was found in as well via the Section ptr
// . assign an address probability/score from 0 to 1.0
// . allow inheriting of city or adm1 from title tag or tagdb rec
// (consider other inheritable places and areas later)
// . must have agreeing street,placeName,adm1 and city
// . zip is optional
// . base score is .20
// . then add streetScore*0.30 + placeScore*0.30
// . add .10 if we got a valid agreeable zip code
// . add .03 if we got a valid suite
// . add (20-X)/20 * .07 where X is the avg # of alnum words between
// all possible pairs of the places involved. do not consider
// inherited Places in this calculation. actually weight the distance
// involving the place name half as much as other pairs since
// place name is often in a subtitle...
// . if first section's m_numOccurences > 1, stop... otherwise...
// . get parent section of that first section
// . and repeat as if it were the first section
// . "startAlnum" is where we expect the city to be in order to set the
// AF_INLINED bit for this address
// . zip code does NOT override a non-zip code address if the city or adm1
// are derived from the zip code! or from title or tag!
bool Addresses::addAddress ( Place *name1 ,
Place *name2 ,
Place *suite ,
Place *street ,
Place *city ,
Place *adm1 ,
Place *zip ,
Place *ctry ,
Section *addrSec ,
// where we expect the city to be in an inlined
// address. because we can have multiple streets
// for one place name we need this to be
// after all such streets.
// "abq conv ctr 401 2nd st nw po box 1293 abq nm"
// too
long startAlnum ,
char flags3 ,
Address **retAddr ) {
if ( retAddr ) *retAddr = NULL;
if ( flags3 & AF2_LATLON ) {
// assume to store the new address here, the destination
Address *dst = NULL;
if ( ! dst ) dst = (Address *)m_am.getMem(sizeof(Address));
if ( ! dst ) return false;
if ( retAddr ) *retAddr = dst;
dst->m_hash = 0;
dst->m_score2 = 0;
// now just do ptrs
dst->m_name1 = name1;
dst->m_name2 = name2;
dst->m_suite = suite;
dst->m_street = street;
dst->m_city = city;
dst->m_adm1 = adm1;
dst->m_zip = zip;
dst->m_placedbNames = NULL;
dst->m_alias = NULL;
dst->m_latitude = NO_LATITUDE; // 999.0;
dst->m_longitude = NO_LONGITUDE; // 999.0;
dst->m_latLonScore = 0;
dst->m_latLonDist = 9999999;
// reset this for the geocoder lookup
dst->m_geocoderLat = NO_LATITUDE;
dst->m_geocoderLon = NO_LONGITUDE;
// make placedbkey
//dst->m_placedbKey = dst->makePlacedbKey(m_docId,false,false);
dst->m_bestPlacedbName = NULL;
// sanity check
//if ( dst->m_placedbKey.n1 == 0LL ) { char *xx=NULL;*xx=0; }
// force this to true
dst->m_flags = AF_INLINED;
dst->m_replyFlags = 0;
dst->m_domHash32 = m_domHash32;
dst->m_ip = m_ip;
dst->m_section = NULL;
dst->m_flags3 = flags3;
dst->m_importedLatitude = NO_LATITUDE;
dst->m_importedLongitude = NO_LONGITUDE;
dst->m_importedVotes = -1;
return true;
// no room left?
//if ( m_na >= MAX_ADDRESSES ) {
// // note it
// if ( ! m_firstBreach ) return true;
// m_firstBreach = false;
// log("addr: got address breach for %s",m_url->getUrl());
// return true;
// char *xx=NULL; *xx=0;
// return true;
// maybe we should try to speed up msg2c by quickly validating
// whether the street is in that city/state using zak's db... but
// i don't think truncating the addresses is the right approach
if ( m_am.getNumPtrs() >= 10000 ) {
// note it
if ( ! m_firstBreach ) return true;
m_firstBreach = false;
m_breached = true;
log("addr: got address breach for %s",m_url->getUrl());
return true;
// if we have a city and the zip does not agree and the
// zip is after the city, the nuke the zip
//if ( city && zip && zip->m_cityHash != city->m_hash &&
// zip->m_a > city->m_a )
// zip = NULL;
// skip if zip does not agree with state
if ( adm1 && zip && zip->m_adm1Bits != adm1->m_adm1Bits )
return true;
// or agree with city
if ( city && zip && ! (zip->m_adm1Bits & city->m_adm1Bits ) )
return true;
static bool hset = false;
static long long h_zip;
static long long h_code;
static long long h_postal;
static long long h_zipcode;
static long long h_usa;
if ( ! hset ) {
hset = true;
h_zip = hash64n("zip");
h_code = hash64n("code");
h_postal = hash64n("postal");
h_zipcode = hash64n("zipcode");
h_zipcode = hash64n("usa");
// set zipAlnumA
long zipAlnumA ;
if ( zip ) zipAlnumA = zip->m_alnumA;
// scan to left of zip to change zipAlnumA to allow for acceptable
// words in between it
long zipa = -1; if ( zip ) zipa = zip->m_a - 1;
long mini = zipa - 10;
if ( mini < 0 ) mini = 0;
long count = 0;
for ( long i = zipa ; i >= mini ; i-- ) {
if ( ! m_wids[i] ) continue;
if ( m_wids[i] == h_zip ) count++;
else if ( m_wids[i] == h_code ) count++;
else if ( m_wids[i] == h_postal ) count++;
else if ( m_wids[i] == h_zipcode ) count++;
else if ( m_wids[i] == h_usa ) count++;
else break;
//if ( count > 0 )
// log("hey");
// adjust it to allow for words in between
zipAlnumA -= count;
// if zip and no state or city,do not allow unless right next to street
if ( zip && ! adm1 && ! city && zipAlnumA != startAlnum )
return true;
// or if no state, but we have a city and zip, then zip must follow
// the city or the street
if ( zip && ! adm1 && city &&
zipAlnumA != startAlnum &&
zipAlnumA != city->m_alnumB &&
zip->m_alnumB != city->m_alnumA )
return true;
// or if a state and no city...
if ( zip && adm1 && ! city &&
zipAlnumA != startAlnum &&
zipAlnumA != adm1->m_alnumB &&
zip->m_alnumB != adm1->m_alnumA )
return true;
// set cityhash immediately
uint64_t cityHash = 0;
if ( city ) cityHash = city->m_hash;
else if ( zip ) cityHash = zip->m_cityHash;
if ( ! cityHash ) return true;
// set these
uint64_t adm1Bits;
char *adm1Str = NULL;
if ( adm1 ) {
adm1Bits = adm1->m_adm1Bits;
adm1Str = adm1->m_adm1;
else if ( zip ) {
adm1Bits = zip->m_adm1Bits;
adm1Str = zip->m_adm1;
//else if ( city && (city->m_adm1Bits & CF_UNIQUE ) )
// adm1Bits = city->m_adm1Bits;
return true;
// zip cannot be suite #
if ( suite && zip && zip ->intersects ( suite ) ) return true;
if ( suite && zip && suite->intersects ( zip ) ) return true;
bool inlined = true;
// . are we an inlined address? that means the city and adm1 (state)
// are right after the street address
// . therefore we are not inlined if we inherited the city or the
// adm1 (state) from a tag or the title of the doc
pbits_t flags = PLF_FROMTAG | PLF_FROMTITLE;
// do not use PLF_FROMTITLE if street is in title too though
if ( m_sections &&
(m_sections->m_sectionPtrs[street->m_a]->m_flags & SEC_IN_TITLE) )
flags = PLF_FROMTAG;
bool cityOut = false;
bool adm1Out = false;
bool zipOut = false;
if ( ! city ) cityOut = true;
if ( ! adm1 ) adm1Out = true;
if ( ! zip ) zipOut = true;
if ( city && ( city->m_bits & flags ) ) cityOut = true;
if ( adm1 && ( adm1->m_bits & flags ) ) adm1Out = true;
if ( zip && ( zip ->m_bits & flags ) ) zipOut = true;
// if we have a suite to the right of the street, it must be
// RIGHT after the street for now (TODO: allow colon)
if ( suite && suite->m_a > street->m_a && startAlnum !=suite->m_alnumA)
inlined = false;
bool cityInline = false;
// what is between street and city.
if ( city && ! suite && startAlnum == city->m_alnumA )
cityInline = true;
// suite to the right of street
if ( city && suite && suite->m_alnumB == city->m_alnumA )
cityInline = true;
// suite to the left of street
if ( city && suite &&
suite->m_a < street->m_a &&
startAlnum == city->m_alnumA )
cityInline = true;
// or if a colon is before city
// "Street: 4904 4th St NW \nCity/Town: Albuquerque, NM"
bool gotColon = false;
bool gotWord = false;
long x;
if ( city ) x = city->m_a - 1;
// only loop if city not inlined from above
for ( ; city && ! cityInline && x >= street->m_alnumB ; x-- ) {
// skip if tag
if ( m_tids[x] ) {
// just ignore
if ( ! gotColon ) continue;
// must have had a word
if ( ! gotWord ) continue;
// we need a breaking tag now!
if ( ! isBreakingTagId ( m_tids[x] ) ) continue;
// all done!
cityInline = true;
// stop
// alnum word???
if ( m_wids[x] ) {
// if got alnum word before getting colon, no good!
if ( ! gotColon ) break;
// mark this
gotWord = true;
// otherwise ignore
// got colon?
if ( m_words->hasChar(x,':') ) gotColon = true;
// assume we have no city right after the street...
x = 0;
long xend = -1;
char c = 0;
// also allow something like "123 main st (downtown mall) las cruces"
// to fix
// detail&eID=22180&year=2011&month=01
if ( city &&
city->m_a >= 0 &&
city->m_a > street->m_b &&
city->m_a - street->m_b < 20 ) {
x = street->m_b;
xend = city->m_a;
// loop from end of street to beginning of city looking for '('
for ( ; x < xend ; x++ ) {
// skip if tag
if ( m_tids[x] ) continue;
// stop on word!
if ( m_wids[x] ) {
// unless in parens!
if ( c ) continue;
// crap... Msg13.cpp when it sets the dates does not
// filter out html entites for speed, so watch
// out for crap after an ampersand or &#. this
// was causing some urls to
// not get their address inlined!
if (x>0 && m_wptrs[x][-1] =='&' ) continue;
if (x>1 && m_wptrs[x][-1] =='#'&&m_wptrs[x][-2]=='&' )
// otherwise, really stop
// check for '(' or '['
char *p = m_wptrs[x];
char *pend = p + m_wlens[x];
for ( ; p < pend ; p++ ) {
// breathe
// check for ( or [
if ( *p=='(' ) c = '(';
if ( *p=='[' ) c = '[';
if ( *p==')' ) c = 0;
if ( *p==']' ) c = 0;
// if we scanned all the way through, that's great, we are inlined
if ( x == xend ) cityInline = true;
// turn it off
if ( city && ! cityInline ) inlined = false;
// this restriction was inspidered by "The TAVERN, 4701 Menaul,
// between Washington and Carlisle..." making gb think that it is
// in the city of Carlisle in Washington...
if (city && adm1 && city->m_alnumB != adm1->m_alnumA )
// but if city is "unique" like albuquerque, we allow it
//!(city->m_adm1Bits & CF_UNIQUE) )
inlined = false;
// . wow, "less than 1 mile away from Abq NM" inspired me to require
// that the street be adjacent to the city now!
// . but i am seeing more false positives, so restrict things more
//if ( ! suite && street->m_alnumB != city->m_alnumA )
if ( ! suite && startAlnum != city->m_alnumA )
inlined = false;
// if we have a suite, and it is left of the street, that is ok too
if ( suite && suite->m_a < street->m_a &&
//street->m_alnumB != city->m_alnumA )
startAlnum != city->m_alnumA )
inlined = false;
if ( suite && suite->m_a > street->m_a &&
suite->m_alnumB != city->m_alnumA )
inlined = false;
// if you got a zip, must follow adm1 immediately
// fixes
//if ( zip && adm1 && adm1->m_alnumB != zipAlnumA )
// inlined = false;
bool zipInline = false;
// . zip right after street is good
// . but the city/adm1 must in title or tag, not after the zip
// otherwise we end up inlining bad cities after the zip like
// "house, nm"
if ( zip ) {
if ( startAlnum == zipAlnumA ) zipInline = true;
if ( suite && suite->m_alnumB == zipAlnumA ) zipInline = true;
// . or if zip follows city where city is tight
// . "114 Coronado Road, Corrales, 87048"
if ( city && city->m_alnumB == zipAlnumA ) zipInline = true;
if ( adm1 && adm1->m_alnumB == zipAlnumA ) zipInline = true;
// turn it off
if ( ! zipInline ) inlined = false;
// set this
bool adm1Inline = false;
if ( adm1 ) {
if ( adm1->m_alnumA == street->m_alnumB )
adm1Inline = true;
if ( city && adm1->m_alnumA == city->m_alnumB )
adm1Inline = true;
if ( ! adm1Inline ) inlined = false;
// fix for
// obstetrics_and_gynecology/Seattle.html
// 1959 NE Pacific St
// University Washington Medical Center
// Seattle, WA 98195
// gets "University" as a city in "Washington" state!
if ( adm1 ) {
long ab = adm1->m_b;
long long *wids = m_words->getWordIds();
char **wptrs = m_words->getWordPtrs();
long *wlens = m_words->getWordLens();
nodeid_t *tids = m_words->getTagIds();
long nw = m_words->getNumWords();
if ( inlined && ab-1>= 0 && ab+1 < nw && ! tids[ab] &&
! wids[ab] &&
wlens[ab]==1 &&
// this was hurting
// "195 Crystie Street, Suite 20<br>\nNew York, NY USA"
// so i added this constraint
wlens[ab-1] == 1 &&
is_wspace_utf8(wptrs[ab]) &&
is_upper_utf8(wptrs[adm1->m_a]) &&
is_upper_utf8(wptrs[ab+1]) )
inlined = false;
//if ( city && (city->m_flags2 & PLF2_REQUIRED) )
// inlined = true;
// set the address hash (combo of street,city,adm1)
uint64_t ch = getAddressHash ( street, city, adm1, zip );
// do not add it if street name is lower case and adm1 and city
// are inlined and upper. should fix "4 barrack Oakland CA" and
// "3 spacios - Seattle WA" for
//if ( inlined &&
// ! (street->m_bits & PLF_HAS_UPPER ) &&
// ! (street->m_flags2 & PLF2_HAD_INDICATOR ) &&
// is_upper_utf8(wptrs[adm1->m_a]) &&
// is_upper_utf8(wptrs[city->m_a]) )
// return true;
// . now compare to other address with this same street
for ( long i = m_am.getNumPtrs() - 1 ; i >= 0 ; i-- ) {
// breathe
// get it
Address *prev = (Address *)m_am.getPtr(i);
// if not our street, bail!
if ( prev->m_street->m_a != street->m_a ) break;
// if he is inlined and we are not!
if ( prev->m_flags & AF_INLINED ) {
// if we are not, bail, do not add us
if ( ! inlined ) return true;
// if he is not inlined and we are, we overwrite him
else if ( inlined ) {
// overwrite him
//dst = prev;
// kill him
m_am.rewind ( 1 );
// print him
//log("DELETING the following address 1:");
// try to kill more
// ok, we are not inlined and previous got isn't either...
// . now for the remaining address with this same street, they are
// all, including ourselves, either inlined or not inlined
// . assign a score to each address for a particular street
// . the address with the highest score wins and the others
// are removed. in the case of a tie we keep all of them.
// . we only do this comparison to addresses that have the same
// address hash,
long score = 0;
// inlining always trumps all others
//if ( inlined ) score += 10000;
// and then if all else is equal, having a zip is better than just
// a city because it is more specific
if ( zip ) score += 1000;
// having a valid adm1 is good (might not have one explicity if city
// is unique to a particular state)
if ( adm1 ) score += 100;
// prefer city over no city
if ( city ) score += 10;
// sanity check
if ( score <= 0 ) { char *xx=NULL;*xx=0; }
Address *dst = NULL;
// now compare to other address with this same address hash
for ( long i = m_am.getNumPtrs() - 1 ; i >= 0 ; i-- ) {
// breathe
// get it
Address *prev = (Address *)m_am.getPtr(i);
// stop if for a different street
if ( prev->m_street->m_a != street->m_a ) break;
if ( prev->m_street->m_b != street->m_b ) break;
// skip if should not compare
if ( prev->m_hash != ch ) continue;
// do not add us if he is higher score
if ( prev->m_score2 > score ) return true;
// if a tie, that is strange!
if ( prev->m_score2 == score ) return true;
// overwrite him
dst = prev;
// an undo for the m_na down below
//m_am.rewind ( 1 );
// one at a time
// print him
//log("DELETING the following address 1:");
// try to kill more
// assume to store the new address here, the destination
if ( ! dst ) dst = (Address *)m_am.getMem(sizeof(Address));
if ( ! dst ) return false;
if ( retAddr ) *retAddr = dst;
//dst->m_cityHash = cityHash;
//dst->m_adm1Bits = adm1Bits;
dst->m_cityId32 = getCityId32 ( cityHash , adm1Str );
dst->m_hash = ch;
dst->m_score2 = score;
// now just do ptrs
dst->m_name1 = name1;
dst->m_name2 = name2;
dst->m_suite = suite;
dst->m_street = street;
dst->m_city = city;
dst->m_adm1 = adm1;
dst->m_zip = zip;
dst->m_placedbNames = NULL;
// nuke this for comparing for setting AF_AMBIGUOUS bit
//if ( zip ) dst->m_zip->m_hash = 0;
// reset this too
dst->m_alias = NULL;
dst->m_latitude = NO_LATITUDE; // 999.0;
dst->m_longitude = NO_LONGITUDE; // 999.0;
dst->m_latLonScore = 0;
dst->m_latLonDist = 9999999;
// reset this for the geocoder lookup
dst->m_geocoderLat = NO_LATITUDE;
dst->m_geocoderLon = NO_LONGITUDE;
// make placedbkey
dst->m_placedbKey = dst->makePlacedbKey ( m_docId , false, false );
dst->m_bestPlacedbName = NULL;
// the address voting table key is based on the placedb key but needs
// to be unique for each address! there are often times the same
// street address with a different place name, and since the placedb
// key does not even take the place name into account, we need to
// for this...
//dst->m_avtKey = dst->makeAddressVotingTableKey ( );
// need these
//if ( ! tmp->m_name ) { char *xx=NULL;*xx=0; }
if ( ! street ) { char *xx=NULL;*xx=0; }
if ( ! city && ! zip ) { char *xx=NULL;*xx=0; }
// unique cities like Albuquerque imply a state
if ( ! adm1 && ! zip && ! city->m_adm1[0] ) { char *xx=NULL;*xx=0; }
// sanity check
if ( ! street->m_hash ) { char *xx=NULL;*xx=0; }
//if ( ! street->m_streetNumHash ) { char *xx=NULL;*xx=0; }
if ( city && ! city->m_hash ) { char *xx=NULL;*xx=0; }
if ( adm1 && ! adm1->m_adm1Bits ) { char *xx=NULL;*xx=0; }
// sanity check
if ( dst->m_placedbKey.n1 == 0LL ) { char *xx=NULL;*xx=0; }
// reset flags
dst->m_flags = 0;
dst->m_replyFlags = 0;
if ( inlined ) dst->m_flags |= AF_INLINED;
// . HACK! if our m_str referenced our m_adm1, fix that!
// . see "HACK" above to where we did this
//if ( adm1->m_str == adm1->m_adm1 )
// dst->m_adm1->m_str = dst->m_adm1->m_adm1;
// set m_b for the address so we can use it when as a boundary
// for harvesting place names for following addresses above
long max = -1;
if ( dst->m_street->m_b > max ) max = dst->m_street->m_b;
if ( dst->m_adm1->m_b > max && inlined ) max = dst->m_adm1->m_b;
if ( dst->m_city->m_b > max && inlined ) max = dst->m_city->m_b;
// do not require inlineness for a zip!
if ( zip && dst->m_zip->m_b > max ) max = dst->m_zip->m_b;
// or for a suite!
if ( suite && suite->m_b > max ) max = suite->m_b;
dst->m_b = max;
// and the left most point not including place name
dst->m_a = dst->m_street->m_a;
// suite might be before street sometimes
if ( suite && suite->m_a < dst->m_a ) dst->m_a = suite->m_a;
// add these in
dst->m_domHash32 = m_domHash32;
dst->m_ip = m_ip;
// get the section containing all components
long a = dst->m_street->m_a;
long b = dst->m_street->m_b;
// increase address range?
if ( suite && suite->m_a < a ) a = suite->m_a;
if ( suite && suite->m_b > b ) b = suite->m_b;
// sometimes the city/adm1/zip is in the title or something
// so only use it if within reach!!
if ( ! cityOut && city && city->m_b > b && city->m_b < b + 20 )
b = city->m_b;
if ( ! adm1Out && adm1 && adm1->m_b > b && adm1->m_b < b + 20 )
b = adm1->m_b;
if ( ! zipOut && zip && zip ->m_b > b && zip->m_b < b + 20 )
b = zip->m_b;
//if ( ! cityOut && city && city->m_a < a ) a = city->m_a;
//if ( ! adm1Out && adm1 && adm1->m_a < a ) a = adm1->m_a;
//if ( ! zipOut && zip && zip ->m_a < a ) a = zip->m_a;
if ( a < 0 ) { char *xx=NULL;*xx=0; }
// get section
Section *as = NULL;
if ( m_sections ) as = m_sections->m_sectionPtrs[a];
// telescope up until contains all inlined things in address
//for ( ; as ; as = as->m_parent )
// // stop if contained
// if ( as->m_a <= a && as->m_b >= b ) break;
// store that
dst->m_section = as;
dst->m_flags3 = flags3;
//dst->m_latitude = latitude;
//dst->m_longitude = longitude;
// reset the imported lat/lon
dst->m_importedLatitude = NO_LATITUDE;
dst->m_importedLongitude = NO_LONGITUDE;
dst->m_importedVotes = -1;
// advance m_na iff we did not overwrite a previous address
//log("addr: u=%s addr # = %li",m_url->m_url,m_na-1);
// uncomment this for debug to the log
//dst->print ( );
return true;
uint64_t getAddressHash ( Place *street ,
Place *city ,
Place *adm1 ,
Place *zip ) {
long long ch = 0;
ch ^= street->m_hash;
ch ^= street->m_streetNumHash;
ch ^= street->m_streetIndHash;
// adm1
2013-08-09 19:52:15 +04:00
char *adm1Str = NULL;
2013-08-03 00:12:24 +04:00
if ( adm1 ) adm1Str = adm1->m_adm1;
else if ( zip ) adm1Str = zip->m_adm1;
else if ( city && city->m_adm1[0] ) adm1Str = city->m_adm1;
else { char *xx=NULL;*xx=0; }
// xor in adm1
//ch ^= (long long)*((uint16_t *)adm1Str);
// and city hash
uint64_t cityHash = 0;
if ( city ) cityHash = city->m_hash;
else if ( zip ) cityHash = zip->m_cityHash;
if ( ! cityHash ) { char *xx=NULL;*xx=0; }
//ch ^= cityHash;
// . use this instead. it will convert "SF,CA" to "San Francisco"
// . use a special adm1 bit in the bit vector to indicate its an alias
// . if its an alias we check the g_aliases table to see what the
// cityHash64 should really be
uint32_t cid32 = getCityId32(cityHash,adm1Str);
ch ^= cid32;
return ch;
bool setFromStr ( Address *a, char *s, pbits_t flags ,
PlaceMem *pm ,
long niceness ) {
// clear it up
// shortcuts
//Place *city = NULL;
//Place *adm1 = NULL;
a->m_latitude = NO_LATITUDE;
a->m_longitude = NO_LONGITUDE;
a->m_geocoderLat = NO_LATITUDE;
a->m_geocoderLon = NO_LONGITUDE;
// ctry is always empty, because its always the US
// name1;name2;suite;street;city;adm1;zip;ctry;domhash;ip;origurl;lat;lon;addrHash\0
// . loop it
for ( long i = 0 ; i <= 13 ; i++ , s++ ) {
// stop if no more fields
if ( ! *s ) break;
// save it
char *start = s;
// advance s to ;
//while ( *s && *s != ';' && *s !='(' ) s++;
while ( *s && *s != ';' ) s++;
// site hash?
if ( i == 8 ) {
a->m_domHash32 = 0;
// panic if none!
if ( *start == ';' ) { char *xx=NULL;*xx=0;}//continue;
a->m_domHash32 = (unsigned long)atoll(start);
// ip?
if ( i == 9 ) {
a->m_ip = 0;
if ( *start == ';' ) continue;
a->m_ip = atoip(start,s-start);
// 0 -1 not allowed
if ( a->m_ip==0 || a->m_ip==-1) {char *xx=NULL;*xx=0;}
// skip orig url
if ( i == 10 ) {
// skip if empty
if ( *start == ';' ) continue;
// latitude
if ( i == 11 ) {
// skip if empty
if ( *start == ';' ) continue;
// set it
a->m_latitude = atod2 (start,s-start);
// longitude
if ( i == 12 ) {
// skip if empty
if ( *start == ';' ) continue;
// set it
a->m_longitude = atod2 (start,s-start);
// skip semicolon
if ( ! *s ) break;
// addrHash
if ( i == 13 ) {
// skip if empty
if ( *start == ';' ) continue;
// must be digit
//if ( is_digit(*p) )
a->m_hash = strtoull(start,NULL,10);//atoll(p);
// skip semicolon
// timezone offset
//if ( i == 13 ) {
// // skip if empty
// if ( *start == ';' ) continue;
// // set it
// a->m_timeZoneOffset= atol2 (start,s-start);
// // skip semicolon
// if ( *s && *s == ';' ) s++;
// continue;
// ptr to a place
//Place *p = NULL;
// get length of place
long slen = s - start;
// skip if empty
if ( slen <= 0 ) continue;
// do not breach
//if ( *np >= maxPlaces ) { char *xx=NULL;*xx=0; }
// ok, add this entry
Place *p = (Place *)pm->getMem(sizeof(Place));//&places[*np];
if ( ! p ) { char *xx=NULL;*xx=0; }
// advance np
//*np = *np + 1;
// pt = "place type"
long pt;
if ( i == 0 ) { a->m_name1 = p; pt = PT_NAME_1;}
if ( i == 1 ) { a->m_name2 = p; pt = PT_NAME_2;}
if ( i == 2 ) { a->m_suite = p; pt = PT_SUITE;}
if ( i == 3 ) { a->m_street = p; pt = PT_STREET;}
if ( i == 4 ) { a->m_city = p; pt = PT_CITY;}
if ( i == 5 ) { a->m_adm1 = p; pt = PT_STATE;}
if ( i == 6 ) { a->m_zip = p; pt = PT_ZIP; }
if ( i == 7 ) { continue; }// p = a->m_ctry; pt = PT_CTRY;}
// clear it
// set it
p->m_type = pt;
p->m_a = -7;
p->m_b = -6;
p->m_alnumA = -5;
p->m_alnumB = -4;
p->m_str = start;
p->m_strlen = slen;
p->m_bits = 0;
// set adm1 bits if adm1
if ( pt == PT_STATE ) {
p->m_adm1Bits = getAdm1Bits ( start );
// set the state two-letter abbr as well
p->m_adm1[0] = start[0];
p->m_adm1[1] = start[1];
// we got a parenthetical?
char *parens = NULL;
// skip semicolon
if ( *s && *s == '(' ) {
// what is this from now?
char *xx=NULL;*xx=0;
// skip parens
// mark it
parens = s;
// skip to end
for ( ; *s && *s != ';' ; s++ );
// skip semicolon
//if ( *s && *s == ';' ) s++;
// store it in Address class if not NULL
if ( ! p->m_str ) continue;
// incorporate the flags. usually PLF_FROMTAG
p->m_bits = flags;
// clear these
p->m_flags2 = 0;
// two letter country code in parentheses
//if ( i == 7 && parens && parens[2] == ')' )
// p->m_crid = getCountryId ( parens );
// . two letter admin code in parentheses
// . usually only city names and zip codes have this
//if ( i != 7 && parens && parens[2] == ')' ) {
// p->m_adm1[0] = parens[0];
// p->m_adm1[1] = parens[1];
// and make the word non-overlappable
p->m_a = -3;
p->m_b = -2;
// null it out
p->m_hash = 0LL;
p->m_streetIndHash = 0LL;
p->m_streetNumHash = 0LL;
// set m_streetHash, m_streetIndHash, m_streetNumHash of
// this Place, p
setHashes ( p , NULL , niceness );
// do not take streets from tag, must be on the page itself
if ( i == 3 && (flags & PLF_FROMTAG) ) continue;
// do not take name from tag either!
if ( i == 0 && (flags & PLF_FROMTAG) ) continue;
if ( i == 1 && (flags & PLF_FROMTAG) ) continue;
// nor suite
if ( i == 2 && (flags & PLF_FROMTAG) ) continue;
// and make the word non-overlappable
//p->m_a = -3;
//p->m_b = -2;
// save these
//if ( i == 4 ) city = p;
//if ( i == 5 ) adm1 = p;
// if we are a city OR a zip code, we must set m_hash since
// addAddress() uses it to check for dups!
if ( i == 4 || i == 5 ) {
Words w;
// i guess just use "version" of 0
if ( ! w.set (p->m_str , p->m_strlen,0,true,niceness))
return false;
// shortcut
long long *wids = w.getWordIds();
// zero out the hash
long long h = 0LL;
// loop em
for ( long j = 0 ; j < w.m_numWords ; j++ ) {
// skip if not alnum
if ( ! wids[j] ) continue;
// mix it up
h <<= 1;
// xor it in
h ^= wids[j];
// set that hash
p->m_hash = h;
// update crid
if ( i == 7 ) {
// get numeric id
uint8_t crid = getCountryId(p->m_str);
// set it
p->m_crid = crid;
// and for adm1
adm1->m_crid = crid;
// and city
city->m_crid = crid;
// if it was a lat/lon only contact address it will not have a
// city, so this is NULL. perhaps, just give up on that?
// this is not the case any more since we insert the foreign
// country and state and city sometimes
if ( ! a->m_city || ! a->m_adm1 )
a->m_flags3 |= AF2_LATLON;
// set adm1 bits last from the two character string code
if ( a->m_city && a->m_adm1 ) {
a->m_city->m_adm1Bits = a->m_adm1->m_adm1Bits;
a->m_city->m_adm1[0] = a->m_adm1->m_adm1[0];
a->m_city->m_adm1[1] = a->m_adm1->m_adm1[1];
if ( a->m_zip ) {
a->m_zip->m_adm1Bits = a->m_adm1->m_adm1Bits;
a->m_zip->m_adm1[0] = a->m_adm1->m_adm1[0];
a->m_zip->m_adm1[1] = a->m_adm1->m_adm1[1];
// require ip
if ( a->m_ip == 0 || a->m_ip == -1 ) { char *xx=NULL;*xx=0; }
// do we need this?
a->m_cityId32 = 0;
// adm1
char *adm1Str = NULL;
if ( a->m_adm1 )
adm1Str = a->m_adm1->m_adm1;
else if ( a->m_zip )
adm1Str = a->m_zip->m_adm1;
else if ( a->m_city && a->m_city->m_adm1[0] )
adm1Str = a->m_city->m_adm1;
else { char *xx=NULL;*xx=0; }
// use city hash
a->m_cityId64 = getCityId64 ( a->m_city->m_hash , adm1Str );
// update "m_crid" member on all relevant places
return true;
void setFromStr2 ( char *addr ,
char **name1 ,
char **name2 ,
char **suite ,
char **street ,
char **city ,
char **adm1 ,
char **zip ,
char **country,
double *lat ,
double *lon ) {
// use this
static char s_addr[2048];
//long alen = gbstrlen(addr);
//char *aend = addr + alen;
//long *tzoff ) {
if ( name1 ) *name1 = NULL;
if ( name2 ) *name2 = NULL;
if ( suite ) *suite = NULL;
if ( street ) *street = NULL;
if ( city ) *city = NULL;
if ( adm1 ) *adm1 = NULL;
if ( zip ) *zip = NULL;
if ( country) *country= NULL;
if ( lon ) *lon = 999.00;
if ( lat ) *lat = 999.00;
// breach check
long len = gbstrlen(addr);
if ( len + 1 > 2048 ) {
log("addr: address is too big to parse");
// copy into our static buffer
memcpy ( s_addr , addr , len+1 );
// parse it in our static buffer so we do not destroy it
char *p = s_addr;
// if we are double called on the same "addr" string we have to
// expect to encounter \0 just as we would ';'... and we do this
// now from PageResults.cpp because it uses ExpandedResults, where
// an event that has a date like "every wednesday" results in like
// 104 search results, so that search result has to be repeated
// in the listings using the same address "addr" over and over again,
// and each time it calls setFromStr2, so since this is destructive
// that way, be prepared!
if ( name1 ) *name1 = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
if ( name2 ) *name2 = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
if ( suite ) *suite = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
if ( street ) *street = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
if ( city ) *city = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
if ( adm1 ) *adm1 = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
if ( zip ) *zip = p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
if ( country) *country= p; for ( ; *p != ';' ; p++ ); *p++ = '\0';
//for ( ; *p != ';' ; p++ ); p++; // was country
for ( ; *p != ';' ; p++ ); p++; // domhash?
for ( ; *p != ';' ; p++ ); p++; // ip
for ( ; *p != ';' ; p++ ); p++; // orig url
if ( lat && *p!=';' ) *lat = atof(p);
for ( ; *p != ';' ; p++ ); p++;
if ( lon && *p ) *lon = atof(p);
//if ( tzoff ) *tzoff= atol(p);
//s anity check
//if ( p > aend ) { char *xx=NULL;*xx=0; }
// . year is like "2011" or whatever
// . assume we are in greenwhich england (timezone=+0)
// . BUT apply the american daylight start/end times
// . currently in affect from 2nd sunday in march to first sunday in nov @ 2am
void getDSTInterval ( long year , long *a , long *b ) {
// find the 2nd sunday in march for this year
*a = getDOWStart ( year, 3, 1, 2); // 3=march 1=sunday, 2=2nd
// 2am?
*a += 2*3600;
// the end point now
*b = getDOWStart ( year, 11, 1, 1); // 11=nov 1=sunday 1=1st
// 2am
*b += 2*3600;
// . nowUTC is # secs elapsed since epoch in UTC (no DST)
// . currently in affect from 2nd sunday in march to first sunday in nov @ 2am
bool getIsDST ( long nowUTC , char timezone2 ) {
// mod the time
long mod = nowUTC ;
// add if known
if ( timezone2 != UNKNOWN_TIMEZONE ) {
// sanity check, make sure its the offset, not in seconds
if ( timezone2 > 13 ) { char *xx=NULL;*xx=0; }
if ( timezone2 < -13 ) { char *xx=NULL;*xx=0; }
mod += timezone2*3600;
// get DOW now
struct tm *timeStruct = gmtime ( &mod );
// certain months are always dst. jan = 0. goes from 0 to 11.
long mon = timeStruct->tm_mon;
// feb=1,mar=2,apr=3,may=4,jun=5,jul=6,aug=7,sep=8,oct=9,nov=10,dec=11
if ( mon >= 3 && mon <= 9 ) return true;
// not in dec
if ( mon == 11 ) return false;
// not in jan or feb
if ( mon >= 0 && mon <= 1 ) return false;
// get dow. 0 to 6. 0 being sunday.
long dow = timeStruct->tm_wday;
// what # of dow are we? i.e. xth monday, where x=dowCount
long dowCount = 1 + timeStruct->tm_mday / 7;
// for march, if we are the 2nd dow, and not sunday, return true
if ( mon == 2 ) {
if ( dowCount <= 1 ) return false;
if ( dowCount >= 3 ) return true;
if ( dowCount == 2 && dow != 0 ) return true;
// if before 2nd sunday at 2am, not yet summer time
if ( dowCount == 2 && dow == 0 )
return ( timeStruct->tm_hour >= 2 );
// november
if ( mon == 10 ) {
if ( dowCount >= 2 ) return false;
if ( dowCount == 1 && dow != 0 ) return false;
// if before 1st sunday at 2am, it is still summer time
if ( dowCount == 1 && dow == 0 )
return ( timeStruct->tm_hour < 2 );
// how did we get here?
char *xx=NULL;*xx=0;
return false;
class CityStateDesc {
float m_latitude;
float m_longitude;
char m_timeZoneOffset;
char m_useDST;
//uint8_t m_crid;
// id within that country
//uint8_t m_stateId;
bool getCityLatLonFromAddrStr ( char *addr , double *lat , double *lon ) {
// get city from string
uint32_t cid32 = 0;
if ( addr[0] ) cid32 = getCityIdFromAddr ( addr );
// assume city/state not found in our list
// now get lat lon of that city
bool status = getLatLon ( cid32 , lat , lon );
// returns false if city not found
return status;
uint32_t getCityIdFromAddr ( char *addr ) {
// get city and adm1 from address
char *p = addr;
long semiCount = 0;
char *adm1 = NULL;
char *city = NULL;
for ( ; ; p++ ) {
// skip if not border
if ( *p != ';' ) continue;
// inc it
// city?
if ( semiCount == 4 ) {
city = p + 1;
if ( semiCount == 5 ) {
adm1 = p + 1;
if ( semiCount != 6 ) continue;
// if no city try lat/lon
if ( city[0] == ';' ) {
double lat = 0.0;
double lon = 0.0;
getLatLonFromStr ( addr , &lat , &lon );
float distInMilesSquared = 0.0;
uint32_t cid32 = getNearestCityId ( lat , lon , 0,
if ( distInMilesSquared > 1000 )
cid32 = 0;
// how can this be 0?
//if ( cid32 == 0 ) { char *xx=NULL;*xx=0; }
return cid32;
// ok, we got both now
char *semi1 = adm1 - 1;
char *semi2 = p;
// temp null term
*semi1 = '\0';
*semi2 = '\0';
// fix Denver's so we do not return unknown timezone
if ( semi1[-1]=='s' && semi1[-2]=='\'' ) semi1[-2]='\0';
// get city hash
long long h = getWordXorHash(city);
// TODO: make state into two letter abbr?
//if ( gbstrlen(adm1) != 2 ) { char *xx=NULL;*xx=0; }
// use this now
uint32_t cid32 = (uint64_t)getCityId32(h,adm1);
// put back
*semi1 = ';';
*semi2 = ';';
// put apostrophe back if we stripped it
if ( ! semi1[-2] ) semi1[-2] = '\'';
return cid32;
PlaceDesc *getCityPlaceDescFromAddrLatLon_new ( char *addr ) {
double lat = 0.0;
double lon = 0.0;
getLatLonFromStr ( addr , &lat , &lon );
float distInMilesSquared = 0.0;
PlaceDesc *pd = getNearestCity_new (lat,lon,0,&distInMilesSquared);
if ( distInMilesSquared < 1000 ) return pd;
return NULL;
char getTimeZoneFromAddr ( char *addr , char *useDST ) {
// . try this new function
// . if no city explicitly, use lat/lon to get nearest city?
// . returns NULL if no nearby city
PlaceDesc *pd = getCityPlaceDescFromAddrLatLon_new ( addr );
if ( pd && useDST ) {
*useDST = 0;
if ( pd->m_flags & PDF_USE_DST ) *useDST = 1;
if ( pd ) return pd->m_timeZoneOffset;
// i guess we choose not to store the lat/lon for US cities
// because we can look them up by name here...
uint32_t cid32 = getCityIdFromAddr ( addr );
// if it had a city specified, or its lat/lon was nearby a city,
// then use that city id to get the timezone
if ( cid32 ) return getTimeZone3 ( cid32 , useDST );
// if doesn't have a city or the specified lat/lon is not close
// to a city in our list then let's use the lat lon to get the
// timezone
double lat = 0.0;
double lon = 0.0;
getLatLonFromStr ( addr, &lat, &lon );
if ( lat == NO_LATITUDE ) return UNKNOWN_TIMEZONE;
if ( lon == NO_LATITUDE ) return UNKNOWN_TIMEZONE;
if ( useDST ) *useDST = 1;
return (char)(long)(lon / (360.0/24.0));
// . hash city and state together then lookup in g_timeZones table
// . name1;name2;suite;street;city;adm1;zip;domhash;ip;origurl;lat;lon\0
// . uint32_t getCityHash32 ( char *addr , uint32_t *adm1Hash ) {
char getTimeZoneFromAddr ( char *addr , char *useDST ) {
// get city and adm1 from address
char *p = addr;
long semiCount = 0;
char *adm1 = NULL;
char *city = NULL;
for ( ; ; p++ ) {
// skip if not border
if ( *p != ';' ) continue;
// inc it
// city?
if ( semiCount == 4 ) {
city = p + 1;
if ( semiCount == 5 ) {
adm1 = p + 1;
if ( semiCount != 6 ) continue;
// ok, we got both now
char *semi1 = adm1 - 1;
char *semi2 = p;
// temp null term
*semi1 = '\0';
*semi2 = '\0';
// fix Denver's so we do not return unknown timezone
if ( semi1[-1]=='s' && semi1[-2]=='\'' ) semi1[-2]='\0';
char tzoff = getTimeZone2 ( city , adm1 , useDST );
// put back
*semi1 = ';';
*semi2 = ';';
// put apostrophe back if we stripped it
if ( ! semi1[-2] ) semi1[-2] = '\'';
return tzoff;
char getTimeZone2 ( char *city , char *state , char *useDST ) {
// get the words
//Words ww; ww.set3 ( city );
// shortcut
//long long *wids = ww.m_wordIds;
// limit hash
//long count = 0;
// get city hash
long long h = getWordXorHash(city);
// TODO: make state into two letter abbr?
2013-11-25 07:48:47 +04:00
// crap, if state is taken from class ZipDesc it is only
// 2 letters and has no \0 in it
//if ( gbstrlen(state) != 2 ) { char *xx=NULL;*xx=0; }
2013-08-03 00:12:24 +04:00
// use this now
uint32_t cid32 = (uint64_t)getCityId32(h,state);
// and call this
return getTimeZone3 ( cid32 , useDST );
char getTimeZone3 ( uint32_t cid32 , char *useDST ) {
// now lookup timezone
long slot = g_timeZones.getSlot ( &cid32 );//&cityStateHash );
// return 0 if not found
if ( slot < 0 ) {
log("addr: gettimezone3: unknown timezone");
// Denver Art Museum;;;100 West 14th Avenue Parkway;Denver's;
// co;;;1993583704;;;;
// otherwise, set m_timeZoneOffset appropriately
CityStateDesc *csd=(CityStateDesc *)g_timeZones.getValueFromSlot(slot);
*useDST = csd->m_useDST;
// sanity corruption check
if ( *useDST != 0 && *useDST != 1 ) { char *xx=NULL;*xx=0; }
char tz = csd->m_timeZoneOffset;
if ( tz < -13 || tz > 13 ) { char *xx=NULL;*xx=0; }
return tz;
// . for now just get the closest city to the user and use that timezone
// . this is not 100% accurate but should be like 99.9%
// . no, just use the GeoCityLite.dat call, that returns the city/state already
char getTimeZoneFromUserIP ( long uip , long niceness , char *useDST ) {
double lat;
double lon;
double radius;
char *city,*state,*ctry;
// use this by default
//long ip = r->m_userIP;
// ip for testing?
//long iplen;
//char *ips = r->getString("uip",&iplen);
//if ( ips ) ip = atoip(ips);
// returns true if found in db
char buf[128];
getIPLocation ( uip ,
&lat ,
&lon ,
&city ,
&state ,
&ctry ,
buf ,
128 ) ;
// 999 means unknown timezone offset
if ( ! city || ! state ) {
log("addr: got unknown timezone for user");
// get timezone offset from this
return getTimeZone2 ( city , state , useDST );
// used by SearchInput.cpp to get timezone of the user from user's lat/lon
char getTimeZoneFromLatLon ( float lat,float lon,long niceness,char *useDST ) {
// get nearest city/state
float distInMilesSquared = 0.0;
uint32_t cid32 = getNearestCityId ( lat , lon , niceness ,
&distInMilesSquared );
if ( distInMilesSquared > 1000 )
cid32 = 0;
// then its easy
return getTimeZone3 ( cid32 , useDST );
static long *s_latList = NULL;
static long s_latListSize = 0;
//static long *s_lonList = NULL;
static long s_ni = 0;
// . we need a list of the city ids sorted by lat, and a list sorted by lon
// . then we do b-stepping on each list
// . bstep down to a 20 mile by 20 mile box
// . then intersect using a hashtable
// . if empty, then increase to 30 by 30 mile box, etc.
// . there are 123k US cities in cities.dat
// . these 2 lists should be about 2MB then
// . then lookup cityid in g_timezones to get timezone
uint32_t getNearestCityId ( float lat ,
float lon ,
long niceness ,
float *distInMilesSquared ) {
// radius is 5 miles, put miles into degrees
float radius = 5.0 / 69.0;
CityStateDesc *csd;
long step = s_ni / 2;
// get lat boundaries using bstep
long start = s_ni / 2;
// do the bstepping
for ( ; ; ) {
// get that city
long citySlot = s_latList[start];
// get csd
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(citySlot);
if ( ! csd ) { char *xx=NULL;*xx=0; }
// increase resolution for next round
step /= 2;
//if ( step <= 0 ) step = 1;
// step it down?
if ( lat < csd->m_latitude ) start -= step;
// use " - radius" here as well to avoid infinite loop?
else if ( lat > csd->m_latitude ) start += step;
// ok, we are in range, done
else break;
// avoid breaching!
if ( start < 0 ) { start = 0 ; break; }
if ( start >= s_ni ) { start = s_ni-1; break; }
// stop if we hit steps of 0
if ( step <= 0 ) break;
// if step was 0 and we failed, than need to increase radius
//if ( step > 0 ) continue;
// ok, we failed, we will increase radius below and try again
// increase stripe width
//radius += 5.0;
// try again
//goto tryagain;
//getCityRange ( s_latList , lat , radius , &lata , &latb );
//getCityRange ( s_lonList , lon , radius , &lona , &lonb );
// now take intersection of the ranges
//long numCities = lata - latb;
//HashTableX ih;
//if(! ih.set ( 4 , 0 , numCities , ihbuf, 3000 , false , niceness )){
// char *xx=NULL;*xx=0; }
long lata = start;
long latb = start;
long count = 0;
// TODO: do b-step on these too, takes like 3500 iterations for
// both of these loops
// adjust lata/latb until just out of range
for ( ; lata > 0 ; lata-- ) {
// get csd
long slot = s_latList[lata];
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(slot);
if ( csd->m_latitude < lat - radius ) break;
for ( ; latb < s_ni ; latb++ ) {
// get csd
long slot = s_latList[latb];
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(slot);
if ( csd->m_latitude > lat + radius ) break;
float min = -1.0;
long minSlot = -1;
// add in the lat cities
for ( long i = lata ; i <= latb ; i++ ) {
// break?
if ( i >= s_ni ) break;
// breathe
// get that city
long citySlot = s_latList[i];
// get cd
CityStateDesc *csd;
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(citySlot);
// just compute distance
float latDiff = csd->m_latitude - lat;
float lonDiff = csd->m_longitude - lon;
// add up
float dist = latDiff*latDiff + lonDiff*lonDiff;
// min?
if ( dist > min && minSlot >= 0 ) continue;
// set it
min = dist;
minSlot = citySlot;
// must have one
if ( minSlot == -1 ) {
// note it
log("addr: what the hell.");
// increase stripe width
radius += 10.0;
// try again
goto tryagain;
if ( distInMilesSquared ) *distInMilesSquared = min;
uint32_t *cidp = (uint32_t *)g_timeZones.getKeyFromSlot(minSlot);
// get that then
return *cidp;
int latcmp ( const void *arg1 , const void *arg2 ) {
long slot1 = *(long *)arg1;
long slot2 = *(long *)arg2;
// get the addresses
CityStateDesc *cd1;
CityStateDesc *cd2;
cd1 = (CityStateDesc *)g_timeZones.getValueFromSlot(slot1);
cd2 = (CityStateDesc *)g_timeZones.getValueFromSlot(slot2);
// simple compare
if ( cd1->m_latitude < cd2->m_latitude ) return -1;
if ( cd1->m_latitude > cd2->m_latitude ) return 1;
return 0;
//int loncmp ( const void *arg1 , const void *arg2 ) {
// // get the addresses
// CityDesc *cd1 = *(CityDesc **)arg1;
// CityDesc *cd2 = *(CityDesc **)arg2;
// // simple compare
// return ( cd1->m_longitude - cd2->m_longitude );
// . our data is used by getNearestCityId
// . about 123k cities, sort them by lat in one list, lon in the other
// . 4 bytes per entry, we are talking 1.2MB for both lists
bool initCityLists ( ) {
// scan city table
long ns = g_timeZones.m_numSlots;
// need this
long used = g_timeZones.m_numSlotsUsed;
// how much space to alloc?
long need = used * 4;
// alloc it
char *space = (char *)mmalloc(need,"latlist");
if ( ! space ) return false;
char *p = space;
s_latList = (long *)p;
s_latListSize = need;
//p += 4 * used;
//s_lonList = (CityDesc **)p;
// reset
s_ni = 0;
// scan the slots
for ( long i = 0 ; i < ns ; i++ ) {
// skip empties
if ( ! g_timeZones.m_flags[i] ) continue;
// get it
CityStateDesc *csd;
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(i);
// add to the list
s_latList[s_ni] = i;
//s_lonList[s_ni] = cd;
// now sort each list
gbqsort ( s_latList , s_ni , 4 , latcmp , 0 );
//gbqsort ( s_lonList , s_ni , 4 , loncmp , 0 );
return true;
char Address::getTimeZone ( char *useDST ) {
// need this
char *adm1Str = NULL;
char *cityStr = NULL;
if ( m_adm1 ) adm1Str = m_adm1->m_adm1;
else if ( m_zip ) {
cityStr = m_zip->m_cityStr;
adm1Str = m_zip->m_adm1;
else if ( m_city && m_city->m_adm1[0] ) {
adm1Str = m_city->m_adm1;
// this sets m_cityId32 to the nearest city to the lat/lon
else if ( (m_flags3 & AF2_LATLON) && m_cityId32 ) ;
// if we failed to set city id because no city was nearby
// then just guess based on lat/lon
else if ( m_flags3 & AF2_LATLON ) {
if ( useDST ) *useDST = 1;
char timeZone = (char)(long)(m_longitude / (360.0/24.0));
if ( timeZone < -12 || timeZone > 12 ) { char *xx=NULL;*xx=0;}
return timeZone;
else { char *xx=NULL;*xx=0; }
// normalize this
//char aa[3];
//aa[0] = to_lower_a(adm1Str[0]);
//aa[1] = to_lower_a(adm1Str[1]);
//aa[2] = 0;
// hash state hash
//uint32_t adm1Hash32 = (uint32_t)*((uint16_t *)aa);
//uint32_t cityHash32 = (uint32_t)m_cityHash;
// combine the two hashes
//uint32_t cityStateHash = hash32h(cityHash32,adm1Hash32);
// use this now
//uint32_t cid32 = (uint32_t)m_cityId64;
// now lookup timezone
long slot = g_timeZones.getSlot ( &m_cityId32 );
// return 0 if not found
if ( slot < 0 ) {
// nte it
if ( cityStr && adm1Str ) {
log("addr: could not find timezone in g_timezones, "
"trying to call getTimeZone2");
char tzoff = getTimeZone2 ( cityStr, adm1Str, useDST );
if ( tzoff != UNKNOWN_TIMEZONE )
return tzoff;
log("addr: got unknown timezone for addr");
*useDST = 1;
// otherwise, set m_timeZoneOffset appropriately
CityStateDesc *csd;
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(slot);
char tzoff = csd->m_timeZoneOffset;
if ( tzoff < - 13 || tzoff > 13 ) { char *xx=NULL;*xx=0; }
*useDST = csd->m_useDST;
return tzoff;
bool Addresses::addToTagRec ( TagRec *gr , long ip , long timestamp ,
char *origUrl , long maxAddrBytes ,
char *tagName ) {
// inherit Places that all the Addresses in the list agree on
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// get it
Address *ai = (Address *)m_am.getPtr(i);
// do not add this to tagdb if not inlined!
if ( ! ( ai->m_flags & AF_INLINED ) ) continue;
// add address #i
if ( ! ai->addToTagRec (gr,ip,timestamp,origUrl,
maxAddrBytes,tagName) )
return false;
return true;
// can xmldoc use this for venue addresses?
bool Address::addToTagRec ( TagRec *gr , long ip , long timestamp ,
char *origUrl , long maxAddrBytes ,
char *tagName ) {
// we are no longer storing contact info addresses
//return true;
// use ; as delimter
char buf[5003];
// . size includes the terminating \0
// . include the Address::m_hash for deduping in XmlDoc.cpp
long size = serialize ( buf , 5000 , origUrl , false , true );
// returns -1 and sets g_errno on error
if ( size < 0 ) return false;
// point to end of data excluding the origUrl for deduping
char *end1 = buf + size - 1;
for ( ; end1 > buf && *end1 != ';' ; end1-- ) ;
// the length without that
long len1 = end1 - buf;
// how many address bytes are we using currently? only need to
// compute this if we have a limit, i.e. "maxAddrBytes" >= 0
// count those bytes
long used = 0;
if ( maxAddrBytes >= 0 ) {
// our tag type
long tt = getTagTypeFromStr ( tagName );//"contactaddress" );
// taken from TagRec::getTag() function
Tag *tag = gr->getFirstTag();
// loop over all contact info addresses in the TagRec
for ( ; tag ; tag = gr->getNextTag(tag) ){
// skip if not a "address" tag (ci=contactInfo)
if ( tag->m_type != tt ) continue;
// get str
used += tag->m_dataSize;
// point to end of data excluding the origUrl for
// deduping contact addresses in the tag rec
char *end2 = tag->m_data + tag->m_dataSize - 1;
for ( ; end2 > tag->m_data && *end2 != ';' ; end2-- ) ;
// get lengths
long len2 = end2 - tag->m_data;
// is it a dup?
if ( len1 != len2 ) continue;
if ( memcmp(tag->m_data, buf, len1 ) ) continue;
// it was a dup!
return true;
// can we fit it? if not, do not add it
if ( maxAddrBytes >= 0 && used + size > maxAddrBytes ) return true;
// store it
//long now = getTimeGlobal();
// returns false and sets g_errno on error
return gr->addTag (tagName,timestamp,"xmldoc",ip,buf,size);
// . hash city and state together then lookup in g_timeZones table
// . name1;name2;suite;street;city;adm1;zip;country;domhash;ip;origurl;lat;lon;hash\0
// . uint32_t getCityHash32 ( char *addr , uint32_t *adm1Hash ) {
uint64_t getHashFromAddr ( char *addr ) {
char *p = addr;
long semiCount = 0;
for ( ; *p ; p++ ) {
// skip if not border
if ( *p != ';' ) continue;
// inc it
// hash?
if ( semiCount != 13 ) continue;
// got it
// none?
if ( ! *p ) { char *xx=NULL;*xx=0; }
// skip semi
// must be digit
if ( ! is_digit(*p) ) { char *xx=NULL;*xx=0; }
// get that value
uint64_t ah = strtoull(p,NULL,10);//atoll(p);
// that's what we want
return ah;
// . used by Address::serialize
// . filter out back to back spaces
// . covert \n and \t to ' '
long memcpy2 ( char *dst , char *src , long bytes , bool filterCommas ,
long dstMaxBytes ) {
char *srcEnd = src + bytes;
// do not start with a space, so set this to 1
char lastWasSpace = 1;
char *dstStart = dst;
char fc = ' ';
if ( filterCommas ) fc = ',';
bool inTag = false;
char *dstEnd = NULL;
if ( dstMaxBytes >= 0 ) dstEnd = dstStart + dstMaxBytes;
char cs ;
//if ( src[0]=='G' && src[1]=='o' && src[2]=='n' )
// log("hey");
for ( ; src < srcEnd ; src += cs ) {
// set it
cs = getUtf8CharSize(src);
// remove tags
if ( *src == '<' ) {
inTag = true;
// skip if bold tag
if ( to_lower_a(src[1])=='b' && src[2]=='>' ) continue;
// skip if italic
if ( to_lower_a(src[1])=='i' && src[2]=='>' ) continue;
// skip if already had printed space
if ( lastWasSpace ) continue;
// otherwise print the space
*dst++ = ' ';
// and set this flag
lastWasSpace = true;
if ( *src == '>' ) { inTag = false; continue;}
if ( inTag ) continue;
// . when serializing address semicolons have special meaning
// . deal special with spaces. treat comma as a space too now!
if ( is_wspace_utf8 (src) || *src == fc || *src == ';' ) {
// stop if would breach
if ( dstEnd && dst + 1 > dstEnd ) break;
if ( ! lastWasSpace ) *dst++ = ' ';//*src;
lastWasSpace = 1;
// reset
lastWasSpace = 0;
// stop if would breach
if ( dstEnd && dst + cs > dstEnd ) break;
// everything else
if( cs == 1 ) { *dst++ = *src; continue; }
// otherwise characters is > 1 byte
memcpy ( dst , src , cs );
dst += cs;
// return bytes written
return dst - dstStart;
// "olen" is length of origUrl to be stored
long Address::getStoredSize ( long ulen , bool includeHash ) {
// how much buffer space do we need?
long need = 0;
if ( m_name1 ) need += m_name1 ->m_strlen + 1;
if ( m_name2 ) need += m_name2 ->m_strlen + 1;
if ( m_suite ) need += m_suite ->m_strlen + 1;
if ( m_street ) need += m_street->m_strlen + 1;
if ( m_city ) need += m_city ->m_strlen + 1;
if ( m_zip ) need += m_zip ->m_strlen + 1;
if ( m_adm1 ) need += m_adm1 ->m_strlen + 1;
//if ( m_ctry ) need += m_ctry ->m_strlen + 1;
// if city our adm1 or country is NULL, guess because it
// will be looked up and supplied based on lat/lon
if ( ! m_city ) need += 64 + 1;
if ( ! m_adm1 ) need += 2 + 1;
// country!
need += 3;
// country is now just ;
// domainhash
need += 10 + 1;
// ip string
need += 16;
// this includes the "..." of truncated urls
need += ulen;
// latitude
need += 12;
// longitude
need += 12;
// address hash -- printing out a uint64_t in ascii
// 18446744073709551615LL = 20 digits + semicolon before it
need += 21;
// null term
// timezoneoffset
//need += 4;
return need;
bool Address::serializeVerified ( SafeBuf *sb ) {
// get min # of bytes needed
long need = getStoredSize ( 0 , false );
// make room
if ( ! sb->reserve ( need ) ) return false;
// store it here
char *buf = sb->getBuf();
// do it
long written = serialize ( buf , need , NULL , true , false );
// sanity check
if ( written > need ) { char *xx=NULL;*xx=0; }
// update it
sb->incrementLength ( written );
// success
return true;
// . returns -1 and sets g_errno on error
// . name1;name2;suite;street;city;adm1;zip;country;domHash32;ipStr;url;lat;lon;addHash
// . setfromstr() above
long Address::serialize ( char *buf , long bufSize , char *origUrl ,
bool verifiedOnly , bool includeHash ) {
char *p = buf;
// sanity check. these should be filtered out
//if ( m_score <= 0.0 ) { char *xx=NULL;*xx=0; }
// also truncate at semicolon in urls since that is our delimeter
char *o = origUrl;
for ( ; o && *o && *o !=';' ; o++ );
// truncate this if we should
long olen = o - origUrl; // gbstrlen(origUrl);
bool trunc = false;
if ( olen > 128 ) { olen = 96; trunc = true; }
// if a semicolon kicked us out, we were truncated as well
else if ( o && *o == ';' ) trunc = true;
// include ...
long extra = 0;
if ( trunc ) extra = 3;
// how much buffer space do we need?
long need = getStoredSize( olen + extra , includeHash );
// silenty ignore overflow errors
if ( need > bufSize ) return -1;
PlaceDesc *pd = NULL;
// guess the city/state names if we got lat/lon only
if ( m_flags3 & AF2_LATLON ) {
float distInMilesSquared = 0.0;
pd = getNearestCity_new ( m_latitude ,
m_longitude ,
0 , // niceness
&distInMilesSquared );
if ( distInMilesSquared >= 1000 ) pd = NULL;
Place *d ;
char flags = m_flags;
if ( ! verifiedOnly ) flags |= AF_VERIFIED_PLACE_NAME_1;
if ( ! verifiedOnly ) flags |= AF_VERIFIED_PLACE_NAME_2;
d = m_name1;
if ( d && (flags & AF_VERIFIED_PLACE_NAME_1) ) {
// bytes written may be different than d->m_strlen since
// memcpy2() filters out back-to-back spaces
// should also remove semicolons
p += memcpy2(p,d->m_str,d->m_strlen,false);
*p++ = ';';
d = m_name2;
if ( d && (flags & AF_VERIFIED_PLACE_NAME_2) ) {
// bytes written may be different than d->m_strlen since
// memcpy2() filters out back-to-back spaces
p += memcpy2(p,d->m_str,d->m_strlen,false);
*p++ = ';';
d = m_suite;
if ( d ) {
// bytes written may be different than d->m_strlen since
// memcpy2() filters out back-to-back spaces
p += memcpy2(p,d->m_str,d->m_strlen,true);
*p++ = ';';
d = m_street;
if ( d ) {
// bytes written may be different than d->m_strlen since
// memcpy2() filters out back-to-back spaces
p += memcpy2(p,d->m_str,d->m_strlen,true);
*p++ = ';';
d = m_city;
if ( d ) {
// bytes written may be different than d->m_strlen since
// memcpy2() filters out back-to-back spaces
p += memcpy2(p,d->m_str,d->m_strlen,true);
// append the adm1 code
//if ( d->m_adm1[0] ) {
// *p++ = '(';
// memcpy(p,d->m_adm1,2);
// p += 2;
// *p++ = ')';
// if city is NULL it must be implied from zip code
else if ( m_zip ) {
char *cs = m_zip->m_cityStr;
if ( gbstrlen(cs) == 0 ) { char *xx=0;*xx=0; }
p += memcpy2(p,cs,gbstrlen(cs),true);
else if ( m_flags3 & AF2_LATLON ) {
if ( pd ) {
char *str = pd->m_officialNameOffset + g_pbuf;
long slen = gbstrlen(str);
// limit to 64 since that is getStoredSize() number
if ( slen > 64 ) slen = 64;
memcpy ( p , str ,slen );
p += slen;
// otherwise, we have an issue, it must be impliable
else {
char *xx=NULL;*xx=0;
*p++ = ';';
// mdw mdw
d = m_adm1;
if ( d ) {
// bytes written may be different than d->m_strlen since
// memcpy2() filters out back-to-back spaces
//p += memcpy2(p,d->m_str,d->m_strlen,true);
// to save space use two letter abbr
p += memcpy2(p,d->m_adm1,2,true);
// append the adm1 code
//if ( d->m_adm1[0] ) {
// *p++ = '(';
// memcpy(p,d->m_adm1,2);
// p += 2;
// *p++ = ')';
// if city is NULL it must be implied from zip code
else if ( m_zip ) {
p += memcpy2(p,m_zip->m_adm1,2,true);
// imply from city if city is unique
//else if ( m_city && (m_city->m_adm1Bits & CF_UNIQUE) ) {
// p += memcpy2(p,m_city->m_adm1,2,true);
else if ( m_flags3 & AF2_LATLON ) {
// this is the nearest city's state based on our lat/lon
if ( pd && pd->m_adm1[0] && pd->m_adm1[1] ) {
memcpy ( p , pd->m_adm1 ,2 );
p += 2;
// otherwise, we have an issue, it must be impliable
else {
char *xx=NULL;*xx=0;
*p++ = ';';
d = m_zip;
if ( d ) {
// bytes written may be different than d->m_strlen since
// memcpy2() filters out back-to-back spaces
p += memcpy2(p,d->m_str,d->m_strlen,true);
// append the adm1 code
//if ( d->m_adm1[0] ) {
// *p++ = '(';
// memcpy(p,d->m_adm1,2);
// p += 2;
// *p++ = ')';
*p++ = ';';
// use country code from "crid"
//char *cn = (char *)g_countryCode.getAbbr(m_adm1->m_crid-1);
//if ( cn ) {
// memcpy(p,cn,gbstrlen(cn));
// p += gbstrlen(cn);
if ( m_flags3 & AF2_LATLON ) {
if ( pd && pd->m_crid ) {
char *cc = getCountryCode(pd->m_crid);
memcpy ( p , cc , 2 );
p += 2;
*p++ = ';';
// sanity check
if ( m_domHash32 == 0 ) { char *xx=NULL;*xx=0; }
// serialize 32-bit domain hash
p += sprintf( p , "%lu", m_domHash32 );
*p++ = ';';
// sanity check
if ( m_ip == 0 || m_ip == -1 ) { char *xx=NULL;*xx=0;}
// serialize ip string
p += sprintf( p , "%s", iptoa(m_ip));
*p++ = ';';
if ( origUrl ) {
// bytes written may be different than d->m_strlen since
// memcpy2() filters out back-to-back spaces
p += memcpy2(p,origUrl,olen,false);
if ( trunc ) p += memcpy2 (p,"...",3,false);
*p++ = ';';
// then latitude
if ( m_latitude != NO_LATITUDE && m_latitude != AMBIG_LATITUDE )
p += sprintf(p,"%f",m_latitude);
*p++ = ';';
// then longitude
if ( m_longitude != NO_LONGITUDE && m_longitude != AMBIG_LONGITUDE )
p += sprintf(p,"%f",m_longitude);
if ( includeHash ) {
*p++ = ';';
// finally the address hash in ascii
p += sprintf ( p , "%llu" , m_hash );
// . then timezone off, a single signed byte really
// . we add 100 to this to signify that it does NOT use DST
//p += sprintf(p,"%li", (long)m_timeZoneOffset);
*p++ = '\0';
// count the semicolons to make sure data did not insert extra ones
char *s = buf;
long semiCount = 0;
long semiNeed = 12;
if ( includeHash ) semiNeed++;
for ( ; *s ; s++ ) if ( *s == ';' ) semiCount++;
if ( semiCount != semiNeed ) { char *xx=NULL;*xx=0; }
long size = p - buf;
// sanity check
if ( size > bufSize ) { char *xx=NULL;*xx=0; }
// all done
return size;
long Address::print ( ) {
return print2 ( 0,NULL,0);
long Address::print2 ( long i , SafeBuf *pbuf , long long uh64 ) {
// print out each candidate for debug
SafeBuf sb;
//bool validAddr = ( (m_flags) & AF_INLINED );
// old sanity checker to ensure div ids were unique
//static bool s_init = false;
//static HashTableX ht;
//if ( ! s_init ) {
// s_init = true;
// ht.set ( 4 , 4 , 128 , NULL , 0 , false , 2 );
//if ( validAddr ) {
// if ( ht.isInTable ( &m_divId) ) { char *xx=NULL;*xx=0; }
// ht.addKey ( &m_divId );
// print out to a table?
if ( pbuf ) {
// dump it
// . for the sake of doing delta diffs in Test.cpp
// eliminate the number!
//pbuf->safePrintf ( "<td>%li/%li</td>", num ,m_street.m_a);
//if ( m_street.m_a >= 0 )
// pbuf->safePrintf ( "<td>%li</td>", m_street.m_a);
long napos = -1;
if ( m_name1 ) napos = m_name1->m_a;
long stra = -1;
if ( m_street ) stra = m_street->m_a;
pbuf->safePrintf ( "<td>%li/%li</td>", napos,stra );
//pbuf->safePrintf ( "<td>%.06f</td>", m_score );
//pbuf->safePrintf("<td>0x%lx</td>", m_section->m_tagHash);
printEssentials ( pbuf , false , uh64 );
// print flags
//if ( (m_flags) & AF_IGNORE )
// pbuf->safePrintf("ignore ");
if ( m_flags & AF_VENUE_DEFAULT )
pbuf->safePrintf("venueaddress ");
if ( (m_flags) & AF_INLINED )
pbuf->safePrintf("inlined ");
pbuf->safePrintf("notinlined ");
if ( m_alias )
pbuf->safePrintf("alias[a=%li] ",
if ( m_flags3 & AF2_HAS_REQUIRED_CITY )
pbuf->safePrintf("requiredcity ");
if ( m_flags3 & AF2_HAS_REQUIRED_STATE )
pbuf->safePrintf("requiredstate ");
if ( m_street && (m_street->m_flags2 & PLF2_COLLISION) )
pbuf->safePrintf("streetcollision ");
// means that we are inlined and the city FOLLOWS the state
//if ( (m_flags) & AF_BADORDER )
// pbuf->safePrintf("badorder ");
if ( (m_flags) & AF_AMBIGUOUS )
pbuf->safePrintf("ambig ");
if ( (m_flags3) & AF2_BADCITYSTATE )
pbuf->safePrintf("badcitystate ");
if ( (m_flags) & AF_VERIFIED_STREET )
pbuf->safePrintf("verifiedstreet ");
if ( (m_flags) & AF_VERIFIED_STREET_NUM )
pbuf->safePrintf("verifiedstreetnum ");
if ( (m_flags) & AF_VERIFIED_PLACE_NAME_1 )
pbuf->safePrintf("verifiedplacename1 ");
if ( (m_flags) & AF_VERIFIED_PLACE_NAME_2 )
pbuf->safePrintf("verifiedplacename2 ");
if ( m_street &&(m_street->m_flags3 & PLF3_SUPPLANTED))
pbuf->safePrintf("<b>supplanted</b> ");
if ( m_street &&(m_street->m_flags3 & PLF3_LATLONDUP))
pbuf->safePrintf("<b>latlondup</b> ");
if ( m_street &&(m_street->m_flags2 & PLF2_INTERSECTION) )
pbuf->safePrintf("intersection ");
if ( m_street &&(m_street->m_flags2 & PLF2_IS_NAME ))
pbuf->safePrintf("streetisname ");
if ( m_street &&(m_street->m_flags2 & PLF2_AFTER_AT) )
pbuf->safePrintf("afterat ");
if ( m_street &&(m_street->m_flags2 & PLF2_TICKET_PLACE) )
pbuf->safePrintf("ticketplace ");
// when the event hours are not "store hours" we flag the
// place name so as to avoid it as the event title in
// Events.cpp
//if ( m_name1 && (m_name1->m_flags2 & PLF2_STORE_NAME) )
// pbuf->safePrintf("storename ");
//if ( (m_flags) & AF_VERIFIED_STREET_IND )
// pbuf->safePrintf("verifiedstreetind ");
if ( !(m_flags) )
// print the address ptr, but make it an offset so
// it doesn't show up on the test qa run diffs
//long offset = this - base;
long offset = i;
pbuf->safePrintf("<td><nobr>0x%llx (%li)</nobr></td>",
// print placedb names
char *s = m_placedbNames;
char *send = m_placedbNamesEnd;
// scan them
for ( ; s && s < send ; ) {
// skip score
s += 4;
// empty? strange...
if ( ! *s ) { char *xx=NULL;*xx=0; }
if ( s > m_placedbNames + 4 )
// print that
// skip that and the \0
s += gbstrlen(s) + 1;
// adm1
char *adm1Str = "\0\0";
if ( m_adm1 ) adm1Str = m_adm1->m_adm1;
else if ( m_zip ) adm1Str = m_zip->m_adm1;
//else if ( m_city && m_city->m_adm1[0] )
// adm1Str = m_city->m_adm1;
else if ( m_flags3 & AF2_LATLON );
else { char *xx=NULL;*xx=0; }
// city
long long cityHash = 0LL;
if ( m_city ) cityHash = m_city->m_hash;
else if ( m_zip ) cityHash = m_zip->m_cityHash;
else if ( m_flags3 & AF2_LATLON );
else { char *xx=NULL;*xx=0; }
uint32_t cityId = getCityId32(cityHash,adm1Str);
// ripped from XmlDoc.cpp placedb logic
key128_t *k2 = &m_placedbKey;
long long bigHash = g_placedb.getBigHash ( k2 );
long long docId = g_placedb.getDocId ( k2 );
long snh = g_placedb.getStreetNumHash ( k2 );
long long nh1 = 0;
long long nh2 = 0;
if ( m_name1 ) nh1 = m_name1->m_hash;
if ( m_name2 ) nh2 = m_name2->m_hash;
long long strh = 0LL;
if ( m_street ) strh = m_street->m_hash;
"k.n1=0x%16llx n0=0x%16llx "
//"addrhash=0x%llx "
"bigHash64=0x%016llx "
"docId=%llu "
"streetNumHash25=0x%08lx "
"cityHash=0x%016llx "
"cityId=0x08%lx "
"streetHash=0x%016llx "
"adm1Hash=0x%04lx "
"name1Hash=0x%016llx "
"name2Hash=0x%016llx "
k2->n1 , k2->n0 ,
snh ,
strh, // m_street->m_hash,
(long)*(uint16_t *)adm1Str,
char *b1 = "&nbsp;";
char *b2 = "&nbsp;";
char *b3 = "&nbsp;";
if ( m_flags & AF_VERIFIED_STREET ) b1 = "yes";
if ( m_flags & AF_VERIFIED_STREET_NUM ) b2 = "yes";
if ( m_flags & AF_VERIFIED_PLACE_NAME ) b3 = "yes";
// wrap up the table row
pbuf->safePrintf ( "</tr>\n");
return 1;
if ( m_name1 ) {
if ( m_name2 && m_name2->m_str ) {
sb.safePrintf(" name2=");
if ( m_street ) {
sb.safePrintf(" street[%li]=",m_street->m_a);
//if ( m_zip ) {
// sb.safePrintf(" zip=");
// sb.safeMemcpy(m_zip->m_str,m_zip->m_strlen);
if ( m_suite ) {
sb.safePrintf(" suite=");
if ( m_city ) {
sb.safePrintf(" city[%li]=",m_city->m_a);
if ( m_adm1 ) {
sb.safePrintf(" adm1[%li]=",m_adm1->m_a);
if ( m_zip ) {
sb.safePrintf(" zip=");
//if ( m_adm2 && m_adm2->m_str ) {
// sb.safePrintf(" adm2=");
// sb.safeMemcpy(m_adm2->m_str,m_adm2->m_strlen);
//if ( m_ctry->m_str ) {
// sb.safePrintf(" country=");
// sb.safeMemcpy(m_ctry->m_str,m_ctry->m_strlen);
sb.safePrintf(" score2=%li",m_score2);
sb.safePrintf(" flags=");
if ( (m_flags) & AF_INLINED )
sb.safePrintf("inlined ");
sb.safePrintf("notinlined ");
// means that we are inlined and the city FOLLOWS the state
//if ( (m_flags) & AF_BADORDER )
// sb.safePrintf("badorder ");
if ( (m_flags) & AF_AMBIGUOUS )
sb.safePrintf("ambig ");
if ( (m_flags) & AF_VERIFIED_STREET )
sb.safePrintf("verifiedstreet ");
if ( (m_flags) & AF_VERIFIED_STREET_NUM )
sb.safePrintf("verifiedstreetnum ");
if ( (m_flags) & AF_VERIFIED_PLACE_NAME_1 )
sb.safePrintf("verifiedplacename1 ");
if ( (m_flags) & AF_VERIFIED_PLACE_NAME_2 )
sb.safePrintf("verifiedplacename2 ");
if ( m_street && (m_street->m_flags2 & PLF2_INTERSECTION ))
sb.safePrintf("intersection ");
if ( m_street && (m_street->m_flags2 & PLF2_IS_NAME ))
sb.safePrintf("streetisname ");
if ( m_street && (m_street->m_flags2 & PLF2_AFTER_AT ))
sb.safePrintf("afterat ");
//sb.safePrintf(" a=%li b=%li",m_a,m_b);
// null term
sb.safeMemcpy ( "\0",1 );
//sb.safePrintf(" =");
//logf(LOG_DEBUG,"events: addr score=%.06f %s",
logf(LOG_DEBUG,"events: %s",
sb.getBufStart() );
return 1;
void Address::printEssentials ( SafeBuf *pbuf , bool forEvents ,
long long uh64 ) {
pbuf->safePrintf ( "<td><nobr>");
// . this is for XmlDoc::validateOutput()
// . we use javascriptEncode() to convert &'s to &amp; since
// the javascript escape() function does that before
// converting into a url encoded character for some
// reason, which is very annoying!!!! maybe tagInner
// does that! yeah, probably, it returns normalized output
// as i've seen it reorganize the attributes of html tags.
if ( uh64 ) {
"<!--ignore-->" // ignore for Test.cpp diff
"<span class=validated>"
"<input type=checkbox "
"onclick=\"senddiv(this,'%lli');\" "
"unchecked> "
"<div class=validated style=\"display:none\">",
// this must be unsigned
//char *p = pbuf->getBuf();
// map utf8 characters into &#xxxx entites because
// the senddiv() function maps all utuf8 chars to
// crap like "%u2019" for the apostrophe for instance
if ( m_name1 )
if ( m_name2 )
if ( m_suite )
if ( m_street )
if ( m_city )
else if ( m_zip )
else if ( m_flags3 & AF2_LATLON );
else { char *xx=NULL;*xx=0; }
// now print adm1 abbr
char *as = NULL;
long aslen = 2;
// mdw mdw
if ( m_adm1 )
as = m_adm1->m_adm1;
else if ( m_zip )
as = m_zip->m_adm1;
//else if ( m_city && (m_city->m_adm1Bits & CF_UNIQUE) )
// as = m_city->m_adm1;
else if ( m_flags3 & AF2_LATLON );
else { char *xx=NULL;*xx=0; }
if ( as ) pbuf->javascriptEncode(as,aslen);
if ( m_zip )
//if ( m_ctry->m_str )
// pbuf->javascriptEncode(m_ctry->m_str,m_ctry->m_strlen);
// now we include lat and long, but only if we got both valid
if ( m_longitude != NO_LONGITUDE &&
m_latitude != NO_LONGITUDE ) {
// now also check the lat/lon we import
if ( m_importedLatitude != NO_LATITUDE )
if ( m_importedLongitude != NO_LONGITUDE )
//char *pend = pbuf->getBuf();
pbuf->safePrintf ("\n</div>" );
pbuf->safePrintf ("</span>" );
// set these
long nameLen1 = 0;
char *name1 = NULL;
if ( m_name1 ) {
name1 = m_name1->m_str;
nameLen1 = m_name1->m_strlen;
if ( forEvents && !(m_flags & AF_VERIFIED_PLACE_NAME_1) )
name1 = NULL;
if ( forEvents && m_alias ) {
name1 = m_alias->m_name1->m_str;
nameLen1 = m_alias->m_name1->m_strlen;
if ( ! name1 ) {
name1 = "&nbsp;";
nameLen1 = gbstrlen(name1);
if ( m_alias && forEvents ) {
pbuf->safePrintf("(alias = ");
// this will have STREET_IS_NAME set so use the street
// not name 1
pbuf->safePrintf(") ");
long nameLen2 = 0;
char *name2 = NULL;
if ( m_name2 ) {
nameLen2 = m_name2->m_strlen;
name2 = m_name2->m_str;
if ( forEvents && !(m_flags & AF_VERIFIED_PLACE_NAME_2) )
name2 = NULL;
if ( forEvents && m_alias ) {
name2 = m_alias->m_name2->m_str;
nameLen2 = m_alias->m_name2->m_strlen;
if ( ! name2 ) {
name2 = "&nbsp;";
nameLen2 = gbstrlen(name2);
long suiteLen = 0;
char *suite = NULL;
if ( m_suite ) {
suiteLen = m_suite->m_strlen;
suite = m_suite->m_str;
if ( forEvents && m_alias ) {
suite = m_alias->m_suite->m_str;
suiteLen = m_alias->m_suite->m_strlen;
if ( ! suite ) {
suite = "&nbsp;";
suiteLen = gbstrlen(suite);
long streetLen = 0;
char *street = NULL;
if ( m_street ) {
streetLen = m_street->m_strlen;
street = m_street->m_str;
if ( forEvents && m_alias ) {
street = m_alias->m_street->m_str;
streetLen = m_alias->m_street->m_strlen;
if ( ! street ) {
street = "&nbsp;";
streetLen = gbstrlen(street);
// print it right. niceness = 0
pbuf->htmlEncode ( street,streetLen, true,0);
long cityLen = 0;
char *city = NULL;
if ( m_city ) {
cityLen = m_city->m_strlen;
city = m_city->m_str;
if ( forEvents && m_alias ) {
city = m_alias->m_city->m_str;
cityLen = m_alias->m_city->m_strlen;
if ( ! city ) {
city = "&nbsp;";
cityLen = gbstrlen(city);
long adm1Len = 0;
char *adm1 = NULL;
if ( m_adm1 ) {
adm1Len = 2;//m_adm1->m_strlen;
adm1 = m_adm1->m_adm1;//str;
if ( forEvents && m_alias ) {
adm1 = m_alias->m_adm1->m_adm1;//str;
adm1Len = 2;//m_alias->m_adm1->m_strlen;
if ( ! adm1 ) {
adm1 = "&nbsp;";
adm1Len = gbstrlen(adm1);
long zipLen = 0;
char *zip = NULL;
if ( m_zip ) {
zipLen = m_zip->m_strlen;
zip = m_zip->m_str;
if ( forEvents && m_alias ) {
zip = m_alias->m_zip->m_str;
zipLen = m_alias->m_zip->m_strlen;
if ( ! zip ) {
zip = "&nbsp;";
zipLen = gbstrlen(zip);
// ctry is special
char *ctry = m_ctry->m_str;
if ( forEvents && m_alias ) ctry = m_alias->m_ctry->m_str;
if ( ! ctry ) {
Place *cp = &m_adm1;
char *cn = (char *)g_countryCode.getName(cp->m_crid-1);
if ( cn ) pbuf->safeMemcpy ( cn,gbstrlen(cn) );
else pbuf->safePrintf("unknown");
double lat = m_latitude;
double lon = m_longitude;
// geocoder lat/long
lat = m_geocoderLat;
lon = m_geocoderLon;
if ( lat != NO_LATITUDE && lat != AMBIG_LATITUDE )
if ( lon != NO_LONGITUDE && lon != AMBIG_LONGITUDE )
// then lat/long
lat = m_latitude;
lon = m_longitude;
if ( lat != NO_LATITUDE && lat != AMBIG_LATITUDE )
if ( lon != NO_LONGITUDE && lon != AMBIG_LONGITUDE )
// IMPORTED lat/long
lat = m_importedLatitude;
lon = m_importedLongitude;
if ( lat != NO_LATITUDE && lat != AMBIG_LATITUDE )
pbuf->safePrintf("%f (%li)",lat,m_importedVotes);
if ( lon != NO_LONGITUDE && lon != AMBIG_LONGITUDE )
pbuf->safePrintf("%f (%li)",lon,m_importedVotes);
void printPlaces ( PlaceMem *pm , SafeBuf *pbuf , Sections *sections,
Address *base ) {
if ( pbuf ) pbuf->safePrintf ( "<table cellpadding=3 border=1>"
"<td><b>simple place</b></td>"
"<td><b><nobr>place hash"
"<td><b><nobr>address ptr"
"<td><b><nobr>word a</nobr></b></td>"
"<td><b><nobr>word b</nobr></b></td>"
"<td><b><nobr>alnum word a</nobr>"
"<td><b><nobr>alnum word b</nobr>"
//"<td><b><nobr>section #</nobr></b></td>"
//"<td><b><nobr>parent section #</nobr>"
"<td><b><nobr>section tagHash</nobr>"
"</tr>\n" );
// just streets really, or fake streets
for ( long i = 0 ; i < pm->getNumPtrs() ; i++ ) { // np
Place *pi = (Place *)pm->getPtr(i);
char *p = pi->m_str;
char *pend = p + pi->m_strlen;
char c = *pend;
*pend = 0;
long flags = pi->m_bits;
char fbuf[1000];
char *f = fbuf;
// skip if filtered out from the city/adm1 loop above
if ( ! pi->m_type ) { *pend = c; continue; }
f += sprintf ( f , "type=" );
if ( pi->m_type == PT_SCH )
f += sprintf ( f , "school " );
if ( pi->m_type == PT_PRK )
f += sprintf ( f , "park " );
if ( pi->m_type == PT_CITY )
f += sprintf ( f , "city " );
if ( pi->m_type == PT_STATE )
f += sprintf ( f , "adm1 " );
if ( pi->m_type == PT_ADM2 )
f += sprintf ( f , "adm2 " );
if ( pi->m_type == PT_ADM3 )
f += sprintf ( f , "adm3 " );
if ( pi->m_type == PT_ADM4 )
f += sprintf ( f , "adm4 " );
if ( pi->m_type == PT_CTRY )
f += sprintf ( f , "ctry " );
if ( pi->m_type == PT_ZIP )
f += sprintf ( f , "zip " );
if ( pi->m_type == PT_SUITE )
f += sprintf ( f , "suite " );
if ( pi->m_type == PT_NAME_1 )
f += sprintf ( f , "name1 " );
if ( pi->m_type == PT_NAME_2 )
f += sprintf ( f , "name2 " );
if ( pi->m_type == PT_STREET )
f += sprintf ( f , "street " );
else if ( pi->m_type == PT_CITY )
f += sprintf ( f , "city " );
else if ( pi->m_type == PT_STATE )
f += sprintf ( f , "state " );
else if ( pi->m_type == PT_NAME_1 )
f += sprintf ( f , "name1 " );
else if ( pi->m_type == PT_NAME_2 )
f += sprintf ( f , "name2 " );
else if ( pi->m_type == PT_SUITE )
f += sprintf ( f , "suite " );
else if ( pi->m_type == PT_ZIP )
f += sprintf ( f , "zip " );
else if ( pi->m_type == PT_LATLON )
f += sprintf ( f , "latlon " );
else { char *xx=NULL;*xx=0; }
f += sprintf ( f , "flags=" );
char *of = f;
//if ( flags & PLF_HAS_UPPER )
// f += sprintf ( f , "hasupper " );
//if ( flags & PLF_ALT )
// f += sprintf ( f , "alt " );
//if ( flags & PLF_IGNORE )
// f += sprintf ( f , "ignore " );
//if ( flags & PLF_PARTIAL )
// f += sprintf ( f , "partial " );
//if ( flags & PLF_AMBIGUOUS )
// f += sprintf ( f , "ambig " );
if ( pi->m_flags2 & PLF2_COLLISION )
f += sprintf(f,"streetcollision ");
if ( pi->m_flags2 & PLF2_REQUIRED )
f += sprintf(f,"requiredplace ");
if ( pi->m_flags2 & PLF2_TICKET_PLACE )
f += sprintf(f,"ticketplace ");
if ( pi->m_flags2 & PLF2_INTERSECTION )
f += sprintf(f,"intersection ");
if ( pi->m_flags2 & PLF2_IS_NAME )
f += sprintf(f,"streetisname ");
if ( pi->m_flags2 & PLF2_AFTER_AT )
f += sprintf(f,"afterat ");
if ( pi->m_flags2 & PLF2_IS_POBOX )
f += sprintf(f,"ispobox ");
if ( pi->m_address )
f += sprintf(f,"inaddress ");
if ( pi->m_unverifiedAddress )
f += sprintf(f,"inunverifiedaddress ");
if ( pi->m_alias )
f += sprintf(f,"alias[a=%li] ",
if ( flags & PLF_INFILE )
f += sprintf ( f , "infile " );
//if ( flags & PLF_INHERITED )
// f += sprintf ( f , "inherited " );
//if ( flags & PLF_FROMZIP )
// f += sprintf ( f , "fromzip ");
if ( flags & PLF_FROMTAG )
f += sprintf ( f , "fromtag " );
if ( flags & PLF_FROMTITLE )
f += sprintf ( f , "fromtitle " );
if ( flags & PLF_ABBR )
f += sprintf ( f , "abbr " );
//if ( f == of ) *f++ = ' ';
//else f[-1] = ' ';
if ( f == of )
f += sprintf(f,"&nbsp;");
if ( flags & IND_NAME )
f += sprintf ( f , "ind_name " );
if ( flags & IND_SUITE )
f += sprintf ( f , "ind_suite " );
if ( flags & IND_STREET )
f += sprintf ( f , "ind_street " );
if ( flags & IND_DIR )
f += sprintf ( f , "ind_dir " );
//if ( flags & IND_BITS )
// f += sprintf ( f , "ind_bits " );
// add state
//if ( pi->m_adm1[0] && pi->m_adm1[1] )
// f += sprintf(f,"adm1=%c%c ",
// pi->m_adm1[0],pi->m_adm1[1]);
// add country
//if ( pi->m_crid )
// f += sprintf(f,"ctry=%s ",
// g_countryCode.getName(pi->m_crid-1) );
*f = '\0';
// shortcut
Section **sp = sections->m_sectionPtrs;
// get section
Section *sn = NULL;
if ( pi->m_a >= 0 ) sn = sp [ pi->m_a ];
long depth = -1;
if ( sn ) depth = sn->m_depth;
// sectio number
long secNum = -1;
long parentSecNum = -1;
if ( sn ) secNum = (long)(sn - sp[0]);
Section *parent = NULL;
if ( sn ) parent = sn->m_parent;
if ( parent ) parentSecNum = (long)(parent - sp[0]);
long secHash = 0;
if ( sn ) secHash = sn->m_turkTagHash32;
2013-08-03 00:12:24 +04:00
// print the address we are in or the address we alias
Address *myaddr = NULL;
if ( pi->m_address ) myaddr = pi->m_address;
if ( pi->m_alias ) myaddr = pi->m_alias;
// make it relative so qa test run diff is ok
// MDW: might need to store the off in m_addressOff/m_aliasOff
// or something.. keep an eye on this
long myoff = i;//myaddr - base;
if ( myaddr == NULL ) myoff = -1;
// sanity check
// no, we now allow a full address like
// "14th and curtis, denver co" to be an alias to a non
// intersection address "1000 14th street, denver co"
// as in
//if ( pi->m_address && pi->m_alias ) {char *xx=NULL;*xx=0;}
if ( pbuf ) {
pbuf->safePrintf ( "<tr>"
"<td><nobr>" );
// print it right. niceness = 0
pbuf->htmlEncode ( p , gbstrlen(p) , true,0);
pbuf->safePrintf ("</nobr></td>"
"</tr>\n" ,
fbuf ,
pi->m_hash ,//m_hash
(long)pi->m_a ,
(long)pi->m_b ,
(long)pi->m_alnumA ,
(long)pi->m_alnumB ,
//(long)depth ,
logf(LOG_DEBUG,"events: place #%li \"%s\" "
"flags=%s alnuma=%li alnumb=%li "
pi->m_alnumA ,
// put char back
*pend = c;
// sanity
if ( ! ( pi->m_type ) ) { char *xx=NULL;*xx=0; }
if ( pbuf ) pbuf->safePrintf ( "</table><br>\n" );
// THINK ABOUT: discard phrases with number at end, no "suite" indicator, and
// has US as the country (do this last)
// "... be eligible to play. AYSO Region 1447 offers a fun..."
// "Sunday 9 . 6, Tuesday 10 - 4; " --> no street called "Tuesday"!
class AliasDesc {
char *m_s1;
char *m_s2;
char *m_adm1;
char *m_mostPopStateAbbr;
// these are relative to the aliases as far as computing the best/
// default state that contains it. right now we just set santa fe
// down to 99 so that "sf" maps to "san francisco" by default.
long m_pop;
static AliasDesc s_cityList[] = {
{"ny","new york city","ny","ny",1000}
,{"nyc","new york city","ny","ny",1000}
,{"n y c","new york city","ny","ny",1000}
,{"la","los angeles","ca","ca",1000}
,{"lax","los angeles","ca","ca",1000}
,{"sa","san antonio","tx","tx",1000}
,{"sd","san diego","ca","ca",1000}
,{"sj","san jose","ca","ca",1000}
,{"sf","san francisco","ca","ca",1000}
,{"san fran","san francisco","ca","ca",1000}
,{"sf","santa fe","nm","ca",99}
,{"fw","fort worth","tx","tx",1000}
,{"ft worth","fort worth","tx","tx",1000}
,{"ept","El Paso","tx","tx",1000}
,{"elp","El Paso","tx","tx",1000} // airport
,{"bos","Boston","ma","ma",1000} // airport
,{"lv","Las Vegas","nv","nv",1000} // postal
,{"las","Las Vegas","nv","nv",1000} // airport
,{"okc","Oklahoma City","ok","ok",1000}
,{"lbc","Long Beach","ca","ca",1000}
,{"lb","Long Beach","ca","ca",1000}
,{"smf","Sacramento","ca","ca",1000} // airport
,{"kc","Kansas City","ks","ks",1000}
,{"vab","Virginia Beach","va","va",1000}
,{"stl","Saint Louis","mo","mo",1000}
,{"st louis","saint louis","mo","mo",1000}
,{"sna","Santa Ana","ca","ca",1000}
,{"aoc","Anaheim","ca","ca",1000} // anaheim orange county
,{"cctx","Corpus Christi","tx","tx",1000}
,{"cor chr","Corpus Christi","tx","tx",1000}
,{"stpaul","Saint Paul","mn","mn",1000}
,{"st paul","Saint Paul","mn","mn",1000}
,{"fwa","Fort Wayne","in","in",1000} // airport
//,{"ftw","Fort Wayne","","",1000}
,{"ft wayne","Fort Wayne","in","in",1000} // airport
,{"st petersburg","saint petersburg","fl","fl",1000}
,{"jc","Jersey City","nj","nj",1000}
,{"br","Baton Rouge","la","la",1000}
,{"b ham","Birmingham","al","al",1000}
,{"no","New Orleans","la","la",1000}
,{"north hempstead","Town of North Hempstead","ny","ny",1000}
,{"n hempstead","Town of North Hempstead","ny","ny",1000}
,{"n hemp","Town of North Hempstead","ny","ny",1000}
,{"north hemp","Town of North Hempstead","ny","ny",1000}
,{"cv","Chula Vista","ca","ca",1000}
bool addCity ( uint64_t ch64 ,
char *adm1 ,
long pop ,
HashTableX *maxPops ) {
// see if already in the table
CityDesc *cdp = (CityDesc *)g_cities.getValue(&ch64);
// if contending with another state that has this
// same city name, check his city pop
// get the last max popularity for this state
long *v=(long *)maxPops->getValue(&ch64);
// save it into "lastPop" in case *v changes
long lastPop = -1;
if ( v ) lastPop = *v;
// update pop with ours if bigger
if ( v && pop > *v ) *v = pop;
uint64_t adm1Bits = getAdm1Bits ( adm1 );
if ( ! adm1Bits ) { char *xx=NULL;*xx=0; }
// if there, or it in
if ( cdp ) cdp->m_adm1Bits |= adm1Bits;
//if ( ch64==2443313629685134902LL && adm1Bits==2147483648 ) {
// log("hey");
// get our state
StateDesc *sd = getStateDesc ( adm1 );
// get our state index
long stateIndex = sd - s_states;
// update most popular state index?
if ( cdp && pop > lastPop ) {
// change it to our state
cdp->m_mostPopularState = stateIndex;
return true;
// already there? then skip
if ( cdp ) return true;
// otherwise, add the pop for the first time
// now this is CityDesc
CityDesc cd;
cd.m_adm1Bits = adm1Bits;
cd.m_mostPopularState = stateIndex;
// otherwise, just add it
g_cities.addKey ( &ch64 , &cd ) ; // adm1Bits );
return true;
// . ch64 is the 64bit hash of the original city name
// . "alias" is the alias name o fthe city
// . adm1Str is the state it is in
bool addAlias ( char *alias ,
char *adm1Str ,
uint64_t ch64 ,
long pop ,
HashTableX *maxPops ) {
// sanity check
if ( is_upper_a(adm1Str[0]) ) { char *xx=NULL;*xx=0; }
if ( is_upper_a(adm1Str[1]) ) { char *xx=NULL;*xx=0; }
// get "hash" of state
uint32_t adm1Hash32 = (uint32_t)(*(uint16_t *)adm1Str);
// get hash of city name alias
uint64_t ah = getWordXorHash ( alias );
// nothing?
if ( ! ah ) return true;
// debug point
if ( !strcmp(alias,"sf") )
// get the bits
uint64_t adm1Bits = getAdm1Bits ( adm1Str );
// if already in g_cities for this state, do not add as alias!
CityDesc *test = (CityDesc *) g_cities.getValue(&ah);
if ( test && (test->m_adm1Bits & adm1Bits ) ) {
// no! strange... how is this happening...
return true;
// hash city name alias and adm1 together
uint32_t aliasStateHash = hash32h ( (uint32_t)ah , adm1Hash32 );
// now that maps to the proper cityId32
uint32_t cid32 = getCityId32 ( ch64 , adm1Str ) ;
// must be a proper city name
CityDesc *cd = (CityDesc *)g_cities.getValue(&ch64);
if ( ! cd ) { char *xx=NULL;*xx=0; }
// make sure the city we are an alias for is in our state!
if ( !(cd->m_adm1Bits & adm1Bits) ) { char *xx=NULL;*xx=0; }
// add to alias table
if (!g_aliases.addKey (&aliasStateHash,&cid32)){char*xx=NULL;*xx=0;}
// sanity check -- verify the cityId works out
if ( ! g_timeZones.isInTable(&cid32) ) { char *xx=NULL;*xx=0;}
// then add to city table
addCity ( ah , adm1Str , pop , maxPops );
return true;
bool initPlaceDescTable ( ) {
// sanity check
if ( s_init ) { char *xx=NULL;*xx=0; }
// bail if not indexing events
//if ( ! g_conf.m_indexEventsOnly ) return true;
return true;
// . make this table
// . has words that can be lower case in a place name
//s_lc.set ( 8 , 0 , 0 , s_lcbuf , 2000 , false , 0 ,"plnametbl");
// stock the table (StopWords.cpp function)
if ( ! initWordTable ( &s_lc , s_lcWords , sizeof(s_lcWords),
char *xx=NULL;*xx=0; }
// we are init now
s_init = true;
// init indicator table
g_indicators.set ( 6 , // keySize
sizeof(IndDesc) , // dataSize
0 , // initial # slots
NULL , // initial buf
0 , // initial buf size
false , // allowDup keys?
0 , // niceness
"indictbl" );
// load inidcator table
//bool loadedIndicators = false;
if ( g_indicators.load ( g_hostdb.m_dir , "indicators.dat" ) ) {
loadedIndicators = true;
long long h = hash64 ( "highway" , 7 );
// test the indicators
if ( g_indicators.getSlot ( &h ) < 0 ){char *xx=NULL;*xx=0; }
// test the indicators
h = hash64Lower_a ( "N" , 1 );
if ( g_indicators.getSlot ( &h ) < 0 ){char *xx=NULL;*xx=0; }
// fix it
//loadedIndicators = true;
// keep these separate so we do not have to recompute any time we
// add or substract to/from this list
addIndicator ( "airport" , IND_NAME , 1.0 );
addIndicator ( "airstrip" , IND_NAME , 1.0 );
addIndicator ( "area" , IND_NAME , 1.0 );
addIndicator ( "arena" , IND_NAME , 1.0 );
addIndicator ( "arroyo" , IND_NAME , 1.0 );
addIndicator ( "bank" , IND_NAME , 1.0 );
addIndicator ( "banks" , IND_NAME , 1.0 );
addIndicator ( "bar" , IND_NAME , 1.0 );
addIndicator ( "pub" , IND_NAME , 1.0 );
addIndicator ( "brewpub" , IND_NAME , 1.0 );
addIndicator ( "atrium" , IND_NAME , 1.0 );
addIndicator ( "base" , IND_NAME , 1.0 );
addIndicator ( "basin" , IND_NAME , 1.0 );
addIndicator ( "bay" , IND_NAME , 1.0 );
addIndicator ( "beach" , IND_NAME , 1.0 );
addIndicator ( "bluff" , IND_NAME , 1.0 );
addIndicator ( "bog" , IND_NAME , 1.0 );
addIndicator ( "boundary" , IND_NAME , 1.0 );
addIndicator ( "branch" , IND_NAME , 1.0 );
addIndicator ( "bridge" , IND_NAME , 1.0 );
addIndicator ( "brook" , IND_NAME , 1.0 );
addIndicator ( "building" , IND_NAME , 1.0 );
addIndicator ( "bunker" , IND_NAME , 1.0 );
addIndicator ( "burro" , IND_NAME , 1.0 );
addIndicator ( "butte" , IND_NAME , 1.0 );
addIndicator ( "cabin" , IND_NAME , 1.0 );
addIndicator ( "camp" , IND_NAME , 1.0 );
addIndicator ( "campground" , IND_NAME , 1.0 );
addIndicator ( "campgrounds" , IND_NAME , 1.0 );
addIndicator ( "campus" , IND_NAME , 1.0 );
addIndicator ( "canal" , IND_NAME , 1.0 );
addIndicator ( "canyon" , IND_NAME , 1.0 );
addIndicator ( "casa" , IND_NAME , 1.0 );
addIndicator ( "castle" , IND_NAME , 1.0 );
addIndicator ( "cathedral" , IND_NAME , 1.0 );
addIndicator ( "cave" , IND_NAME , 1.0 );
addIndicator ( "cemetery" , IND_NAME , 1.0 );
addIndicator ( "center" , IND_NAME , 1.0 );
addIndicator ( "centre" , IND_NAME , 1.0 );
// "channel 13 news"?
//addIndicator ( "channel" , IND_NAME , 1.0 );
addIndicator ( "chapel" , IND_NAME , 1.0 );
addIndicator ( "church" , IND_NAME , 1.0 );
// "bible study circle"
//addIndicator ( "circle" , IND_NAME , 1.0 );
addIndicator ( "cliffs" , IND_NAME , 1.0 );
addIndicator ( "clinic" , IND_NAME , 1.0 );
addIndicator ( "college" , IND_NAME , 1.0 );
addIndicator ( "company" , IND_NAME , 1.0 );
addIndicator ( "complex" , IND_NAME , 1.0 );
addIndicator ( "corner" , IND_NAME , 1.0 );
addIndicator ( "cottage" , IND_NAME , 1.0 );
addIndicator ( "course" , IND_NAME , 1.0 ); // golf
addIndicator ( "courthouse" , IND_NAME , 1.0 );
addIndicator ( "courtyard" , IND_NAME , 1.0 );
addIndicator ( "cove" , IND_NAME , 1.0 );
addIndicator ( "creek" , IND_NAME , 1.0 );
addIndicator ( "dam" , IND_NAME , 1.0 );
addIndicator ( "den" , IND_NAME , 1.0 );
addIndicator ( "department" , IND_NAME , 1.0 );
addIndicator ( "depot" , IND_NAME , 1.0 );
addIndicator ( "dome" , IND_NAME , 1.0 );
addIndicator ( "downs" , IND_NAME , 1.0 );
addIndicator ( "fair" , IND_NAME , 1.0 );
addIndicator ( "fairgrounds" , IND_NAME , 1.0 );
addIndicator ( "fairground" , IND_NAME , 1.0 );
addIndicator ( "falls" , IND_NAME , 1.0 );
addIndicator ( "farm" , IND_NAME , 1.0 );
addIndicator ( "farms" , IND_NAME , 1.0 );
addIndicator ( "field" , IND_NAME , 1.0 );
addIndicator ( "fields" , IND_NAME , 1.0 );
addIndicator ( "flat" , IND_NAME , 1.0 );
addIndicator ( "flats" , IND_NAME , 1.0 );
addIndicator ( "forest" , IND_NAME , 1.0 );
addIndicator ( "fort" , IND_NAME , 1.0 );
addIndicator ( "fountain" , IND_NAME , 1.0 );
addIndicator ( "garden" , IND_NAME , 1.0 );
addIndicator ( "gardens" , IND_NAME , 1.0 );
addIndicator ( "gate" , IND_NAME , 1.0 );
addIndicator ( "glacier" , IND_NAME , 1.0 );
addIndicator ( "graveyard" , IND_NAME , 1.0 );
addIndicator ( "gulch" , IND_NAME , 1.0 );
addIndicator ( "gully" , IND_NAME , 1.0 );
addIndicator ( "hacienda" , IND_NAME , 1.0 );
addIndicator ( "hall" , IND_NAME , 1.0 );
addIndicator ( "halls" , IND_NAME , 1.0 );
addIndicator ( "harbor" , IND_NAME , 1.0 );
addIndicator ( "harbour" , IND_NAME , 1.0 );
addIndicator ( "hatchery" , IND_NAME , 1.0 );
addIndicator ( "headquarters" , IND_NAME , 1.0 );
addIndicator ( "heights" , IND_NAME , 1.0 );
addIndicator ( "heliport" , IND_NAME , 1.0 );
addIndicator ( "hill" , IND_NAME , 1.0 );
addIndicator ( "hillside" , IND_NAME , 1.0 );
addIndicator ( "hilton" , IND_NAME , 1.0 );
addIndicator ( "historical" , IND_NAME , 1.0 );
addIndicator ( "historic" , IND_NAME , 1.0 );
addIndicator ( "holy" , IND_NAME , 1.0 );
addIndicator ( "home" , IND_NAME , 1.0 );
addIndicator ( "homestead" , IND_NAME , 1.0 );
addIndicator ( "horn" , IND_NAME , 1.0 );
addIndicator ( "hospital" , IND_NAME , 1.0 );
addIndicator ( "hotel" , IND_NAME , 1.0 );
addIndicator ( "house" , IND_NAME , 1.0 );
addIndicator ( "howard" , IND_NAME , 1.0 ); // johnson's
addIndicator ( "inlet" , IND_NAME , 1.0 );
addIndicator ( "inn" , IND_NAME , 1.0 );
addIndicator ( "institute" , IND_NAME , 1.0 );
addIndicator ( "international" , IND_NAME , 1.0 );
addIndicator ( "isla" , IND_NAME , 1.0 );
addIndicator ( "island" , IND_NAME , 1.0 );
addIndicator ( "isle" , IND_NAME , 1.0 );
addIndicator ( "islet" , IND_NAME , 1.0 );
addIndicator ( "junction" , IND_NAME , 1.0 );
addIndicator ( "knoll" , IND_NAME , 1.0 );
addIndicator ( "lagoon" , IND_NAME , 1.0 );
addIndicator ( "laguna" , IND_NAME , 1.0 );
addIndicator ( "lake" , IND_NAME , 1.0 );
addIndicator ( "landing" , IND_NAME , 1.0 );
addIndicator ( "ledge" , IND_NAME , 1.0 );
addIndicator ( "lighthouse" , IND_NAME , 1.0 );
addIndicator ( "lodge" , IND_NAME , 1.0 );
addIndicator ( "lookout" , IND_NAME , 1.0 );
addIndicator ( "mall" , IND_NAME , 1.0 ); // added
addIndicator ( "manor" , IND_NAME , 1.0 );
addIndicator ( "marina" , IND_NAME , 1.0 );
addIndicator ( "meadow" , IND_NAME , 1.0 );
addIndicator ( "mine" , IND_NAME , 1.0 );
addIndicator ( "mines" , IND_NAME , 1.0 );
addIndicator ( "monument" , IND_NAME , 1.0 );
addIndicator ( "motel" , IND_NAME , 1.0 );
addIndicator ( "museum" , IND_NAME , 1.0 );
addIndicator ( "office" , IND_NAME , 1.0 );
addIndicator ( "outlet" , IND_NAME , 1.0 );
addIndicator ( "palace" , IND_NAME , 1.0 );
addIndicator ( "park" , IND_NAME , 1.0 );
addIndicator ( "peaks" , IND_NAME , 1.0 );
addIndicator ( "peninsula" , IND_NAME , 1.0 );
addIndicator ( "pit" , IND_NAME , 1.0 );
addIndicator ( "place" , IND_STREET , 1.0 ); // leroy place
addIndicator ( "pl" , IND_STREET , 1.0 ); // place
addIndicator ( "plains" , IND_NAME , 1.0 );
addIndicator ( "plant" , IND_NAME , 1.0 );
addIndicator ( "plantation" , IND_NAME , 1.0 );
addIndicator ( "plateau" , IND_NAME , 1.0 );
addIndicator ( "playa" , IND_NAME , 1.0 );
addIndicator ( "plaza" , IND_NAME , 1.0 );
addIndicator ( "point" , IND_NAME , 1.0 );
addIndicator ( "pointe" , IND_NAME , 1.0 );
addIndicator ( "pond" , IND_NAME , 1.0 );
addIndicator ( "port" , IND_NAME , 1.0 );
addIndicator ( "ramada" , IND_NAME , 1.0 );
addIndicator ( "ranch" , IND_NAME , 1.0 );
addIndicator ( "rancho" , IND_NAME , 1.0 );
addIndicator ( "range" , IND_NAME , 1.0 );
addIndicator ( "reef" , IND_NAME , 1.0 );
addIndicator ( "refure" , IND_NAME , 1.0 );
addIndicator ( "reserve" , IND_NAME , 1.0 );
addIndicator ( "reservoir" , IND_NAME , 1.0 );
addIndicator ( "residence" , IND_NAME , 1.0 );
addIndicator ( "resort" , IND_NAME , 1.0 );
//addIndicator ( "rio" , IND_NAME , 1.0 );
//addIndicator ( "river" , IND_NAME , 1.0 );
//addIndicator ( "riverside" , IND_NAME , 1.0 );
//addIndicator ( "riverview" , IND_NAME , 1.0 );
// was getting "rock bands"
//addIndicator ( "rock" , IND_NAME , 1.0 );
addIndicator ( "sands" , IND_NAME , 1.0 ); // added
addIndicator ( "sawmill" , IND_NAME , 1.0 );
addIndicator ( "school" , IND_NAME , 1.0 );
// try to fix hadcolon algo for
// The+Webb+Schools:+Calendars+...
addIndicator ( "schools" , IND_NAME , 1.0 );
addIndicator ( "schoolhouse" , IND_NAME , 1.0 );
addIndicator ( "shore" , IND_NAME , 1.0 );
addIndicator ( "spa" , IND_NAME , 1.0 );
addIndicator ( "spring" , IND_NAME , 1.0 );
addIndicator ( "springs" , IND_NAME , 1.0 );
addIndicator ( "stadium" , IND_NAME , 1.0 );
addIndicator ( "station" , IND_NAME , 1.0 );
addIndicator ( "strip" , IND_NAME , 1.0 );
addIndicator ( "suites" , IND_NAME , 1.0 );
addIndicator ( "temple" , IND_NAME , 1.0 );
addIndicator ( "terrace" , IND_NAME , 1.0 );
addIndicator ( "tower" , IND_NAME , 1.0 );
//addIndicator ( "trail" , IND_NAME , 1.0 );
addIndicator ( "travelodge" , IND_NAME , 1.0 );
addIndicator ( "triangle" , IND_NAME , 1.0 );
addIndicator ( "tunnel" , IND_NAME , 1.0 );
addIndicator ( "university" , IND_NAME , 1.0 );
//addIndicator ( "valley" , IND_NAME , 1.0 );
addIndicator ( "wall" , IND_NAME , 1.0 );
addIndicator ( "ward" , IND_NAME , 1.0 );
addIndicator ( "waterhole" , IND_NAME , 1.0 );
addIndicator ( "waters" , IND_NAME , 1.0 );
addIndicator ( "well" , IND_NAME , 1.0 );
addIndicator ( "wells" , IND_NAME , 1.0 );
addIndicator ( "wilderness" , IND_NAME , 1.0 );
addIndicator ( "windmill" , IND_NAME , 1.0 );
addIndicator ( "woodland" , IND_NAME , 1.0 );
addIndicator ( "woods" , IND_NAME , 1.0 );
// good stuff i added
// some from
addIndicator ( "gallery" , IND_NAME , 1.0 );
addIndicator ( "theater" , IND_NAME , 1.0 );
addIndicator ( "theatre" , IND_NAME , 1.0 );
addIndicator ( "playhouse" , IND_NAME , 1.0 );
addIndicator ( "saloon" , IND_NAME , 1.0 );
addIndicator ( "nightclub" , IND_NAME , 1.0 );
addIndicator ( "lounge" , IND_NAME , 1.0 );
addIndicator ( "ultralounge" , IND_NAME , 1.0 );
addIndicator ( "brewery" , IND_NAME , 1.0 );
addIndicator ( "chophouse" , IND_NAME , 1.0 );
addIndicator ( "tavern" , IND_NAME , 1.0 );
addIndicator ( "company" , IND_NAME , 1.0 );
addIndicator ( "rotisserie" , IND_NAME , 1.0 );
addIndicator ( "bistro" , IND_NAME , 1.0 );
addIndicator ( "parlor" , IND_NAME , 1.0 );
addIndicator ( "studio" , IND_NAME , 1.0 );
addIndicator ( "studios" , IND_NAME , 1.0 );
// albuquerque publishing co., short for "company"
addIndicator ( "co" , IND_NAME , 0.9 );
addIndicator ( "bureau" , IND_NAME , 1.0 );
addIndicator ( "estates" , IND_NAME , 1.0 );
addIndicator ( "dockyard" , IND_NAME , 1.0 );
addIndicator ( "gym" , IND_NAME , 1.0 );
addIndicator ( "synagogue" , IND_NAME , 1.0 );
addIndicator ( "shrine" , IND_NAME , 1.0 );
addIndicator ( "mosque" , IND_NAME , 1.0 );
addIndicator ( "store" , IND_NAME , 1.0 );
addIndicator ( "mercantile" , IND_NAME , 1.0 );
addIndicator ( "mart" , IND_NAME , 1.0 );
addIndicator ( "amphitheatre" , IND_NAME , 1.0 );
addIndicator ( "kitchen" , IND_NAME , 1.0 );
addIndicator ( "casino" , IND_NAME , 1.0 );
addIndicator ( "diner" , IND_NAME , 1.0 );
addIndicator ( "eatery" , IND_NAME , 1.0 );
addIndicator ( "shop" , IND_NAME , 1.0 );
addIndicator ( "inc" , IND_NAME , 1.0 ); // incorporated
addIndicator ( "incorporated" , IND_NAME , 1.0 );
addIndicator ( "corporation" , IND_NAME , 1.0 );
addIndicator ( "limited" , IND_NAME , 1.0 );
addIndicator ( "llc" , IND_NAME , 1.0 );
addIndicator ( "foundation" , IND_NAME , 1.0 );
addIndicator ( "warehouse" , IND_NAME , 1.0 );
addIndicator ( "roadhouse" , IND_NAME , 1.0 );
addIndicator ( "foods" , IND_NAME , 1.0 );
addIndicator ( "cantina" , IND_NAME , 1.0 );
addIndicator ( "steakhouse" , IND_NAME , 1.0 );
addIndicator ( "smokehouse" , IND_NAME , 1.0 );
addIndicator ( "deli" , IND_NAME , 1.0 );
addIndicator ( "enterprises" , IND_NAME , 1.0 );
addIndicator ( "repair" , IND_NAME , 1.0 );
addIndicator ( "service" , IND_NAME , 1.0 );
addIndicator ( "services" , IND_NAME , 1.0 );
addIndicator ( "systems" , IND_NAME , 1.0 );
addIndicator ( "salon" , IND_NAME , 1.0 );
addIndicator ( "boutique" , IND_NAME , 1.0 );
addIndicator ( "preschool" , IND_NAME , 1.0 );
addIndicator ( "galleries" , IND_NAME , 1.0 );
addIndicator ( "bakery" , IND_NAME , 1.0 );
addIndicator ( "factory" , IND_NAME , 1.0 );
addIndicator ( "llp" , IND_NAME , 1.0 );
addIndicator ( "attorney" , IND_NAME , 1.0 );
addIndicator ( "association" , IND_NAME , 1.0 );
addIndicator ( "solutions" , IND_NAME , 1.0 );
addIndicator ( "facility" , IND_NAME , 1.0 );
addIndicator ( "cannery" , IND_NAME , 1.0 );
addIndicator ( "mill" , IND_NAME , 1.0 );
addIndicator ( "quarry" , IND_NAME , 1.0 );
addIndicator ( "monastery" , IND_NAME , 1.0 );
addIndicator ( "observatory" , IND_NAME , 1.0 );
addIndicator ( "nursery" , IND_NAME , 1.0 );
addIndicator ( "pagoda" , IND_NAME , 1.0 );
addIndicator ( "pier" , IND_NAME , 1.0 );
addIndicator ( "prison" , IND_NAME , 1.0 );
addIndicator ( "post" , IND_NAME , 1.0 );
addIndicator ( "ruin" , IND_NAME , 1.0 );
addIndicator ( "ruins" , IND_NAME , 1.0 );
addIndicator ( "storehouse" , IND_NAME , 1.0 );
addIndicator ( "square" , IND_NAME , 1.0 );
addIndicator ( "tomb" , IND_NAME , 1.0 );
addIndicator ( "wharf" , IND_NAME , 1.0 );
addIndicator ( "zoo" , IND_NAME , 1.0 );
addIndicator ( "mesa" , IND_NAME , 1.0 );
addIndicator ( "pass" , IND_NAME , 1.0 );
addIndicator ( "passage" , IND_NAME , 1.0 );
addIndicator ( "peak" , IND_NAME , 1.0 );
addIndicator ( "vineyard" , IND_NAME , 1.0 );
addIndicator ( "grove" , IND_NAME , 1.0 );
//addIndicator ( "" , IND_NAME , 1.0 );
// maple street dance space
addIndicator ( "space" , IND_NAME , 1.0 );
addIndicator ( "library" , IND_NAME , 1.0 );
addIndicator ( "school" , IND_NAME , 1.0 );
addIndicator ( "church" , IND_NAME , 1.0 );
addIndicator ( "park" , IND_NAME , 1.0 );
addIndicator ( "house" , IND_NAME , 1.0 );
// markets are sometimes more of events than place names
addIndicator ( "market" , IND_NAME , 0.5 );
addIndicator ( "marketplace" , IND_NAME , 0.75 );
addIndicator ( "university" , IND_NAME , 1.0 );
addIndicator ( "center" , IND_NAME , 1.0 );
addIndicator ( "restaurant" , IND_NAME , 1.0 );
//addIndicator ( "bar" , IND_NAME , 1.0 );
addIndicator ( "grill" , IND_NAME , 1.0 );
addIndicator ( "grille" , IND_NAME , 1.0 );
addIndicator ( "cafe" , IND_NAME , 1.0 );
addIndicator ( "cabana" , IND_NAME , 1.0 );
addIndicator ( "shack" , IND_NAME , 1.0 );
addIndicator ( "shoppe" , IND_NAME , 1.0 );
addIndicator ( "collesium" , IND_NAME , 1.0 );
addIndicator ( "colliseum" , IND_NAME , 1.0 );
addIndicator ( "pavilion" , IND_NAME , 1.0 );
// cafe with accent mark
char tmp[64];
sprintf(tmp,"caf"); tmp[3]=0xc3; tmp[4]=0xa9; tmp[5]=0;
addIndicator ( tmp , IND_NAME , 1.0 );
// Less effective place name indicators
addIndicator ( "club" , IND_NAME , 0.5 );
// . now add some more indicators to g_cities hash table
// . TODO: get these in other languages. use wikipedia page!
addIndicator ( "suite" , IND_SUITE , 1.0 );
addIndicator ( "ste" , IND_SUITE , 1.0 );
addIndicator ( "room" , IND_SUITE , 1.0 );
addIndicator ( "pier" , IND_SUITE , 1.0 );
addIndicator ( "department" , IND_SUITE , 0.5 );
addIndicator ( "rm" , IND_SUITE , 1.0 );
addIndicator ( "floor" , IND_SUITE , 1.0 );
addIndicator ( "bldg" , IND_SUITE , 1.0 );
addIndicator ( "bld" , IND_SUITE , 1.0 );
addIndicator ( "building" , IND_SUITE , 1.0 );
addIndicator ( "apartment" , IND_SUITE , 1.0 );
addIndicator ( "apt" , IND_SUITE , 1.0 );
addIndicator ( "po" , IND_SUITE , 1.0 );
addIndicator ( "pobox" , IND_SUITE , 1.0 );
//addIndicator("p.o. box" , IND_SUITE , 1.0 );
addIndicator ( "box" , IND_SUITE , 1.0 );
addIndicator ( "postbus" , IND_SUITE , 1.0 ); // european
addIndicator ( "post" , IND_SUITE , 1.0 ); // european
addIndicator ( "bus" , IND_SUITE , 1.0 ); // european
addIndicator ( "private" , IND_SUITE , 1.0 ); // australia
addIndicator ( "box" , IND_SUITE , 1.0 ); // australia
// TODO: get these in other languages. use wikipedia page!
addIndicator ( "north" , IND_DIR , 1.0 );
addIndicator ( "east" , IND_DIR , 1.0 );
addIndicator ( "south" , IND_DIR , 1.0 );
addIndicator ( "west" , IND_DIR , 1.0 );
addIndicator ( "northeast" , IND_DIR , 1.0 );
addIndicator ( "northwest" , IND_DIR , 1.0 );
addIndicator ( "southeast" , IND_DIR , 1.0 );
addIndicator ( "southwest" , IND_DIR , 1.0 );
addIndicator ( "north" , IND_DIR , 1.0 );
addIndicator ( "east" , IND_DIR , 1.0 );
addIndicator ( "south" , IND_DIR , 1.0 );
addIndicator ( "west" , IND_DIR , 1.0 );
addIndicator ( "n" , IND_DIR , 1.0 );
addIndicator ( "s" , IND_DIR , 1.0 );
addIndicator ( "e" , IND_DIR , 1.0 );
addIndicator ( "w" , IND_DIR , 1.0 );
addIndicator ( "ne" , IND_DIR , 1.0 );
addIndicator ( "nw" , IND_DIR , 1.0 );
addIndicator ( "se" , IND_DIR , 1.0 );
addIndicator ( "sw" , IND_DIR , 1.0 );
// TODO: get in other languages
addIndicator ( "highway" , IND_STREET , 1.0 );
addIndicator ( "hghway" , IND_STREET , 1.0 );
addIndicator ( "hiway" , IND_STREET , 1.0 );
addIndicator ( "hway" , IND_STREET , 1.0 );
addIndicator ( "hwy" , IND_STREET , 1.0 );
// county road
//addIndicator ( "cr" , IND_STREET , 1.0 );
// state route
//addIndicator ( "route" , IND_STREET , 1.0 );
addIndicator ( "avenue" , IND_STREET , 1.0 );
addIndicator ( "ave" , IND_STREET , 1.0 );
addIndicator ( "drive" , IND_STREET , 1.0 );
addIndicator ( "dr" , IND_STREET , 1.0 );
addIndicator ( "ln" , IND_STREET , 1.0 );
addIndicator ( "lane" , IND_STREET , 1.0 );
addIndicator ( "blvd" , IND_STREET , 1.0 );
addIndicator ( "boulevard" , IND_STREET , 1.0 );
addIndicator ( "street" , IND_STREET , 1.0 );
addIndicator ( "st" , IND_STREET , 1.0 );
addIndicator ( "circle" , IND_STREET , 1.0 );
addIndicator ( "place" , IND_STREET , 1.0 );
addIndicator ( "parkway" , IND_STREET , 1.0 );
addIndicator ( "pkway" , IND_STREET , 1.0 );
addIndicator ( "pkwy" , IND_STREET , 1.0 );
addIndicator ( "straße", IND_STREET , 1.0 ); //!test this!
addIndicator ( "strasse" , IND_STREET , 1.0 );
addIndicator ( "sr" , IND_STREET , 1.0 ); // state route
addIndicator ( "trail" , IND_STREET , 1.0 );
// 80 mosby's run
addIndicator ( "run" , IND_STREET , 1.0 );
addIndicator ( "entrada" , IND_STREET , 1.0 );
// these were taken from
addIndicator ( "Autobahn" , IND_STREET , 1.0 );
addIndicator ( "Auto-estrada" , IND_STREET , 1.0 );
addIndicator ( "Autoroute" , IND_STREET , 1.0 );
addIndicator ( "Autostrada" , IND_STREET , 1.0 );
addIndicator ( "Autostrasse" , IND_STREET , 1.0 );
addIndicator ( "Byway" , IND_STREET , 1.0 );
addIndicator ( "Expressway" , IND_STREET , 1.0 );
addIndicator ( "Freeway" , IND_STREET , 1.0 );
addIndicator ( "Motorway" , IND_STREET , 1.0 );
addIndicator ( "Pike" , IND_STREET , 1.0 );
addIndicator ( "Avenue" , IND_STREET , 1.0 );
addIndicator ( "Boulevard" , IND_STREET , 1.0 );
addIndicator ( "Road" , IND_STREET , 1.0 );
addIndicator ( "rd" , IND_STREET , 1.0 );
addIndicator ( "Street" , IND_STREET , 1.0 );
addIndicator ( "Alley" , IND_STREET , 1.0 );
addIndicator ( "Bay" , IND_STREET , 1.0 );
addIndicator ( "Drive" , IND_STREET , 1.0 );
addIndicator ( "Fairway" , IND_STREET , 1.0 );
addIndicator ( "Gardens" , IND_STREET , 1.0 );
addIndicator ( "Gate" , IND_STREET , 1.0 );
addIndicator ( "Grove" , IND_STREET , 1.0 );
addIndicator ( "Heights" , IND_STREET , 1.0 );
addIndicator ( "Highlands" , IND_STREET , 1.0 );
addIndicator ( "Knoll" , IND_STREET , 1.0 );
addIndicator ( "Lane" , IND_STREET , 1.0 );
addIndicator ( "Manor" , IND_STREET , 1.0 );
addIndicator ( "Mews" , IND_STREET , 1.0 );
addIndicator ( "Passage" , IND_STREET , 1.0 );
addIndicator ( "Pathway" , IND_STREET , 1.0 );
addIndicator ( "Place" , IND_STREET , 1.0 );
addIndicator ( "Row" , IND_STREET , 1.0 );
addIndicator ( "Terrace" , IND_STREET , 1.0 );
addIndicator ( "Trail" , IND_STREET , 1.0 );
addIndicator ( "View" , IND_STREET , 1.0 );
addIndicator ( "Way" , IND_STREET , 1.0 );
addIndicator ( "Close" , IND_STREET , 1.0 );
addIndicator ( "Court" , IND_STREET , 1.0 );
addIndicator ( "Cove" , IND_STREET , 1.0 );
addIndicator ( "Croft" , IND_STREET , 1.0 );
addIndicator ( "Garth" , IND_STREET , 1.0 );
addIndicator ( "Green" , IND_STREET , 1.0 );
addIndicator ( "Lawn" , IND_STREET , 1.0 );
addIndicator ( "Nook" , IND_STREET , 1.0 );
addIndicator ( "Place" , IND_STREET , 1.0 );
addIndicator ( "Circle" , IND_STREET , 1.0 );
addIndicator ( "Crescent" , IND_STREET , 1.0 );
addIndicator ( "Loop" , IND_STREET , 1.0 );
addIndicator ( "Lp" , IND_STREET , 1.0 ); // abbreviation for loop
addIndicator ( "Oval" , IND_STREET , 1.0 );
addIndicator ( "Quadrant" , IND_STREET , 1.0 );
addIndicator ( "Square" , IND_STREET , 1.0 );
addIndicator ( "Canyon" , IND_STREET , 1.0 );
addIndicator ( "Causeway" , IND_STREET , 1.0 );
addIndicator ( "Grade" , IND_STREET , 1.0 );
addIndicator ( "Hill" , IND_STREET , 1.0 );
addIndicator ( "Mount" , IND_STREET , 1.0 );
addIndicator ( "Parkway" , IND_STREET , 1.0 );
addIndicator ( "Rise" , IND_STREET , 1.0 );
addIndicator ( "Vale" , IND_STREET , 1.0 );
addIndicator ( "Approach" , IND_STREET , 1.0 );
addIndicator ( "Bypass" , IND_STREET , 1.0 );
addIndicator ( "Esplanade" , IND_STREET , 1.0 );
addIndicator ( "Frontage road" , IND_STREET , 1.0 );
addIndicator ( "Parade" , IND_STREET , 1.0 );
addIndicator ( "Park" , IND_STREET , 1.0 );
addIndicator ( "Plaza" , IND_STREET , 1.0 );
addIndicator ( "Promenade" , IND_STREET , 1.0 );
addIndicator ( "Quay" , IND_STREET , 1.0 );
addIndicator ( "Stravenue" , IND_STREET , 1.0 );
// was matching intersection "8k run and walk"
//addIndicator ( "Walk" , IND_STREET , 1.0 );
// italy?
addIndicator ( "via" , IND_STREET , 1.0 );
// try to load places.dat. the new junk first
if ( ! loadPlaces ( ) ) return false;
// we do zips separate now! use wordId as the key
if ( ! g_zips.set ( 8,sizeof(ZipDesc),0,NULL,0,true,0,"tbl-zipcodes")){
char *xx=NULL;*xx=0; }
// zip codes reference city strings stored in this buffer
char *cityBuf = NULL;
long cityBufSize = 0;
// load zip code table
bool loadedZips = false;
if ( g_zips.load ( g_hostdb.m_dir,"zips.dat",&cityBuf,&cityBufSize)) {
// sanity check
//if ( g_zips.m_numSlotsUsed != 89471 ) { char*xx=NULL;*xx=0;}
if ( g_zips.m_numSlotsUsed != 43595 ) { char*xx=NULL;*xx=0;}
loadedZips = true;
long long h = hash64 ( "87109" , 5 );
// test the zips table
if ( g_zips.getSlot ( &h ) < 0 ){char *xx=NULL;*xx=0; }
// . assign it
// . ZipDesc::m_cityOffset reference this buffer
g_cityBuf = cityBuf;
g_cityBufSize = cityBufSize;
// . quickly set the states
// . map each name of a state to its index into s_states[] array
g_states.set ( 8 , 4 , 256 , NULL , 0 , false , 0 ,"adm1tbl");
long size = sizeof(s_states);
// item count
long n = (long)size/ sizeof(StateDesc);
for ( long i = 0 ; i < n ; i++ ) {
// get it
StateDesc *sd = &s_states[i];
// get hash of abbr
long long h = hash64n ( sd->m_adm1 );
// make the value
//long val = 0;
// shift up
//val <<= 8;
// or in the position
//val |= i;
// no dups
if ( g_states.isInTable ( &h ) ) { char *xx=NULL;*xx=0; }
// store it
if ( ! g_states.addKey ( &h , &sd ) ) { char*xx=NULL;*xx=0; }
// stop if done
if ( ! sd->m_name1 ) continue;
// then the second name
h = getWordXorHash ( sd->m_name1 );
// must be there
if ( ! h ) { char *xx=NULL;*xx=0; }
// flag it
//val = 1;
// shift up
//val <<= 8;
// or in the position
//val |= i;
// no dups
if ( g_states.isInTable ( &h ) ) { char *xx=NULL;*xx=0; }
// store it
if ( ! g_states.addKey ( &h , &sd ) ) { char*xx=NULL;*xx=0; }
// and the second name
if ( ! sd->m_name2 ) continue;
// then the second name
h = getWordXorHash ( sd->m_name2 );
// must be there
if ( ! h ) { char *xx=NULL;*xx=0; }
// flag it as second name
//val = 2;
// shift up
//val <<= 8;
// or in the position
//val |= i;
// no dups
if ( g_states.isInTable ( &h ) ) { char *xx=NULL;*xx=0; }
// store it
if ( ! g_states.addKey ( &h , &sd ) ) { char*xx=NULL;*xx=0; }
// . timezone table
// . hash of city and adm1 is the key
// . maps to a one byte timezone offset, usually negative
g_timeZones.set ( 4 ,
sizeof(CityStateDesc),// 1 byte date timezone offset
0 ,
0 ,
false , // dups?
0 , // niceness
"tbl-tzs" );
if ( loadedZips && !g_timeZones.load(g_hostdb.m_dir,"timezones.dat")){
log("places: failed to load timezones.dat");
loadedZips = false;
long vv = 185747;
if ( g_timeZones.m_numSlotsUsed && g_timeZones.m_numSlotsUsed!=vv){
log("places: bad timezones.dat file %li != %li",
return false;
// sanity
if ( g_timeZones.m_numSlotsUsed ) {
char udst;
char tzoff;
tzoff = getTimeZone2 ( "houston", "tx", &udst );
if ( tzoff == UNKNOWN_TIMEZONE ) { char *xx=NULL;*xx=0; }
if ( tzoff != -5 ) { char *xx=NULL;*xx=0; }
tzoff = getTimeZone2 ( "woods hole", "ma", &udst );
if ( tzoff == UNKNOWN_TIMEZONE ) { char *xx=NULL;*xx=0; }
tzoff = getTimeZone2 ( "albuquerque", "nm", &udst );
if ( tzoff == UNKNOWN_TIMEZONE ) { char *xx=NULL;*xx=0; }
// map a cityHash/state of an aliased city name to a normalized cityId
if ( ! g_aliases.set(4,4,128,NULL,0,false,0,"aliastab") )
return false;
// load the aliases
if ( loadedZips && g_aliases.load ( g_hostdb.m_dir , "aliases.dat")){
// match this
long na = 11663;//11462;
// sanity check
if ( g_aliases.m_numSlotsUsed != na){char*xx=NULL;*xx=0;}
// . init the hash table
// . use an 8-byte hash for the key
// . xor the wids together for quick lookups
// . all subphrases that include the first word of the place name will
// be hashed, that way we know if we should hash further
// . also, we should allow dups!
// . use a 6 byte key (truncated wordId) to use up less space!
g_cities.set ( 8 , // keySize
sizeof(CityDesc) , // adm1 bit vector + mostpopcity
0 , // initial # slots
NULL , // initial buf
0 , // initial buf size
true , // allowDup keys?
0 , // niceness
"tbl-places" );
// try to load the binary hash table first
if ( loadedZips && g_cities.load ( g_hostdb.m_dir , "cities.dat" ) ) {
// sanity check
long nc = 123347; // 123141;
if ( g_cities.m_numSlotsUsed != nc){char*xx=NULL;*xx=0;}
// another test
char *str;
//char *str = "nm";
//str = "madrid";
//long long h = hash64 (str,gbstrlen(str));
long long h = 0;
//h = hash64 ("santa",5);
//h ^= hash64 ("n",1);
h = hash64n ("jemez");
h <<= 1;
//h ^= hash64 ("fe",2);
//h ^= hash64 ("m",1);
h ^= hash64n("springs");
//str = "santa fe";
//str = "n.m.";
str = "jemez springs";
//h = hash64 ( "abq",3);
//str = "abq";
//h = hash64 ( "alb",3);
//str = "alb";
//str = "albuquerque";
//h = hash64 ( str,gbstrlen(str) );
str = "new york";
h = getWordXorHash ( str );
// make sure we got madrid nm
//long slot = g_cities.getSlot ( &h );
//if ( slot < 0 ) { char *xx=NULL;*xx=0; }
CityDesc *cd = (CityDesc *)g_cities.getValue(&h);
if ( ! cd ) { char *xx=NULL;*xx=0; }
uint64_t abits = getAdm1Bits ( "ny" );
if ( ! ( cd->m_adm1Bits & abits ) ) { char *xx=NULL;*xx=0;}
// check city ids
long long abqh1 = getWordXorHash("abq");
long long abqh2 = getWordXorHash("albuquerque");
uint32_t cid1 = getCityId32(abqh1,"nm");
uint32_t cid2 = getCityId32(abqh2,"nm");
if ( cid1 != cid2 ) { char *xx=NULL;*xx=0; }
// get nm
long long hnm = getWordXorHash("new mexico");
// get state descriptor
long pos = getStateOffset ( &hnm );
// sanity
if ( pos < 0 ) { char *xx=NULL;*xx=0; }
// make bit mask
uint64_t mask = 1LL << pos;
// and in nm
if ( ! ((cd->m_adm1Bits) & mask) ) { char *xx=NULL;*xx=0;}
// a nested loop
for ( ; slot >= 0 ; slot = g_cities.getNextSlot(slot,&h)) {
// get the place
pd = (PlaceDesc *)g_cities.getValueFromSlot(slot);
// map to alias?
if ( pd->m_bits & PLF_ALIAS )
pd=(PlaceDesc *)g_cities.getValueFromSlot(pd->getSlot());
if ( ! is_ascii(pd->m_adm1[0]) ||
! is_ascii(pd->m_adm1[1]) ) {
char *xx=NULL;*xx=0; }
// print it
log("places: h=%s adm1=%c%c ctry=%s",
// now hash for zip code
//h = hash64Lower_a("BC",2);
//long long h1 = hash64("n",1);
//long long h2 = hash64("m",1);
//long long h3 = (h1<<1LL) ^ h2;
char *zstr = "87102";
h = hash64 ( zstr,gbstrlen(zstr));
//h = hash64 ("78404",5);
//slot = g_cities.getSlot ( &h );
long slot = g_zips.getSlot ( &h );
//char *city="Corpus Christi";
char *city="Albuquerque";
long long ch = hash64Lower_utf8(city,gbstrlen(city));
//long ch = (long)(th64&0xffffffff);
log("places: %s hash = %llu",city,ch);
// a nested loop
for ( ; slot >= 0 ; slot = g_zips.getNextSlot(slot,&h)) {
// get the place
ZipDesc *zd;
zd = (ZipDesc *)g_zips.getValueFromSlot(slot);
// convert adm1 bit to adm1 code
StateDesc *sd = getStateDescFromBits(zd->m_adm1Bits);
// must be there
if ( ! sd ) { char *xx=NULL;*xx=0; }
//if(!is_ascii(zd->m_adm1[0]) ) {char *xx=NULL;*xx=0;}
// print it
log("places: h=%s cityhash=%llu adm1=%s "//adm1=%c%c "
if ( zd->m_cityHash != ch ) { char*xx=NULL;*xx=0; }
// exit until we get "nm" and "bc" for british columbia!!!
//log("hey hey!!!!!!!!!!!!!!!!! fix me you");
// otherwise, we passed
//if ( loadedIndicators ) return true;
return true;
//loadedCities = true;
// let them know that we are creating it
logf(LOG_INFO,"places: creating cities.dat");
// init with 8M slots
//g_cities.set ( 6,sizeof(PlaceDesc),6950000,NULL,0,true,0);
// 1M since doing USA only now. now cities.dat is only 12MB not 100MB
// uses 731k slots
//g_cities.set ( 8,sizeof(PlaceDesc),100000,NULL,0,true,0,"placestbl");
// this now maps just a city to the state/adm1 bit vector of the states
// it is in... AND the one byte timezone offset
g_cities.set ( 8,sizeof(CityDesc),100000,NULL,0,false,0,"placestbl");
// we do zips separate now! use wordId as the key (89k used)
if ( ! g_zips.set ( 8,sizeof(ZipDesc),10000,NULL,0,true,0,"zipstbl")) {
char *xx=NULL;*xx=0; }
if (!g_timeZones.set(4,sizeof(CityStateDesc),100000,NULL,0,false,0,
"tbl99")){ char *xx=NULL;*xx=0;}
// map a cityHash/state of an aliased city name to a normalized cityId
if ( ! g_aliases.set(4,4,128,NULL,0,false,0,"aliastab") )
return false;
// keep track of max population for each city name and the state
// in which that max population occurs
HashTableX maxPops;
maxPops.set (8,4,100000,NULL,0,false,0,"poptbl");
// LOAD THE allCountries.txt file
// geonameid : integer id of record in geonames database
// name : name of geographical point (utf8) varchar(200)
// asciiname : name of geographical point in plain ascii
// characters, varchar(200)
// alternatenames : alternatenames, comma separated varchar(4000)
// (varchar(5000) for SQL Server)
// latitude : latitude in decimal degrees (wgs84)
// longitude : longitude in decimal degrees (wgs84)
// feature class : see,
// char(1)
// feature code : see,
// varchar(10)
// country code : ISO-3166 2-letter country code, 2 characters
// cc2 : alternate country codes, comma separated,
// ISO-3166 2-letter country code, 60 characters
// admin1 code : fipscode (subject to change to iso code),
// isocode for the us and ch, see file
// admin1Codes.txt for display names of this code;
// varchar(20)
// admin2 code : code for the second administrative division, a
// county in the US, see file admin2Codes.txt;
// varchar(80)
// admin3 code : code for third level administrative division,
// varchar(20)
// admin4 code : code for fourth level administrative division,
// varchar(20)
// population : bigint (4 byte int)
// elevation : in meters, integer
// gtopo30 : average elevation of 30'x30' (ca 900mx900m)
// area in meters, integer
// timezone : the timezone id (see file timeZone.txt)
// modification date : date of last modification in yyyy-MM-dd format
// . make the filename to open
// . downloadeded from ?
// . sample line =
// 3038840 Serrat de Ventader Serrat de Ventader 42.4833333
// 1.4333333 T MT AD 00
char ff[1024];
sprintf ( ff , "%sallCountries.txt", g_hostdb.m_dir );
// places.txt is just the United States
//sprintf ( ff , "%splaces.txt", g_hostdb.m_dir );
logf(LOG_INFO,"places: reading %s",ff);
FILE *fd = fopen ( ff, "r" );
if ( ! fd )
return log("places: failed to open %s: %s",ff,strerror(errno));
// count how many times we see each word for purposes of establishing
// the most common indicators of a place. i.e. "center", "square",...
//HashTableX ct;
// init with 8M places too
//ct.set ( 8 , 4 , 9300000,NULL,0,false,0 ,"addrcmmn");
// similar to "ct" but we incorporate latitude/longitude to restrict
// voting in order to remove "local words", like Edisto!
//HashTableX gvt;
//gvt.set ( 8 , 0 , 30000 ,NULL,0,false,0,"addrgvt" );
HashTableX popTable;
popTable.set ( 4,4,30000,NULL,0,false,0,"poptab");
long badEntry = 0;
long line = 0;
//long MAX = 0;
// . go through the places in allCountries.txt
// . format described in /gb/geo/geonames/readme.txt
char buf[10000];
// for debuging
char *dbuf = buf;
//char topBuf[1000000];
//char *topBufPtr = topBuf;
// map a wid to a string ptr with this table, "st"
HashTableX st;
st.set ( 8 , 4 , 30000 , NULL,0,false,0 ,"addrst");
while ( fgets ( buf , 10000 , fd ) ) {
// tmp debug for postalCodes.txt
// length of line, including the terminating \n
long wlen = gbstrlen(buf) ;
// sanity check
if ( wlen >= 9000 ) { char *xx=NULL;*xx=0; }
// skip if empty
if ( wlen <= 0 ) continue;
// null terminate it, instead of \n
// debug point
//char *poo = strstr(buf,"Town of North Hempstead" ); if (poo)
// log("hey");
// log it
if ( (line % 10000) == 0 )
log(LOG_INFO,"places: read line #%li out of "
"6,900,574 (%li places added)",line,
// country id
uint8_t crid = 0;
// country code
char cc[3];
cc[0] = 0;
cc[1] = 0;
// admin1code
char a1[2];
// admin2code
//char a2[2];
// reset
a1[0] = a1[1] = 0;
// descriptive bits
//pbits_t bits = 0;
// place type
placetype_t ptype = 0;
// official name of the place
char *name = NULL;
// the ascii version
char *ascii = NULL;
// comma-separated abbreviations and alternative names
char *alt = NULL;
// stop after this char ptr
char *stop = NULL;
double latitude = 0.0;
double longitude = 0.0;
// population of the city/place
long pop = 0;
// count tabs
long tabs = 0;
// point to the beginning of the line
char *p = buf;
char tzoff = 0;
char useDST; // daylight savings time
// debug point
//if ( strncmp(buf,"2241297\t", 8) ==0 )
//if ( strncmp(buf,"3856157\t", 8) ==0 )
// log("gotit");
// parse out the tab delimeted things from the line
for ( ; *p ; p++ ) {
// skip if no tab
if ( *p != '\t' ) continue;
// count tabs
// point "s" to right after the tab
char *s = p + 1;
// done?
if ( ! *s ) break;
// after first tab is the official place name
if ( tabs == 1 ) name = s;
// then the name in ascii
if ( tabs == 2 ) ascii = s;
// then comma-separated list of alternative names
if ( tabs == 3 ) alt = s;
// the latitude
if ( tabs == 4 ) {
// a stopping point for "alt"
stop = s;
// get it
latitude = atof(s);
// the longitude
if ( tabs == 5 ) {
// get it
longitude = atof(s);
// . the category of place is after the 6th tab
// . the specific type of place is after the 7th tab
// . see
// . to save mem, only hash certain types...
if ( tabs == 7 ) {
// this is usually a state in the U.S.
if ( ! strncmp(s,"ADM1",4) )
ptype = 0;//PT_STATE;
// this is usually a county in the U.S.
else if ( ! strncmp(s,"ADM2",4) )
ptype = 0;//PT_ADM2;
// this is usually a county in the U.S.
else if ( ! strncmp(s,"ADM3",4) )
ptype = 0;//PT_ADM3;
// this is usually a county in the U.S.
else if ( ! strncmp(s,"ADM4",4) )
ptype = 0;//PT_ADM4;
// populated place = city
else if ( ! strncmp(s,"PPL" ,3) )
ptype = PT_CITY;
// town of, township, etc.
// town of north hempstead
// . crap! this gets a different san jose!
else if ( ! strncmp(s,"ADMD" ,4) )
ptype = PT_CITY;
// locality
else if ( ! strncmp(s,"LCTY" ,4) )
ptype = PT_CITY;
// independent political entity
else if ( ! strncmp(s,"PCLIX" ,4) )
ptype = PT_CITY;
else if ( ! strncmp(s,"P\t" ,2) )
ptype = PT_CITY;
// independent political entity = country
else if ( ! strncmp(s,"PCLI",4) )
ptype = PT_COUNTRY;
// allow schools (popular meeting place)
else if ( ! strncmp(s,"SCH",3) )
ptype = 0;//PT_SCH;
// and parks (popular meeting place)
else if ( ! strncmp(s,"PRK",3) )
ptype = 0;//PT_PRK;
// . country code (two letters)
// . sometimes things like a gulf of aden has no
// associated country code!
if ( tabs == 8 && s[0] != '\t' ) {
cc[0] = to_lower_a(s[0]);
cc[1] = to_lower_a(s[1]);
cc[2] = 0;
crid = getCountryId ( cc );
// sanity check
if ( s[2]!='\t'&&s[2]) { char *xx=NULL;*xx=0;}
// alternate country code (two letters)
if ( tabs == 9 && ! crid && s[0] != '\t' ) {
cc[0] = to_lower_a(s[0]);
cc[1] = to_lower_a(s[1]);
cc[2] = 0;
crid = getCountryId ( cc );
// . admin1 code (two letters)
// . readme.txt says varchar(20) but
// /gb/geo/admin1Codes.txt seems to say 2 chars
// . actually i have seen 3 letter ones... but they
// if truncated to two chars would be unique in their
// respective country. i.e. GB.ENG, GB.NIR, ...
// . BUT for GR.ESYE11 through GR.ESYE14, ... just use
// the last two chars!
if ( tabs == 10 ) {
// usually these 2 chars are digits!
a1[0] = to_lower_a(s[0]);
a1[1] = to_lower_a(s[1]);
// panic!
if ( s[2] == '\t' ) continue;
// watch out for GReece
if ( cc[0] != 'g' ) continue;
if ( cc[1] != 'r' ) continue;
// and its "states" (admin1 codes)
if ( a1[0] != 'e' ) continue;
if ( a1[1] != 's' ) continue;
// use the last two for this guy!
s += 4;
if ( ! is_digit(s[0]) ) continue;
if ( ! is_digit(s[1]) ) continue;
a1[0] = s[0];
a1[1] = s[1];
// pop is timezone - 3
if ( tabs == 14 ) {
// get it
pop = atol(s);
// timezone
if ( tabs == 17 ) {
char *tzname = p + 1;
// assume we use daylights savings time
useDST = 1;
// assume not found
tzoff = 0;
// find the end, a tab i guess or wsapce
char *e = tzname;
for ( ; *e && ! is_wspace_a(*e) ; e++ );
// temp null term
char saved = *e;
*e = '\0';
// convert to timezone offset
if ( ! strcmp(tzname,"America/Chicago") )
tzoff = -6;
else if ( ! strcmp(tzname,"America/Anchorage"))
tzoff = -9;
else if ( ! strcmp(tzname,"America/Indiana/Knox"))
tzoff = -5;
else if ( ! strcmp(tzname,"America/Kentucky/Monticello"))
tzoff = -5;
else if ( ! strcmp(tzname,"America/Boise"))
tzoff = -7;
else if ( ! strcmp(tzname,"America/Indiana/Indianapolis"))
tzoff = -5;
else if ( ! strcmp(tzname,"America/Indiana/Marengo"))
tzoff = -5;
else if ( ! strcmp(tzname,"America/Indiana/Petersburg"))
tzoff = -6;
else if ( ! strcmp(tzname,"America/Indiana/Tell_City"))
tzoff = -6;
else if ( ! strcmp(tzname,"America/Indiana/Vevay"))
tzoff = -5;
else if ( ! strcmp(tzname,"America/Indiana/Vincennes"))
tzoff = -5;
else if ( ! strcmp(tzname,"America/Indiana/Winamac"))
tzoff = -5;
else if ( ! strcmp(tzname,"America/Juneau"))
tzoff = -9;
else if ( ! strcmp(tzname,"America/Kentucky/Louisville"))
tzoff = -5;
else if ( ! strcmp(tzname,"America/Menominee"))
tzoff = -6;
else if ( ! strcmp(tzname,"America/Nome"))
tzoff = -9;
else if ( ! strcmp(tzname,"America/North_Dakota/Center"))
tzoff = -6;
else if ( ! strcmp(tzname,"America/North_Dakota/New_Salem"))
tzoff = -6;
else if ( ! strcmp(tzname,"America/Shiprock"))
tzoff = -7;
else if ( ! strcmp(tzname,"America/Yakutat"))
// could not find this - guessing
tzoff = -9;
else if ( ! strcmp(tzname,"America/Detroit"))
tzoff = -5;
else if ( !strcmp(tzname,"America/St_Thomas")){
tzoff = -4;
useDST = 0;
else if ( ! strcmp(tzname,"Pacific/Kwajalein"))
tzoff = -12;
else if ( ! strcmp(tzname,"America/Adak"))
tzoff = -10;
else if ( ! strcmp(tzname,"America/Phoenix")){
tzoff = -7; useDST = 0; }
else if ( ! strcmp(tzname,"America/Denver"))
tzoff = -7;
else if (!strcmp(tzname,"America/Los_Angeles"))
tzoff = -8;
else if ( ! strcmp(tzname,"America/New_York"))
tzoff = -5;
else if ( ! strcmp(tzname,"Pacific/Honolulu")){
tzoff = -10; useDST = 0; }
// amchitka in alasakn aleutian islands...
else if ( ! tzname[0] )
tzoff = 0;
else {
char *xx=NULL;*xx=0; }
// restore
*e = saved;
// break point
//if ( name && strncasecmp(name,"Madrid\t",7)==0 )
// log("hey");
// skip if not a place we are interested in
//if ( ! bits )
// continue;
if ( ! crid ) {
log("places: bad country for "
"for %s",dbuf);
// must have all 4 things here:
if ( !a1[0] || ! name ) {
//log("places: %s does not have country of adm1",name);
// skip all NON-USA places now that we are specializing
// no, now we had facebook events from all over, if they
// have a lat/lon! yeah, so let foreign cities through...
//if ( crid != CRID_US )continue;
// only store cities for now
if ( ! ptype ) continue;
// sanity check
if ( ! is_ascii(a1[0]) || ! is_ascii(a1[1]) ) {
//log("places: bad %s",name);
// what is this???? i see "00"
if ( is_digit(a1[0]) ) continue;
uint64_t h_washington = hash64n ("washington");
uint64_t h_dc = hash64n ("dc");
uint64_t h_d = hash64n ("d");
uint64_t h_c = hash64n ("c");
uint64_t h_wdc = h_washington;
h_wdc <<= 1;
h_wdc ^= h_dc;
uint64_t h_wdc2 = h_washington;
h_wdc2 <<= 1;
h_wdc2 ^= h_d;
h_wdc2 <<= 1;
h_wdc2 ^= h_c;
// set nameEnd/asciiEnd/altEnd
char *nameEnd = name;
for (;nameEnd;nameEnd++)
if(*nameEnd ==','||*nameEnd=='\t'||!*nameEnd ) break;
char *asciiEnd = ascii;
for (;asciiEnd;asciiEnd++)
if(*asciiEnd ==','||*asciiEnd=='\t'||!*asciiEnd)break;
char *altEnd = alt;
for ( ; altEnd ; altEnd++ )
if (*altEnd==','||*altEnd=='\t'||!*altEnd) break;
2013-08-03 00:12:24 +04:00
// null terms
*nameEnd = '\0';
*asciiEnd = '\0';
*altEnd = '\0';
// ok, now we need to grab the place id in the file and
// use that to reference the alt names table we hashed up
// top. because that includes the language code of the
// altname!!!
// then we need to make a string like
// and store that into a buffer for each place. then the
// city desc needs to references that buffer. we also hash
// every alt name to point to the same CityDesc or CountryDesc
// or StateDesc whichever type of place it is...
uint64_t h = getWordXorHash ( name );
// hashes we added, to dedup
//HashTableX dt;
//char buf[10000];
//dt.set ( 6,0,100,buf,10000,false,0);
// do not add "washington, dc" as a city, treat
// dc as a state!!
if ( h == h_wdc )
if ( h == h_wdc2 )
// no dups!
//if ( dt.isInTable(&h ) ) continue;
// add it
//if ( ! dt.addKey(&h) ) { char *xx=NULL;*xx=0; }
// normalize this
char adm1[3];
adm1[0] = to_lower_a(a1[0]);
adm1[1] = to_lower_a(a1[1]);
adm1[2] = 0;
// use this now
uint32_t cid32 = (uint32_t)getCityId32(h,a1);
// we add 100 to the timeZoneOffset to indicate it
// does not use DST
//if ( useDST == 0 ) tzoff += 100;
// already in there?
long slot = g_timeZones.getSlot ( &cid32 );
if ( slot >= 0 ) {
CityStateDesc *csd ;
csd = (CityStateDesc *)g_timeZones.
char tv = csd->m_timeZoneOffset;
if ( tv != tzoff ) {
log("places: bad city timezone "
"csh=%lu z: %s",
(unsigned long)cid32,
//char *xx=NULL;*xx=0; }
// get the pop from this
long cpop = *(long *)popTable.getValue ( &cid32 );
// if already in there, and this has more pop,
// then use it!
if ( pop > cpop ) {
csd->m_latitude = latitude;
csd->m_longitude = longitude;
popTable.addKey ( &cid32, &pop );
// timezone table maps city/state pair to a tzoffset
else {
// for each city/state pair we must store its
// lat/lon now too
CityStateDesc csd;
csd.m_timeZoneOffset = tzoff;
csd.m_useDST = useDST;
csd.m_latitude = latitude;
csd.m_longitude = longitude;
g_timeZones.addKey ( &cid32 , &csd );
popTable.addKey ( &cid32, &pop );
// add city name to the temporary hashtable of CityDescriptors.
// later we will serialize it into g_cityDescBuf and make
// the g_city hash table map ptrs into that. i think
// we can save it in cities.dat because HashTableX provides
// the mechanism for that.
addCity ( h , adm1 , pop , &maxPops );
// if the ascii hash is different, add as alias
addAlias ( ascii, adm1, h,pop, &maxPops );
// and the alt hash
addAlias ( alt, adm1, h,pop, &maxPops );
// now add the alternate names of this city
// as aliases, not just to g_cities, but also to
// g_aliases
long len = gbstrlen(name);
if ( strncmp(name,"Township of ",12) == 0 )
addAlias ( name + 12,adm1,h,pop,&maxPops);
if ( strncmp(name,"Town of ",8) == 0 )
addAlias ( name + 8 ,adm1,h,pop,&maxPops );
if ( strncmp(name,"City of ",7) == 0 )
addAlias ( name + 7 ,adm1,h,pop,&maxPops );
if ( strncmp(ascii,"Township of ",12) == 0 )
addAlias ( ascii + 12,adm1,h,pop,&maxPops);
if ( strncmp(ascii,"Town of ",8) == 0 )
addAlias ( ascii + 8 ,adm1,h,pop,&maxPops );
if ( strncmp(ascii,"City of ",7) == 0 )
addAlias ( ascii + 7 ,adm1,h,pop,&maxPops );
// "New York City" equals "New York"
char *tail = name+len-5;
if ( len >=6 && strncmp(tail," City",5)==0) {
*tail = '\0';
addAlias ( name ,adm1,h,pop,&maxPops );
*tail = ' ';
tail = ascii+len-5;
if ( len >=6 && strncmp(tail," City",5)==0) {
*tail = '\0';
addAlias ( ascii ,adm1,h,pop,&maxPops );
*tail = ' ';
// now scan each city in g_cities and set their CF_SINGLE_STATE
// flag if they only have one state
for ( long i = 0 ; i < g_cities.m_numSlots ; i++ ) {
// skip empty slots
if ( ! g_cities.m_flags[i] ) continue;
// get the data value
uint64_t *bv = (uint64_t *)g_cities.getValueFromSlot(i);
// count bits on
long nb = getNumBitsOn(*bv);
// sanity check
if ( nb == 0 ) { char *xx=NULL;*xx=0; }
// if only 1 set this flag
if ( nb == 1 ) *bv |= CF_UNIQUE;
// close that file
//logf(LOG_INFO,"places: allCountries.txt had %li bad entries.",
logf(LOG_INFO,"places: places.txt had %li bad entries.",
// reset for this file
badEntry = 0;
// LOAD THE postalCodes.txt file
//country code :iso country code, 2 characters
//postal code :varchar(10)
//place name :varchar(180)
//admin name1 :1. order subdivision (state) varchar(100)
//admin code1 :1. order subdivision (state) varchar(20)
//admin name2 :2. order subdivision (county/province) varchar(100
//admin code2 :2. order subdivision (county/province) varchar(20)
//admin name3 :3. order subdivision (community) varchar(100)
//latitude :estimated latitude (wgs84)
//longitude :estimated longitude (wgs84)
//accuracy :accuracy of lat/lng from 1=estimated to 6=centroid
// crap canadian state abbreviations are not in allCountries.txt
// so use the "admin code1" in the postalCodes.txt file!
// . now read in the zip codes
// . make the filename to open
sprintf ( ff , "%spostalCodes.txt", g_hostdb.m_dir );
logf(LOG_INFO,"places: reading %s",ff);
fd = fopen ( ff, "r" );
if ( ! fd )
return log("places: failed to open %s: %s",ff,strerror(errno));
// make the city buf
SafeBuf sb;
line = 0;
// . go through the places in allCountries.txt
// . format described in /gb/geo/geonames/readme.txt
while ( fgets ( buf , 10000 , fd ) ) {
// length of line, including the terminating \n
long wlen = gbstrlen(buf) ;
// sanity check
if ( wlen >= 9000 ) { char *xx=NULL;*xx=0; }
// skip if empty
if ( wlen <= 0 ) continue;
// null terminate it, instead of \n
// log it
if ( (line % 10000) == 0 )
log(LOG_INFO,"places: read postal line #%li out of "
"848,226 (%li places added)",line,
// country id
uint8_t crid = 0;
// admin1code
char a1[2];
// reset
a1[0] = a1[1] = 0;
// count tabs
long tabs = 0;
// point to the beginning of the line
char *p = buf;
// isoalte the zip code
char *zip = NULL;
char *cityName = NULL;
char *a1name = NULL;
char *a2name = NULL;
//char *zipEnd = NULL;
// parse out the tab delimeted things from the line
for ( ; *p ; p++ ) {
// a temp var
char *s = p;
// put country code here
char cc[3];
// first is country code
if ( p == buf ) {
cc[0] = to_lower_a(s[0]);
cc[1] = to_lower_a(s[1]);
cc[2] = 0;
// sanity check
if ( s[2] != '\t' ) { char *xx=NULL;*xx=0;}
// to id
crid = getCountryId ( cc );
// must be valid
//if ( ! crid ) { char *xx=NULL;*xx=0; }
// there is a "gg" in there!
if ( ! crid ) break;
// skip if no tab
if ( *p != '\t' ) continue;
// count tabs
// after first tab is the POSTAL CODE
if ( tabs == 1 ) {
zip = p + 1;
if ( tabs == 2 ) {
// terminate zip for Words::set() below
*p = '\0';
cityName = p + 1;
if ( tabs == 3 ) {
// terminate for cityName
*p = '\0';
a1name = p + 1;
// . after 4th tab is admin code1
// . admin1 code (two letters)
// . readme.txt says varchar(20) but
// /gb/geo/admin1Codes.txt seems to say 2 chars
// . actually i have seen 3 letter ones... but they
// if truncated to two chars would be unique in their
// respective country. i.e. GB.ENG, GB.NIR, ...
// . BUT for GR.ESYE11 through GR.ESYE14, ... just use
// the last two chars!
if ( tabs == 4 ) {
// terminate for a1name
*p = '\0';
// usually these 2 chars are digits!
a1[0] = to_lower_a(p[1]);
a1[1] = to_lower_a(p[2]);
// one letter province/state code?
if ( p[2] == '\t' ) {
a1[1] = 0;
// panic!
if ( p[3] == '\t' ) continue;
// watch out for GReece
if ( cc[0] != 'g' ) continue;
if ( cc[1] != 'r' ) continue;
// and its "states" (admin1 codes)
if ( a1[0] != 'e' ) continue;
if ( a1[1] != 's' ) continue;
// use the last two for this guy!
s += 4;
if ( ! is_digit(s[0]) ) continue;
if ( ! is_digit(s[1]) ) continue;
a1[0] = s[0];
a1[1] = s[1];
if ( tabs == 5 ) {
// terminate for cityName
//*p = '\0';
a2name = p + 1;
if ( tabs == 6 ) {
// terminate for a2name
*p = '\0';
// if we got an illegit adm1 code try convert the admin 1 name
bool legit = true;
if ( !a1[0] )
legit = false;
if ( !is_ascii(a1[0]) )
legit = false;
if ( !is_ascii(a1[1]) )
legit = false;
// empty is NULL
if ( a1name && ! *a1name ) a1name = NULL;
if ( a2name && ! *a2name ) a2name = NULL;
if ( cityName && ! *cityName ) cityName = NULL;
//if ( is_ascii(a1[0])&&is_ascii(a1[1])&&is_ascii(a1[2]) )
// legit = false;
// do we got this?
//if ( ! legit && ! a1name ) continue;
// not a chance to save ourselves if no adm1 name given
if ( ! legit && ! a1name && ! a2name && ! cityName ) {
// now we must have a valid a1name because as we have found
// the adm1 code in postalCodes.txt does not always correspond
// to those in allCountries.txt. like "british columbia" is
// "02" in allCountries.txt and "bc" in postalCodes.txt.
if ( ! a1name ) {
// skip all NON-USA places now that we are specializing
if ( crid != CRID_US )
// try to convert it
PlaceDesc *tpd ;
long ss;
long long th;
long long *twids;
Words tw;
// make a city hash that would match Place::m_hash
//long long cityHash = hashStringXor ( cityName );
//long long tmpHash ;
//tmpHash = hash64Lower_utf8 ( cityName , gbstrlen(cityName) ) ;
//long cityHash = (long)(ch & 0xffffffff);
//if ( strncmp(cityName,"Budlake",7)==0 )
// log("hey");
if ( ! legit ) {
char *use = NULL;
if ( ! use ) use = a2name;
if ( ! use ) use = a1name;
if ( ! use ) use = cityName;
if ( ! use ) { char *xx=NULL;*xx=0; }
// hash each alnum word in there
if ( ! use ) { char *xx=NULL;*xx=0; }
// hash the name
long long uh = hashStringXor ( use );
// see if we got it
City *c = (City *) g_cities.getValue ( &uh );
// set adm1 i guess
if ( c ) {
legit = true;
adm1Bits = c->m_adm1Bits;
// a nested loop
for ( ; ss >= 0 ; ss = g_cities.getNextSlot(ss,&th)) {
// get the place
tpd=(PlaceDesc *)g_cities.getValueFromSlot(ss);
// must be our ctry
if ( tpd->m_crid != crid ) continue;
// got it
a1[0] = tpd->m_adm1[0];
a1[1] = tpd->m_adm1[1];
legit = true;
// if still not found, try the other
if ( ! legit && use && use == a2name && a1name ) {
use = a1name;
goto redo;
if ( ! legit && use && use == a1name && cityName ) {
use = cityName;
goto redo;
static long s_printed = 0;
// sanity check
if ( ! legit ) {
if ( ++s_printed < 100 )
log("places: bad adm1 for "
"zip=\"%s\" cityName=\"%s\" "
"adm1Name=\"%s\" adm2Name=\"%s\"",
zip, cityName,a1name,a2name);
// the two-letter adm1 in postalCodes.txt sometimes differs
// from those in allCountries.txt. like, for example,
// British Columbia has adm1 code of "02" in allCountries.txt
// but it is "bc" in postalCodes.txt.
// so let's hash the full adm1 name in postalCodes.txt in order
// to get the proper adm1 from allCountries.txt.
if ( ! a1name ) continue;
// hash the proper name of the adm1
long long HH = getWordXorHash ( a1name );
// skip if empty
if ( HH == 0 ) continue;
// now get state
long pos = getStateOffset ( &HH );
// skip if could not match it to an adm1 in allCountries.txt
// by the full name of the adm1
if ( pos < 0 ) { char *xx=NULL;*xx=0; }//continue;
// set it
ZipDesc zd;
//zd.m_crid = crid;
// set the state's bit. each state has its own unique bit
zd.m_adm1Bits = 1LL << pos;
zd.m_adm1[0] = a1[0];
zd.m_adm1[1] = a1[1];
zd.m_cityHash = getWordXorHash ( cityName );
// centroid lat/lon now
zd.m_latitude = 999.0;
zd.m_longitude = 999.0;
// sanity check
if ( ! zd.m_cityHash ) { char *xx=NULL;*xx=0; }
// offset to current position
long cityOffset = sb.length();
// store it
long cityNameLen = gbstrlen(cityName);
sb.safeMemcpy ( cityName , cityNameLen );
sb.safeMemcpy ( "\0", 1 ); // null terminate
// update zd
zd.m_cityOffset = cityOffset;
long long zh = getWordXorHash ( zip );
// skip if bad
if ( ! zh ) { badEntry++; continue; }
// sanity check
//if ( g_zips.isInTable ( &zh ) ) {
// // both willowbrook,Il and hinsdale,IL have the
// // same zip code!
// //char *xx=NULL;*xx=0; }
// continue;
// debug point
//if ( zh == 70799779105646092LL )
// log("hey");
if ( ! g_zips.addKey ( &zh , &zd ) ) return false;
// close that file
// now open zipcode.csv and add the lat/lon of each zip code
// from
sprintf ( ff , "%szipcode.csv", g_hostdb.m_dir );
logf(LOG_INFO,"places: reading %s",ff);
fd = fopen ( ff, "r" );
if ( ! fd )
return log("places: failed to open %s: %s",ff,strerror(errno));
line = 0;
// go through the zipcodes in zipcode.csv, one per line
while ( fgets ( buf , 10000 , fd ) ) {
// length of line, including the terminating \n
long wlen = gbstrlen(buf) ;
// sanity check
if ( wlen >= 9000 ) { char *xx=NULL;*xx=0; }
// skip if empty
if ( wlen <= 0 ) continue;
// null terminate it, instead of \n
// log it
if ( (line % 10000) == 0 )
log(LOG_INFO,"places: read line #%li",line);
// for debug
char *p = buf;
// lat is after 7th quote, lon is after 9th quote
long qcount = 0;
float latitude = 999.0;
float longitude = 999.0;
char *zip = NULL;
for ( ; *p ; p++ ) {
if ( *p == '\"' ) qcount++;
else continue;
if ( qcount == 1 ) zip = p+1;
if ( qcount == 7 ) latitude = atof (p+1);
if ( qcount == 9 ) longitude = atof (p+1);
if ( ! zip ) continue;
// must be numeric (disregard line 1 that has "zip")
if ( ! is_digit(zip[0]) ) continue;
// null term
if ( zip[6] != '\"' ) zip[6] = '\0';
else { char *xx=NULL;*xx=0; }
// look it up
long long zh = getWordXorHash ( zip );
// skip if bad
ZipDesc *zd = (ZipDesc *)g_zips.getValue ( &zh );
// must be there
if ( ! zd ) {
logf(LOG_INFO,"places: could not find zip %s",zip);
// set it
zd->m_latitude = latitude;
zd->m_longitude = longitude;
// scan all zips and make sure all have lat/lon
long missed = 0;
for ( long i = 0 ; i < g_zips.m_numSlotsUsed ; i++ ) {
// skip i fempty bucket
if ( ! g_zips.m_flags[i] ) continue;
// get it
ZipDesc *zd = (ZipDesc *)g_zips.getValueFromSlot(i);
// check it
if ( zd->m_latitude == 999.0 ||
zd->m_longitude == 999.0 )
logf(LOG_INFO,"places: missed lat/lon for %li zipcodes",missed);
logf(LOG_INFO,"places: postalCodes.txt had %li bad entries.",
// convert the indicator count table into g_indicators for IND_NAME
// and add them into g_indicators now
for ( long i = 0 ; i < ct.m_numSlots ; i++ ) {
// skip if empty
if ( ct.m_flags[i] == 0 ) continue;
// this is a count table
long count = *(long *)ct.getValueFromSlot ( i );
// skip if not popular
if ( count < MIN_POP_COUNT ) continue;
// skip for now
// make into score
//float boost = 1.0 + (9.0 * (float)count / (float)MAX);
//float boost = 1.00;
// increment for every count
//for ( long j = 10 ; j < count ; j++ )
// boost *= 1.002;
// limit it to 1.5 for now...
//if ( boost > 1.5 ) boost = 1.5;
// get wid
//long long *wid = (long long *)ct.getKey ( i );
// . add it
// . use a boost of just 0.25 for now
//if(! addIndicator ( *wid , IND_NAME , 0.25 ) ) // boost ) )
// return log("places: failed to make indicators.");
// debug
//char *str = *(char **)st.getValue ( wid );
// show it
//logf (LOG_DEBUG,"events: top place %s boost=%.02f",
// str,boost);
// add the aliases
logf(LOG_INFO,"places: making aliases.dat");
// . abbreviations for popular cities
// . now we use the s_cityList array
long ncl = (long)sizeof(s_cityList)/ sizeof(AliasDesc);
for ( long i = 0 ; i < ncl ; i++ ) {
char *s1 = s_cityList[i].m_s1;
char *s2 = s_cityList[i].m_s2;
// use this now
uint64_t h1 = getWordXorHash(s1);
uint64_t h2 = getWordXorHash(s2);
// skip if the same
if ( h1 == h2 ) continue;
// sanity check
if ( h1 == 0 ) { char *xx=NULL;*xx=0; }
if ( h2 == 0 ) { char *xx=NULL;*xx=0; }
// get it
CityDesc *cdp2 = (CityDesc *)g_cities.getValue ( &h2 );
// must be there
if ( ! cdp2 ) { char *xx=NULL;*xx=0; }
// . add it as an alias for h2
// . will add to g_aliases table which maps our
// cityHash and adm1Str to the normalized cityHash
// . also adds to g_cities which maps a normalized city
// hash to a bit vector of states that contain a city
// by that name
addAlias ( s1 , s_cityList[i].m_adm1,h2,
// you know addAlias() now adds this junk to g_cities...!
// get our special cdp
CityDesc *cdp1 = (CityDesc *)g_cities.getValue ( &h1 );
// if not there, add one
if ( ! cdp1 ) {
// make CityDesc to add
CityDesc cd;
// . we choose most pop state for this alias
// . so "SF" has two entries in s_cityList and the
// "mostPopState" is "ca" for both
char *ss = s_cityList[i].m_mostPopStateAbbr;
// get this
StateDesc *tsd = getStateDesc(ss);
// convert to index
long si = tsd - &s_states[0];
// sanity
if ( si < 0 ) { char *xx=NULL;*xx=0; }
// store it
cd.m_mostPopularState = si;
// and the bits indicating states we are in
cd.m_adm1Bits = cdp2->m_adm1Bits;
if ( ! g_cities.addKey(&h1,&cd) ){ char*xx=NULL;*xx=0;}
// flag it as an alias so getCityId32() knows to
// look it up special...
//cd.m_adm1Bits |= 0x8000000000000000LL;
// then update bits
cdp1->m_adm1Bits |= cdp2->m_adm1Bits;
// save it
logf(LOG_INFO,"places: saving timezones.dat");
if ( ! ( g_hostdb.m_dir , "timezones.dat" ) )
return log("places: failed to save timezones.dat");
// save it
logf(LOG_INFO,"places: saving cities.dat");
if ( ! ( g_hostdb.m_dir , "cities.dat" ) )
return log("places: failed to save cities.dat");
logf(LOG_INFO,"places: saving aliases.dat");
if ( ! ( g_hostdb.m_dir , "aliases.dat" ) )
return log("places: failed to save aliases.dat");
logf(LOG_INFO,"places: saving zips.dat");
char *tbuf = sb.getBufStart();
long tbufSize = sb.length();
if ( ! ( g_hostdb.m_dir , "zips.dat",tbuf,tbufSize ) )
return log("places: failed to save zips.dat");
// let this memlose
g_cityBuf = tbuf;
g_cityBufSize = tbufSize;
// do not let "sb" free it
//sb.m_buf = NULL;
2013-08-03 00:12:24 +04:00
//if ( ! ( g_hostdb.m_dir, "indicators.dat" ) )
// return log("places: failed to save indicators.dat");
// LOAD THE planet-090421.osm file to get street names
// init indicator table
g_streets.set ( 7 , // keySize
0 ,
0 , // initial # slots
NULL , // initial buf
0 , // initial buf size
false , // allowDup keys?
0 ); // niceness
// load inidcator table
if ( g_streets.load ( g_hostdb.m_dir , "streetnames.dat" ) )
return true;
// . open the unholy planet-090421.osm file to create streetnames.dat
// . see to
// explain a bit about this xml file
// .
// .
// .
sprintf ( ff , "%splanet-090421.osm", g_hostdb.m_dir );
logf(LOG_INFO,"places: reading %s",ff);
FILE *fd = fopen ( ff, "r" );
if ( ! fd )
return log("places: failed to open %s: %s",ff,strerror(errno));
return true;
// . "boost" is how much to boost the Place's score by if it has this indicator
bool addIndicator ( char *s , char bit , float indScore ) {
// hash it
long long h = hash64Lower_utf8 ( s , gbstrlen(s) );
return addIndicator ( h , bit , indScore );
bool addIndicator ( long long h , char bit , float indScore ) {
// plaza is two types of indicator, street and name
IndDesc *pid = (IndDesc *)g_indicators.getValue (&h);
// if there, augment the bits
if ( pid ) {
pid->m_bit |= bit;
return true;
// add in some indicators of our own
IndDesc id;
// set bit, should only be one
id.m_bit = bit;
id.m_indScore = indScore;
// add it. should memcpy "pd"
return g_indicators.addKey ( &h , &id ) ;
// "baseScore" should be event id
bool Address::hash ( long baseScore ,
HashTableX *dt ,
uint32_t date ,
Words *words ,
Phrases *phrases ,
SafeBuf *pbuf ,
HashTableX *wts ,
SafeBuf *wbuf ,
long version ,
long niceness ) {
return true;
// . returns false and sets g_errno on error
bool Addresses::hashForPlacedb ( long long docId ,
long siteHash32 ,
long ip ,
HashTableX *dt ) {
// sanity check
if ( dt->m_ds != 512 ) { char *xx=NULL;*xx=0; }
if ( dt->m_ks != 16 ) { char *xx=NULL;*xx=0; }
// ensure we allow dups because some streets are repeated on
// the page, but with different place names. see
//if ( ! dt->m_allowDups ) { char *xx=NULL;*xx=0; }
// now create the meta rdb list
for ( long i = 0 ; i < m_am.getNumPtrs() ; i++ ) {
// breathe
QUICKPOLL ( dt->m_niceness );
// get it
Address *a = (Address *)m_am.getPtr(i);
// skip if lat/lon
if ( a->m_flags3 & AF2_LATLON ) continue;
// is it good?
bool good = false;
// being inlined is awesome
if ( a->m_flags & AF_INLINED ) good = true;
// if the street is verified, add the whole thing too!
// even if the street num and place name are not verified.
if ( a->m_flags & AF_VERIFIED_STREET ) good = true;
// sometimes a street can exist in two cities or states
if ( a->m_flags & AF_AMBIGUOUS ) good = false;
// do not add addresses that have no street per se
if ( a->m_street->m_flags2 & PLF2_IS_NAME ) good = false;
// no intersections
if ( a->m_street->m_flags2 & PLF2_INTERSECTION ) good = false;
// . skip if not good
// . we no longer add non-inlined addresses cuz those are
// not as accurate. many pages have the street address
// too far from the city and state, and we use one from the
// tag and it ain't right.
// 4007 Menaul NE ~
// Between Washington and Carlisle ~
// 87110 ~
// with the tag:
// New Mexico Music Commission;;PO Box 1450;Santa Fe(nm);...
// caused it to get "Santa Fe" as the city
if ( ! good ) continue;
// not if amibiguous
//if ( a->m_flags & AF_AMBIGUOUS ) good = false;
// . skip if no zip
// . hmmm, a lot seem to be missing zip, so forget about it
//if ( ! a->m_zip ) continue;
// seraialize into "buf"
char buf[513];
// reset it to all 0s
memset ( buf , 0 , 513 );
// convert to semicolon format
long size = a->serialize ( buf , 511 , NULL , false , false);
// skip on error, probably > 511 bytes!
if ( size < 0 ) continue;
// make the key for this address
key128_t k = a->makePlacedbKey ( m_docId , false,false );
// store it for getNamedbData() to use
if ( a->m_placedbKey != k ) { char *xx=NULL; *xx=0; }
// if key already added, skip. assume the first one is better.
// has two different place names for Kimo Theater street addr
// will add the entire 512 bytes of buffer to this hash table
// so it is really up to XmlDoc::addTable128() to fix that
// when it creates the corresponding meta list. it will need
// to shrink that list
if ( ! dt->isInTable (&k) &&
! dt->addKey ( (char *)&k , buf ) ) return false;
// now if the name is verified, then use the hash of the
// name in place of the street hash
if ( a->m_flags & AF_VERIFIED_PLACE_NAME_1 ) {
// use that
key128_t k2 = a->makePlacedbKey ( m_docId,true,false);
// add again
if ( ! dt->addKey ( (char *)&k2 , buf ) ) return false;
// same with place name 2
if ( a->m_flags & AF_VERIFIED_PLACE_NAME_2 ) {
// use that
key128_t k2 = a->makePlacedbKey ( m_docId,false,true);
// add again
if ( ! dt->addKey ( (char *)&k2 , buf ) ) return false;
// . skip if not a venue location for this venue website
if ( ! ( a->m_flags & AF_VENUE_DEFAULT ) ) continue;
// . do not do this now... key formation is setting del bit!
// . we do not really use this right now...
// . add the address of the website itself!! a venue website!!
// . use siteHash32 as the top key
// . make the key
key128_t k3;
k3.n0 = 0LL;
k3.n1 = 0LL;
k3.n1 = siteHash32;
k3.n1 <<= 32;
k3.n0 = (docId<<1);
// add it
if ( ! dt->addKey((char *)&k3,buf)) return false;
return true;
#include "Placedb.h"
// . H = 48 bit hash of (streetname,ctryId,adm1,city)
// N = 16 bit hash of streetnum
// . placedb key format:
// H (48 bits) | N (16 bits) |docId(38bits) | delbit(1)
// . data = serialized address ( see setFromStr() function)
// . "streetname" should exclude any indicators
// . we determine the group responsible for this key by the 64 bit hash (H)
// alone... see Hostdb::getGroupId()
key128_t Address::makePlacedbKey (long long docId,bool useName1,bool useName2){
// the key we are setting
key128_t k;
// sanity check, must be 8 bits or less
//if ( m_adm1->m_crid > 255 ) { char *xx=NULL;*xx=0; }
// sanity
if ( m_cityId32 == 0 ) { char *xx=NULL;*xx=0; }
// save for sanity check. mask it to 25 bits
long snh = m_street->m_streetNumHash & 0x01ffffff;
// add in street name (not including indicators)
long long h = m_street->m_hash;
// . use place name 1 instead of street name?
// . we use this for when "Tingley Colesium" is given and no street!
if ( useName1 || useName2 ) {
// use the name hash in place of the street hash!!! HACK
if ( useName1 ) h = m_name1->m_hash;
if ( useName2 ) h = m_name2->m_hash;
// anytime we use a name as the street hash we have to
// xor in this to prevent a place name from matching
// a street name (see above)
h ^= 0x123456;
// and incorporate the street hash into the snh so that
// sendBackAddress() function's life is easier
snh ^= m_street->m_hash;
// mask it
snh &= 0x01ffffff;
// country id
//h = hash64 ( (long long)m_adm1.m_crid , h );
// adm1
// get the two-letter state abbreviation code (nm = new mexico)
2013-08-09 19:52:15 +04:00
char *adm1Str = NULL;
2013-08-03 00:12:24 +04:00
if ( m_adm1 ) adm1Str = m_adm1->m_adm1;
else if ( m_zip ) adm1Str = m_zip->m_adm1;
// unique cities like "Albuquerque" imply a state
//else if ( m_city && m_city->m_adm1[0] ) adm1Str = m_city->m_adm1;
else { char *xx=NULL;*xx=0; }
h = hash64 ( (long long)(*(uint16_t *)adm1Str) , h );
// city
2013-08-09 19:52:15 +04:00
long long cityHash = 0LL;
2013-08-03 00:12:24 +04:00
if ( m_city ) cityHash = m_city->m_hash;
else if ( m_zip ) cityHash = m_zip->m_cityHash;
else { char *xx=NULL;*xx=0; }
// use the *city id* to deal with aliases of the same city
uint64_t cid64 = (uint64_t)getCityId32 ( cityHash , adm1Str );
// incorporate that into "h"
h = hash64 ( cid64 , h );
// store that in most signficant long long
k.n1 = h;
// street hash
long long n0 = snh;
// shift up for docid
n0 <<= 38;
// sanity
if ( (long)NUMDOCIDBITS != 38 ) { char *xx=NULL;*xx=0; }
// put that in
n0 |= docId;
// empty bit for del bit
n0 <<= 1;
// set the del bit to indicate a positive key
n0 |= 0x01;
// set
k.n0 = n0;
// sanity checks
if ( g_placedb.getBigHash (&k) != h ) { char *xx=NULL;*xx=0; }
if ( g_placedb.getStreetNumHash(&k) != snh ) { char *xx=NULL;*xx=0; }
if ( g_placedb.getDocId (&k) != docId ) { char *xx=NULL;*xx=0; }
// return
return k;
// similar to Address::serialize()
long long Address::makeAddressVotingTableKey ( ) {
long long h = 0LL;
Place *d = NULL;
// incorporate place name into the hash
d = &m_name1;
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
// and secondary name
d = &m_name2;
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
// incorporate suite into the hash
d = &m_suite;
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
// incorporate street into the hash
d = &m_street;
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
// incorporate city into the hash
d = &m_city;
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
// adm1 of the city
if ( d->m_str ) h = hash64 ( d->m_adm1 , 2 , h );
// incorporate zip into the hash
d = &m_zip;
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
// adm1 as well
if ( d->m_str ) h = hash64 ( d->m_adm1 , 2 , h );
// incorporate adm1 into the hash
d = &m_adm1;
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
// adm1 as well
if ( d->m_str ) h = hash64 ( d->m_adm1 , 2 , h );
// incorporate adm2 into the hash
//d = &m_adm2;
//if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
// adm1 as well
//if ( d->m_str ) h = hash64 ( d->m_adm1 , 2 , h );
// incorporate ctry into the hash
d = &m_ctry;
if ( d->m_str ) h = hash64 ( d->m_str , d->m_strlen , h );
return h;
// Msg2c : for verifying all the places/addresses
Msg2c::Msg2c() {
m_replies = 0;
m_requests = 0;
m_initializedInUse = false;
#include "Process.h"
Msg2c::~Msg2c () {
// no destroying if still awaiting replies
if ( m_replies != m_requests && ! g_process.m_exiting ) {
char *xx=NULL;*xx=0; }
void Msg2c::reset() {
m_replies = 0;
// all done if never initialized the multicasts
if ( ! m_initializedInUse ) return;
// shortcut
long max = (long)MAX_ADDR_REQUESTS_OUT;
// call DEstructors on multicasts
for ( long i = 0 ; i < max ; i++ ) {
// . sets Address::m_verified to 1 if verified
// . returns false if blocked
// . returns true and sets g_errno on error
// . and also sets the "avt" address verification table which we serialize
// into the TitleRec for re-parsing purposes later on, so we consistently
// re-parse
bool Msg2c::verifyAddresses ( Addresses *aa ,
//char *coll ,
collnum_t collnum ,
2013-08-03 00:12:24 +04:00
long domHash32 ,
long ip ,
long niceness ,
void *state ,
void (* callback)(void *state ) ) {
m_niceness = niceness;
m_addresses = aa;
m_collnum = collnum;
2013-08-03 00:12:24 +04:00
m_domHash32 = domHash32;
m_ip = ip;
m_callback = callback;
m_state = state;
// reset
m_errno = 0;
m_requests = 0;
m_replies = 0;
m_doneLaunching = false;
// reset address ptr
m_i = 0;
// all done if no addresses!
if ( m_addresses->m_am.getNumPtrs() == 0 ) return true;
// sanity check
if ( aa->m_sb.length() != 0 ) { char *xx=NULL; *xx=0; }
// . launch the requests
// . returns false if we are waiting for replies to come in
if ( ! launchRequests() ) return false;
// fill the the m_sb buf with all replies
// did not block and all replies are in
return true;
// keep tabs on total out
static long s_totalOut = 0;
// . returns false if blocked, true otherwise
// . sets g_errno and returns true on error
bool Msg2c::launchRequests ( ) {
// clear it
g_errno = 0;
// how many max can be out?
long maxOut = (long)MAX_ADDR_REQUESTS_OUT;
// but be careful
if ( s_totalOut >= 200 ) maxOut = 1;
// we are only built for one at a time since request buffer is static
//if ( (long)MAX_ADDR_REQUESTS != 1 ) { char *xx=NULL;*xx=0; }
// all done?
if ( m_i == m_addresses->m_am.getNumPtrs() )
m_doneLaunching = true;
// return true if nothing to launch
if ( m_doneLaunching )
return (m_requests == m_replies);
// don't bother if already got an error
if ( m_errno )
return (m_requests == m_replies);
// limit max to 5ish
if (m_requests-m_replies >= maxOut ) // MAX_ADDR_REQUESTS_OUT)
return (m_requests==m_replies);
// . limit total requests for better performance
// . does like 500,000 lookups. it would take
// like 30 seconds on a single test server. limiting to 50,000
// lookups it still takes 10 seconds on titan.
// . this limit doesn't affect any other pages in urls.txt - 11/18/11
if ( m_requests > 50000 ) {
if ( m_requests == m_replies )
log("addr: limiting msg2c requests to 50000 for %s",
return (m_requests==m_replies);
// take a breath
Address *a = (Address *)m_addresses->m_am.getPtr(m_i);
// skip it
// assume not verified
a->m_replyFlags = 0;
// . skip if it is like "call for location"
// . no no no this is messing up "at the filling station" for
// 88688960-sea-the-invalid-mariner
//if ( a->m_street->m_flags2 & PLF2_AFTER_AT ) {
// // might be done
// if ( m_i == m_addresses->m_na ) m_doneLaunching = true;
// // try the next one
// goto loop;
// max size of request
//long max = 1024;
// request is startKey,endKey,pihash,niceness,coll
//char *requestBuf = a->m_requestBuf;
// prepare to get a request buf if we haven't already
if ( ! m_initializedInUse ) {
long max = (long)MAX_ADDR_REQUESTS_OUT;
// call constructors on multicasts
for ( long i = 0 ; i < max ; i++ ) {
// do not repeat
m_initializedInUse = true;
// get a request buf, assume none (-1)
long reqBufNum = -1;
// scan what we got
for ( long i = 0 ; i < MAX_ADDR_REQUESTS_OUT ; i++ ) {
// breathe
// skip if in use
if ( m_inUse[i] ) continue;
// and let caller know which one
reqBufNum = i;
// and stop
// panic! how did this happen?
if ( reqBufNum == -1 ) { char *xx=NULL;*xx=0; }
// claim it
m_inUse[reqBufNum] = 1;
// point to the junk
char *requestBuf = m_bigBuf[reqBufNum];
// store requestbuf # we did get
a->m_reqBufNum = reqBufNum;
// and store addr # (subtract one since we increment m_i above)
a->m_addrNum = m_i - 1;
// point to this
Multicast *m = &m_mcasts[reqBufNum];
// store it
char *p = requestBuf;
// store placedbKey
*(key128_t *)p = a->m_placedbKey; p += sizeof(key128_t);
// site hash
*(long *)p = m_domHash32; p += 4;
*(long *)p = m_ip ; p += 4;
// niceness, 1 byte
*(char *)p = m_niceness; p += 1;
// is the street really a place name in disguise? ("Tingley Colesium")
char isName = ( a->m_street->m_flags2 & PLF2_IS_NAME ) ;
*(char *)p = isName ; p += 1;
// collection
//long collSize = gbstrlen(m_coll) + 1;
//memcpy ( p , m_coll , collSize );
//p += collSize;
*(collnum_t *)p = m_collnum;
p += sizeof(collnum_t);
2013-08-03 00:12:24 +04:00
// end of it
char *pend = requestBuf + REQBUFSIZE; // s_requestBuf + max;
// . then the address string, semicolon separated, null terminated
// . like ";;5815 Wyoming Blvd NE;Albuquerque;nm;87109;;..."(see below)
// . returns -1 and sets g_errno on error
// . returns # of bytes written, including null terminator
long written = a->serialize ( p , pend - p , NULL , false , false );
// error?
if ( written == -1 ) {
m_errno = g_errno;
// unclaim
m_inUse[reqBufNum] = 0;
return (m_requests == m_replies);
// update our ptr
p += written;
// must be there
if ( written == 0 ) { char *xx=NULL;*xx=0; }
// ensure null terminated
if ( p[-1] != '\0' ) { char *xx=NULL;*xx=0; }
// size of it
long requestSize = p - requestBuf;
// sanity check for breach
if ( requestSize > REQBUFSIZE ) { char *xx=NULL;*xx=0; }
// . get group to handle it
// . each group is responsible for a specific streetname/ctry/city/adm1
// . Hostdb.cpp::getGroupId()
//unsigned long gid = getGroupId(RDB_PLACEDB,(char *)&a->m_placedbKey);
unsigned long shardNum;
shardNum = getShardNum (RDB_PLACEDB,(char *)&a->m_placedbKey);
2013-08-03 00:12:24 +04:00
// . pick a host within that group based on docid
// . base that on streetname hash i guess
// . but i would like to cache this using a biased cache
// . so we need to divide based on streetname hash
// . that is the most significant 16 bits of the placedb key
long numHosts = g_hostdb.getNumHostsPerShard();
2013-08-03 00:12:24 +04:00
long hostNum = a->m_street->m_hash % numHosts;
Host *group = g_hostdb.getShard ( shardNum );
2013-08-03 00:12:24 +04:00
// get host # "hostNum" in group "group" to send our request to
Host *h = &group [ hostNum ];
//long addrNum = m_i - 1;
// launch it
//Multicast *m = &m_mcast;
// this returns false and sets g_errno on error
if ( ! m->send ( requestBuf ,
requestSize ,
0x2c , // msgType
false , // multicast own request?
shardNum, // gid ,
2013-08-03 00:12:24 +04:00
false , // send to whole group?
0 , // key for selecting host (not used)
this , // state
(void *)a , // state2
gotMsg2cReplyWrapper ,
180 , // total timeout
m_niceness ,
false , // realtime udp
h->m_hostId ,
NULL,//&a->m_replyFlags , // replyBuf
0,//1 , // replyBufMaxSize
false )) { // freeReplyBuf?
// note it
m_errno = g_errno;
// return false if we are waiting on replies
return (m_requests == m_replies);
// keep tabls
// successfully launched
// launch another
goto loop;
void gotMsg2cReplyWrapper ( void *state , void *state2 ) {
Msg2c *THIS = (Msg2c*)state;
// we got one
// back
// error?
if ( g_errno ) {
THIS->m_errno = g_errno;
log("addr: msg2c reply: %s",mstrerror(g_errno));
// cast this
Addresses *aa = THIS->m_addresses;
// point to the address we were working for
Address *a = (Address *)state2;
// what address # was it matching?
long addrNum = a->m_addrNum;
// and the reply buffer num for making available again
long reqBufNum = a->m_reqBufNum;
// sanity
if ( reqBufNum<0 || reqBufNum>=MAX_ADDR_REQUESTS_OUT ) {
char *xx=NULL; *xx=0; }
// make it available again
THIS->m_inUse[reqBufNum] = 0;
// test it
Multicast *m = &THIS->m_mcasts[reqBufNum];
long replySize , replyMaxSize; bool freeIt;
char *r = m->getBestReply (&replySize,&replyMaxSize,&freeIt);
// store reply into our cache
if ( ! g_errno && ! aa->addToReplyBuf (r,replySize,addrNum)){
// sanity check
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
// set this
THIS->m_errno = g_errno;
// free that memory to stop the mem leak
mfree ( r , replyMaxSize , "umsg2c" );
// test it
//if ( r && replySize != 1 ) { char *xx=NULL; *xx=0; }
// show it
//log("addr: got reply=%li replyaddr=0x%lx",(long)*r,(long)r);
// launchGetRequests() returns false if still waiting for replies...
if ( ! THIS->launchRequests() ) return;
// set g_errno for the callback
if ( THIS->m_errno ) g_errno = THIS->m_errno;
// fill the table
//THIS->allDone ( );
// otherwise, call callback
THIS->m_callback ( THIS->m_state );
// we then call Addresses::updateAddresses() to modify our m_addresses[]
// array with these replies!
bool Addresses::addToReplyBuf ( char *reply , long replySize , long addrNum ) {
// if nothing found in placedb lookup we get a 0 byte reply
if ( replySize == 0 ) return true;
// sanity
if ( addrNum < 0 || addrNum >= m_am.getNumPtrs()){char *xx=NULL;*xx=0;}
// if no room, make it 1.5 times bigger
if ( m_sb.m_length + replySize+4+4 > m_sb.m_capacity &&
! m_sb.reserve ( (long)(m_sb.m_capacity * 1.5 + 1000 ) ) ) {
log("addr: addtoreplybuf: %s",mstrerror(g_errno));
return false;
// store the address # this reply is for
if ( ! m_sb.pushLong ( addrNum ) ) return false;
// then reply stuff
if ( ! m_sb.pushLong ( replySize ) ) return false;
if ( ! m_sb.safeMemcpy ( reply , replySize ) ) return false;
return true;
class State2c {
UdpSlot *m_slot;
Msg5 m_msg5;
long m_votesForStreet;
long m_votesForStreetNum;
long m_votesForPlaceName1;
long m_votesForPlaceName2;
RdbList m_list;
long m_domHash32;
long m_ip;
key128_t m_placedbKey;
long m_niceness;
// is the street really a place name in disguise? (Tingley Colesium)
char m_isName;
// point to the serialize Address (semicolon separated, null term'd)
char *m_addrStr;
void handleRequest2c ( UdpSlot *slot , long nicenessWTF ) {
// get the request
char *request = slot->m_readBuf;
long requestSize = slot->m_readBufSize;
// overflow protection for corrupt requests
if ( requestSize < 4 ) {
g_udpServer.sendErrorReply ( slot , g_errno );
// parse the request
char *p = request;
// do the lookup on disk (hopefully in cache or ssd!)
// make a new Msg5
State2c *st;
try { st = new (State2c); }
catch ( ... ) {
g_errno = ENOMEM;
log("msg2c: new(%i): %s", sizeof(State2c), mstrerror(g_errno));
return g_udpServer.sendErrorReply ( slot, g_errno );
mnew ( st , sizeof(State2c) , "hndl2c" );
// save slot for sending reply
st->m_slot = slot;
// extract placedb key from request
st->m_placedbKey = *(key128_t *)p; p += sizeof(key128_t);
// get key range
key128_t startKey = st->m_placedbKey;
key128_t endKey = st->m_placedbKey;
// sanity check
if ( startKey.n1 == 0LL ) { char *xx=NULL;*xx=0; }
if ( endKey.n1 == 0LL ) { char *xx=NULL;*xx=0; }
// now we also mask out the street num hash
startKey.n1 &= 0xffffffffffff0000LL;
// and or that in for the endKey
endKey.n1 |= 0x000000000000ffffLL;
// mask out all but n1
startKey.n0 = 0x0000000000000000LL;
// or in lower bits for the endKey
endKey .n0 = 0xffffffffffffffffLL;
// domhash
st->m_domHash32 = *(long *)p; p += 4;
st->m_ip = *(long *)p; p += 4;
// get niceness
//long niceness = *(char *)p; p++;
// skip still though!!
// this was messing up our niceness conversion algo
long niceness = slot->m_niceness;
// is the street really a place name in disguise? (Tingley Colesium)
st->m_isName = *(char *)p; p++;
// save it
st->m_niceness = niceness;
// get coll
//char *coll = p; p += gbstrlen(p) + 1;
collnum_t collnum = *(collnum_t *)p;
p += sizeof(collnum_t);
2013-08-03 00:12:24 +04:00
// the address string, semicolon separated, NULL terminated
st->m_addrStr = p; p += gbstrlen(p) + 1;
// . get from msg5, return if it blocked
// . will probably not block since in the disk page cache a lot
if ( ! st->m_msg5.getList ( RDB_PLACEDB ,
collnum ,
2013-08-03 00:12:24 +04:00
&st->m_list ,
(char *)&startKey ,
(char *)&endKey ,
100000 , // minRecSizes
true , // include tree?
false , // addtocache?
0 , // maxcacheage
0 , // startfilenum
-1 , // numFiles
st ,
gotList2c ,
niceness ,
true ))// do err correction?
// it did not block...
gotList2c( st , NULL , NULL );
void gotList2c ( void *state , RdbList *xxx , Msg5 *yyy ) {
// cast our state class
State2c *st = (State2c *)state;
// get this
UdpSlot *slot = st->m_slot;
// return right away if error getting the rec
if ( g_errno ) {
// loop back up here on error below as well
// all done with this
mdelete ( st , sizeof(State2c),"msg2cfr");
delete (st);
g_udpServer.sendErrorReply ( slot,g_errno );
// assume not good
st->m_votesForStreet = 0;
st->m_votesForStreetNum = 0;
st->m_votesForPlaceName1 = 0;
st->m_votesForPlaceName2 = 0;
// if request was looking up a *place name* and not a street
// then we do some different logic
if ( st->m_isName ) {
// caller needs a street address for the place
sendBackAddress ( st );
// get our street num hash
key128_t *pk = &st->m_placedbKey;
long long myBigHash = g_placedb.getBigHash(pk);
long myStreetNumHash = g_placedb.getStreetNumHash(pk);
// point to the place name
char *pn1 = st->m_addrStr;
// get the first semicolon
char *semi1 = pn1;
// scan for it
for ( ; *semi1 && *semi1 !=';' ; semi1++ );
// NULL term
*semi1 = '\0';
// skip leading "the"
if ( ! strncasecmp ( pn1, "the ", 4) ) pn1 += 4;
// get niceness
long niceness = st->m_niceness;
// make a vector of "longs" from the place name
long myvbuf1[50];
long mynv1 = makeSimpleWordVector ( pn1 , myvbuf1 , 50*4,niceness ) ;
if ( mynv1 == -1 ) goto hadError;
// do the same for the second name
char *pn2 = semi1 + 1;
// skip for it
char *semi2 = pn2;
// scan for it
for ( ; *semi2 && *semi2 !=';' ; semi2++ );
// NULL term
*semi2 = '\0';
// skip leading "the"
if ( ! strncasecmp ( pn2, "the ", 4) ) pn2 += 4;
// make vector of secondary place name
long myvbuf2[50];
long mynv2 = makeSimpleWordVector ( pn2 , myvbuf2 , 50*4,niceness ) ;
if ( mynv2 == -1 ) goto hadError;
//log("build: matching %s",pn1);
// each placedb record's place name in the list is hashed and
// stored in this table so we can accumulate votes. "voting table"
HashTableX vt;
char vtableBuf[5000];
// and likewise each hash has a ptr to the original string
// of the place name
HashTableX ptrTable;
char ptrBuf[5000];
// how much reply buf to allocate? need at least one byte for
// the original one byte reply of flags...
// now we also store the best lat and lon which are the two doubles,
// and the 4 bytes before for the # of votes for that lat/lon
long need = 1 + 4 + sizeof(double)*2;
// shortcut
RdbList *list = &st->m_list;
while ( ! list->isExhausted() ) {
// breathe
QUICKPOLL ( st->m_niceness );
// get it
char *data = list->getCurrentData();
// get the key
key128_t k; list->getCurrentKey(&k);
// skip it
// cast it
Address a2;
//Place places2[10];
//long np2 = 0;
PlaceMem pm;
char tmpbuf[7024];
pm.init ( 5000 ,10,10,tmpbuf,7024,0 );
// set "a"
setFromStr ( &a2, data, 0 , &pm ,st->m_niceness );
// must not be same site as us for better voting accuracy
if ( a2.m_domHash32 == st->m_domHash32 ) continue;
// and different ip from us, for better voting accuracy
if ( iptop(a2.m_ip) == iptop(st->m_ip) ) continue;
// valid ip sanity check
if ( a2.m_ip == 0 || a2.m_ip==-1 ) { char *xx=NULL; *xx=0; }
// sanity check
if (g_placedb.getBigHash(&k)!=myBigHash) {char*xx=NULL;*xx=0;}
// ok, now we have verfied the street for sure
// get the street num hash of that record
long snh = g_placedb.getStreetNumHash ( &k );
// . does it match our street number?
// . i.e. the "15110" in "15110 Wyoming blvd"
if ( snh != myStreetNumHash ) continue;
// yes, another match
// build a vector for each of the two place names
// get place name
pn1 = data;
// get semi
semi1 = pn1;
// scan for it
for ( ; *semi1 && *semi1 !=';' ; semi1++ );
// NULL term
*semi1 = '\0';
// skip leading "the"
if ( ! strncasecmp ( pn1, "the ", 4) ) pn1 += 4;
// make its place name into a vector
long vbuf1[50];
long nvbuf1 ;
nvbuf1 = makeSimpleWordVector(pn1,vbuf1,50*4,st->m_niceness);
if ( nvbuf1 == -1 )
goto hadError;
// do the same for the second name
pn2 = semi1 + 1;
// skip for it
semi2 = pn2;
// scan for it
for ( ; *semi2 && *semi2 !=';' ; semi2++ );
// NULL term
*semi2 = '\0';
// skip leading "the"
if ( ! strncasecmp ( pn2, "the ", 4) ) pn2 += 4;
// make vector of secondary place name
long vbuf2[50];
long nvbuf2;
nvbuf2 = makeSimpleWordVector (pn2,vbuf2,50*4,st->m_niceness);
if ( nvbuf2 == -1)
goto hadError;
// undo
*semi1 = ';';
*semi2 = ';';
//log("build: matching %s vs %s",pn1,pn2);
// ok, compare the two vectors
float sim1 = computeSimilarity ( myvbuf1 ,
vbuf1 ,
st->m_niceness );
float sim2 = computeSimilarity ( myvbuf2 ,
vbuf2 ,
st->m_niceness );
// compare the secondary to primary, and vice versa
float sim3 = computeSimilarity ( myvbuf1 ,
vbuf2 ,
st->m_niceness );
float sim4 = computeSimilarity ( myvbuf2 ,
vbuf1 ,
st->m_niceness );
// now we also hash each word in each place name and
// store those two hashes into a table so we can score
// each place name of each placedb record. this allows us
// to ultimately set Address::m_placedbName1 and 2.
long h1 = hash32 ( (char *)vbuf1 , nvbuf1 * 4 , 0 );
long h2 = hash32 ( (char *)vbuf2 , nvbuf2 * 4 , 0 );
// . update max buf if its a new string
// . include one byte for the \0
// . include 4 bytes for preceeding score
if ( h1 && ! vt.isInTable(&h1) ) {
// update what we allocate
// add to ptr table
if ( ! ptrTable.addKey ( &h1 , &pn1 ) ) goto hadError;
if ( h2 && h2!=h1 && ! vt.isInTable(&h2) ) {
// update what we allocate
// add to ptr table
if (! ptrTable.addKey ( &h2 , &pn2 ) ) goto hadError;
// add to voting table
if ( h1 && ! vt.addTerm32 ( &h1 ) ) goto hadError;
if ( h2 && h2 != h1 && ! vt.addTerm32 ( &h2 ) ) goto hadError;
// break here for now to figure it out!
//char *xx=NULL;*xx=0;
//log("build: matching sim=%.02f for %s vs %s",sim,pn1,pn2);
// skip this guy if not a match
if ( sim1 < 85.0 &&
sim2 < 85.0 &&
sim3 < 85.0 &&
sim4 < 85.0 ) continue;
// 85%+ is good enough to be a vote for
if ( sim1 >= 85.0 ) st->m_votesForPlaceName1++;
if ( sim2 >= 85.0 ) st->m_votesForPlaceName2++;
if ( sim3 >= 85.0 ) st->m_votesForPlaceName1++;
if ( sim4 >= 85.0 ) st->m_votesForPlaceName2++;
// that is good enough
// set the reply
char *reply = NULL;
if ( need < TMPBUFSIZE ) reply = slot->m_tmpBuf;
else reply = (char *)mmalloc ( need , "repbuf" );
if ( ! reply ) goto hadError;
char *rend = reply + need;
// reply is either 1 or 0
//char *reply = slot->m_tmpBuf;
// clear it
uint8_t flags = 0;
// use flags
if ( st->m_votesForStreet ) flags |= AF_VERIFIED_STREET;
if ( st->m_votesForStreetNum ) flags |= AF_VERIFIED_STREET_NUM;
if ( st->m_votesForPlaceName1 ) flags |= AF_VERIFIED_PLACE_NAME_1;
if ( st->m_votesForPlaceName2 ) flags |= AF_VERIFIED_PLACE_NAME_2;
// sanity checks
if ( (flags & AF_VERIFIED_STREET_NUM) &&
!(flags & AF_VERIFIED_STREET ) ) { char *xx=NULL;*xx=0; }
if ( (flags & AF_VERIFIED_PLACE_NAME_1) &&
!(flags & AF_VERIFIED_STREET_NUM ) ) { char *xx=NULL;*xx=0; }
if ( (flags & AF_VERIFIED_PLACE_NAME_2) &&
!(flags & AF_VERIFIED_STREET_NUM ) ) { char *xx=NULL;*xx=0; }
// point to reply buffer after that first byte
char *rptr = reply ;
// now scan these placedb recs to find the most agreed upon lat/lon
// so that we do not trust the one on our page necessarily
double lat;
double lon;
long numVotes;
// need the street number hash so we only get lat/lon coords from
// addresses with the same street number as well as street
if ( ! getBestLatLon ( list , &lat, &lon , &numVotes, niceness ,
myStreetNumHash ) )
goto hadError;
// add that in
*(long *)rptr = numVotes; rptr += 4;
*(double *)rptr = lat; rptr += sizeof(double);
*(double *)rptr = lon; rptr += sizeof(double);
// then the 1 byte flag
*rptr = flags; rptr++;
// . now we store all the alternative place names and their vote count,
// as long as it was 2 or more. so scan the score table to find
// the hashes of the winners, then lookup the hashes of the winners
// in the ptr table, ptrTable, to get the string to send back.
// . we set Address::m_placedbNames to this string above when we
// process this reply
for ( long i = 0 ; i < vt.m_numSlots ; i++ ) {
// breathe
// skip emptyies
if ( vt.isEmpty(i) ) continue;
// get score
long score = vt.getScoreFromSlot ( i );
// skip if too small
if ( score <= 1 ) continue;
// get key
long key = *(long *)vt.getKeyFromSlot ( i );
// grab string
char *str = *(char **)ptrTable.getValue ( &key );
// must be there
if ( ! str ) { char *xx=NULL;*xx=0; }
// skip if empty string... was it just "the "???
if ( ! *str ) continue;
// store score first
*(long *)rptr = score;
// skip it
rptr += 4;
// get length
long len = gbstrlen(str);
// store in reply buf, include \0
memcpy ( rptr , str , len + 1 );
// skip over
rptr += len + 1;
// sanity check
if ( rptr > rend ) { char *xx=NULL;*xx=0; }
// the reply size may be less than what we allocated
long replySize = rptr - reply;
// set it
//if ( st->m_votes ) *reply = 1;
//else *reply = 0;
// all done with this
mdelete ( st , sizeof(State2c),"msg2cfr");
delete (st);
// send the 1 byte reply
// the msg2c request was asking for the address of a possible place name,
// like "Tingley Colesium", so this sends back the address
void sendBackAddress ( State2c *st ) {
// shortcut
RdbList *list = &st->m_list;
// winning street address
char *winner = NULL;
long winnerSnh = 0;
// and max count
long max = 0;
// get this
UdpSlot *slot = st->m_slot;
// set myBigHash for comparing
key128_t *pk = &st->m_placedbKey;
long long myBigHash = g_placedb.getBigHash(pk);
// set up a little voting table
char vbuf[30000];
HashTableX vt;
vt.set ( 4 , 4 , 100 ,vbuf,30000,false,0 ,"addrvt");
while ( ! list->isExhausted() ) {
// breathe
QUICKPOLL ( st->m_niceness );
// get it
char *data = list->getCurrentData();
// get the key
key128_t k; list->getCurrentKey(&k);
// skip it
// cast it
//Address a2;
// set "a2"
//setFromStr ( &a2, data, 0 , st->m_niceness );
// must not be same site as us
//if ( a2.m_domHash32 == st->m_domHash32 ) continue;
// and different ip from us
//if ( iptop(a2.m_ip) == iptop(st->m_ip) ) continue;
// sanity check
if (g_placedb.getBigHash(&k)!=myBigHash) {char*xx=NULL;*xx=0;}
// now his key's street hash was replaced with his placename1
// hash, and (TODO) his street num hash was made to include
// his actual street name hash, so we can use this to make sure
// everyone agrees on the same street address
// get the street num hash of that record
long snh = g_placedb.getStreetNumHash ( &k );
// get his vote count, we take the max
if ( ! vt.addTerm32 ( &snh ) ) {
// all done with this
mdelete ( st , sizeof(State2c),"msg2cfr");
delete (st);
g_udpServer.sendErrorReply ( slot,g_errno );
// does this guy have a latitude/longitude in him?
char *pp = data;
// count out like 9 semicolons to see
long scount = 0;
for ( ; scount < 10 ; pp++ )
if ( *pp == ';' ) scount++;
// check it out
bool hasLatLon = ( pp[1] != ';' );
// bad?
if ( scount < 5 ) { char *xx=NULL;*xx=0; }
// get his count
long score = vt.getScore32 ( &snh );
// new max?
if ( score < max ) continue;
// on tie, pref if has lat/lon
if ( score == max && ! hasLatLon ) continue;
// point to winning address then
winner = data;
// set this for loop below
winnerSnh = snh;
// for looping again, reset this, but only if we had a winner
double bestLat;
double bestLon;
long numVotes;
if ( winner && ! getBestLatLon ( list,
winnerSnh ) ) {
// all done with this
mdelete ( st , sizeof(State2c),"msg2cfr");
delete (st);
g_udpServer.sendErrorReply ( slot,g_errno );
// all done with this
// CRAP! the winner is referencing into this list which is in this
// state we are freeing!
// debug
//log("placedb: input=%s output=%s",st->m_addrStr,winner);
// if no winner, send empty reply
if ( ! winner ) {
mdelete ( st , sizeof(State2c),"msg2cfr");
delete (st);
long wlen = gbstrlen(winner);
// hos can this be?
if ( wlen <= 1 ) { char *xx=NULL;*xx=0; }
// send winner back. add in extra for lat/lon
long need = wlen + 48;
// use the slot's tmp buf to hold the reply if we can
char *reply = slot->m_tmpBuf;
// make buf if we need to
if ( need > TMPBUFSIZE )
reply = (char *)mmalloc ( need , "msg2creply");
// return error on error
if ( ! reply ) {
mdelete ( st , sizeof(State2c),"msg2cfr");
delete (st);
g_udpServer.sendErrorReply ( slot,g_errno );
// now store here
char *p = reply;
*(long *)p = numVotes; p += 4;
*(double *)p = bestLat ; p += sizeof(double);
*(double *)p = bestLon ; p += sizeof(double);
// how much to copy, include \0
long bytes = wlen + 1;
// copy over all but lat and lon if there, includes last ';'
memcpy ( p , winner , bytes ); p += bytes;
// how big is reply?
long replySize = p - reply;
// sanity check
if ( replySize > need ) { char *xx=NULL;*xx=0; }
// free it last since winner points into it
mdelete ( st , sizeof(State2c),"msg2cfr");
delete (st);
// send back empty reply if no winner, strange!
// returns false and sets g_errno on error
bool getBestLatLon ( RdbList *list ,
double *bestLat ,
double *bestLon ,
long *numVotes ,
long niceness ,
long winnerSnh ) {
// reset ptr, since we did a loop above with it
// no best now
long bestScore = 0;
*bestLat = NO_LATITUDE;
*bestLon = NO_LONGITUDE;
*numVotes = 0;
// voting table for lat/lon
HashTableX gpsTable;
char gbuf[1024];
gpsTable.set ( 8 , 4 , 32 , gbuf , 1024 , false , niceness,"addrgps");
// now loop again looking for the best lat/lon of the winning street
while ( ! list->isExhausted() ) {
// breathe
QUICKPOLL ( niceness );
// get it
char *data = list->getCurrentData();
// need this now
//long dataSize = list->getCurrentDataSize();
// get the key
key128_t k; list->getCurrentKey(&k);
// skip it
// get the street num hash of that record
long snh = g_placedb.getStreetNumHash ( &k );
// skip if not winner
if ( winnerSnh && snh != winnerSnh ) continue;
// grab it from the string (TODO: use this for above too!)
double lat;
double lon;
getLatLonFromStr ( data , &lat , &lon );
// skip if either not there
if ( lat == NO_LATITUDE ) continue;
if ( lon == NO_LONGITUDE ) continue;
// sanity check
if ( sizeof(double) != 8 ) { char *xx=NULL;*xx=0; }
// get hash for them
long long h1 = *(long long *)&lat;
long long h2 = *(long long *)&lon;
long long h = (h1<<1) ^ h2;
// add to table
if ( ! gpsTable.addTerm ( &h ) )
return false;
// get score
long score = gpsTable.getScore ( &h );
// skip if not best
if ( score <= bestScore ) continue;
// otherwise set it
*bestLat = lat;
*bestLon = lon;
bestScore = score;
*numVotes = bestScore;
return true;
uint8_t getCountryIdFromAddrStr ( char *addr ) {
char *p = addr;
long scount = 0;
for ( ; scount < 7 ; p++ )
if ( *p == ';' ) scount++;
// empty? assume US then
if ( *p == ';' ) return CRID_US;
// map abbr to crid
uint8_t crid = getCountryId ( p );
return crid;
char *getLatLonPtrFromStr ( char *data ) {
// now point to latitude,longitude
// skip city,state,zip,something,hash,ip
char *latitudePtr = data;
long scount = 0;
for ( ; scount < 11 ; latitudePtr++ )
if ( *latitudePtr == ';' ) scount++;
// pts past that ';'
return latitudePtr;
void getLatLonFromStr ( char *data, double *lat, double *lon ) {
// set lat long
// now point to latitude,longitude
// skip city,state,zip,something,hash,ip
char *latitudePtr = getLatLonPtrFromStr ( data );
// find end of it
char *latitudeEnd = latitudePtr;
// this may not be incremented at all if we have no latitude
for ( ; *latitudeEnd != ';' ; latitudeEnd++ );
// if we had something, then assign it
if ( *latitudePtr != ';' )
*lat = atod2(latitudePtr,latitudeEnd-latitudePtr);
// skip to l
char *longitudePtr = latitudeEnd + 1;
// need this now
//char *dataEnd = data + dataSize;
// this may not be incremented at all if we have no latitude
char *longitudeEnd = longitudePtr;
// this may not be incremented at all if we have no latitude
for ( ; *longitudeEnd && *longitudeEnd != ';' ; longitudeEnd++ );
// . this is the last item so it is already \0 terminated
// . sometimes is not \0 terminated because it is a sequence of
// replies serialized into our reply buffer, m_sb
if ( *longitudePtr && *longitudePtr != ';' )
*lon = atod2(longitudePtr,longitudeEnd-longitudePtr);
// sanity check
if ( *lon == 0.0 || *lat == 0.0 ) {
log("addr: bad 0.0 lon or lat");
// used by Events.cpp and by Dates.cpp
int streetcmp ( const void *arg1 , const void *arg2 ) {
// get the addresses
Place *street1 = *(Place **)arg1;
Place *street2 = *(Place **)arg2;
// get word position
long a1 = street1->m_a;
long a2 = street2->m_a;
// if tied, prefer the one whose m_address is set! that means
// it came from a inlined or verified address
if ( a1 == a2 ) {
if ( street1->m_address ) return -1;
if ( street2->m_address ) return 1;
if ( street1->m_alias ) return -1;
if ( street2->m_alias ) return 1;
return 0;
// sanity check
if ( a1 < 0 ) { char *xx=NULL;*xx=0; }
if ( a2 < 0 ) { char *xx=NULL;*xx=0; }
// compare
return ( a1 - a2);
// . allow "store hours" addresses to telescope up without limit
// . only store streets now that have PLF2_AFTER_AT set, or are a street
// name like "404 John NE"
// . and store streets in addresses that have verified street, name1 or name2
// OR are inlined
// . returns false and sets g_errno on error
bool Addresses::setFirstPlaceNums ( ) {
// no double calls
//if ( m_sorted ) { char *xx=NULL;*xx=0; }
if ( m_sorted ) {
mfree ( m_sorted , m_sortedSize , "asortbuf");
m_sorted = NULL;
m_sortedValid = false;
//char sbuf[10000];
// set the sorted[] array which consists of addresses
// sorted by their street position, or in the
// case if PLF2_IS_NAME addresses, their place name 1 position
//Place **sorted = (Place **)sbuf;
// how much space do we need?
long need = (m_am.getNumPtrs() + m_sm.getNumPtrs())* 4;
// alloc if we need to
m_sorted = (Place **)mmalloc(need,"getaddrtab");
if ( ! m_sorted ) return false;
m_sortedValid = true;
// store for freeing
m_sortedSize = need;
// reset count
m_numSorted = 0;
// add streets from m_streets[]
long lasta1 = -1;
for ( long i = 0 ; i < m_sm.getNumPtrs() ; i++ ) {
// give up control
// get streets #i
Place *street = (Place *)m_sm.getPtr(i);
// skip if po box. causes us to miss setting DF_STORE_HOURS
// for a date because there is a PO box as well as the
// bldg street address in the "store hours" section.
if ( street->m_flags2 & PLF2_IS_POBOX ) continue;
// is the street name really a place name?
bool isName = ( street->m_flags2 & PLF2_IS_NAME );
// assume not a good place
bool good = false;
// is our street really a place name
if ( street->m_flags2 & PLF2_AFTER_AT ) good = true;
// intersections are good
if ( street->m_flags2 & PLF2_INTERSECTION ) good = true;
// if it is a verified place name, allow it through too!
Address *aa = street->m_address;
if ( aa ) {
if ( aa->m_flags&AF_VERIFIED_PLACE_NAME_1) good = true;
if ( aa->m_flags&AF_VERIFIED_PLACE_NAME_2) good = true;
// . allow an aliases street name to be ok
// . helps fix invalid mariner url even though
// "The Filling Station" is really after an at... but we
// were not picking that up before because of another bug
// which is now fixed.
if ( street->m_alias ) good = true; // afterAt = true;
// get the address or the alias, whichever is non-NULL, if any
Address *ax = aa;
if ( ! ax ) ax = street->m_alias;
// sometimes we re-nege on our lat lon address we added because
// it was ambiguous because their were multiple lat/lon pairs
// and we didn't know which one was right. we really should
// delete them i guess up there but i am not sure they were
// last on stack? this is for addresses that are like
// after at like "at Norquay" and they have a latlon only
// flag...
if ( ax && (ax->m_flags3 & AF2_LATLON) ) {
// make sure lat/lon is not AMBIG_LATITUDE
if ( ax->m_latitude == AMBIG_LATITUDE ||
ax->m_longitude == AMBIG_LONGITUDE ||
ax->m_latitude == NO_LATITUDE ||
ax->m_longitude == NO_LONGITUDE )
// is not a name, that's good!
if ( ! isName ) good = true;
// must have address or be after at OR it must be a
// street name like "400 John NE"
if ( ! good ) continue;
// skip if it is a place to buy tickets and not really
// an actual event place
//if ( street->m_flags2 & PLF2_TICKET_PLACE ) continue;
// do add po box addresses, the above loop will just
// disqualify the event if this is the best address for it!
//if ( street->m_flags2 & PLF2_IS_POBOX ) continue;
// get the street name word range
long a1 = street->m_a;
long b1 = street->m_b;
// sanity check
if ( a1 < 0 || b1 < 0 ) { char *xx=NULL;*xx=0; }
// stop dups
if ( a1 == lasta1 ) continue;
// update
lasta1 = a1;
// add it
m_sorted[m_numSorted++] = street;
// . now sort the array by the street/name word start number
// . i.e. sort streets by their position on the page
// . in case of ties prefers the street with m_address set, because
// that indicates it came from an inlined or verified address
gbqsort ( m_sorted , m_numSorted , 4 , streetcmp , m_niceness );
// . remove duplicate places
// . fix "classes at Blue Tribe School. contact tammy.
// School 111 Maple SE Abq NM" for
// . basically an address can have a place name and a street
// and our streets array treats both kinds separately, so we
// have to detect if what we think is a different place name
// is really the place name of a street name here
long numSorted3 = 0;
for ( long i = 0 ; i < m_numSorted - 1 ; i++ ) {
// give up control
// get address #i
Place *street = m_sorted[i];
// get next
Place *next = m_sorted[i+1];
// re-add "street"
bool add = false;
// we must eb after at
if ( ! ( street->m_flags2 & PLF2_AFTER_AT ) ) add = true;
// and he must be a regular street
if ( next->m_flags2 & PLF2_AFTER_AT ) add = true;
if ( next->m_flags2 & PLF2_IS_NAME ) add = true;
// and must be kinda close together
if ( next->m_alnumA - street->m_alnumA > 10 ) add = true;
// fix "Grants Middle Schoole ... 111 Easterday NE" for
// because we get two places for that one address.
// one place is "Grants Middle School" as a fake street place
// name, and the other is the address with the actual street
// which also incorporates the same "Grants Middle School" as
// its name... so stop that!
if ( next->m_address &&
next->m_address->m_name1 &&
next->m_address->m_name1->m_a == street->m_a )
add = false;
if ( next->m_address &&
next->m_address->m_name2 &&
next->m_address->m_name2->m_a == street->m_a )
add = false;
// ok, ignore us!
if ( ! add ) continue;
// re-add it
m_sorted[numSorted3++] = street;
// last one
if ( m_numSorted > 0 )
m_sorted[numSorted3++] = m_sorted[m_numSorted-1];
// replace with the smaller deduped number
m_numSorted = numSorted3;
// clear all in case of re-call
for ( long i = 0 ; i < m_sections->m_numSections ; i++ ) {
QUICKPOLL ( m_niceness );
Section *sn = &m_sections->m_sections[i];
sn->m_firstPlaceNum = -1;
// loop over streets in sorted[] and hash their sections
long lasta = -1;
for ( long i = 0 ; i < m_numSorted ; i++ ) {
// give up control
// get address #i
Place *street = m_sorted[i];
// get word position, word #a
long a = street->m_a;
if ( a == lasta ) continue;
lasta = a;
if ( a < 0 ) { char *xx=NULL;*xx=0; }
// get section
Section *sa = m_sections->m_sectionPtrs[a];
// telescope up
for ( ; sa ; sa = sa->m_parent ) {
// breathe
// stop if already has one
if ( sa->m_firstPlaceNum >= 0 ) break;
// we are the first place contained by this section
sa->m_firstPlaceNum = i;
// dbug
long b = street->m_b;
SafeBuf tmp;
char *start = m_wptrs[a];
char *end = m_wptrs[b-1]+m_words->m_wordLens[b-1];
Section **sp = m_sections->m_sectionPtrs;
long sa = -1;
long aa = -1;
if ( street->m_address ) sa = street->m_address->m_street->m_a;
if ( street->m_alias ) sa = street->m_alias->m_street->m_a;
log("dbug: (a=%li,b=%li) sec=%lx %s addr=%li alias=%li "
tmp.getBufStart() ,
return true;
// . returns false and sets g_errno on error
// . "i" is the word position of "and" or "&"
bool Addresses::addIntersection ( long i , long alnumPos ) {
//if ( m_ns >= MAX_STREETS ) return true;
bool hadUpper = false;
// to the LEFT of the "and"
long good1 = -1;
long j1 = i;
long numPos1 = -1;
long lastBeforeNum1 = -1;
long routePos1 = -1;
long ap1 = alnumPos;
long dirCount1 = 0;
long wcount1 = 0;
long icount1 = 0;
bool firstWord = true;
long long lastWid1 = 0LL;
bool explicit1 = false;
bool hadPage1 = false;
bool lastWasStreetInd = false;
bool badLeftStreetEnd = false;
// do not back up past this
long minj = i - 14; if ( minj < 0 ) minj = 0;
// now back up to the left, see if that is a street
for ( long j = i - 1 ; j >= minj ; j-- ) {
// breathe
QUICKPOLL ( m_niceness );
// count it
if ( m_wids[j] ) ap1--;
// between is a total killer!
if ( m_wids[j] == h_between ) return true;
if ( m_wids[j] == h_btwn ) return true;
if ( m_wids[j] == h_bet ) return true;
// try this out
if ( ! isInStreet( j ) ) break;
// if not alnum word, keep going
if ( ! m_wids[j] ) continue;
// detect "corner of the page"
if ( m_wids[j] == h_page ) hadPage1 = true;
if ( m_wids[j] == h_intersection && lastWid1 == h_of ) {
explicit1 = true;
// include "intersection of" so it is not in name
good1 = j;
if ( m_wids[j] == h_corner && lastWid1 == h_of ) {
// ignore "corner of the page"
if ( hadPage1 ) return true;
explicit1 = true;
// include "corner of" so it is not in name
good1 = j;
// save it
bool saved3 = lastWasStreetInd;
// reset this
lastWasStreetInd = false;
// first word we encounter must be a directional or
// street indicator
if ( firstWord ) {
firstWord = false;
IndDesc *id;
id=(IndDesc *)g_indicators.getValue(&m_wids[j]);
bool ok = false;
if ( id && (id->m_bit & IND_DIR ) &&
// must have space or comma before us to prevent
// "tom's and jerry's"
j>0 &&
( is_wspace_a(m_wptrs[j][-1]) ||
m_wptrs[j][-1]==',') ) {
ok = true;
if ( id && (id->m_bit & IND_STREET) ) {
lastWasStreetInd = true;
ok = true;
// "14th and W St. NW" for
// "i-25 & hwy 301"
if ( is_digit(m_wptrs[j][0]) &&
// fix "21+ & I.D. Required" for
is_alnum_a(m_wptrs[j][m_wlens[j]-1]) )
ok = true;
// otherwise, stop on any other word
if ( ! ok ) {
badLeftStreetEnd = true;
bool isNum = false;
// this is good "4th and 5th"
if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
m_wptrs[j][m_wlens[j]-2] == 's' &&
m_wptrs[j][m_wlens[j]-1] == 't' )
good1 = j;
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
m_wptrs[j][m_wlens[j]-2] == 'n' &&
m_wptrs[j][m_wlens[j]-1] == 'd' )
good1 = j;
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
m_wptrs[j][m_wlens[j]-2] == 'r' &&
m_wptrs[j][m_wlens[j]-1] == 'd' )
good1 = j;
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
m_wptrs[j][m_wlens[j]-2] == 't' &&
m_wptrs[j][m_wlens[j]-1] == 'h' )
good1 = j;
// numbers not allowed unless after "route", etc.
else if ( m_words->isNum(j) ) {
numPos1 = j;
isNum = true;
// allow "79 st & shore rd" for
if ( isNum && saved3 ) good1 = j;
// record this
if ( numPos1 == -1 ) lastBeforeNum1 = j;
// this one too
if ( m_wids[j] == h_route ) routePos1 = j;
if ( m_wids[j] == h_rte ) routePos1 = j;
if ( m_wids[j] == h_rt ) routePos1 = j;
if ( m_wids[j] == h_hwy ) routePos1 = j;
if ( m_wids[j] == h_highway ) routePos1 = j;
if ( m_wids[j] == h_hiway ) routePos1 = j;
if ( m_wids[j] == h_road ) routePos1 = j;
if ( m_wids[j] == h_rd ) routePos1 = j;
// "Locatd on US 64 and New Mexico Highway X"
if ( m_wids[j] == h_us ) routePos1 = j;
if ( m_wids[j] == h_interstate ) routePos1 = j;
if ( m_wids[j] == h_i ) routePos1 = j;
// stop if word after the number is not a route
if ( ! isNum && numPos1 >= 0 && routePos1 == -1 )
// save it
lastWid1 = m_wids[j];
// no mixing caps
if ( s_lc.isInTable ( &m_wids[j] ) ) continue;
// cap?
if ( is_upper_utf8(m_wptrs[j]) ) hadUpper = true;
// do not include a lower case guy
else if ( hadUpper && is_lower_utf8(m_wptrs[j]) )
// count it
// note it
j1 = j;
// scan to left looking for "corner of" etc
long minsj = j1 - 10; if ( minsj < 0 ) minsj = 0;
bool hadOf = false;
for ( long sj = j1 - 1 ; sj > minsj ; sj-- ) {
// skip tags etc
if ( ! m_wids[sj] ) continue;
// of is ok
if ( m_wids[sj] == h_of ) { hadOf = true; continue; }
// bad i fno of
if ( ! hadOf ) break;
// corner of intersection of
if ( m_wids[sj] != h_intersection &&
m_wids[sj] != h_corner )
explicit1 = true;
if ( badLeftStreetEnd && ! explicit1 ) return true;
// . return if only indicator in street name.
// . fixes "NE and NW parts of Metro Atlanta."
if ( ! explicit1 && dirCount1 == wcount1 ) return true;
// reset it to before the pure number if no "route" before number
if ( ! explicit1 && numPos1 >= 0 && routePos1 != numPos1 - 2 ) {
j1 = lastBeforeNum1;
// if negative give up!
if ( j1 < 0 ) return true;
// use good1 if we had that!
if ( good1 >= 0 && good1 < j1 )
j1 = good1;
// return if no street to the left
if ( j1 == i ) return true;
// to the right of the "and"
bool good2 = false;
long icount2 = 0;
long dirCount2 = 0;
long wcount2 = 0;
long j2 = i;
bool hadStreetInd = false;
bool hadDirInd = false;
long numPos2 = -1;
long lastBeforeNum2 = -1;
long routePos2 = -1;
long ap2 = alnumPos;
bool hadCornerDirInd2 = false;
bool firstWord2 = true;
// do not exceed this
long maxj = i + 14; if ( maxj > m_nw ) maxj = m_nw;
// need a street to the right as well
for ( long j = i + 1 ; j < maxj ; j++ ) {
// breathe
QUICKPOLL ( m_niceness );
// count it
if ( m_wids[j] ) ap2++;
// try this out
if ( ! isInStreet( j ) ) break;
// skip if not alnum at this point
if ( ! m_wids[j] ) continue;
bool savedFirstWord2 = firstWord2;
if ( firstWord2 ) firstWord2 = false;
// if we hit a street indicator, only a dir can follow
IndDesc *id=(IndDesc *)g_indicators.getValue(&m_wids[j]);
if ( id && (id->m_bit & IND_STREET) && ! savedFirstWord2 ) {
hadStreetInd = true;
good2 = true;
else if ( id && (id->m_bit & IND_DIR ) ) {
hadDirInd = true;
// fix "Central Ave SE and Richmond SE Albuquerque"
if ( m_wlens[j] == 2 )
hadCornerDirInd2 = true;
good2 = true;
else if ( hadStreetInd || hadCornerDirInd2 )
// this is good "4th and 5th"
if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
m_wptrs[j][m_wlens[j]-2] == 's' &&
m_wptrs[j][m_wlens[j]-1] == 't' )
good2 = true;
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
m_wptrs[j][m_wlens[j]-2] == 'n' &&
m_wptrs[j][m_wlens[j]-1] == 'd' )
good2 = true;
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
m_wptrs[j][m_wlens[j]-2] == 'r' &&
m_wptrs[j][m_wlens[j]-1] == 'd' )
good2 = true;
else if ( is_digit(m_wptrs[j][0]) && m_wlens[j] >= 3 &&
m_wptrs[j][m_wlens[j]-2] == 't' &&
m_wptrs[j][m_wlens[j]-1] == 'h' )
good2 = true;
// numbers not allowed unless after "route", etc.
else if ( m_words->isNum(j) ) {
numPos2 = j;
// stop if had no route
if ( routePos2 == -1 ) break;
// fix for 14th and Curtis Denver CO
if ( cityAdm1Follows ( j ) ) {
good2 = true;
// record this
if ( numPos2 == -1 ) lastBeforeNum2 = j;
// this one too
if ( m_wids[j] == h_route ) routePos2 = j;
if ( m_wids[j] == h_rte ) routePos2 = j;
if ( m_wids[j] == h_rt ) routePos2 = j;
if ( m_wids[j] == h_hwy ) routePos2 = j;
if ( m_wids[j] == h_highway ) routePos2 = j;
if ( m_wids[j] == h_hiway ) routePos2 = j;
if ( m_wids[j] == h_road ) routePos2 = j;
if ( m_wids[j] == h_rd ) routePos2 = j;
// "Locatd on US 64 and New Mexico Highway X"
if ( m_wids[j] == h_us ) routePos2 = j;
if ( m_wids[j] == h_interstate ) routePos2 = j;
if ( m_wids[j] == h_i ) routePos2 = j;
// no mixing caps
if ( s_lc.isInTable ( &m_wids[j] ) ) continue;
// cap?
if ( is_upper_utf8(m_wptrs[j]) ) hadUpper = true;
// do not include a lower case guy
else if ( hadUpper && is_lower_utf8(m_wptrs[j]) )
// count it
// note it
j2 = j;
// reset it to before the pure number if no "route" before number
if ( numPos2 >= 0 && routePos2 != numPos2 - 2 ) {
j2 = lastBeforeNum2;
// if negative give up!
if ( j2 < 0 ) return true;
// fix "First Nations North and South" and
// "Broadway South East and North East"
if ( ! explicit1 && wcount2 == dirCount2 ) return true;
// trim after the "route x"
if ( numPos2 == routePos2 + 2 )
j2 = numPos2;
// return if no street to the left
if ( j2 == i ) return true;
// these are indivative of good street names
if ( routePos2 >= 0 ) good2 = true;
// no need for street indicator on right street if we have
// "intersection of" or whatever to left of left street
// need to have a "good" street name in there
if ( ! explicit1 && ! good2 ) return true;
long a = j1;
long b = j2+1;
// . no starting/ending with stop word
// . i-25 is exception!
if ( m_wids[j1] != h_i && m_words->isStopWord(j1) ) return true;
if ( m_wids[j2] != h_i && m_words->isStopWord(j2) ) return true;
// count alnums from a to b
long ac = 0;
for ( long i = a ; i < b ; i++ )
if ( m_wids[i] ) ac++;
Place *street = (Place *)m_sm.getMem(sizeof(Place));
if ( ! street ) return false;
street->m_a = a;
street->m_b = b;
street->m_alnumA = ap1;
street->m_alnumB = ap1 + ac; // ap2+1;
street->m_type = PT_STREET;
street->m_str = m_wptrs[j1];
street->m_strlen = m_wptrs[j2]-m_wptrs[j1]+m_wlens[j2];
//street->m_adm1[0] = 0;
//street->m_adm1[1] = 0;
//street->m_crid = 0;
street->m_flags2 = PLF2_INTERSECTION;
street->m_bits = 0;
street->m_address = NULL;
street->m_alias = NULL;
// set its m_hash member
setHashes ( street , m_words , m_niceness );
// prevent overlap with next street
//lastb = m_street->m_b;
// . need to know this for getting place name
// . place name must also be in upper case if
// the street is...
// . TODO: do we need this???? mdw
//if ( uc == 1 ) m_street->m_bits |= PLF_HAS_UPPER;
// set some bits
for ( long k = a ; m_bits && k < b ; k++ )
m_bits->m_bits[k] |= D_IS_IN_STREET;
// point to next street
return true;
// . returns false and sets g_errno on error
// . sets *good to true when we have a completed street
bool Addresses::isInStreet ( long j ) {
// we can never contain a tag
if ( m_tids[j] ) {
// skip if <sup>
if ( m_tids[j] == TAG_SUP ) return true;
if ( m_tids[j] == (TAG_SUP|BACKBIT) ) return true;
// . crap but micorosft front page has brs
// . "intersection of Interstate 405 and Sunset <br>Boulevard"
if ( m_tids[j] == TAG_BR ) return true;
// be a little more sensitive with this since it is easier
// to have false positives because we do not have a street
// number!
return false;
// are we punctuation?
if ( ! m_wids[j] ) {
// single space is ok
if (m_wptrs[j][0]==' '&&m_wlens[j]==1) return true;
// double space is ok
if (m_wptrs[j][0]==' '&&m_wptrs[j][1]==' '&& m_wlens[j]==2)
return true;
// period only after abbreviation
if ( m_wptrs[j][0] == '.' && j > 0 &&
m_wptrs[j][1] == ' ' && m_wlens[j]==2 )
return true;
// . period after a single letter as well
// . N. M.
if ( m_wptrs[j][0] == '.' && j > 0 &&
m_wlens[j-1]==1 &&
// fix "8. wall street"
!is_digit(m_wptrs[j-1][0]) &&
m_wptrs[j][1] == ' ' &&
m_wlens[j]==2 )
return true;
// N.M.
if ( m_wptrs[j][0] == '.' && j > 0 &&
// fix 1."5 miles west"
!is_digit(m_wptrs[j-1][0]) &&
m_wlens[j-1]==1 && m_wlens[j]==1 )
return true;
// quote: The Noyes House 2525 "N" Avenue
// National
if (m_wptrs[j][0]=='\"'&&m_wptrs[j][1]==' ' &&
// 'closer to 37"' is not a street name!
return true;
if (m_wptrs[j][0]==' ' &&m_wptrs[j][1]=='\"'&&
m_wlens[j]==2) return true;
// punct mark: st. michael's drive
if (m_wptrs[j][0]=='\''&&m_wlens[j]==1) return true;
// mosby's run: utf8 apostrophe
if (m_wlens[j]==3&&
m_wptrs[j][0]==-30 &&
m_wptrs[j][1]==-128 &&
m_wptrs[j][2]==-103 )
return true;
// village of los ranchos growers' market
if (m_wptrs[j][0]=='\''&&m_wptrs[j][1]==' '&&
m_wlens[j]==2) return true;
// hyphens usually bad, but x-y is ok.
m_words->isAlpha(j-1)&&m_words->isAlpha(j+1))return true;
// i-25 is ok now too
if (m_wptrs[j][0]=='-'&&j>0&&m_wids[j-1]==h_i&&j+1<m_nw&&
is_digit(m_wptrs[j+1][0]) )
return true;
// fix "3650-A Hwy 528..."
// j+1<m_nw&&m_wlens[j+1]==1&&
// is_alpha_a(m_wptrs[j+1][0])) return true;
// "620-624 Central Ave SW." (El Rey)
//if ( hasRange &&j==i+1 ) return true;
// fix for 4909-15 Hawkins NE" for
// m_wlens[j+1]==2&&is_digit(m_wptrs[j+1][0])&&
// m_wlens[j-1]>=4&&is_digit(m_wptrs[j-1][0]) ) {
// hasHyphenAddress = true;
// return true;
// sequence of whitespace is ok
long k; for(k=0;k<m_wlens[j];k++)
if(!is_wspace_a(m_wptrs[j][k])) break;
if(k==m_wlens[j]) return true;
// '/' is ok if part of a fraction!
//if( j == fractionj ) return true;
// . comma allowed only b4 directional indicatr
// . "131 Monroe St, NE"
// . no because we got a false positive:
// "1024 4th street, sw corner..."
// . ok, this is back again now! BUT... need
// to make sure a tag or city name follows it
// . crap, now we got
// "5305 Gibson, S.E. <b>Albuquerque ..."
if ( m_wptrs[j][0]!=',' ) return false;
if ( m_wptrs[j][1]!=' ' ) return false;
if ( j+3>= m_nw ) return false;
char gotDir = 0;
if ( m_wids[j+1] == h_ne ) gotDir = 2;
if ( m_wids[j+1] == h_nw ) gotDir = 2;
if ( m_wids[j+1] == h_se ) gotDir = 2;
if ( m_wids[j+1] == h_sw ) gotDir = 2;
if ( m_wids[j+1] == h_n&&m_wids[j+3]==h_e)gotDir=4;
if ( m_wids[j+1] == h_n&&m_wids[j+3]==h_w)gotDir=4;
if ( m_wids[j+1] == h_s&&m_wids[j+3]==h_e)gotDir=4;
if ( m_wids[j+1] == h_s&&m_wids[j+3]==h_w)gotDir=4;
if ( ! gotDir ) return false;
// its great if tag follows the dir indicator
if ( m_tids[j+gotDir] ) return true;
// or a punct then a tag
if ( m_tids[j+gotDir+1] ) return true;
// ok, a cap word must follow
if ( ! is_upper_utf8 (m_wptrs[j+gotDir+1])) return false;
// we are good
return true;
// skip dates, not allowed in there
if ( m_bits && (m_bits->m_bits[j] & D_IS_IN_DATE) )
return false;
// . otherwise we are alphanumeric
// . more than 10 is too many for a street
//if ( alnumsInPhrase++ >= 10 ) return false;
// stop at "at"
if ( m_wids[j] == h_at )
return false;
// stop at "and"
if ( m_wids[j] == h_and )
return false;
// stop at "between"
if ( m_wids[j] == h_between )
return false;
if ( m_wids[j] == h_btwn )
return false;
if ( m_wids[j] == h_bet )
return false;
// stop at "location"
if ( m_wids[j] == h_location )
return false;
// stop at "location"
if ( m_wids[j] == h_intersection )
return false;
long long postWid = 0LL;
long maxj = j+15; if ( j > m_nw ) j = m_nw;
for ( long pi = j + 1 ; pi < maxj ; pi++ ) {
if ( ! m_wids[pi] ) continue;
postWid = m_wids[pi];
// skip if indicator
//IndDesc *id=(IndDesc *)g_indicators.getValue(&m_wids[j]);
//if ( id && (id->m_bit & IND_STREET) ) return true;
//if ( id && (id->m_bit & IND_DIR ) ) return true;
return true;
uint64_t getAdm1Bits ( char *stateAbbr ) {
//if ( stateAbbr[2] ) { char *xx=NULL;*xx=0; }
uint64_t h64 = hash64Lower_a( stateAbbr , 2 );
StateDesc **sdp = (StateDesc **)g_states.getValue(&h64);
//uint16_t *val = (uint16_t *)g_states.getValue ( &h64 );
// this happens if we have a foreign latlon only address in the contact
// address tags and we call setFromStr() on that. obviously
// foreign states will not be in here! so allow this for now and
// do not core!
if ( ! sdp ) return 0;
// get position in the s_states[] array
long pos = (long)((*sdp) - s_states);
// that is the shifter
return (1LL << pos);
// . search for all PCLI entries in /geo/allCountries.txt
// . grep out into countries.txt and process into countries.dat
// . remove
// "Kingdom of"
// "Republic of"
// "Democractic Republic of"
// "Oriental Republic of"
// "* Republic"
// "United Republic of"
// "Socialist Republic of"
// "Independent State of"
// "State of the" (Vatican City)
// "Federative Replublic of"
SafeBuf g_countryDescBuf;
// . g_countryDescBuf consists of a list of these
// . the hashtablex g_countryTable maps a country name word hash to
// a CountryDesc pointer
class CountryDesc {
// country id in one byte
uint8_t m_crid;
// two letter, upper case countrycode includes \0
char *m_countryCode[3];
// country population, up to 4B
unsigned long m_population;
// centroid
float m_latitude;
float m_longitude;
// box radius i guess
float m_radius;
// . ptr into SafeBuf g_countryNameBuf
// . all the country names with their languages like:
// us-fi-nl=egypt,de-es=egypti,...
// . comma separated
// . \0 terminated
char m_nameBufPtr[];
// . get the name of the country in the designated language
// . langAbbr is the two letter lang abbreviation (en=english,etc.)
// . sometimes it can be 3 letters! nds, nrm, ... see
// /geo/geonames/iso-languagecodes.txt
// . sometimes there are names of the place with no associated language
// as well, so watch out for that
char *getCountryName ( char *langAbbr );
// . a huge string of all the countries and corresponding data
// . we parse this up into the g_countries table where each slot is a
// CountryDesc and CountryDesc::m_nameBufPtr references into g_countryData.
// . we need to know the language of each spelling of the country name
// so we can display that name if someone's browser says they only know
// Spanish or something, we'd say Estados instead of States or whatever.
// . well the alternateNames.txt file has the alternate names of each
// city or country or state and the language it is from, so use that...
// . make a name list like "en=Egypt" to indicate its called Egypt in english
// .,,fi=Egypti
char *g_countryData = "";
HashTableX g_countries;
bool setCountryTable ( ) {
return true;
// access g_countries table to find it
CountryDesc *getCountryDesc ( long long wid ) {
return NULL;
// two letter country code
CountryDesc *getCountryDesc ( char *countryCode ) {
long long wid = hash64Lower_a ( countryCode , 2 );
return getCountryDesc ( wid );
Place *getCountryPlace ( long a , long alnumPos , Words *words ) {
return NULL;
StateDesc *getStateDesc ( char *stateAbbr ) {
uint64_t h64 = hash64Lower_a( stateAbbr , 2 );
StateDesc **sdp = (StateDesc **)g_states.getValue(&h64);
if ( ! sdp ) return NULL;
return *sdp;
StateDesc *getStateDescByNum ( long i ) {
// sto breach;
if ( i >= (long)sizeof(s_states)/ (long)sizeof(StateDesc)) return NULL;
if ( i < 0 ) return NULL;
return &s_states[i];
inline long getStateOffset ( long long *h ) {
StateDesc **sdp = (StateDesc **)g_states.getValue(h);
if ( ! sdp ) return -1;
// return the POSITION though
return (long)((*sdp) - s_states);
// from hash of state
uint64_t getStateBitFromHash ( long long *h ) {
long pos = getStateOffset ( h );
if ( pos < 0 ) return 0;
return (1LL << pos);
StateDesc *getStateDescFromBits ( uint64_t bit ) {
long size = sizeof(s_states);
// item count
long n = (long)size/ sizeof(char *);
for ( long i = 0 ; i < n ; i++ ) {
// get it
StateDesc *sd = &s_states[i];
// check bits
if ( (((uint64_t)1LL)<<i) == bit ) return sd;
// sanity check
char *xx=NULL;*xx=0;
return NULL;
char *getStateAbbr ( uint64_t bit ) {
// clear the unique bit
//bit &= ~ CF_UNIQUE;
// use this for speed
long pos = getBitPosLL((uint8_t *)&bit);
// must be there
return s_states[pos].m_adm1;
long long getWordXorHash2 ( char *s , long slen ) {
// tmp save
char c = s[slen];
s[slen] = '\0';
long long h = getWordXorHash(s);
// put back
s[slen] = c;
return h;
long long getWordXorHash ( char *s ) {
Words tmp;
tmp.set9 ( s , 0 );
long long *wids = tmp.m_wordIds;
uint64_t h = 0LL;
for ( long i = 0 ; i < tmp.m_numWords ; i++ ) {
if ( !wids[i] ) continue;
// make it
h <<= 1LL;
h ^= wids[i];
return h;
#include "GeoIP.h"
#include "GeoIPCity.h"
static const char * _mk_NA( const char * p ){
return p ? p : "N/A";
// try "geolite city" free software
// mwells@titan:~/tmp2/GeoIP-1.4.6/apps$ geoiplookup -f GeoLiteCity.dat
// GeoIP City Edition, Rev 1: US, NM, Albuquerque, N/A, 35.102501, -106.611702, 505
// i guess i can just include that library in the gb source
// . i would say just trace the code and just grab the code we need
// and re-code into gb. BUT do indeed keep the GeoLiteCity.dat file
// that is only 28MB so we should load it up at start time
// . put our api code into here down below
bool getIPLocation ( long ip ,
double *lat ,
double *lon ,
double *radius ,
char **city ,
char **state ,
char **ctry ,
char *buf ,
long bufSize ) {
//static int s_i = 0;
// assume none
*city = NULL;
*state = NULL;
if ( ctry ) *ctry = NULL;
static GeoIP *s_gi = NULL;
char *sip = (char *)&ip;
// if ip is local use abq, nm
if ( sip[0]==10 ||
// 192.168.x.x is local
(sip[0]==(char)192 && sip[1]==(char)168) ||
ip==(long)16777343 ) {
char *p = buf;
*city = p;
p += sprintf ( p , "Albuquerque" );
*p++ = '\0';
*state = p;
p += sprintf ( p , "NM" );
*p++ = '\0';
*ctry = p;
p += sprintf ( p , "US" );
// use this
*lat = 35.10438;
*lon = -106.6270;
return true;
if ( ! s_gi ) {
// make full pathc
char full[1024];
s_gi = GeoIP_open(full, GEOIP_STANDARD);
if ( ! s_gi ) {
log("gb: could not open %s",full);
return false;
//s_i = GeoIP_database_edition(s_gi);
// geoiplookup(gi,hostname,i);
//char hostname[64];
//uint32_t ipnum = GeoIP_lokupaddress(hostname);
// put in network byte order, host to network
long ipnum = htonl ( ip );
// temp
//ipnum = ip;
GeoIPRecord *gir = GeoIP_record_by_ipnum(s_gi, ipnum);
// false if not found
if ( ! gir ) return false;
log("geoip: "
//"%s: %s, %s, %s, %s, %f, %f, %d",
"%s, %s, %s, %s, %f, %f, %d",
//GeoIPDBDescription[(unsigned long)s_gi->databaseType],
// %d
gir->latitude, gir->longitude, //gir->metro_code,
// transfer
if ( lat ) *lat = gir->latitude;
if ( lon ) *lon = gir->longitude;
// express 20 miles in degrees... one degree is 69 miles
if ( radius ) *radius = 20.0 / 69.0;
// city and state
char *p = buf;
long len ;
// bogus?
if ( ! gir->country_code ) return false;
if ( ctry ) *ctry = p;
//len = gbstrlen(gir->country_code);
//memcpy ( p , gir->country_code , len + 1 );
p[0] = gir->country_code[0];
p[1] = gir->country_code[1];
p += 2;
*p++ = '\0';
*state = p;
len = 0;
if ( gir->region ) len = gbstrlen(gir->region);
// bogus?
if ( len == 0 ) return false;
//memcpy ( p , gir->region , len + 1 );
// make it all lowercase so we don't core anywhere
long written = to_lower_alnum_a(gir->region,len,p);
// sanity
if ( written != len ) { char *xx=NULL;*xx=0; }
// skip over what we stored
p += len ;
// null term
*p++ = '\0';
// get len
//long plen = gbstrlen(p);
//p += len + 1;
*city = p;
len = 0;
if ( gir->city ) len = gbstrlen(gir->city);
// bogus?
if ( len == 0 ) return false;
memcpy ( p , gir->city , len );
p += len;
*p++ = '\0';
// sanbity check
if ( p - buf > bufSize ) { char *xx=NULL;*xx=0; }
// free this junk too!
GeoIPRecord_delete ( gir );
//free ( gir );
return true;
bool getLatLon ( uint32_t cityId , double *lat , double *lon ) {
// now lookup timezone
long slot = g_timeZones.getSlot ( &cityId );
// return 0 if not found
if ( slot < 0 ) return false;
// otherwise, set m_timeZoneOffset appropriately
CityStateDesc *csd;
csd = (CityStateDesc *)g_timeZones.getValueFromSlot(slot);
*lat = csd->m_latitude;
*lon = csd->m_longitude;
return true;
// or numeric lat/lon
float getLatLonSpecial ( char *p ,
char *bufStart,
char *bufEnd ,
char *found ) {
// assume none
*found = 0;
// must start with digit
if ( ! is_digit(*p) ) return 0.0;
// set start
char *start = p;
// negative sign?
if ( p>bufStart && p[-1] == '-' ) start--;
// reset counts
long digitCount = 0;
long decimalCount = 0;
// do not scan so far
char *pmax = p + 20;
if ( pmax > bufEnd ) pmax = bufEnd;
// scan until no digit or period
for ( ; p < pmax ; p++ ) {
// count the digits
if ( is_digit(*p) ) {
// decimal point is ok
if ( *p == '.' ) {
// stop on other crap
// give up if less than 3 digits encountered
if ( digitCount < 3 ) return 0.0;
// some pages have no period in it
// and we just have to assume the first
// 3 digits are before the period. like for
// urls
if ( decimalCount >= 2 ) return 0.0;
// convert
double dval = atod2(start,p-start);
// fix stuff which has no decimal pt
if ( decimalCount == 0 ) {
// how many digits to left of decimal
long left = 3;
// make a divisor
double ddd = 1;
for ( long vv = 0 ; vv<digitCount-left; vv++)
ddd *= 10;
// fix it
dval /= ddd;
// bail if bad
if ( dval < -180.0 || dval > 180.0 ) return 0.0;
// in the usual decimal it is
// lat from 24.450000 to 60 (juneau alask) // 47.4666666
// lon from -71.083333 to -114.1333333
//char type = 0;
//if ( dval >= 24.45 && dval <= 60.0 ) type = 1; // lat
//else if ( dval >= -140.0 && dval <= -66.1 ) type = 2; // lon
//else log("query: lat/lon point not in our scope. fix!");
//if ( type == 0 ) return 0.0;
*found = 1;//type;
return dval;
static char *s_tests[] = {
"sf ca",
"sf nm",
"ottawa ontario",
"rio de janeiro",
"mexico city",
// pasadena texas is more popular than california!
"berlin, germany",
"paris, tx",
"paris, ky",
"paris, france",
"key west",
"santa fe",
"san francisco",
"georgia", // the country!!
"washington",// (should be the state)
"washington d.c.",
"washington dc",
"united states of america",
"georgia", // (should be the US state, not the country!)
"taste of germany",
"kimo theater", // (venue name)
"pleasant arena", // (venue name)
"barton road"// (street name test)
bool printTesterPage ( SafeBuf &sb ) {
long count = 0;
long n = sizeof(s_tests)/sizeof(char *);
bool firstRow = true;
for ( long i = 0 ; i < n ; i++ ) {
if ( count %4 == 0 ) {
if ( ! firstRow )
firstRow = false;
// print map
long width = 200;
long height = 200;
// get stuff
float radius;
char *where = s_tests[i];
float cityLat;
float cityLon;
float stateLat;
float stateLon;
float countryLat;
float countryLon;
float zipLat;
float zipLon;
float userLat;
float userLon;
char timeZone2;
char useDST;
uint8_t ipCrid = CRID_US;
char gbwhereBuf[512];
long gbwhereBufSize = 500;
getLatLonFromUserInput ( &radius,
where ,
&cityLat ,
&cityLon ,
&zipLat ,
&zipLon ,
&userLat ,
&userLon ,
&timeZone2 ,
gbwhereBuf ,
gbwhereBufSize ) ;
// get most accurate lat/lon
float lat = NO_LATITUDE;
float lon = NO_LONGITUDE;
long zoom = 0; // world
if ( countryLat != NO_LATITUDE && countryLon != NO_LONGITUDE) {
lat = countryLat;
lon = countryLon;
zoom = 3; // country?
if ( stateLat != NO_LATITUDE && stateLon != NO_LONGITUDE ) {
lat = stateLat;
lon = stateLon;
zoom = 5; // state?
if ( cityLat != NO_LATITUDE && cityLon != NO_LONGITUDE ) {
lat = cityLat;
lon = cityLon;
zoom = 7; // city?
if ( zipLat != NO_LATITUDE && zipLon != NO_LONGITUDE ) {
lat = zipLat;
lon = zipLon;
zoom = 8; // zip?
if ( userLat != NO_LATITUDE && userLon != NO_LONGITUDE ) {
lat = userLat;
lon = userLon;
zoom = 8;
sb.safePrintf ( "<img src=\""
"size=%lix%li&maptype=roadmap&sensor=false" ,
width, height );
"%%7Clabel:%c" // letter
"%%7C%.07f" // lat
"%%2C%.07f" //lon
,"red" // s_mapColors[0]
,lon );
return true;
// TODO: maybe just print out like 20 google maps for these on a page tester?
// . returns false if we could not identify a lat/lon from "where" string
// . returns false and sets g_errno on error
// . stores words NOT used for lat/lon determination into gbwhereBuf each
// word with a "gbwhere:" prefix so we can append gbwhereBuf to the query.
// . if input is just a state like new mexico, then uses gbwhere:"new mexico"
// otherwise it could be referring to a street called New Mexico Avenue...
// . you pass in the radius SearchInput::m_radius as "radius" and we may
// change it here! if its 0 and we find a lat/lon in the "where" string
// then we will change it to 100. if the *radius you pass in is non-zero
// we may change it to zero if we can't find a lat/lon...
bool getLatLonFromUserInput ( float *radius,
char *where ,
float *cityLat ,
float *cityLon ,
float *stateLat,
float *stateLon,
float *countryLat,
float *countryLon,
//double *radius ,
// . position of the user
// . we try to set these from the zipcode if ther
float *zipLat ,
float *zipLon ,
float *userLat ,
float *userLon ,
PlaceDesc **retCityDesc,
PlaceDesc **retStateDesc,
PlaceDesc **retCountryDesc,
char *timeZone2 ,
char *useDST,
// country of search based on ip (two letters)
uint8_t ipCrid,
char *gbwhereBuf ,
long gbwhereBufSize ) {
// convert "where" string into a cityId32 so we can convert
// to a lat/lon by calling getLatLon(cityId)
g_errno = 0;
Words w;
if ( ! w.set3 ( where ) ) return false;
// express 20 miles in degrees... one degree is 69 miles
//*radius = 20.0 / 69.0;
// start at -1
long alnumPos = -1;
//char *adm1Str = NULL;
long cityA = -1;
long cityB = -1;
long stateA = -1;
long stateB = -1;
long zipA = -1;
long zipB = -1;
long countryA = -1;
long countryB = -1;
long cityAlnumA = -1;
long cityAlnumB = -1;
long stateAlnumA = -1;
long stateAlnumB = -1;
long zipAlnumA = -1;
long zipAlnumB = -1;
long countryAlnumA = -1;
long countryAlnumB = -1;
long finalCityA = -1;
long finalCityB = -1;
long finalStateA = -1;
long finalStateB = -1;
long finalCountryA = -1;
long finalCountryB = -1;
long finalZipA = -1;
long finalZipB = -1;
// shortcuts
long long *wids = w.getWordIds();
char **wptrs = w.getWords();
long *wlens = w.getWordLens();
// set lastWidPos
long long lastWidPos = w.m_numWords;
for ( long i = 0 ; i < w.m_numWords ; i++ )
if ( wids[i] ) lastWidPos = i;
char *bufStart = where;
char *bufEnd = where + gbstrlen(where);
// reset all
*userLat = NO_LATITUDE;
*userLon = NO_LONGITUDE;
*cityLat = NO_LATITUDE;
*cityLon = NO_LONGITUDE;
*stateLat = NO_LATITUDE;
*stateLon = NO_LONGITUDE;
*countryLat = NO_LATITUDE;
*countryLon = NO_LONGITUDE;
*zipLat = NO_LATITUDE;
//long totalAlnums = w.getNumAlnumWords ();
// for numeric entries like 58.xxxx -128.yyyy
bool hasLat = false;
bool hasLon = false;
long ignoreUntil;
// do a initial loop looking for the country to use, otherwise,
// we'll assume ipcrid. once we establish a country it will be
// easier to know what state or city is being talked about.
alnumPos = -1;
ignoreUntil = -1;
PlaceDesc *finalCountryDesc = NULL;
for ( long i = 0 ; i < w.m_numWords ; i++ ) {
// skip if punct
if ( ! wids[i] ) continue;
// alnum pos count
// fix "united states of america"
if ( i < ignoreUntil ) continue;
// country names are unique, so we can set this here
PlaceDesc *crd = NULL;
// get the last non-null country in the where box
getLongestPlaceName_new ( i,
0 , // alnumPos,
NULL, // state abbr
&countryB ,
&crd );
// record last one
if ( crd ) {
finalCountryDesc = crd;
finalCountryA = countryA;
finalCountryB = countryB;
ignoreUntil = countryB;
// assume country based on searcher's IP address
uint8_t crid = ipCrid;
// unless a country was specified in the wherebox, then use that
if ( finalCountryDesc ) crid = finalCountryDesc->m_crid;
// do a secondary loop looking for the state before the country
// or picking the last encountered state. ignore any country we
// might have found in the first loop. require state be in that
// country.
alnumPos = -1;
ignoreUntil = -1;
PlaceDesc *finalStateDesc = NULL;
for ( long i = 0 ; i < w.m_numWords ; i++ ) {
// skip if punct
if ( ! wids[i] ) continue;
// alnum pos count
// skip if already in use by us
if ( i < ignoreUntil ) continue;
// skip if its like a lat/lon
if ( i+2<w.m_numWords &&
is_digit(wptrs[i][0]) &&
wptrs[i][wlens[i]] == '.' &&
is_digit(wptrs[i][wlens[i]+1]) )
// skip of country words
//if ( i >= finalCountryA && i < finalCountryB ) continue;
// country names are unique, so we can set this here
PlaceDesc *srd = NULL;
// use this country id (CRID_ANY = 0)
uint8_t useCrid = CRID_ANY;
// come back up here with a non-zero crid
// . don't use the countryid to fix "new mexico"...
// . picks the most popular in case of ties
getLongestPlaceName_new ( i,
NULL, // state abbr
&stateB ,
&srd );
// if that does not overlap the country we had then
// re-do it using the country id!!!
if ( srd &&
useCrid == CRID_ANY &&
finalCountryDesc &&
stateB <= finalCountryA ) {
useCrid = finalCountryDesc->m_crid;
goto redo;
// if it is exact overlap and same country... prefer
// the state. try to fix 'georgia' which is a state and country
if ( srd &&
useCrid == CRID_ANY &&
finalCountryDesc &&
stateA == finalCountryA &&
stateB == finalCountryB &&
finalCountryDesc->m_crid == srd->m_crid &&
// if in 'mexico' searching for 'mexico' assume the
// state, and nuke the country...
ipCrid == srd->m_crid ) {
ignoreUntil = stateB;
finalCountryDesc = NULL;
finalCountryA = -1;
finalCountryB = -1;
crid = ipCrid;
// otherwise, if NOT in 'mexico' searching for 'mexico'
// assume the country, not the state in mexico
if ( srd &&
useCrid == CRID_ANY &&
finalCountryDesc &&
stateA == finalCountryA &&
stateB == finalCountryB &&
finalCountryDesc->m_crid == srd->m_crid &&
// if in 'mexico' searching for 'mexico' assume the
// state, and nuke the country...
ipCrid != srd->m_crid ) {
ignoreUntil = finalCountryB;
srd = NULL;
// if it is exact overlap and different countries,
// prefer one that is "crid", the same country as the user!
// try to fix 'georgia' which is a state and country...
// in the US we expect georgia the state.
if ( srd &&
useCrid == CRID_ANY &&
finalCountryDesc &&
stateA == finalCountryA &&
stateB == finalCountryB &&
finalCountryDesc->m_crid == ipCrid &&
srd->m_crid != ipCrid ) {
ignoreUntil = stateB;
srd = NULL;
// if the state is in the user's country but the country
// is not the user's country. kill the country descriptor.
// so 'georgia' in the US will match the state, not
// 'georgia' the country.
if ( srd &&
useCrid == CRID_ANY &&
finalCountryDesc &&
stateA == finalCountryA &&
stateB == finalCountryB &&
finalCountryDesc->m_crid != ipCrid &&
srd->m_crid == ipCrid ) {
ignoreUntil = stateB;
finalCountryDesc = NULL;
finalCountryA = -1;
finalCountryB = -1;
crid = ipCrid;
// if it does overlap the country, nuke the country then
// to fix 'new mexico' so country is not 'mexico'
if ( srd &&
useCrid == CRID_ANY &&
finalCountryDesc &&
stateB > finalCountryA ) {
finalCountryDesc = NULL;
finalCountryA = -1;
finalCountryB = -1;
crid = ipCrid;
// get the last non-null state
if ( srd ) {
finalStateDesc = srd;
finalStateA = stateA;
finalStateB = stateB;
ignoreUntil = stateB;
// do a third loop looking for the city. ignore any state or country
// we found in the first two loops. require city be in an state or
// country we found in the first two loops.
alnumPos = -1;
ignoreUntil = -1;
PlaceDesc *finalCityDesc = NULL;
for ( long i = 0 ; i < w.m_numWords ; i++ ) {
// skip if punct
if ( ! wids[i] ) continue;
// alnum pos count
// skip if already in use by us
if ( i < ignoreUntil ) continue;
// skip of country words
// no, was hurting "mexico city" because "mexico" was
// our country and should have been the city!
//if ( i >= finalCountryA && i < finalCountryB ) continue;
// . skip over the state
// . no, for 'santa fe' it was a state, but we need
// to comment this line out to contest that.
//if ( i >= finalStateA && i < finalStateB ) continue;
// state abbr?
char *stateAbbr = NULL;
//if ( finalStateDesc ) stateAbbr = finalStateDesc->m_adm1;
// country names are unique, so we can set this here
PlaceDesc *crd1 = NULL;
// picks the most popular in case of ties
getLongestPlaceName_new ( i,
&cityB ,
&crd1 );
// if none found, try not restricting to searcher's
// country then!!! should fix 'tokyo' since there is no
// 'tokyo' city in the US at all.
// crap, then this gets georgia this city in jamaica
PlaceDesc *crd2 = NULL;
long city2A;
long city2B;
long city2AlnumA;
long city2AlnumB;
getLongestPlaceName_new ( i,
&city2B ,
&crd2 );
// default to city in user's country
PlaceDesc *crd = crd1;
// use the worldly city if the local city name does not
// exist in the user's country.
if ( ! crd ) {
crd = crd2;
cityA = city2A;
cityB = city2B;
// if both existed, prefer the longer. if tied. prefer
// the local one even if its population might be smaller
if ( crd && crd2 && city2B > cityB ) {
crd = crd2;
cityA = city2A;
cityB = city2B;
// if city does NOT overlap the state re-do it using the
// stateAbbr. constrain to that state then...
if ( crd &&
! stateAbbr &&
finalStateDesc &&
cityB <= finalStateA ) {
stateAbbr = finalStateDesc->m_adm1;
goto redoCity;
// if it more than contains the country... nuke the country
// fixes "mexico city" where it thinks "mexico" is the country
if ( crd &&
! stateAbbr &&
finalCountryDesc &&
cityA == finalCountryA &&
cityB > finalCountryB ) {
ignoreUntil = cityB;
finalCountryDesc = NULL;
finalCountryA = -1;
finalCountryB = -1;
crid = ipCrid;
// do not intersect with country otherwise beyond this point
if ( i >= finalCountryA && i < finalCountryB ) continue;
// if it is exact overlap and same country... prefer state!
if ( crd &&
! stateAbbr &&
finalStateDesc &&
cityA == finalStateA &&
cityB == finalStateB &&
// the state must be in different country now to
// fix the 'kentucky' query so we do not get
// 'kentucky, arkansas'
finalStateDesc->m_crid == crd->m_crid ) {
ignoreUntil = cityB;
crd = NULL;
// if it equals the state, and we already had a finalCity
// then toss that city... it's most likely a city/state
// combo where the state is a city name somewhere as well!
// fixes 'ottawa, ontario' where it ontario is also a city
// in the US!
if ( crd &&
! stateAbbr &&
finalStateDesc &&
finalCityDesc &&
cityA == finalStateA ) {
ignoreUntil = finalStateB;
crd = NULL;
// if it is exacvt overlap and different countries,
// prefer one that is "crid"
if ( crd &&
! stateAbbr &&
finalStateDesc &&
cityA == finalStateA &&
cityB == finalStateB &&
// the state must be in different country now to
// fix the 'kentucky' query so we do not get
// 'kentucky, arkansas'
finalStateDesc->m_crid == crid ) {
ignoreUntil = cityB;
crd = NULL;
// if exact overlap and city is in the user's country,
// then prefer city and nuke state
if ( crd &&
! stateAbbr &&
finalStateDesc &&
cityA == finalStateA &&
cityB == finalStateB &&
// the state must be in different country now to
// fix the 'kentucky' query so we do not get
// 'kentucky, arkansas'
finalStateDesc->m_crid == crid ) {
ignoreUntil = cityB;
finalStateDesc = NULL;
finalStateA = -1;
finalStateB = -1;
// if it does overlap the state, nuke the state then
// to fix 'key west' query. it thought 'west' was a
// state in iceland!
if ( crd &&
! stateAbbr &&
finalStateDesc &&
cityB > finalStateA &&
// the state must be in different country now to
// fix the 'kentucky' query so we do not get
// 'kentucky, arkansas'
finalStateDesc->m_crid != crd->m_crid &&
// i added this so 'georgia' the city in jamaica
// did not beat out the state in the US...
crd->m_crid == crid ) {
ignoreUntil = cityB;
finalStateDesc = NULL;
finalStateA = -1;
finalStateB = -1;
// BUT kill the city if its the one in a different state
if ( crd &&
! stateAbbr &&
finalStateDesc &&
cityB > finalStateA &&
finalStateDesc->m_crid != crd->m_crid &&
finalStateDesc->m_crid == crid ) {
ignoreUntil = finalStateB;
crd = NULL;
// get the last non-null city
if ( crd ) {
finalCityDesc = crd;
finalCityA = cityA;
finalCityB = cityB;
ignoreUntil = cityB;
// and a 4th loop to get the zip code
alnumPos = -1;
for ( long i = 0 ; i < w.m_numWords ; i++ ) {
// skip if punct
if ( ! wids[i] ) continue;
// alnum pos count
// skip of country words
if ( i >= finalCountryA && i < finalCountryB ) continue;
// skip over the state
if ( i >= finalStateA && i < finalStateB ) continue;
// skip over city
if ( i >= finalCityA && i < finalCityB ) continue;
// we must be in the US
//if ( crid != CRID_US ) continue;
// U.S. only for now
getZip_new ( i,
&zipB ,
// skip if none
if ( *zipLat != NO_LATITUDE ) {
// set these i guess
finalZipA = zipA;
finalZipB = zipB;
// loop for numeric lat/lon
alnumPos = -1;
ignoreUntil = -1;
for ( long i = 0 ; i < w.m_numWords ; i++ ) {
// skip if punct
if ( ! wids[i] ) continue;
// ignore
if ( i < ignoreUntil ) continue;
// stop if we had any of the above though!
//if ( finalCityDesc ) break;
//if ( finalStateDesc ) break;
//if ( finalCountryDesc ) break;
//if ( zipA >= 0 ) break;
// alnum pos count
char found = 0;
float ret = getLatLonSpecial(wptrs[i],
if ( found && !hasLat ) { // == 1
*userLat = ret;
ignoreUntil = i + 3;
// the next one should be the lon
hasLat = true;
if ( found && !hasLon ) { // == 2
*userLon = ret;
ignoreUntil = i + 3;
hasLon = true;
if ( found ) {
log("query: got extra lat/lon term! ignoring.");
ignoreUntil = i + 3;
// ok, a random gbwhere: term i guess
// if we had a lat/lon toss all else out. should fix location of
// "33.83660 -116.54670" which thought the 83660 was a french city.
if ( hasLat && hasLon ) {
finalCityDesc = NULL;
finalStateDesc = NULL;
finalCountryDesc = NULL;
// nuke other lons/lats too
*cityLat = NO_LATITUDE;
*cityLon = NO_LONGITUDE;
*stateLat = NO_LATITUDE;
*stateLon = NO_LONGITUDE;
*countryLat = NO_LATITUDE;
*countryLon = NO_LONGITUDE;
*zipLat = NO_LATITUDE;
// . if we got a lat and a lon convert...
// . this was in pageevents.cpp
if ( hasLat && hasLon ) {
float distInMilesSquared;
PlaceDesc *pd;
pd = getNearestCity_new ( lat ,lon,0, &distInMilesSquared);
if ( distInMilesSquared < 1000 ) {
finalCityDesc = pd;
finalStateDesc =
long nw = w.getNumWords();
// was it just a city name by itself?
bool onlyCity = ( ( finalCityA == 0 || finalCityA == 1 ) &&
( finalCityB == nw || finalCityB == nw-1 ) );
// but if only a city and the city name is also a street indicator
// then cancel it? that way if they put 'avenue' in the where box
// they do not get 'avenue, maryland' city.
// they should! and this messed up 'homestead' in florida.
//if ( onlyCity && finalCityB == finalCityA+1 ) {
// IndDesc *id = (IndDesc *)g_indicators.getValue(&wids[
// finalCityA]);
// if ( id ) onlyCity = false;
// . if only a city name, nuke it if no state
// . otherwise if we enter 'avenue' into the where box it thinks
// its "Avenue, Maryland"
// . but if the whole thing is just this city, then let it fly...
if ( finalCityDesc &&
// no country
! finalCountryDesc &&
// no state
! finalStateDesc &&
// not just city
! onlyCity &&
// no zip..
finalZipA < 0 ) {
// do not lookup lat/lon...
finalCityDesc = NULL;
// nuke these
finalCityA = -1;
finalCityB = -1;
finalStateA = -1;
finalStateB = -1;
// use userlat/lon to make the bounding box. this is usually the
// city centroid otherwise.
if ( *userLat != 999.0 && *userLon != 999.0 )
// uses getNearestCityId() ... need to update to use
// our new foreign cities...? really only need to add them
// if they do not use dst i guess...?
*timeZone2=getTimeZoneFromLatLon(*userLat, *userLon,0,useDST);
// this is true if we had a city with a lat/lon
//bool status = false;
if ( finalCityDesc ) {
// this is easy...
*timeZone2 = finalCityDesc->m_timeZoneOffset;
*useDST = false;
if ( finalCityDesc->m_flags & PDF_USE_DST ) *useDST = true;
//status = true;
*cityLat = finalCityDesc->m_lat;
*cityLon = finalCityDesc->m_lon;
if ( finalStateDesc ) {
*stateLat = finalStateDesc->m_lat;
*stateLon = finalStateDesc->m_lon;
if ( finalCountryDesc ) {
*countryLat = finalCountryDesc->m_lat;
*countryLon = finalCountryDesc->m_lon;
// did we get a lat/lon from the "where" string?
bool hasCentroid = false;
if ( *cityLat != NO_LATITUDE ) hasCentroid = true;
if ( *zipLat != NO_LATITUDE ) hasCentroid = true;
if ( *userLat != NO_LATITUDE ) hasCentroid = true;
// if we got a cityLat or zipLat or userLat and
// radius is zero then we gotta make it default to 100
if ( *radius == 0 && hasCentroid ) *radius = 100;
// if no centroid...
if ( *radius && ! hasCentroid ) *radius = 0;
// bitch if no centroid
if ( ! hasCentroid && w.m_numWords )
log("query: no centroid for location in wherebox");
if ( *userLat != NO_LATITUDE )
return true;
// reset
alnumPos = -1;
ignoreUntil = -1;
// set the gbwherebuf if provided
char *p = gbwhereBuf;
char *pend = p + gbwhereBufSize - 1; // room for \0
bool gotStuff = false;
bool firstOne = true;
for ( long i = 0 ; p && i < w.m_numWords ; i++ ) {
// count it?
if ( wids[i] ) alnumPos++;
// skip punct
if ( ! wids[i] ) continue;
// skip if in middle of state or city name
if ( i < ignoreUntil ) continue;
// if we had a valid city/state/zip, do not include those
// in this buffer
if ( //status &&
//(i>= finalCountryA && i <finalCountryB ) ||
//(i>= finalStateA && i <finalStateB) ||
(i>= finalCityA && i <finalCityB ) ||
(i>= finalZipA && i < finalZipB ) ) )
// breach check
if ( p + 8 + wlens[i] + 2 >= pend ) break;
if ( ! firstOne ) *p++ = ' ';
firstOne = false;
// now do not break up a state name like 'new mexico' into
// 'gbwhere:new gbwhere:mexico' but rather do
// 'gbwhere:newmexico' because when we hash the gbwhere:
// terms we hash the state adm1 string as 'nm' and its synonym
// 'newmexico'
// we can't do this right now because when we index foreign
// events it is always by lat/lon and we do not know the
// state it is in necessarily...
//Place *ps = getStatePlace ( i , alnumPos , &w );
// only print field header if we got something
//if ( wids[i] ) gotStuff = true;
// . if this is a state name, condense it
// . TODO: what about 'new mexico avenue' will
// Address::hash() index 'nm' for that? i would think so
// if synonyms work right... TEST!
if ( finalStateDesc &&
//finalStateDesc->m_crid == CRID_US &&
i >= finalStateA &&
i < finalStateB ) {
// if we got a city ignore though!
if ( finalCityDesc ) continue;
// or zip...
if ( finalZipA >= 0 ) continue;
// mark it
gotStuff = true;
// use gbstate:
memcpy ( p , "gbeventstatecode:", 17 );
p += 17;
// special treatment. a state abbr is always 2 chars
memcpy ( p , finalStateDesc->m_adm1 , 2 );
p += 2;
// store the country as well for that state whether
// it was entered or not! because some states are
// reduced to their numeric code like "08" and
// many countries have that same code!
char *cc = getCountryCode(finalStateDesc->m_crid);
memcpy ( p , " gbeventcountrycode:", 20 );
p += 20;
memcpy ( p , cc , 2 );
p += 2;
// also set the timezone
*timeZone2 = finalStateDesc->m_timeZoneOffset;
// and useDST
*useDST = false;
if ( finalStateDesc->m_flags&PDF_USE_DST) *useDST=true;
// ignore until end of state words
ignoreUntil = finalStateB;
// . we cover foreign states using radius logic up above now
// . when we index a foreign event we do so using the lat/lon
// only since we do not support foreign addresses yet
// . therefore we do not index gbwhere:<adm1> for it...
// so we use the radius centroid logic above
// . we could fix this by using getNearestCityId() for
// the foreign events...
//else if ( finalStateDesc &&
// finalStateDesc->m_crid != CRID_US &&
// i >= finalStateA &&
// i < finalStateB ) {
// same logic for countries
if ( finalCountryDesc &&
//finalCountryDesc->m_crid == CRID_US &&
i >= finalCountryA &&
i < finalCountryB ) {
// if we got a city ignore though!
if ( finalCityDesc ) continue;
// or zip...
if ( finalZipA >= 0 ) continue;
// mark it
gotStuff = true;
// special treatment. a country abbr is always 2 chars
char *cc = getCountryCode(finalCountryDesc->m_crid);
memcpy ( p , "gbeventcountrycode:", 19 );
p += 19;
memcpy ( p , cc , 2 );
p += 2;
ignoreUntil = finalCountryB;
// . we cover foreign countrys using radius logic up above now
// . when we index a foreign event we do so using the lat/lon
// only since we do not support foreign addresses yet
// . therefore we do not index gbwhere:<adm1> for it...
// so we use the radius centroid logic above
//else if ( finalCountryDesc &&
// finalCountryDesc->m_crid != CRID_US &&
// i >= finalCountryA &&
// i < finalCountryB ) {
// mark it
gotStuff = true;
// field header
memcpy ( p , "gbwhere:", 8 );
// advance
p += 8;
// otherwise store into buffer as is
memcpy ( p , wptrs[i] , wlens[i] );
// advance ptr cursor
p += wlens[i];
// delete?
if ( ! gotStuff ) p = gbwhereBuf;
// null term if provided
if ( p ) *p = '\0';
// set these
if ( retCityDesc ) *retCityDesc = finalCityDesc;
if ( retStateDesc ) *retStateDesc = finalStateDesc;
if ( retCountryDesc ) *retCountryDesc = finalCountryDesc;
return true;//status;
// returns false if not found
bool getCityLatLonFromAddress ( Address *aa , double *lat , double *lon ) {
// assume none
Place *city = aa->m_city;
Place *state = aa->m_adm1;
Place *zip = aa->m_zip;
// set these
uint64_t cityHash64 = 0;
char *adm1Str = NULL;
// set city/state from zip if necessary
if ( ! city && zip ) {
cityHash64 = zip->m_cityHash;
adm1Str = zip->m_adm1;
if ( city )
cityHash64 = city->m_cityHash;
if ( state )
adm1Str = state->m_adm1;
// both must be valid
if ( ! cityHash64 ) return false;
if ( ! adm1Str ) return false;
// combine the two hashes
uint32_t cid32 = (uint32_t)getCityId32(cityHash64,adm1Str);
// now get the lat lon
bool status = getLatLon ( cid32 , lat , lon );
return status;
// . like ";;5815 Wyoming Blvd NE;Albuquerque;87109;NM;;;" ???
char *getZipPtrFromStr ( char *data , long *zipLen ) {
// now point to latitude,longitude
// skip city,state,zip,something,hash,ip
char *zipPtr = data;
long scount = 0;
for ( ; scount < 6 ; zipPtr++ )
if ( *zipPtr == ';' ) scount++;
// get length
char *end = zipPtr + 1;
for ( ; *end != ';' ; end++ );
*zipLen = end - zipPtr ;
// pts past that ';'
return zipPtr;
bool getZipLatLon ( char *zip ,
long zipLen ,
float *zipLat ,
float *zipLon ) {
// assume none
*zipLat = NO_LATITUDE;
// only 5 digits i guess
if ( zipLen != 5 ) return false;
// hash it
long long zh = getWordXorHash2(zip,zipLen);
// get it
ZipDesc *zd = (ZipDesc *)g_zips.getValue(&zh);
// mine it
if ( ! zd ) return false;
*zipLat = zd->m_latitude;
*zipLon = zd->m_longitude;
return true;
bool getZipLatLonFromStr ( char *addrStr ,
float *zipLat ,
float *zipLon ) {
long zipLen;
char *zip = getZipPtrFromStr ( addrStr , &zipLen );
return getZipLatLon ( zip , zipLen , zipLat, zipLon );
bool getZipLatLonFromAddress ( Address *aa ,
float *zipLat ,
float *zipLon ) {
// assume none
*zipLat = NO_LATITUDE;
Place *zip = aa->m_zip;
if ( ! zip ) return false;
return getZipLatLon(zip->m_str, zip->m_strlen,zipLat,zipLon);
// if you just want to call setStr() and have it use stack mem to
// store up to 10 places, then init the PlaceMem with this very quickly
void PlaceMem::init ( long poolSize ,
long initNumPoolPtrs ,
long initNumPlacePtrs ,
char *stackMem ,
long stackMemSize ,
long niceness ) {
m_stack = stackMem;
m_stackSize = stackMemSize;
m_initNumPoolPtrs = initNumPoolPtrs;
m_initNumPlacePtrs = initNumPlacePtrs;
m_poolSize = poolSize;
m_numPlacePtrsAllocated = 0;
m_numPoolPtrsAllocated = 0;
m_numPoolsAllocated = 0;
m_numPlacePtrs = 0;
m_cursor = NULL;
m_cursorEnd = NULL;
m_cursorPoolNum = -1;
m_niceness = niceness;
// . returns NULL and sets g_errno on error
// . stores ptr to the returned mem in m_placePtrs[placeNum]
void *PlaceMem::getMem ( long need ) {
// sanity
if ( need > m_poolSize ) { char *xx=NULL;*xx=0; }
// return if we got it
if ( m_cursor && m_cursor + need <= m_cursorEnd ) {
// do we need to realloc m_placePtrs?
if ( m_numPlacePtrs + 1 > m_numPlacePtrsAllocated ) {
if ( m_stack ) { char *xx=NULL;*xx=0; }
long oldSize =m_numPlacePtrsAllocated * 4;
long newAlloc =m_numPlacePtrsAllocated + 2000;
if ( m_numPlacePtrsAllocated == 0 )
newAlloc = m_initNumPlacePtrs;
char **newPtrs = (char **)mmalloc(newAlloc*4,"pptbl");
if ( ! newPtrs ) return NULL;
for ( long i = 0 ; i < m_numPlacePtrs ; i++ ) {
// breathe
newPtrs[i] = m_placePtrs[i];
// to be safe to avoid bad mem writes
m_placePtrs[i] = NULL;
//memcpy ( newPtrs, m_placePtrs , m_numPlacePtrs*4);
mfree ( m_placePtrs , oldSize , "pptbl");
m_placePtrs = newPtrs;
m_numPlacePtrsAllocated = newAlloc;
// store it
m_placePtrs[m_numPlacePtrs] = m_cursor;
// increment it
// save cursor so we can return that
char *returnPtr = m_cursor;
// increment to next place (need = sizeof(Place) usually)
m_cursor += need;
// return the mem for them to use now
return (void *)returnPtr;
// try to use stack
if ( m_stack && m_numPoolPtrsAllocated == 0 ) {
// compute min size for stack...
long need = 0;
need += m_initNumPoolPtrs * 4 ;
need += m_initNumPlacePtrs * 4 ;
need += m_poolSize;
// make sure stack size is big enough for what they want
if ( m_stackSize < need ) { char *xx=NULL;*xx=0;}
// parse it up
char *p = m_stack;
m_placePtrs = (char **)p;
p += m_initNumPlacePtrs * 4;
m_poolPtrs = (char **)p;
p += m_initNumPoolPtrs + 4;
m_poolPtrs[0] = p;
p += m_poolSize;
m_numPoolsAllocated = 1;
m_numPlacePtrsAllocated = m_initNumPlacePtrs;
m_numPoolPtrsAllocated = m_initNumPoolPtrs;
m_cursor = m_poolPtrs[0];
m_cursorEnd = m_cursor + m_poolSize;
m_cursorPoolNum = 0;
// give em that mem now i guess
goto top;
// always constrain to stack if provided to make things simple
if ( m_stack ) { char *xx=NULL;*xx=0; }
// add a new pool
if ( m_numPoolsAllocated + 1 > m_numPoolPtrsAllocated ) {
long oldSize = m_numPoolPtrsAllocated * 4;
long newAlloc = m_numPoolPtrsAllocated + 100;
if ( m_numPoolPtrsAllocated == 0 )
newAlloc = m_initNumPoolPtrs;
char **newPtrs = (char **)mmalloc(newAlloc*4,"pptbl2");
if ( ! newPtrs ) return NULL;
memcpy ( newPtrs , m_poolPtrs , m_numPoolsAllocated*4 );
mfree ( m_poolPtrs , oldSize , "pptbl2");
m_poolPtrs = newPtrs;
m_numPoolPtrsAllocated = newAlloc;
// if we had called setNumPtrs() or rewind() the next pool might
// already be allocated, so if that is true, use it!
long poolNum = m_cursorPoolNum + 1;
// sanity check
if ( poolNum > m_numPoolsAllocated ) { char *xx=NULL;*xx=0; }
// poolNum could be < m_numPoolsAllocated IF we did a rewind at
// somepoint so that m_cursorPoolNum was decreased in setNumPtrs().
// but we need to allocate a new pool if that was not the case.
if ( poolNum == m_numPoolsAllocated ) {
// make a new pool now
char *pool = (char *)mcalloc(m_poolSize,"pool3");
if ( ! pool ) return NULL;
m_poolPtrs [ m_numPoolsAllocated ] = pool;
// update cursor now
m_cursor = m_poolPtrs[poolNum];
m_cursorEnd = m_poolPtrs[poolNum] + m_poolSize;
m_cursorPoolNum = poolNum;
// sanity check
char *pool = m_poolPtrs[m_cursorPoolNum];
char *poolEnd = pool + m_poolSize;
if ( m_cursor < pool || m_cursor >= poolEnd ) { char *xx=NULL;*xx=0;}
// and re-try
goto top;
PlaceMem::PlaceMem() {
// make sure reset() won't core us
m_placePtrs = NULL;
m_poolPtrs = NULL;
m_numPoolsAllocated = 0;
m_niceness = 0;
m_numPlacePtrs = 0;
m_numPoolPtrsAllocated = 0;
m_numPlacePtrsAllocated = 0;
2013-08-09 19:52:15 +04:00
m_stack = NULL;//false;
2013-08-03 00:12:24 +04:00
PlaceMem::~PlaceMem() {
void PlaceMem::reset ( ) {
// do not core
if ( m_stack ) return;
// free everything
for ( long i = 0 ; i < m_numPoolsAllocated; i++ ) {
mfree( m_poolPtrs[i] , m_poolSize, "pool3");
m_poolPtrs[i] = NULL;
// free ptrs
if ( m_placePtrs )
mfree ( m_placePtrs, m_numPlacePtrsAllocated * 4,"plptrs");
if ( m_poolPtrs )
mfree ( m_poolPtrs , m_numPoolPtrsAllocated * 4,"poptrs");
m_placePtrs = NULL;
m_poolPtrs = NULL;
m_numPoolPtrsAllocated = 0;
m_numPlacePtrsAllocated = 0;
m_cursor = NULL;
m_numPlacePtrs = 0;
m_numPoolsAllocated = 0;
// . sometimes we remove the last X Places we added above when we realized
// something was bogus
// . pass in ptr to first Place ptr to be nuked
void PlaceMem::setNumPtrs ( long newNumPtrs ) {
// return if no change requested
if ( newNumPtrs == m_numPlacePtrs ) return;
// sanity check
if ( newNumPtrs >= m_numPlacePtrs ) { char *xx=NULL;*xx=0;};
if ( newNumPtrs < 0 ) { char *xx=NULL;*xx=0;};
// set it back
m_cursor = m_placePtrs[newNumPtrs];
// back up the pool until we are in it
for ( ; m_cursorPoolNum >= 0 ; m_cursorPoolNum-- ) {
char *pool = m_poolPtrs[m_cursorPoolNum];
char *poolEnd = pool + m_poolSize;
if ( m_cursor >= pool && m_cursor < poolEnd ) {
m_cursorEnd = poolEnd;
// this is wierd
if ( m_cursorPoolNum < 0 ) { char *xx=NULL;*xx=0; }
// reset final
m_numPlacePtrs = newNumPtrs;
void resetAddressTables ( ) {
if ( s_latList ) mfree ( s_latList,s_latListSize,"latlist");
s_latList = NULL;
if ( g_pbuf ) mfree ( g_pbuf, g_pbufSize , "placbuf");
// Use this for the new functions:
// If user enters 'berlin': (try to get in country of m_ipCrid first)
// If user enters 'berlin, germany':
// PlaceDesc *getMostPopularCity_new ( uint64_t cityHash64,char crid)
// Algorithm: scan list of cities in that country and choose the most
// populated one in that country.
// If user enters 'berlin': (next, try to get most popular in world)
// PlaceDesc *getMostPopularCity_new ( uint64_t cityHash64 , 0 = crid );
// If user enters 'berlin, <adm1>' or 'cincinnati, ohio'.
// PlaceDesc *getCityInState_new ( uint64_t cityHash64,uint64_t stateHash64);
// Algorithm: get list of all places that are states with stateHash64, and
// record list as the two-letter state codes. Then scan the cities with
// cityHash64 and see which has one of the state codes in that list.
// If user enters 'germany' or 'republic of chad'
// PlaceDesc *getCountryPlace ( long a, long alnumPos, Words *w );
// need this
// PlaceDesc *getCountryDescFromId ( uint8_t crid );
// For getting the timezone from a lat/lon in a foreign country:
// PlaceDesc *getNearestCity_new ( float lat , float lon );
// . maps a hash of a word or phrase to a PlaceDesc ptr
// . dups are allowed - one key can map to multiple PlaceDescriptors
//HashTableX g_nameTable;
bool loadPlaces ( ) {
// map 64bit name hash to a place dec ptr. allowdups= true.
// niceness = 0.
g_nameTable.set ( 8 , // 64 bit key hash
4 , // placedec ptr
0 , // no initial slots
NULL , // no intiial buf
0 , // zero initial buf size
true , // allow dups?
0 , // niceness
"nametab" );
if ( g_proxy.isProxy() ) return true;
// log it
log("places: loading places.dat");
// try to load from disk
if ( g_nameTable.load ( g_hostdb.m_dir ,
"places.dat" ,
&g_pbuf ,
&g_pbufSize ) ) {
// test it out
PlaceDesc *pd = getCity2_new ( "abq", "nm", CRID_US,0);
if ( ! pd ) { char *xx=NULL;*xx=0; }
// make sure "nm" brings up new mexico
pd = getState2_new ( "nm", CRID_US,0);
if ( ! pd ) { char *xx=NULL;*xx=0; }
// scan for integrity
pd = (PlaceDesc *)g_pbuf;
//PlaceDesc *pdend = (PlaceDesc *)(g_pbuf+g_pbufSize);
for ( ; ; pd++ ) {
// stop if we enter the name buf space
if ( ((char *)pd)[0] == 'u' &&
((char *)pd)[1] == 'n' &&
! strcmp((char *)pd,"unknown name" ) )
// sanity
if ( pd->m_lat < -180.0 ) { char *xx=NULL;*xx=0; }
if ( pd->m_lat > 180.0 ) { char *xx=NULL;*xx=0; }
if ( pd->m_lon < -180.0 ) { char *xx=NULL;*xx=0; }
if ( pd->m_lon > 180.0 ) { char *xx=NULL;*xx=0; }
return true;
// error?
log("places: failed to load places.dat: %s",mstrerror(g_errno));
// try making it
return generatePlacesFile ( );
// used by PageEvents.cpp's getSiteMap() to list the most popular cities
PlaceDesc *getPlaceDescBuf () {
return (PlaceDesc *)g_pbuf;
bool generatePlacesFile ( ) {
log("places: generating places.dat file");
char buf[10000];
// MAKE TIMEZONE TABLE for referencing
// scan allCountries.txt
char pcmd[1024];
sprintf(pcmd,"cat %s/timeZones.txt",g_hostdb.m_dir);
FILE *pf = popen ( pcmd , "r" );
if ( ! pf ) {
g_errno = errno;
return log("places: could not open timeZones.txt");
class TZVal {
char m_tzoff;
char m_useDST;
HashTableX tztab;
tztab.set ( 8 , sizeof(TZVal),0,NULL,0,false,0,"tztab");
// read in the lines
while ( fgets ( buf , 10000 , pf ) ) {
// null terminate it, instead of \n
// parse it up. timezonestr\ttzoff1|tzoffdst
char timeZoneStr[64]; // Europe/Andorra
long off1;
long off2; // dst
sscanf ( buf ,
"%s\t" // timezone name
"%li\t" // off1
"%li" // off2
, timeZoneStr
, &off1
, &off2
// make a table
long long tzh64 = getWordXorHash ( timeZoneStr );
// make the value
TZVal tzval;
tzval.m_tzoff = off1;
if ( off1 != off2 ) tzval.m_useDST = 1;
else tzval.m_useDST = 0;
tztab.addKey ( &tzh64 , &tzval );
// . map a geoId to ptr to the PlaceDesc in the g_placeBuf
// . a temporary table really...
HashTableX places;
places.set ( 4, 4, 5000000 , NULL ,0 , false, 0,"gpht");
// official names of each place
SafeBuf nameBuf;
nameBuf.reserve ( 10*1024*1024 );
// this is actually required and we check for it to avoid
// overruning our PlaceDesc when we scan those. we need this
// to set "pdend" for the PlaceDesc scan because we concatenate
// the nameBuf to the end of the placeBuf. so basically
// places.dat holds those two conjoined buffers ...
nameBuf.safePrintf("unknown name");
long zero = 0;
// reserve 100MB
SafeBuf placeBuf;
placeBuf.reserve ( 100*1024*1024 );
HashTableX dedup;
dedup.set ( 8,4,100000,NULL,0,false,0,"pddptb");
// this will have to be remade
sprintf(pcmd,"unlink %s/citylatlist.dat",g_hostdb.m_dir);
// scan allCountries.txt
sprintf(pcmd,"cat %s/allCountries.txt",g_hostdb.m_dir);
pf = popen ( pcmd , "r" );
if ( ! pf ) { g_errno = errno; return false; }
// limit g_nameTable from getting too big! otherwise places.dat
// is 550MB on disk and in memory!!! with this is it 200MB.
// otherwise it grows to 32M slots...
g_nameTable.m_maxSlots = 8388608; // 1<<23
// read in the lines
while ( fgets ( buf , 10000 , pf ) ) {
// null terminate it, instead of \n
// parse it up. id|name|lat|lon|abbr
long geoId;
char name[512];
float lat;
float lon;
char code [16];
char countryAbbr[32];
char stateAbbr[32];
long population = 0;
char timeZoneStr[64]; // Europe/Andorra
// convert all tabs to \0
char *p = buf;
for ( ; *p ; p++ ) if ( *p == '\t' ) *p = '\0';
// see /geo/geonames/index.html for format description
p = buf;
long geoId = atol(p); p += strlen(p) + 1;
//if ( geoId == 1850147 )
// log("hey");
char *officialName = p; p += strlen(p) + 1; // official name
char *asciiName = p; p += strlen(p) + 1; // asciname
char *altNames = p; p += strlen(p)+1; // altnames
float lat;
// sometimes allCountries.txt leaves out "altNames" field!
// so detect if this field is a latitude or not...
bool hadAlpha = false;
bool hadDigit = false;
bool hadPeriod = false;
char *tmp = altNames;
for ( ; *tmp ; tmp++ ) {
if ( is_alpha_a(*tmp) ) hadAlpha = true;
if ( is_digit (*tmp) ) hadDigit = true;
if ( *tmp == '.' ) hadPeriod = true;
// need a digit and no alphas to be a latitude
bool isLat = false;
if ( hadDigit && ! hadAlpha && hadPeriod ) isLat = true;
if ( isLat ) {
lat = atof ( altNames );
else {
lat = atof(p);
p += strlen(p) + 1;
float lon = atof ( p ); p += strlen(p) + 1;
p += strlen(p) + 1; // code class
char *code = p; p += strlen(p)+1; // code type
char *countryAbbr = p; p += strlen(p)+1;
p += strlen(p)+1; // altCountry
char *stateAbbr = p; p += strlen(p)+1;
p += strlen(p)+1; // adm2
p += strlen(p)+1; // adm3
p += strlen(p)+1; // adm4
long population = atol(p); p += strlen(p)+1;
p += strlen(p)+1; // elevation
p += strlen(p)+1; // avg elevation
char *timeZoneStr = p; p += strlen(p)+1;
p += strlen(p)+1; // moddate
// debug point
//if ( geoId == 5381396 )
// log("hey");
// skip if no timezone for now
if ( ! timeZoneStr[0] ) {
log("places: no timezone for geoid=%li name=%s",
// reserve space
//placeBuf.reserve ( 1024 );
// not allowed to grow since we use dedup table now
if ( placeBuf.getAvail() < (long)sizeof(PlaceDesc) ) {
char *xx=NULL;*xx=0;}
// make a new country desc
PlaceDesc *pd = (PlaceDesc *)placeBuf.getBuf();
// see
// exceptions:
// "122 Mile House" ...
if ( ! strncmp( code,"PPLL",4)) continue;
// a basic city
if ( ! strncmp( code,"PPL",3)) pd->m_flags = PDF_CITY;
// locality
else if ( ! strcmp ( code ,"LCTY")) pd->m_flags = PDF_CITY;
// . town of, township, town of north hempstead
// . crap! this gets a different san jose!
// . avoid "City of Cincinnati" etc.. crap
// . BUT allow town of north hempstead through (5129081)
else if ( ! strcmp ( code ,"ADMD") && geoId == 5129081 )
pd->m_flags = PDF_CITY;
// independent political entity
else if ( ! strcmp ( code,"PCLIX")) pd->m_flags = PDF_CITY;
// another city i guess
else if ( ! strcmp ( code , "P" ) ) pd->m_flags = PDF_CITY;
// states
else if ( ! strcmp ( code ,"ADM1")) pd->m_flags = PDF_STATE;
// countries
else if ( ! strcmp ( code ,"PCLI")) pd->m_flags = PDF_COUNTRY;
// otherwise, skip it!
else continue;
// . sanity
// . these were messing up our raw lat/lon processing
// in searchinput.cpp because we thought that a direct
// lat/lon in the wherebox was a city name because there was
// a city name that was "35", which was our latitude entered!
if ( pd->m_flags == PDF_CITY && is_digit(officialName[0]) ){
log("places: bad city name: %s",officialName);
// a bunch of cities do not have states...
//if ( pd->m_flags != PDF_COUNTRY &&
// ( ! stateAbbr[0] || ! stateAbbr[0] ) ) {
// log("hey %s",officialName);
// continue;
// get country id
pd->m_crid = getCountryId ( countryAbbr );
// geoid for looking up in alternateNames.txt
//pd->m_geoId = geoId;
// lat and lon
pd->m_lat = lat;
pd->m_lon = lon;
pd->m_population = population;
// skip over it (not allowed to grow anymore!)
//placeBuf.advance ( sizeof(PlaceDesc) );
placeBuf.m_length += (long)sizeof(PlaceDesc);
// . point to that. we'll store <adm1>,<name> in there now
// . we need to somehow append alternate names later
//pd->m_data = placeBuf.getBuf();
// store adm1 in m_data[]
pd->m_adm1[0] = to_lower_a(stateAbbr[0]);
pd->m_adm1[1] = to_lower_a(stateAbbr[1]);
// if greece... use last two
if ( to_lower_a(countryAbbr[0]) == 'g' &&
to_lower_a(countryAbbr[1]) == 'r' &&
pd->m_adm1[0] == 'e' &&
pd->m_adm1[1] == 's' &&
is_digit(stateAbbr[4]) &&
is_digit(stateAbbr[5]) ) {
// store the last two letter's for greece
pd->m_adm1[0] = to_lower_a(stateAbbr[4]);
pd->m_adm1[1] = to_lower_a(stateAbbr[5]);
// hash timezone string
uint64_t tzh64 = getWordXorHash ( timeZoneStr );
//look it up in our table made from /geo/geonames/timeZones.txt
TZVal *tzv = (TZVal *)tztab.getValue ( &tzh64 );
if ( ! tzv ) { char *xx=NULL;*xx=0 ;}
// from -12 to + 12 i guess
pd->m_timeZoneOffset = tzv->m_tzoff;
// now the daylightsavings time flag
if ( tzv->m_useDST ) pd->m_flags |= PDF_USE_DST;
// . add to table using the name as the key
// . i think this table is just for generation since
// we'll use the g_namesTable to map place names to
// the PlaceDesc.
// store OFFSETS in nametable
long placeDescOffset = (char *)pd - placeBuf.getBufStart();
// we need to add the official name here because it's not
// always in alternateNames.txt...
uint64_t nh64a = getWordXorHash ( officialName );
uint64_t dedupKeya = nh64a ^ (unsigned long)placeDescOffset;
// skip if in there
if ( ! dedup.isInTable(&dedupKeya) ) {
// make this name's hash point to its PlaceDesc
if ( ! g_nameTable.addKey ( &nh64a, &placeDescOffset))
return false;
// do not add dup combos
dedup.addKey ( &dedupKeya , &zero );
// hmmm... we need nh64 to be ascii for adding to nameBuf...
uint64_t exactHash64 = hash64n ( officialName );
// also make this name's hash point to the
// name itself so we can convert a lat/lon into
// a place name, based on getNearestCity_new()
if ( ! dedup.isInTable ( &exactHash64 ) ) {
// nameBuf
long nameOffset = nameBuf.length();
// store it
long olen = gbstrlen(officialName);
nameBuf.safeMemcpy ( officialName , olen );
// store offset
pd->m_officialNameOffset = nameOffset;
// do not repeat!
dedup.addKey ( &exactHash64 , &nameOffset );
else {
// i guess we already added this name before so
// point to where we added it
long off = *(long *)dedup.getValue ( &exactHash64 );
// use that then
pd->m_officialNameOffset = off;
// also add the ascii too, it seems a lot of times that
// is not given in the alternateNames.txt file either!!!!
uint64_t nh64b = getWordXorHash ( asciiName );
uint64_t dedupKeyb = nh64b ^ (unsigned long)placeDescOffset;
// skip if in there
if ( ! dedup.isInTable(&dedupKeyb) ) {
// make this name's hash point to its PlaceDesc
if ( ! g_nameTable.addKey ( &nh64b, &placeDescOffset))
return false;
// do not add dup combos
dedup.addKey ( &dedupKeyb , &zero );
// skip if not state
if ( ! ( pd->m_flags & PDF_STATE) ) continue;
// skip if is numeric for now... strange...
//if ( is_digit(stateAbbr[0]) ) continue;
if ( ! stateAbbr[0] ) continue;
// if we are a state, add our abbreviation here as well!
// does this convert to lowercase? yes... it should
uint64_t nh64c = getWordXorHash ( stateAbbr );
// make another dedupkey
uint64_t dedupKeyc = nh64c ^ (unsigned long)placeDescOffset;
// check that as well
if ( dedup.isInTable(&dedupKeyc) ) continue;
if ( ! g_nameTable.addKey ( &nh64c , &placeDescOffset ) )
return false;
// do not add dup combos
dedup.addKey ( &dedupKeyc , &zero );
// close the pipe
// . now scan in the alternateNames.txt
// . add to the hashtablex g_nameTable
// . key is word xor hash of the name
// . value is ptr to the PlaceDesc in placeBuf
// . allow dups since a single name can point to multiple unique places
sprintf(pcmd,"cat %s/alternateNames.txt",g_hostdb.m_dir);
pf = popen ( pcmd , "r" );
if ( ! pf ) { g_errno = errno; return false; }
// read in the lines
while ( fgets ( buf , 10000 , pf ) ) {
// null terminate it, instead of \n
// convert all tabs to \0
char *p = buf;
for ( ; *p ; p++ ) if ( *p == '\t' ) *p = '\0';
// parse it up. id|name|lat|lon|abbr
p = buf;
p += strlen(p) + 1; // some number
long geoId = atol(p); p += strlen(p) + 1;
p += strlen(p) + 1; // langIdStr
char *altName = p; p += strlen(p) + 1;
p += strlen(p) + 1; // is preferred name
p += strlen(p) + 1; // is short ?name
// now hash up that name
uint64_t nh64d = getWordXorHash ( altName );
// find the place desc for it
PlaceDesc **ppd = (PlaceDesc **)places.getValue ( &geoId );
// this won't be there if its not a city,ctry,state, etc.
// or timezone was missing above
if ( ! ppd ) continue;
// cast it otherwise
PlaceDesc *pd = *ppd;
// store OFFSETS in nametable
long placeDescOffset = (char *)pd - placeBuf.getBufStart();
// do not add dup combos
uint64_t dedupKeyd = nh64d ^ (unsigned long)placeDescOffset;
if ( dedup.isInTable ( &dedupKeyd ) ) continue;
// use that
if ( ! g_nameTable.addKey ( &nh64d , &placeDescOffset ) )
return false;
// do not add dup combos
dedup.addKey ( &dedupKeyd , &zero ) ;
// set this temporarily so getState_new() etc. works for now
g_pbuf = placeBuf.getBufStart();
// . add in state aliases for states in the US
// . "wash" = "washington" "ore = oregeon" etc.
long n = (long)sizeof(s_states)/ sizeof(StateDesc);
for ( long i = 0 ; i < n ; i++ ) {
// get it
StateDesc *sd = &s_states[i];
// skip if none
if ( ! sd->m_name2 ) continue;
// get original name
uint64_t nh64 = getWordXorHash ( sd->m_name1 );
// get the PlaceDesc. this will scan all the matches and
// get the one that is a state in the US
PlaceDesc *pd = getState_new ( nh64 , CRID_US , 0 );
// must be there
if ( ! pd ) { char *xx=NULL;*xx=0; }
// make key (d.c. colo. n.m.)
uint64_t anh64 = getWordXorHash ( sd->m_name2 );
// store OFFSETS in nametable
long offset = (char *)pd - placeBuf.getBufStart();
// add the alias
if ( ! g_nameTable.addKey ( &anh64 , &offset ) ) return false;
// add our CITY aliases i.e. "abq" or "nyc" for cities in the US
n = (long)sizeof(s_cityList)/ sizeof(AliasDesc);
for ( long i = 0 ; i < n ; i++ ) {
// get it
AliasDesc *ad = &s_cityList[i];
// get the PlaceDesc. this will scan all the matches and
// get the one that is a state in the US
PlaceDesc *pd = getCity2_new(ad->m_s2, ad->m_adm1 , CRID_US,0);
// must be there
if ( ! pd ) { char *xx=NULL;*xx=0; }
// make key (d.c. colo. n.m.)
uint64_t ach64 = getWordXorHash ( ad->m_s1 );
// store OFFSETS in nametable
long offset = (char *)pd - placeBuf.getBufStart();
// add the alias
if ( ! g_nameTable.addKey ( &ach64 , &offset ) ) return false;
// size of placeBuf
long placeBufLength = placeBuf.length();
// concatenate nameBuf to placeBuf for saving to disk
if ( ! ( nameBuf ) ) return false;
// adjust all PlaceDesc::m_officialNameOffset vars to compensate for
// this concatenation
PlaceDesc *pd = (PlaceDesc *)placeBuf.getBufStart();
PlaceDesc *pdend = (PlaceDesc *)(((char *)pd) + placeBufLength);
for ( ; pd < pdend ; pd++ )
pd->m_officialNameOffset += placeBufLength;
// test it out
PlaceDesc *pd2 = getCity2_new ( "abq", "nm", CRID_US,0);
if ( ! pd2 ) { char *xx=NULL;*xx=0; }
long long ph64 = getWordXorHash ( "Tokyo" );
pd2 = getMostPopularPlace_new ( ph64 ,CRID_ANY ,PDF_CITY,0 );
if ( ! pd2 ) { char *xx=NULL;*xx=0; }
// pasadena texas is more popular than california!
ph64 = getWordXorHash ( "Pasadena" );
pd2 = getMostPopularPlace_new ( ph64 ,CRID_US ,PDF_CITY,0 );
//if ( pd2->m_population != 144618 ) { char *xx=NULL;*xx=0; }
if ( ! pd2 ) { char *xx=NULL;*xx=0; }
// . now the g_nameTable points into the buffer of PlaceDesc, save it
// . HashTableX can save the buffer too now!
if ( ! ( g_hostdb.m_dir ,
"places.dat" ,
placeBuf.getBufStart() ,
placeBuf.length() ) )
return false;
// ok, try loading now
log("places: loading generated table places.dat from disk");
return g_nameTable.load ( g_hostdb.m_dir , "places.dat" ,
&g_pbuf ,
&g_pbufSize );
// get the state in this country
PlaceDesc *getState_new ( uint64_t pd64 , uint8_t crid , long niceness ) {
long slot = g_nameTable.getSlot ( &pd64 );
// scan the slots
for ( ; slot >= 0 ; slot = g_nameTable.getNextSlot(slot,&pd64) ) {
// breathe
// get the placedesc
long offset = *(long *)g_nameTable.getValueFromSlot(slot);
PlaceDesc *pd = (PlaceDesc *)(g_pbuf + offset);
// skip if not a state
if ( ! (pd->m_flags & PDF_STATE ) ) continue;
// skip if not right country
if ( pd->m_crid != crid ) continue;
// we got it!
return pd;
return NULL;
// get the state in this country
PlaceDesc *getState2_new ( char *state , uint8_t crid , long niceness ) {
uint64_t sh64 = getWordXorHash ( state );
return getState_new ( sh64, crid,niceness);
PlaceDesc *getCity_new ( uint64_t ch64 ,
char *stateAbbr ,
uint8_t crid ,
long niceness ) {
// sanity
if ( ! is_lower_a(stateAbbr[0]) ) { char *xx=NULL;*xx=0; }
if ( ! is_lower_a(stateAbbr[1]) ) { char *xx=NULL;*xx=0; }
long slot = g_nameTable.getSlot ( &ch64 );
// scan the slots
for ( ; slot >= 0 ; slot = g_nameTable.getNextSlot(slot,&ch64) ) {
// breathe
// get the placedesc
long offset = *(long *)g_nameTable.getValueFromSlot(slot);
PlaceDesc *pd = (PlaceDesc *)(g_pbuf + offset);
// skip if not a city
if ( ! (pd->m_flags & PDF_CITY ) ) continue;
// skip if not right country
if ( crid != CRID_ANY && pd->m_crid != crid ) continue;
// or right state
if ( stateAbbr[0] != pd->m_adm1[0] ) continue;
if ( stateAbbr[1] != pd->m_adm1[1] ) continue;
// we got it!
return pd;
return NULL;
PlaceDesc *getCity2_new ( char *city ,
char *stateAbbr ,
uint8_t crid ,
long niceness ) {
uint64_t ch64 = getWordXorHash ( city );
return getCity_new ( ch64, stateAbbr,crid,niceness);
PlaceDesc *getCity3_new ( uint64_t ch64 ,
uint64_t stateHash64,
uint8_t crid ,
long niceness ) {
long slot1 = g_nameTable.getSlot ( &ch64 );
// scan the slots
for ( ; slot1 >= 0 ; slot1 = g_nameTable.getNextSlot(slot1,&ch64) ) {
// breathe
// get the placedesc
long offset1 = *(long *)g_nameTable.getValueFromSlot(slot1);
PlaceDesc *pd1 = (PlaceDesc *)(g_pbuf + offset1);
// skip if not a city
if ( ! (pd1->m_flags & PDF_CITY ) ) continue;
// skip if not right country
if ( crid != CRID_ANY && pd1->m_crid != crid ) continue;
// see if we got a state that matches "stateHash64" and
// "pd->m_adm1"
long slot2 = g_nameTable.getSlot ( &stateHash64 );
for ( ; slot2 >= 0 ;
slot2=g_nameTable.getNextSlot(slot2,&stateHash64)) {
// breathe
// get the placedesc
long offset2;
offset2 = *(long *)g_nameTable.getValueFromSlot(slot2);
PlaceDesc *pd2 = (PlaceDesc *)(g_pbuf + offset2);
// skip if not a city
if ( ! (pd2->m_flags & PDF_CITY ) ) continue;
// skip if not right country
if ( crid != CRID_ANY && pd2->m_crid != crid) continue;
// matching abbr?
if ( pd2->m_adm1[0] != pd1->m_adm1[0] ) continue;
if ( pd2->m_adm1[1] != pd1->m_adm1[1] ) continue;
// it's a match!
return pd1;
return NULL;
bool getLongestPlaceName_new ( long a,
long alnumPos,
Words *words,
uint8_t placeType,
uint8_t crid,
char *stateAbbr,
uint64_t *placeHash64,
long *placeAlnumA,
long *placeAlnumB,
long *placeA,
long *placeB ,
// set to most popular match
PlaceDesc **pdp ) {
// assume none
if ( placeHash64 ) *placeHash64 = 0LL;
// init hash to zero
long long h = 0LL;
// max count
long count = 0;
// record start
long startAlnumPos = alnumPos;
// fix this
// for some filtering
static bool s_flag = false;
static long long h_university;
static long long h_of;
if ( ! s_flag ) {
s_flag = true;
h_university = hash64n("university");
h_of = hash64n("of");
// shortcut
long nw = words->m_numWords;
long wcount = 0;
// loop over words in [a,b)
for ( long k = a ; k < nw ; k++ ) {
// or 15 words is good enough too!
if ( ++wcount >= 20 ) break;
// skip if not alnum
if ( ! words->isAlnum(k) ) continue;
// count it
// only up to 4 words in a place name
if ( ++count >= 5 ) break;
// get the hash of potential place name
long long wid = words->m_wordIds[k];
// shortcut
long wlen = words->m_wordLens[k];
char *wptr = words->m_words[k];
// if it ended in apostrophe s then fix that
if ( wlen > 2 &&
wptr[wlen-2]=='\'' &&
to_lower_a(wptr[wlen-1]) == 's' )
// hash the word without the 's
wid = hash64Lower_utf8(wptr,wlen-2);
// mix it up
h <<= 1;
// hash it into our ongoing hash
h ^= wid;
// ignore "University" if "of" follows
if ( h == h_university &&
k + 2 < nw &&
words->m_wordIds[k+2] == h_of )
// get it. just get the most popular that matches
PlaceDesc *pd = getPlaceDesc ( h,placeType,crid,stateAbbr,0);
if ( ! pd ) continue;
// check for "county" (santa fe county is not a city name)
if ( k + 2 < nw && words->m_wordIds[k+2] == h_county ) {
// nuke it
if ( placeHash64 ) *placeHash64 = 0LL;
return true;
// shortcuts
//char **wptrs = words->getWords();
//long *wlens = words->getWordLens();
// set the place
*placeA = a;
*placeB = k+1;
*placeAlnumA = startAlnumPos;
*placeAlnumB = alnumPos+1;
if ( placeHash64 ) *placeHash64 = h;
if ( pdp ) *pdp = pd;
return true;
// . placeType is like PDF_CITY or PDF_STATE or PDF_COUNTRY
// . return most popular i guess
PlaceDesc *getPlaceDesc ( uint64_t placeHash64 ,
uint8_t placeType ,
uint8_t crid,
char *stateAbbr,
long niceness ) {
long maxPop = -1;
PlaceDesc *best = NULL;
long slot = g_nameTable.getSlot ( &placeHash64 );
// scan the slots
for ( ; slot >= 0 ; slot = g_nameTable.getNextSlot(slot,&placeHash64)){
// breathe
// get the placedesc
long offset = *(long *)g_nameTable.getValueFromSlot(slot);
PlaceDesc *pd = (PlaceDesc *)(g_pbuf + offset);
// skip if not the right type of place
if ( ! (pd->m_flags & placeType ) ) continue;
// crid too match?
if ( crid != CRID_ANY && pd->m_crid != crid ) continue;
// state match?
if ( stateAbbr && pd->m_adm1[0] != stateAbbr[0] ) continue;
if ( stateAbbr && pd->m_adm1[1] != stateAbbr[1] ) continue;
// get pop
if ( pd->m_population <= maxPop ) continue;
// otherwise, a new max
maxPop = pd->m_population;
// save it
best = pd;
return best;
bool getZip_new ( long a ,
long alnumPos ,
Words *words ,
uint64_t *zipHash64 ,
uint64_t *zipCityHash64 ,
uint64_t *zipStateHash64 ,
long *zipAlnumA,
long *zipAlnumB,
long *zipA,
long *zipB,
float *zipLat,
float *zipLon ) {
// assume none
if ( zipHash64 ) *zipHash64 = 0LL;
// must be a number
if ( ! is_digit(words->m_words[a][0]) ) return true;
// make hash
long long h = 0 ^ words->m_wordIds[a];
// check for zip code
long slot = g_zips.getSlot(&h);
// skip if not
if ( slot < 0 ) return true;
// get the place
ZipDesc *zd =(ZipDesc *)g_zips.getValueFromSlot(slot);
// set state hash
if ( zipStateHash64 ) *zipStateHash64 = hash64(zd->m_adm1,2,0LL);
// and city hash
if ( zipCityHash64 ) *zipCityHash64 = zd->m_cityHash;
*zipA = a;
*zipB = a+1;
*zipAlnumA = alnumPos;
*zipAlnumB = alnumPos+1;
if ( zipHash64 ) *zipHash64 = h;
*zipLat = zd->m_latitude;
*zipLon = zd->m_longitude;
return true;
PlaceDesc *getMostPopularPlace_new ( long long placeHash64,
uint8_t crid ,
uint8_t placeType,
long niceness ) {
long maxPop = -1;
PlaceDesc *best = NULL;
long slot = g_nameTable.getSlot ( &placeHash64 );
// scan the slots
for ( ; slot >= 0; slot = g_nameTable.getNextSlot(slot,&placeHash64)){
// breathe
// get the placedesc
long offset = *(long *)g_nameTable.getValueFromSlot(slot);
PlaceDesc *pd = (PlaceDesc *)(g_pbuf + offset);
// skip if not a the right type of place
if ( ! (pd->m_flags & placeType ) ) continue;
// skip if not right country
if ( crid != CRID_ANY && pd->m_crid != crid ) continue;
// get pop
if ( pd->m_population <= maxPop ) continue;
// otherwise, a new max
maxPop = pd->m_population;
// save it
best = pd;
return best;
// . the new getNearestCity_new() function
// . copied from getNearestCity() function above
//static long *s_latList2 = NULL;
//static long s_latListSize2 = 0;
//static long s_ni2 = 0;
static SafeBuf s_cityLatList;
// . we need a list of the city ids sorted by lat, and a list sorted by lon
// . then we do b-stepping on each list
// . bstep down to a 20 mile by 20 mile box
// . then intersect using a hashtable
// . if empty, then increase to 30 by 30 mile box, etc.
// . there are 123k US cities in cities.dat
// . these 2 lists should be about 2MB then
// . then lookup cityid in g_timezones to get timezone
PlaceDesc *getNearestCity_new ( float lat ,
float lon ,
long niceness ,
float *distInMilesSquared ) {
// . radius is 10 miles, put miles into degrees
// . when it was 5 we did not get "Santa Fe" for an event, it
// thought it was in "Agua Fria"
float radius = 10.0 / 69.0;
PlaceDesc *pd = NULL;
// how many cities we got?
long ni = s_cityLatList.length() / 4;
long *latList = (long *)s_cityLatList.getBufStart();
long step = ni / 2;
// get lat boundaries using bstep
long start = ni / 2;
// do the bstepping
for ( ; ; ) {
// get that city
long cityOffset = latList[start];
// get PlaceDesc
pd = (PlaceDesc *)(g_pbuf + cityOffset);
// increase resolution for next round
step /= 2;
// step it down?
if ( lat < pd->m_lat ) start -= step;
// use " - radius" here as well to avoid infinite loop?
else if ( lat > pd->m_lat ) start += step;
// ok, we are in range, done
else break;
// avoid breaching!
if ( start < 0 ) { start = 0 ; break; }
if ( start >= ni ) { start = ni-1; break; }
// stop if we hit steps of 0
if ( step <= 0 ) break;
long lata = start;
long latb = start;
long count = 0;
// TODO: do b-step on these too, takes like 3500 iterations for
// both of these loops
// adjust lata/latb until just out of range
for ( ; lata > 0 ; lata-- ) {
long cityOffset = latList[lata];
pd = (PlaceDesc *)(g_pbuf + cityOffset);
if ( pd->m_lat < lat - radius ) break;
for ( ; latb < ni ; latb++ ) {
long cityOffset = latList[latb];
pd = (PlaceDesc *)(g_pbuf + cityOffset);
if ( pd->m_lat > lat + radius ) break;
// first do a loop to get the absolutely closest place
// to this lat/lon regardless of population
float min1 = -1.0;
PlaceDesc *minpd1 = NULL;
// add in the lat cities
for ( long i = lata ; i <= latb ; i++ ) {
// break?
if ( i >= ni ) break;
// breathe
// get that city
long cityOffset = latList[i];
pd = (PlaceDesc *)(g_pbuf + cityOffset);
// sanity check
if ( cityOffset > g_pbufSize ) { char *xx=NULL;*xx=0; }
if ( cityOffset < 0 ) { char *xx=NULL;*xx=0; }
// just compute distance
float latDiff = pd->m_lat - lat;
float lonDiff = pd->m_lon - lon;
// add up
float dist = latDiff*latDiff + lonDiff*lonDiff;
// min?
if ( dist > min1 && minpd1 ) continue;
// set it
min1 = dist;
minpd1 = pd;
// then do a second loop to find the closest place, taking population
// into account, but also keeping the state/country the same
// as in "minpd1"
float min2 = -1.0;
PlaceDesc *minpd2 = NULL;
// add in the lat cities
for ( long i = lata ; i <= latb ; i++ ) {
// break?
if ( i >= ni ) break;
// breathe
// get that city
long cityOffset = latList[i];
pd = (PlaceDesc *)(g_pbuf + cityOffset);
// just compute distance
float latDiff = pd->m_lat - lat;
float lonDiff = pd->m_lon - lon;
// convert into miles
latDiff *= 69;
lonDiff *= 69;
// must match that of minpd1's state and country
if ( pd->m_adm1[0] != minpd1->m_adm1[0] ) continue;
if ( pd->m_adm1[1] != minpd1->m_adm1[1] ) continue;
if ( pd->m_crid != minpd1->m_crid ) continue;
// but consider the radius of the city to be up to 10 miles
// for a population of 1M people...
// one degree is 69.0 miles
float pop = pd->m_population;
// restrict to 500k people
if ( pop > 500000.0 ) pop = 500000.0;
// compute the city radius, can be up to 33*33 miles
float cityRadiusSquared = (1000.0 * pop) / 500000.0;
// square that
//float cityRadiusSquared = cityRadius * cityRadius;
// add up
float dist = latDiff*latDiff + lonDiff*lonDiff;
// subtract
dist -= cityRadiusSquared;
//if ( dist < 200 )
// log("places: city=%s dist=%.01f rad=%.01f",
// pd->getOfficialName(),dist,cityRadiusSquared);
// min?
if ( dist > min2 && minpd2 ) continue;
// set it
min2 = dist;
minpd2 = pd;
// must have one
if ( ! minpd2 ) {
// note it
log("addr: what the hell.");
// increase stripe width
radius += 10.0;
// try again
goto tryagain;
// debug point -- undo this later
//if ( ! strcmp(minpd2->getOfficialName(),"Agua Fria") )
// log("hey");
if ( distInMilesSquared ) *distInMilesSquared = min2;
// return that then
return minpd2;
int latcmp_new ( const void *arg1 , const void *arg2 ) {
long off1 = *(long *)arg1;
long off2 = *(long *)arg2;
// get the addresses
PlaceDesc *cd1;
PlaceDesc *cd2;
cd1 = (PlaceDesc *)(g_pbuf + off1);
cd2 = (PlaceDesc *)(g_pbuf + off2);
// simple compare
if ( cd1->m_lat < cd2->m_lat ) return -1;
if ( cd1->m_lat > cd2->m_lat ) return 1;
return 0;
bool testCityList ( ) {
PlaceDesc *pd;
char *name;
pd = getNearestCity_new ( 35.596035,-106.052246,0,NULL);
if ( ! pd ) { char *xx=NULL;*xx=0; }
name = pd->m_officialNameOffset + g_pbuf;
if ( strcmp ( name , "Santa Fe" ) ) { char *xx=NULL;*xx=0; }
// try this. make sure this is albuquerque
pd = getNearestCity_new ( 35.08449 ,-106.6511,0,NULL);
if ( ! pd ) { char *xx=NULL;*xx=0; }
name = pd->m_officialNameOffset + g_pbuf;
if ( strcmp ( name , "Albuquerque" ) ) { char *xx=NULL;*xx=0; }
return true;
// . our data is used by getNearestCityId
// . about 123k cities, sort them by lat in one list, lon in the other
// . 4 bytes per entry, we are talking 1.2MB for both lists
bool initCityLists_new ( ) {
// bail if not indexing events
//if ( ! g_conf.m_indexEventsOnly ) return true;
return true;
log ("places: loading citylatlist.dat");
// first try to load the list of city offsets into g_pbuf
// which are pre-sorted
if ( s_cityLatList.fillFromFile(g_hostdb.m_dir,"citylatlist.dat")>=1) {
// test it out right quick
return true;
// scan the buffer of placeDescriptors
PlaceDesc *pd = (PlaceDesc *) g_pbuf;
PlaceDesc *pdend ;//= (PlaceDesc *)(g_pbuf + g_pbufSize);
// find the real end of it!
for ( pdend = pd ; ; pdend++ ) {
// stop if we enter the name buf space
if ( ((char *)pdend)[0] == 'u' &&
((char *)pdend)[1] == 'n' &&
! strcmp((char *)pdend,"unknown name" ) )
// count how many cities we got
long cityCount = 0;
for ( ; pd < pdend ; pd++ )
if ( pd->m_flags & PDF_CITY ) cityCount++;
// . alloc for the "ptrs" which will really be offsets into g_pbuf
// . use offsets so we can save/load to/from disk easily
long need = cityCount * 4;
// alloc it
if ( ! s_cityLatList.reserve ( need ) ) return false;
// point into it so we can fill it up
long *latList = (long *)s_cityLatList.getBufStart();
long nc = 0;
pd = (PlaceDesc *)g_pbuf;
// scan the cities again
for ( ; pd < pdend ; pd++ ) {
// skip if not city
if ( ! (pd->m_flags & PDF_CITY ) ) continue;
// get offset
long cityOffset = ((char *)pd) - g_pbuf;
// add to the list
latList[nc++] = cityOffset;
// sanity
if ( cityCount != nc ) { char *xx=NULL;*xx=0; }
// now sort each list
gbqsort ( latList , nc , 4 , latcmp_new , 0 );
// update length
s_cityLatList.m_length = nc * 4;
// test it out right quick
log ("places: saving citylatlist.dat");
// save it
return true;