# AWSTATS SEARCH ENGINES DATABASE
#------------------------------------------------------------------------------
# If you want to add a Search Engine to extend AWStats database detection capabilities,
# you must add an entry in SearchEnginesSearchIDOrder, SearchEnginesHashID and in
# SearchEnginesHashLib.
# An entry if known in SearchEnginesKnownUrl is also welcome.
#------------------------------------------------------------------------------
# $Revision: 1.45 $ - $Author: eldy $ - $Date: 2007/09/11 18:51:20 $
# 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html
# added minor italian search engines
# arianna http://arianna.libero.it/
# supereva http://search.supereva.com/
# kataweb http://kataweb.it/
# corrected uk looksmart
# 'askuk','ask=', 'bbc','q=', 'freeserve','q=', 'looksmart','key=',
# to
# 'askuk','ask=', 'bbc','q=', 'freeserve','q=', 'looksmartuk','key=',
# corrected spelling
# internationnal -> international
# added 'google\.'=>'mail\.google\.', to NotSearchEnginesKeys in order to
# avoid counting gmail referrals as search engine traffic
# 2005-08-21 Sean Carlos http://www.antezeta.com/awstats.html
# avoid counting babelfish.altavista referrals as search engine traffic
# avoid counting translate.google referrals as search engine traffic
# 2005-11-20 Sean Carlos
# added missing 'tiscali','key=', entry. Check order
# 2005-11-22 Sean Carlos
# added Google Base & Froogle. Froogle not tested.
# 2006-04-18 Sean Carlos http://www.antezeta.com/awstats.html
# added biglotron.com (France)
# added blingo http://www.blingo.com/
# added Clusty & Vivisimo
# added eniro.no (Norway) [https://sourceforge.net/forum/message.php?msg_id=3134783]
# added GPU p2p search http://search.centraldatabase.org/
# added mail.tiscali to "not search engines list" [https://sourceforge.net/forum/message.php?msg_id=3166688]
# added Ask group's "mysearch"
# added sify.com (India)
# added sogou.com (Cina) [https://sourceforge.net/forum/message.php?msg_id=3501603]
# Ask changes:
# - added Ask Japan (ask.jp)
# - break out Ask new country level variants (DE, ES, FR, IT, NL)
# - updated Ask name from Ask Jevees
# - added Ask q= parameter - many recent searches probably not recognized; [https://sourceforge.net/forum/message.php?msg_id=3465444]
# - updated Ask uk (new uk.ask.com added to older ask.co.uk)
# updated voila kw|rdata parameter [https://sourceforge.net/forum/message.php?msg_id=3373912]
# for each new engine, added link to Search Engine. This serves to document engine. Done for major & Italian engines as well. Requires patch
# to AWStats to allow untranslated html. Otherwise html will appear instead of link.
# reviewed mnoGoSearch (http://www.mnogosearch.org/); the search engined mentioned no longer
# exists https://sourceforge.net/forum/message.php?msg_id=3025426
# 2006-05-13 Sean Carlos http://www.antezeta.com/awstats.html
# added 10 Chello European broadband portals (Austria, Belgium, Czech Republic, France, Hungary, The Netherlands, Norway, Poland, Slovakia, Sweden)
# added Alice Internal Search (blends data with Google?) search.alice.it.master:10005
# added detection of google cache views from IPs 66.249.93.104 72.14.203.104 72.14.207.104
# To do: add more extensive IP list; keywords not yet detected.
# added icerocket.com blog search http://www.icerocket.com/
# added live.com (msn) http://www.live.com/
# added Meta motor kartoo. Note: Kartoo does not provide search words in referrers, thus the engine will appear in the
# search engine list but the actual search words are not available.
# added netluchs.de http://www.netluchs.de/
# added sphere.com blog search http://www.sphere.com/
# added wwweasel.de http://wwweasel.de
# added Yahoo Mindset! http://mindset.research.yahoo.com/
# updated Mirago query parameter recognition (qry=); added breakout for each country (France, Germany, Spain, Italy, Norway, Sweden, Denmark, Netherlands, Belgium, Switzerland)
# 2006-05-13 Sean Carlos http://www.antezeta.com/awstats.html
# added Google cache IPs 64.233.183.104 & 66.102.7.104
# 2006-05-20 Sean Carlos http://www.antezeta.com/awstats.html
# anzwers.com.au
# schoenerbrausen.de http://www.schoenerbrausen.de/
# added Google cache IP 216.239.59.104
# answerbus http://www.answerbus.com/ (does not provide keywords)
# 2006-05-23 Sean Carlos http://www.antezeta.com/awstats.html
# added Google cache IP 66.102.9.104, 64.233.161.104
# 2006-06-23 Sean Carlos http://www.antezeta.com/awstats.html
# added Alice Search search.alice.it
# added GoodSearch http://www.goodsearch.com/ (does not provide keywords) "a Yahoo-powered search engine that donates money to your favorite charity or school each time you search the web"
# added googlee.com, variant of Google
# added gotuneed http://www.gotuneed.com/ Italian search engine, in beta
# added icq.com
# added logic to parse Google Cache search keywords. Seems to work for alpha but not numeric cache IDs, i.e. search?q=cache:lWVLmnuGJswJ: is recognized but q=cache:Yv5qxeJNuhgJ: is not recognized. The URL triggering the keywords will also appear. The URLs are probably too varied to parse out?
# added Nusearch http://www.nusearch.com/
# added Polymeta www.polymeta.hu (does not provide keywords)
# added scroogle http://www.scroogle.org/ (does not always provide keywords)
# added Tango http://tango.hu/search.php?st=0&q=jeles+napok
# Changed Google Cache notation 64\.233\.(161|167|179|183|187)\.104 to 64\.233\.1[0-9]{2}\.104
# 72\.14\.(203|205|207|209|221)\.104 to 72\.14\.2[0-9]{2}\.104
# 216\.239\.(51|59)\.104 to 216\.239\.5[0-9]\.104
# 66\.102\.(7|9)\.104 to 66\.102\.[1-9]\.104
# 2006-06-27 Sean Carlos http://www.antezeta.com/awstats.html
# added Onet.pl http://szukaj.onet.pl/
# corrected name "Wirtualna Polska" from "Szukaj" (search); added link http://szukaj.wp.pl/
# 2006-06-30 Sean Carlos http://www.antezeta.com/awstats.html
# Additional Polish Search Engines:
# added Dodaj.pl http://www.dodaj.pl/
# added Gazeta.pl http://szukaj.gazeta.pl/
# added Gery.pl http://szukaj.gery.pl/
# added Hoga.pl http://www.hoga.pl/
# added Interia.pl http://www.google.interia.pl/
# added Katalog.Onet.pl http://katalog.onet.pl/
# added NetSprint.pl http://www.netsprint.pl/
# added o2.pl http://szukaj2.o2.pl/
# added Polska http://szukaj.polska.pl/
# added Szukacz http://www.szukacz.pl/
# added Wow.pl http://szukaj.wow.pl/
# added Sagool http://sagool.jp/
# 2006-08-25 Social Bookmarks
# International
# added del.icio.us/search - for now, just search referrer. To do: consider /tag/(tagname) referrer?
# added stumbleupon.com - No keywords supplied.
# added swik.net
# added digg. Keywords sometimes supplied.
# Italy
# added segnalo.alice.it - No keywords supplied.
# added ineffabile.it - No keywords supplied.
# added filter for google groups. Attempt to parse group name as keyword.
# 2006-09-14
# added Eniro Sverige http://www.eniro.se/
# added MyWebSearch http://search.mywebsearch.com/
# added Teecno http://www.teecno.it/ Italian Open Source Search Engine
#package AWSSE;
# 2006-09-25 (Gabor Moizes)
# added 4-counter (Google alternative) http://4-counter.com/
# added Googlecom (Google alternative) http://googlecom.com/
# added Goggle (Google alternative) http://goggle.co.hu/
# added Comet toolbar http://as.starware.com
# added new IP for Yahoo: 216.109.125.130
# added Ledix http://ledix.net/
# added AT&T search (powered by Google) http://www.att.net/
# added Keresolap (Hungarian search engine) http://www.keresolap.hu/
# added Mozbot (French search engine) http://www.mozbot.fr/
# added Zoznam (Slovak search engine) http://www.zoznam.sk/
# added sapo.pt (Portuguese search engine) http://www.sapo.pt/
# added shaw.ca (powered by Google) http://start.shaw.ca/
# added Searchalot http://www.searchalot.com/
# added Copernic http://www.copernic.com/
# added 216.109.125.130 to Yahoo
# added 66.218.69.11 to Yahoo
# added Avantfind http://www.avantfind.com/
# added Steadysearch http://www.steadysearch.com/
# added Steadysearch http://www.steady-search.com/
# modified 216\.239\.5[0-9]\.104/search to 216\.239\.5[0-9]\.104
# SearchEnginesSearchIDOrder
# It contains all matching criteria to search for in log fields. This list is
# used to know in which order to search Search Engines IDs.
# Most frequent one are in list1, used when LevelForSearchEnginesDetection is 1 or more
# Minor robots are in list2, used when LevelForSearchEnginesDetection is 2 or more
# Note: Regex IDs are in lower case and ' ' and '+' are changed into '_'
#------------------------------------------------------------------------------
@SearchEnginesSearchIDOrder_list1=(
# Major international search engines
'base\.google\.',
'froogle\.google\.',
'groups\.google\.',
'images\.google\.',
'google\.',
'googlee\.',
'googlecom\.com',
'goggle\.co\.hu',
'216\.239\.(35|37|39|51)\.100',
'216\.239\.(35|37|39|51)\.101',
'216\.239\.5[0-9]\.104',
'64\.233\.1[0-9]{2}\.104',
'66\.102\.[1-9]\.104',
'66\.249\.93\.104',
'72\.14\.2[0-9]{2}\.104',
'msn\.',
'live\.com',
'voila\.',
'mindset\.research\.yahoo',
'yahoo\.','(66\.218\.71\.225|216\.109\.117\.135|216\.109\.125\.130|66\.218\.69\.11)',
'search\.aol\.co',
'tiscali\.',
'lycos\.',
'alexa\.com',
'alltheweb\.com',
'altavista\.',
'a9\.com',
'dmoz\.org',
'netscape\.',
'search\.terra\.',
'www\.search\.com',
'search\.sli\.sympatico\.ca',
'excite\.'
);
@SearchEnginesSearchIDOrder_list2=(
# Minor international search engines
'4\-counter\.com',
'att\.net',
'bungeebonesdotcom',
'northernlight\.',
'hotbot\.',
'kvasir\.',
'webcrawler\.',
'metacrawler\.',
'go2net\.com',
'(^|\.)go\.com',
'euroseek\.',
'looksmart\.',
'spray\.',
'nbci\.com\/search',
'de\.ask.\com', # break out Ask country specific engines. (.jp is in Japan section)
'es\.ask.\com',
'fr\.ask.\com',
'it\.ask.\com',
'nl\.ask.\com',
'uk\.ask.\com',
'(^|\.)ask\.com',
'atomz\.',
'overture\.com', # Replace 'goto\.com','Goto.com',
'teoma\.',
'findarticles\.com',
'infospace\.com',
'mamma\.',
'dejanews\.',
'dogpile\.com',
'wisenut\.com',
'ixquick\.com',
'search\.earthlink\.net',
'i-une\.com',
'blingo\.com',
'centraldatabase\.org',
'clusty\.com',
'mysearch\.',
'vivisimo\.com',
'kartoo\.com',
'icerocket\.com',
'sphere\.com',
'ledix\.net',
'start\.shaw\.ca',
'searchalot\.com',
'copernic\.com',
'avantfind\.com',
'steadysearch\.com',
'steady-search\.com',
# Chello Portals
'chello\.at',
'chello\.be',
'chello\.cz',
'chello\.fr',
'chello\.hu',
'chello\.nl',
'chello\.no',
'chello\.pl',
'chello\.se',
'chello\.sk',
'chello', # required as catchall for new countries not yet known
# Mirago
'mirago\.be',
'mirago\.ch',
'mirago\.de',
'mirago\.dk',
'es\.mirago\.com',
'mirago\.fr',
'mirago\.it',
'mirago\.nl',
'no\.mirago\.com',
'mirago\.se',
'mirago\.co\.uk',
'mirago', # required as catchall for new countries not yet known
'answerbus\.com',
'icq\.com\/search',
'nusearch\.com',
'goodsearch\.com',
'scroogle\.org',
'questionanswering\.com',
'mywebsearch\.com',
'as\.starware\.com',
# Social Bookmarking Services
'del\.icio\.us',
'digg\.com',
'stumbleupon\.com',
'swik\.net',
'segnalo\.alice\.it',
'ineffabile\.it',
# Minor Australian search engines
'anzwers\.com\.au',
# Minor brazilian search engines
'engine\.exe', 'miner\.bol\.com\.br',
# Minor chinese search engines
'\.baidu\.com', # baidu search portal
'\.vnet\.cn', # powered by MSN
'\.soso\.com', # powered by Google
'\.sogou\.com', # powered by Sohu
'\.3721\.com', # powered by Yahoo!
'iask\.com', # powered by Sina
'\.accoona\.com', # Accoona
'\.163\.com', # powered by Google
'\.zhongsou\.com', # zhongsou search portal
# Minor czech search engines
'atlas\.cz','seznam\.cz','quick\.cz','centrum\.cz','jyxo\.(cz|com)','najdi\.to','redbox\.cz',
# Minor danish search-engines
'opasia\.dk', 'danielsen\.com', 'sol\.dk', 'jubii\.dk', 'find\.dk', 'edderkoppen\.dk', 'netstjernen\.dk', 'orbis\.dk', 'tyfon\.dk', '1klik\.dk', 'ofir\.dk',
# Minor dutch search engines
'ilse\.','vindex\.',
# Minor english search engines
'(^|\.)ask\.co\.uk','bbc\.co\.uk/cgi-bin/search','ifind\.freeserve','looksmart\.co\.uk','splut\.','spotjockey\.','ukdirectory\.','ukindex\.co\.uk','ukplus\.','searchy\.co\.uk',
# Minor finnish search engines
'haku\.www\.fi',
# Minor french search engines
'recherche\.aol\.fr','ctrouve\.','francite\.','\.lbb\.org','rechercher\.libertysurf\.fr', 'search[\w\-]+\.free\.fr', 'recherche\.club-internet\.fr',
'toile\.com', 'biglotron\.com',
'mozbot\.fr',
# Minor german search engines
'sucheaol\.aol\.de',
'fireball\.de','infoseek\.de','suche\d?\.web\.de','[a-z]serv\.rrzn\.uni-hannover\.de',
'suchen\.abacho\.de','brisbane\.t-online\.de','allesklar\.de','meinestadt\.de',
'212\.227\.33\.241',
'(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42)',
'wwweasel\.de',
'netluchs\.de',
'schoenerbrausen\.de',
# Minor Hungarian search engines
'heureka\.hu','vizsla\.origo\.hu','lapkereso\.hu','goliat\.hu','index\.hu','wahoo\.hu','webmania\.hu','search\.internetto\.hu',
'tango\.hu',
'keresolap\.hu',
'polymeta\.hu',
# Minor Indian search engines
'sify\.com',
# Minor Italian search engines
'virgilio\.it','arianna\.libero\.it','supereva\.com','kataweb\.it','search\.alice\.it\.master','search\.alice\.it','gotuneed\.com',
'godado','jumpy\.it','shinyseek\.it','teecno\.it',
# Minor Japanese search engines
'ask\.jp','sagool\.jp',
# Minor Norwegian search engines
'sok\.start\.no', 'eniro\.no',
# Minor Polish search engines
'szukaj\.wp\.pl','szukaj\.onet\.pl','dodaj\.pl','gazeta\.pl','gery\.pl','hoga\.pl','netsprint\.pl','interia\.pl','katalog\.onet\.pl','o2\.pl','polska\.pl','szukacz\.pl','wow\.pl',
# Minor russian search engines
'ya(ndex)?\.ru', 'aport\.ru', 'rambler\.ru', 'turtle\.ru', 'metabot\.ru',
# Minor Swedish search engines
'evreka\.passagen\.se','eniro\.se',
# Minor Slovak search engines
'zoznam\.sk',
# Minor Portuguese search engines
'sapo\.pt',
# Minor swiss search engines
'search\.ch', 'search\.bluewin\.ch'
);
@SearchEnginesSearchIDOrder_listgen=(
# Generic search engines
'search\..*\.\w+'
);
# NotSearchEnginesKeys
# If a search engine key is found, we check its exclude list to know if it's
# really a search engine
#------------------------------------------------------------------------------
%NotSearchEnginesKeys=(
'altavista\.'=>'babelfish\.altavista\.',
'google\.'=>'mail\.google\.',
'google\.'=>'translate\.google\.',
'msn\.'=>'hotmail\.msn\.',
'tiscali\.'=>'mail\.tiscali\.',
'yahoo\.'=>'mail\.yahoo\.',
'yandex\.'=>'direct\.yandex\.'
);
# SearchEnginesHashID
# Each Search Engine Search ID is associated to an AWStats id string
#------------------------------------------------------------------------------
%SearchEnginesHashID = (
# Major international search engines
'base\.google\.','google_base',
'froogle\.google\.','google_froogle',
'groups\.google\.','google_groups',
'images\.google\.','google_image',
'google\.','google',
'googlee\.','google',
'googlecom\.com','google',
'goggle\.co\.hu','google',
'216\.239\.(35|37|39|51)\.100','google_cache',
'216\.239\.(35|37|39|51)\.101','google_cache',
'216\.239\.5[0-9]\.104','google_cache',
'64\.233\.1[0-9]{2}\.104','google_cache',
'66\.102\.[1-9]\.104','google_cache',
'66\.249\.93\.104','google_cache',
'72\.14\.2[0-9]{2}\.104','google_cache',
'msn\.','msn',
'live\.com','live',
'voila\.','voila',
'mindset\.research\.yahoo','yahoo_mindset',
'yahoo\.','yahoo','(66\.218\.71\.225|216\.109\.117\.135|216\.109\.125\.130|66\.218\.69\.11)','yahoo',
'lycos\.','lycos',
'alexa\.com','alexa',
'alltheweb\.com','alltheweb',
'altavista\.','altavista',
'a9\.com','a9',
'dmoz\.org','dmoz',
'netscape\.','netscape',
'search\.terra\.','terra',
'www\.search\.com','search.com',
'tiscali\.','tiscali',
'search\.aol\.co','aol',
'search\.sli\.sympatico\.ca','sympatico',
'excite\.','excite',
# Minor international search engines
'4\-counter\.com','google4counter',
'att\.net','att',
'bungeebonesdotcom','bungeebonesdotcom',
'northernlight\.','northernlight',
'hotbot\.','hotbot',
'kvasir\.','kvasir',
'webcrawler\.','webcrawler',
'metacrawler\.','metacrawler',
'go2net\.com','go2net',
'(^|\.)go\.com','go',
'euroseek\.','euroseek',
'looksmart\.','looksmart',
'spray\.','spray',
'nbci\.com\/search','nbci',
'de\.ask.\com','askde', # break out Ask country specific engines.
'es\.ask.\com','askes',
'fr\.ask.\com','askfr',
'it\.ask.\com','askit',
'nl\.ask.\com','asknl',
'uk\.ask.\com','askuk',
'(^|\.)ask\.co\.uk','askuk',
'(^|\.)ask\.com','ask',
'atomz\.','atomz',
'overture\.com','overture', # Replace 'goto\.com','Goto.com',
'teoma\.','teoma',
'findarticles\.com','findarticles',
'infospace\.com','infospace',
'mamma\.','mamma',
'dejanews\.','dejanews',
'dogpile\.com','dogpile',
'wisenut\.com','wisenut',
'ixquick\.com','ixquick',
'search\.earthlink\.net','earthlink',
'i-une\.com','iune',
'blingo\.com','blingo',
'centraldatabase\.org','centraldatabase',
'clusty\.com','clusty',
'mysearch\.','mysearch',
'vivisimo\.com','vivisimo',
'kartoo\.com','kartoo',
'icerocket\.com','icerocket',
'sphere\.com','sphere',
'ledix\.net','ledix',
'start\.shaw\.ca','shawca',
'searchalot\.com','searchalot',
'copernic\.com','copernic',
'avantfind\.com','avantfind',
'steadysearch\.com','steadysearch',
'steady-search\.com','steadysearch',
# Chello Portals
'chello\.at','chelloat',
'chello\.be','chellobe',
'chello\.cz','chellocz',
'chello\.fr','chellofr',
'chello\.hu','chellohu',
'chello\.nl','chellonl',
'chello\.no','chellono',
'chello\.pl','chellopl',
'chello\.se','chellose',
'chello\.sk','chellosk',
'chello','chellocom',
# Mirago
'mirago\.be','miragobe',
'mirago\.ch','miragoch',
'mirago\.de','miragode',
'mirago\.dk','miragodk',
'es\.mirago\.com','miragoes',
'mirago\.fr','miragofr',
'mirago\.it','miragoit',
'mirago\.nl','miragonl',
'no\.mirago\.com','miragono',
'mirago\.se','miragose',
'mirago\.co\.uk','miragocouk',
'mirago','mirago', # required as catchall for new countries not yet known
'answerbus\.com','answerbus',
'icq\.com\/search','icq',
'nusearch\.com','nusearch',
'goodsearch\.com','goodsearch',
'scroogle\.org','scroogle',
'questionanswering\.com','questionanswering',
'mywebsearch\.com','mywebsearch',
'as\.starware\.com','comettoolbar',
# Social Bookmarking Services
'del\.icio\.us','delicious',
'digg\.com','digg',
'stumbleupon\.com','stumbleupon',
'swik\.net','swik',
'segnalo\.alice\.it','segnalo',
'ineffabile\.it','ineffabile',
# Minor Australian search engines
'anzwers\.com\.au','anzwers',
# Minor brazilian search engines
'engine\.exe','engine',
'miner\.bol\.com\.br','miner',
# Minor chinese search engines
'\.baidu\.com','baidu',
'iask\.com','iask',
'\.accoona\.com','accoona',
'\.3721\.com','3721',
'\.163\.com','netease',
'\.soso\.com','soso',
'\.zhongsou\.com','zhongsou',
'\.vnet\.cn','vnet',
'\.sogou\.com','sogou',
# Minor czech search engines
'atlas\.cz','atlas',
'seznam\.cz','seznam',
'quick\.cz','quick',
'centrum\.cz','centrum',
'jyxo\.(cz|com)','jyxo',
'najdi\.to','najdi',
'redbox\.cz','redbox',
# Minor danish search-engines
'opasia\.dk','opasia',
'danielsen\.com','danielsen',
'sol\.dk','sol',
'jubii\.dk','jubii',
'find\.dk','finddk',
'edderkoppen\.dk','edderkoppen',
'netstjernen\.dk','netstjernen',
'orbis\.dk','orbis',
'tyfon\.dk','tyfon',
'1klik\.dk','1klik',
'ofir\.dk','ofir',
# Minor dutch search engines
'ilse\.','ilse',
'vindex\.','vindex',
# Minor english search engines
'bbc\.co\.uk/cgi-bin/search','bbc',
'ifind\.freeserve','freeserve',
'looksmart\.co\.uk','looksmartuk',
'splut\.','splut',
'spotjockey\.','spotjockey',
'ukdirectory\.','ukdirectory',
'ukindex\.co\.uk','ukindex',
'ukplus\.','ukplus',
'searchy\.co\.uk','searchy',
# Minor finnish search engines
'haku\.www\.fi','haku',
# Minor french search engines
'recherche\.aol\.fr','aolfr',
'ctrouve\.','ctrouve',
'francite\.','francite',
'\.lbb\.org','lbb',
'rechercher\.libertysurf\.fr','libertysurf',
'search[\w\-]+\.free\.fr','free',
'recherche\.club-internet\.fr','clubinternet',
'toile\.com','toile',
'biglotron\.com', 'biglotron',
'mozbot\.fr', 'mozbot',
# Minor german search engines
'sucheaol\.aol\.de','aolde',
'fireball\.de','fireball',
'infoseek\.de','infoseek',
'suche\d?\.web\.de','webde',
'[a-z]serv\.rrzn\.uni-hannover\.de','meta',
'suchen\.abacho\.de','abacho',
'brisbane\.t-online\.de','t-online',
'allesklar\.de','allesklar',
'meinestadt\.de','meinestadt',
'212\.227\.33\.241','metaspinner',
'(161\.58\.227\.204|161\.58\.247\.101|212\.40\.165\.90|213\.133\.108\.202|217\.160\.108\.151|217\.160\.111\.99|217\.160\.131\.108|217\.160\.142\.227|217\.160\.176\.42)','metacrawler_de',
'wwweasel\.de','wwweasel',
'netluchs\.de','netluchs',
'schoenerbrausen\.de','schoenerbrausen',
# Minor Hungarian search engines
'heureka\.hu','heureka',
'vizsla\.origo\.hu','origo',
'lapkereso\.hu','lapkereso',
'goliat\.hu','goliat',
'index\.hu','indexhu',
'wahoo\.hu','wahoo',
'webmania\.hu','webmania',
'search\.internetto\.hu','internetto',
'tango\.hu','tango_hu',
'keresolap\.hu','keresolap_hu',
'polymeta\.hu','polymeta_hu',
# Minor Indian search engines
'sify\.com','sify',
# Minor Italian search engines
'virgilio\.it','virgilio',
'arianna\.libero\.it','arianna',
'supereva\.com','supereva',
'kataweb\.it','kataweb',
'search\.alice\.it\.master','aliceitmaster',
'search\.alice\.it','aliceit',
'gotuneed\.com','gotuneed',
'godado','godado',
'jumpy\.it','jumpy\.it',
'shinyseek\.it','shinyseek\.it',
'teecno\.it','teecnoit',
# Minor Japanese search engines
'ask\.jp','askjp',
'sagool\.jp','sagool',
# Minor Norwegian search engines
'sok\.start\.no','start', 'eniro\.no','eniro',
# Minor Polish search engines
'szukaj\.wp\.pl','wp',
'szukaj\.onet\.pl','onetpl',
'dodaj\.pl','dodajpl',
'gazeta\.pl','gazetapl',
'gery\.pl','gerypl',
'netsprint\.pl\/hoga\-search','hogapl',
'netsprint\.pl','netsprintpl',
'interia\.pl','interiapl',
'katalog\.onet\.pl','katalogonetpl',
'o2\.pl','o2pl',
'polska\.pl','polskapl',
'szukacz\.pl','szukaczpl',
'wow\.pl','wowpl',
# Minor russian search engines
'ya(ndex)?\.ru','yandex',
'aport\.ru','aport',
'rambler\.ru','rambler',
'turtle\.ru','turtle',
'metabot\.ru','metabot',
# Minor Swedish search engines
'evreka\.passagen\.se','passagen',
'eniro\.se','enirose',
# Minor Slovak search engines
'zoznam\.sk','zoznam',
# Minor Portuguese search engines
'sapo\.pt','sapo',
# Minor swiss search engines
'search\.ch','searchch',
'search\.bluewin\.ch','bluewin',
# Generic search engines
'search\..*\.\w+','search'
);
# SearchEnginesWithKeysNotInQuery
# List of search engines that store keyword as page instead of query parameter
#------------------------------------------------------------------------------
%SearchEnginesWithKeysNotInQuery=(
'a9',1 # www.a9.com/searckey1%20searchkey2
);
# SearchEnginesKnownUrl
# Known rules to extract keywords from a referrer search engine URL
#------------------------------------------------------------------------------
%SearchEnginesKnownUrl=(
# Most common search engines
'alexa','q=',
'alltheweb','q(|uery)=',
'altavista','q=',
'a9','a9\.com\/',
'dmoz','search=',
'google_base','(p|q|as_p|as_q)=',
'google_froogle','(p|q|as_p|as_q)=',
'google_groups','group\/', # does not work
'google_image','(p|q|as_p|as_q)=',
'google_cache','(p|q|as_p|as_q)=cache:[0-9A-Za-z]{12}:',
'google','(p|q|as_p|as_q)=',
'lycos','query=',
'msn','q=',
'live','q=',
'netscape','search=',
'tiscali','key=',
'aol','query=',
'terra','query=',
'voila','(kw|rdata)=',
'search.com','q=',
'yahoo_mindset','p=',
'yahoo','p=',
'sympatico', 'query=',
'excite','search=',
# Minor international search engines
'google4counter','(p|q|as_p|as_q)=',
'att','qry=',
'bungeebonesdotcom','query=',
'go','qt=',
'askde','(ask|q)=', # break out Ask country specific engines.
'askes','(ask|q)=',
'askfr','(ask|q)=',
'askit','(ask|q)=',
'asknl','(ask|q)=',
'ask','(ask|q)=',
'atomz','sp-q=',
'euroseek','query=',
'findarticles','key=',
'go2net','general=',
'hotbot','mt=',
'infospace','qkw=',
'kvasir', 'q=',
'looksmart','key=',
'mamma','query=',
'metacrawler','general=',
'nbci','keyword=',
'northernlight','qr=',
'overture','keywords=',
'dogpile', 'q(|kw)=',
'spray','string=',
'teoma','q=',
'webcrawler','searchText=',
'wisenut','query=',
'ixquick', 'query=',
'earthlink', 'q=',
'iune','(keywords|q)=',
'blingo','q=',
'centraldatabase','query=',
'clusty','query=',
'mysearch','searchfor=',
'vivisimo','query=',
# kartoo: No keywords passed in referring URL.
'kartoo','',
'icerocket','q=',
'sphere','q=',
'ledix','q=',
'shawca','q=',
'searchalot','q=',
'copernic','web\/',
'avantfind','keywords=',
'steadysearch','w=',
# Chello Portals
'chelloat','q1=',
'chellobe','q1=',
'chellocz','q1=',
'chellofr','q1=',
'chellohu','q1=',
'chellonl','q1=',
'chellono','q1=',
'chellopl','q1=',
'chellose','q1=',
'chellosk','q1=',
'chellocom','q1=',
# Mirago
'miragobe','(txtsearch|qry)=',
'miragoch','(txtsearch|qry)=',
'miragode','(txtsearch|qry)=',
'miragodk','(txtsearch|qry)=',
'miragoes','(txtsearch|qry)=',
'miragofr','(txtsearch|qry)=',
'miragoit','(txtsearch|qry)=',
'miragonl','(txtsearch|qry)=',
'miragono','(txtsearch|qry)=',
'miragose','(txtsearch|qry)=',
'miragocouk','(txtsearch|qry)=',
'mirago','(txtsearch|qry)=',
'answerbus','', # Does not provide query parameters
'icq','q=',
'nusearch','nusearch_terms=',
'goodsearch','Keywords=',
'scroogle','Gw=', # Does not always provide query parameters
'questionanswering','',
'mywebsearch','searchfor=',
'comettoolbar','qry=',
# Social Bookmarking Services
'delicious','all=',
'digg','s=',
'stumbleupon','',
'swik','swik\.net/', # does not work. Keywords follow domain, e.g. http://swik.net/awstats+analytics
'segnalo','',
'ineffabile','',
# Minor Australian search engines
'anzwers','search=',
# Minor brazilian search engines
'engine','p1=', 'miner','q=',
# Minor chinese search engines
'baidu','(wd|word)=',
'iask','(w|k)=',
'accoona','qt=',
'3721','(p|name)=',
'netease','q=',
'soso','q=',
'zhongsou','(word|w)=',
'sogou', 'query=',
'vnet','kw=',
# Minor czech search engines
'atlas','searchtext=', 'seznam','w=', 'quick','query=', 'centrum','q=', 'jyxo','s=', 'najdi','dotaz=', 'redbox','srch=',
# Minor danish search engines
'opasia','q=', 'danielsen','q=', 'sol','q=', 'jubii','soegeord=', 'finddk','words=', 'edderkoppen','query=', 'orbis','search_field=', '1klik','query=', 'ofir','querytext=',
# Minor dutch search engines
'ilse','search_for=', 'vindex','in=',
# Minor english search engines
'askuk','(ask|q)=', 'bbc','q=', 'freeserve','q=', 'looksmartuk','key=',
'splut','pattern=', 'spotjockey','Search_Keyword=', 'ukindex', 'stext=', 'ukdirectory','k=', 'ukplus','search=', 'searchy', 'search_term=',
# Minor finnish search engines
'haku','w=',
# Minor french search engines
'francite','name=', 'clubinternet', 'q=',
'toile', 'q=',
'biglotron','question=',
'mozbot','q=',
# Minor german search engines
'aolde','q=',
'fireball','q=', 'infoseek','qt=', 'webde','su=',
'abacho','q=', 't-online','q=',
'metaspinner','qry=',
'metacrawler_de','qry=',
'wwweasel','q=',
'netluchs','query=',
'schoenerbrausen','q=',
# Minor Hungarian search engines
'heureka','heureka=', 'origo','(q|search)=', 'goliat','KERESES=', 'wahoo','q=', 'internetto','searchstr=',
'keresolap_hu','q=',
'tango_hu','q=',
'polymeta_hu','',
# Minor Indian search engines
'sify','keyword=',
# Minor Italian search engines
'virgilio','qs=',
'arianna','query=',
'supereva','q=',
'kataweb','q=',
'aliceitmaster','qs=',
'aliceit','qs=',
'gotuneed','', # Not yet known
'godado','Keywords=',
'jumpy\.it','searchWord=',
'shinyseek\.it','KEY=',
'teecnoit','q=',
# Minor Japanese search engines
'askjp','(ask|q)=',
'sagool','q=',
# Minor Norwegian search engines
'start','q=', 'eniro','q=',
# Minor Polish search engines
'wp','szukaj=',
'onetpl','qt=',
'dodajpl','keyword=',
'gazetapl','slowo=',
'gerypl','q=',
'hogapl','qt=',
'netsprintpl','q=',
'interiapl','q=',
'katalogonetpl','qt=',
'o2pl','qt=',
'polskapl','qt=',
'szukaczpl','q=',
'wowpl','q=',
# Minor russian search engines
'yandex', 'text=', 'rambler','words=', 'aport', 'r=', 'metabot', 'st=',
# Minor swedish search engines
'passagen','q=',
'enirose','q=',
# Minor swiss search engines
'searchch', 'q=', 'bluewin', 'qry='
);
# SearchEnginesKnownUrlNotFound
# Known rules to extract not found keywords from a referrer search engine URL
#------------------------------------------------------------------------------
%SearchEnginesKnownUrlNotFound=(
# Most common search engines
'msn','origq='
);
# If no rules are known, WordsToExtractSearchUrl will be used to search keyword parameter
# If no rules are known and search in WordsToExtractSearchUrl failed, this will be used to clean URL of not keyword parameters.
#------------------------------------------------------------------------------
@WordsToExtractSearchUrl= ('tn=','ie=','ask=','claus=','general=','key=','kw=','keyword=','keywords=','MT=','p=','q=','qr=','qt=','query=','s=','search=','searchText=','string=','su=','txtsearch=','w=');
@WordsToCleanSearchUrl= ('act=','annuaire=','btng=','cat=','categoria=','cfg=','cof=','cou=','count=','cp=','dd=','domain=','dt=','dw=','enc=','exec=','geo=','hc=','height=','hits=','hl=','hq=','hs=','id=','kl=','lang=','loc=','lr=','matchmode=','medor=','message=','meta=','mode=','order=','page=','par=','pays=','pg=','pos=','prg=','qc=','refer=','sa=','safe=','sc=','sort=','src=','start=','style=','stype=','sum=','tag=','temp=','theme=','type=','url=','user=','width=','what=','\\.x=','\\.y=','y=','look=');
# SearchEnginesKnownUTFCoding
# Known parameter that proves a search engine has coded its parameters in UTF-8
#------------------------------------------------------------------------------
%SearchEnginesKnownUTFCoding=(
# Most common search engines
'google','ie=utf-8',
'alltheweb','cs=utf-8'
);
# SearchEnginesHashLib
# List of search engines names
# 'search_engine_id', 'search_engine_name',
#------------------------------------------------------------------------------
%SearchEnginesHashLib=(
# Major international search engines
'alexa','Alexa',
'alltheweb','AllTheWeb',
'altavista','AltaVista',
'a9', 'A9',
'dmoz','DMOZ',
'google_base','Google (Base)',
'google_froogle','Froogle (Google)',
'google_groups','Google (Groups)',
'google_image','Google (Images)',
'google_cache','Google (cache)',
'google','Google',
'lycos','Lycos',
'msn','MSN Search',
'live','Windows Live',
'netscape','Netscape',
'aol','AOL',
'terra','Terra',
'tiscali','Tiscali',
'voila','Voila',
'search.com','Search.com',
'yahoo_mindset','Yahoo! Mindset',
'yahoo','Yahoo!',
'sympatico','Sympatico',
'excite','Excite',
# Minor international search engines
'google4counter','4-counter (Google)',
'att','AT&T search (powered by Google)',
'bungeebonesdotcom','BungeeBones',
'go','Go.com',
'askde','Ask Deutschland',
'askes','Ask España', # break out Ask country specific engines.
'askfr','Ask France',
'askit','Ask Italia',
'asknl','Ask Nederland',
'ask','Ask',
'atomz','Atomz',
'dejanews','DejaNews',
'euroseek','Euroseek',
'findarticles','Find Articles',
'go2net','Go2Net (Metamoteur)',
'hotbot','Hotbot',
'infospace','InfoSpace',
'kvasir','Kvasir',
'looksmart','Looksmart',
'mamma','Mamma',
'metacrawler','MetaCrawler (Metamoteur)',
'nbci','NBCI',
'northernlight','NorthernLight',
'overture','Overture', # Replace 'goto\.com','Goto.com',
'dogpile','Dogpile',
'spray','Spray',
'teoma','Teoma', # Replace 'directhit\.com','DirectHit',
'webcrawler','WebCrawler',
'wisenut','WISENut',
'ixquick','ix quick',
'earthlink', 'Earth Link',
'iune','i-une',
'blingo','Blingo',
'centraldatabase','GPU p2p search',
'clusty','Clusty',
'mysearch','My Search',
'vivisimo','Vivisimo',
'kartoo','Kartoo',
'icerocket','Icerocket (Blog)',
'sphere','Sphere (Blog)',
'ledix','Ledix',
'shawca','Shaw.ca',
'searchalot','Searchalot',
'copernic','Copernic',
'avantfind','Avantfind',
'steadysearch','Avantfind',
# Chello Portals
'chelloat','Chello Austria',
'chellobe','Chello Belgium',
'chellocz','Chello Czech Republic',
'chellofr','Chello France',
'chellohu','Chello Hungary',
'chellonl','Chello Netherlands',
'chellono','Chello Norway',
'chellopl','Chello Poland',
'chellose','Chello Sweden',
'chellosk','Chello Slovakia',
'chellocom','Chello (Country not recognized)',
# Mirago
'miragobe','Mirago Belgium',
'miragoch','Mirago Switzerland',
'miragode','Mirago Germany',
'miragodk','Mirago Denmark',
'miragoes','Mirago Spain',
'miragofr','Mirago France',
'miragoit','Mirago Italy',
'miragonl','Mirago Netherlands',
'miragono','Mirago Norway',
'miragose','Mirago Sweden',
'miragocouk','Mirago UK',
'mirago','Mirago (country unknown)',
'answerbus','Answerbus',
'icq','icq',
'nusearch','Nusearch',
'goodsearch','GoodSearch',
'scroogle','Scroogle',
'questionanswering','Questionanswering',
'mywebsearch','MyWebSearch',
'comettoolbar','Comet toolbar search',
# Social Bookmarking Services
'delicious','del.icio.us (Social Bookmark)',
'digg','Digg (Social Bookmark)',
'stumbleupon','Stumbleupon (Social Bookmark)',
'swik','Swik (Social Bookmark)',
'segnalo','Segnalo (Social Bookmark)',
'ineffabile','Ineffabile.it (Social Bookmark)',
# Minor Australian search engines
'anzwers','anzwers.com.au',
# Minor brazilian search engines
'engine','Cade', 'miner','Meta Miner',
# Minor chinese search engines
'baidu','Baidu',
'iask','Iask',
'accoona','Accoona',
'3721','3721',
'netease', 'NetEase',
'soso','SoSo',
'zhongsou','ZhongSou',
'sogou', 'SoGou',
'vnet','VNet',
# Minor czech search engines
'atlas','Atlas.cz', 'seznam','Seznam', 'quick','Quick.cz', 'centrum','Centrum.cz', 'jyxo','Jyxo.cz', 'najdi','Najdi.to', 'redbox','RedBox.cz',
# Minor danish search-engines
'opasia','Opasia', 'danielsen','Thor (danielsen.com)', 'sol','SOL', 'jubii','Jubii', 'finddk','Find', 'edderkoppen','Edderkoppen', 'netstjernen','Netstjernen', 'orbis','Orbis', 'tyfon','Tyfon', '1klik','1Klik', 'ofir','Ofir',
# Minor dutch search engines
'ilse','Ilse','vindex','Vindex\.nl',
# Minor english search engines
'askuk','Ask UK',
'bbc','BBC', 'freeserve','Freeserve', 'looksmartuk','Looksmart UK',
'splut','Splut', 'spotjockey','Spotjockey', 'ukdirectory','UK Directory', 'ukindex','UKIndex', 'ukplus','UK Plus', 'searchy','searchy.co.uk',
# Minor finnish search engines
'haku','Ihmemaa',
# Minor french search engines
'aolfr','AOL (fr)', 'ctrouve','C\'est trouvé', 'francite','Francité', 'lbb', 'LBB', 'libertysurf', 'Libertysurf', 'free', 'Free.fr', 'clubinternet', 'Club-internet',
'toile', 'Toile du Québec',
'biglotron','Biglotron',
'mozbot','Mozbot',
# Minor German search engines
'aolde','AOL (de)',
'fireball','Fireball', 'infoseek','Infoseek', 'webde','Web.de',
'abacho','Abacho', 't-online','T-Online',
'allesklar','allesklar.de', 'meinestadt','meinestadt.de',
'metaspinner','metaspinner',
'metacrawler_de','metacrawler.de',
'wwweasel','WWWeasel',
'netluchs','Netluchs',
'schoenerbrausen','Schoenerbrausen/',
# Minor hungarian search engines
'heureka','Heureka', 'origo','Origo-Vizsla', 'lapkereso','Startlapkeresõ', 'goliat','Góliát', 'indexhu','Index', 'wahoo','Wahoo', 'webmania','webmania.hu', 'internetto','Internetto Keresõ',
'tango_hu','Tango',
'keresolap_hu','Tango keresolap',
'polymeta_hu','Polymeta',
# Minor Indian search engines
'sify','Sify',
# Minor Italian search engines
'virgilio','Virgilio',
'arianna','Arianna',
'supereva','Supereva',
'kataweb','Kataweb',
'aliceitmaster','search.alice.it.master',
'aliceit','alice.it',
'gotuneed','got u need',
'godado','Godado.it',
'jumpy\.it','Jumpy.it',
'shinyseek\.it','Shinyseek.it',
'teecnoit','Teecno',
# Minor Japanese search engines
'askjp','Ask Japan',
'sagool','Sagool',
# Minor Norwegian search engines
'start','start.no', 'eniro','Eniro',
# Minor polish search engines
'wp','Wirtualna Polska',
'onetpl','Onet.pl',
'dodajpl','Dodaj.pl',
'gazetapl','Gazeta.pl',
'gerypl','Gery.pl',
'hogapl','Hoga.pl',
'netsprintpl','NetSprint.pl',
'interiapl','Interia.pl',
'katalogonetpl','Katalog.Onet.pl',
'o2pl','o2.pl',
'polskapl','Polska',
'szukaczpl','Szukacz',
'wowpl','Wow.pl',
# Minor russian search engines
'yandex', 'Yandex', 'aport', 'Aport', 'rambler', 'Rambler', 'turtle', 'Turtle', 'metabot', 'MetaBot',
# Minor Swedish search engines
'passagen','Evreka',
'enirose','Eniro Sverige',
# Minor Slovak search engines
'zoznam','Zoznam',
# Minor Portuguese search engines
'sapo','Sapo',
# Minor Swiss search engines
'searchch', 'search.ch', 'bluewin', 'search.bluewin.ch',
# Generic search engines
'search','Unknown search engines'
);
# Sanity check.
# Enable this code and run perl search_engines.pm to check file entries are ok
#-----------------------------------------------------------------------------
#foreach my $key (@SearchEnginesSearchIDOrder_list1) {
# if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_list1 with no value in SearchEnginesHashID");
# foreach my $key2 (@SearchEnginesSearchIDOrder_list2) { if ($key2 eq $key) { error("$key is in 1 and 2\n"); } }
# foreach my $key2 (@SearchEnginesSearchIDOrder_listgen) { if ($key2 eq $key) { error("$key is in 1 and gen\n"); } }
#} }
#foreach my $key (@SearchEnginesSearchIDOrder_list2) {
# if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_list1 with no value in SearchEnginesHashID");
# foreach my $key2 (@SearchEnginesSearchIDOrder_list1) { if ($key2 eq $key) { error("$key is in 2 and 1\n"); } }
# foreach my $key2 (@SearchEnginesSearchIDOrder_listgen) { if ($key2 eq $key) { error("$key is in 2 and gen\n"); } }
#} }
#foreach my $key (@SearchEnginesSearchIDOrder_listgen) { if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in SearchEnginesSearchIDOrder_listgen with no value in SearchEnginesHashID"); } }
#foreach my $key (keys %NotSearchEnginesKeys) { if (! $SearchEnginesHashID{$key}) { error("Entry '$key' has been found in NotSearchEnginesKeys with no value in SearchEnginesHashID"); } }
#foreach my $key (keys %SearchEnginesKnownUrl) {
# my $found=0;
# foreach my $key2 (values %SearchEnginesHashID) {
# if ($key eq $key2) { $found=1; last; }
# }
# if (! $found) { die "Entry '$key' has been found in SearchEnginesKnownUrl with no value in SearchEnginesHashID"; }
#}
#foreach my $key (keys %SearchEnginesHashLib) {
# my $found=0;
# foreach my $key2 (values %SearchEnginesHashID) {
# if ($key eq $key2) { $found=1; last; }
# }
# if (! $found) { die "Entry '$key' has been found in SearchEnginesHashLib with no value in SearchEnginesHashID"; }
#}
#print @SearchEnginesSearchIDOrder_list1." ".@SearchEnginesSearchIDOrder_list2." ".@SearchEnginesSearchIDOrder_listgen;
1;