// Main entry point:
// domain_split()
// Used internally for passing results of processing pieces
// of the domain name.
class parsed_words {
var $words;
var $leftovers;
}
function get_wn_result($string) {
$wnresult = `/usr/local/wordnet1.6/bin/wn $string -over`;
return $wnresult;
}
function get_dict_result($string) {
global $l_SHORT_LANGUAGE_NAME;
if ( isset($l_SHORT_LANGUAGE_NAME) ) {
$dicttable = "lang_$l_SHORT_LANGUAGE_NAME.dict";
}
else {
// shouldn't happen; call nameboy_set_language() first
$dicttable = "lang_en.dict";
}
$dq = mysql_query("SELECT COUNT(*) FROM dicttable WHERE word = '$string';");
if ( $dq ) {
$rslt = mysql_fetch_row($dq);
return $rslt[0];
}
else {
error_log("nameboy engine: failed in get_dict_result query: |$querystring| " . mysql_error(), 0);
return 0;
}
}
// array_characters_contained - return combined string length of all array elts
function php_array_characters_contained($arr) {
$cctr = 0;
reset($arr);
while($v = current($arr)) {
$cctr += strlen($v);
next($arr);
}
return $cctr;
}
function indent($ind) {
$r="";
while($ind--) {
$r .= " ";
}
return $r;
}
// do_domain_split - do the work of cutting up domain names
//
// -->> This usually isn't the function you want; domain_split is better.
//
// Given a word, this function returns an object of type parsed_words
// containing the best combination of words it could find, and whatever
// pieces of the domain name were left over.
define("MINSTRINGMATCH", 0);
function do_domain_split($domain) { //, $indent) {
global $use_wordnet;
global $l_SHORT_LANGUAGE_NAME;
if ( isset($l_SHORT_LANGUAGE_NAME) ) {
$dicttable = "lang_$l_SHORT_LANGUAGE_NAME.dict";
}
else {
// shouldn't happen; call nameboy_set_language() first
$dicttable = "lang_en.dict";
}
//error_log("do_domain_split on $domain with $dicttable\n", 3, "/tmp/ds.log");
$parsed = new parsed_words;
if ( $domain == "" ) {
// Got nothing to work on; this is the recursion endpoint
$parsed->leftovers = array();
$parsed->words = array();
return $parsed;
}
// best_parsed holds the object containing the fewest leftovers so far
// if we don't find a perfect fit after scanning the entire domain (so
// the for loop drops through) best_parsed is returned.
$least_leftovers = 999999;
$least_chunks = 999999;
unset($best_parsed);
$results_pushed = 0;
$domainlength = strlen($domain);
//$minstringmatch = 0;
// now analyze substrings.
$howmany=0;
for ($k=$domainlength;$k>=MINSTRINGMATCH;$k--) {
for ($j=0;$j<=($domainlength-$k);$j++) {
//iterate through the word at starting character $j
unset($domainsubstring);
$domainsubstring = substr($domain, $j,$k);
$dq = mysql_query("SELECT COUNT(*) FROM $dicttable WHERE word = '$domainsubstring';");
if ( $dq ) {
list($wnresult) = mysql_fetch_row($dq);
}
else {
error_log("nameboy engine: failed in get_dict_result query: |$querystring| " . mysql_error(), 0);
$wnresult = 0;
}
//echo indent($indent) . $domainsubstring . "\t $foundstring\n";
if ($wnresult) {
//hit
//echo indent($indent) . "$domain: substr is $domainsubstring\n";
if ( ! ereg("^(.*)$domainsubstring(.*)$", $domain, $regs) ) {
// gag, choke, don't know what to do!
error_log(__FILE__ . ": regexp error in do_domain_split!", 0);
return $parsed;
}
//if ( $domainsubstring == $domain ) {
if ( $k == $domainlength ) {
//echo indent($indent). "$domain: is a word - returning\n";
$parsed->words = array($domain);
$parsed->leftovers = array();
return $parsed;
}
//echo indent($indent) . "$domain: recursing on $regs[1], $regs[2]\n";
// Now recurse on the bits left over
$left = do_domain_split($regs[1]); //, $indent+1);
$right = do_domain_split($regs[2]); //, $indent+1);
$parsed->words =
array_merge($left->words, array($domainsubstring), $right->words);
$parsed->leftovers =
array_merge($left->leftovers, $right->leftovers);
$leftover_chars = php_array_characters_contained($parsed->leftovers);
//echo indent($indent) . "$domain: have $leftover_chars leftovers from below\n";
if ( $leftover_chars == 0 ) {
//echo indent($indent) . "$domain: success!\n";
return $parsed;
}
else {
// This combination of sub-words is the best we've found so far if:
// 1 - it leaves fewer leftover characters than anything
// we've found so far, or
// 2 - it has the same number of leftover chars, but they
// are in fewer pieces. This prefers words at the beginning
// and end of the domain, especially important if dict doesn't
// know one of the words entered.
// If it's the best we've found so far, we save it
$leftover_chunks = count($parsed->leftovers);
if ( ($leftover_chars < $least_leftovers) ||
( ($leftover_chars == $least_leftovers) &&
($leftover_chunks < $least_chunks) ) ) {
//echo indent($indent) . "$domain: saving this one\n";
$best_parsed = $parsed;
$least_leftovers = $leftover_chars;
$least_chunks = $leftover_chunks;
}
}
}
} //iterate through the word at starting character $j
} //iterate through a series of substring length $k
//echo indent($indent) . "$domain: no luck\n";
if ( isset($best_parsed) ) {
return $best_parsed;
}
else {
$parsed->words = array();
$parsed->leftovers = array($domain);
return $parsed;
}
}
function reverse_length_function($a, $b) {
return (strlen($a) < strlen($b)) ? 1 : -1;
}
function length_function($a, $b) {
return (strlen($a) > strlen($b)) ? 1 : -1;
}
function sort_words_by_length(&$words, $reverse = FALSE) {
if ( $reverse ) {
usort($words, reverse_length_function);
}
else {
usort($words, length_function);
}
// no return - words is by reference
}
function sort_substrings($subs, $sourcestr) {
//echo "SORT_SUBSTRINGS:
INPUT: "; // !!!
//foreach($subs as $s) {
// print "$s ";
//}
$substring_count = count($subs);
sort_words_by_length($subs, TRUE);
$resultarr = array();
while (count($resultarr) < $substring_count) {
$maxidx = 99999;
foreach($subs as $s) {
if ( ($idx = strpos($sourcestr, $s)) === false ) {
continue;
}
if ( $idx < $maxidx ) {
$maxidx = $idx;
$firstsub = $s;
if ( $idx == 0 ) {
break;
}
}
}
if ( $maxidx == 99999 ) {
error_log(__FILE__ . ": sort_substrings failed on $sourcestr! This is a bug.", 0);
return $subs;
}
array_push($resultarr, $firstsub);
$sourcestr = substr($sourcestr, $maxidx + strlen($firstsub));
} // while
//echo "
OUTPUT: "; // !!!
//foreach($resultarr as $s) {
// print "$s ";
//}
return $resultarr;
}
$DOMAIN_SPLIT_CACHE = array();
// domain_split - cut up domain names into words
function domain_split($domainname, $order_matters = TRUE) {
if ( isset($DOMAIN_SPLIT_CACHE[$domainname]) ) {
return $DOMAIN_SPLIT_CACHE[$domainname];
}
$domainname = (string)$domainname;
$hyphensplit = split("-", $domainname);
$hyphenwordcount = sizeof($hyphensplit);
for($i=0; $i<$hyphenwordcount; $i++) {
$thisparsed = do_domain_split($hyphensplit[$i]); //, 0);
$parsed->words = array_merge($thisparsed->words, $parsed->words);
}
if ( $order_matters ) {
$wlist = sort_substrings($parsed->words, $domainname);
$DOMAIN_SPLIT_CACHE[$domainname] = $wlist;
return $wlist;
}
else {
return $parsed->words;
}
}
function domain_split_string($domainname) {
$words = domain_split($domainname);
$result = "";
$last_chr_matched= 0;
$len = strlen($domainname);
for ($i=0; $i<$len; $i++) {
unset($foundword);
foreach($words as $w) {
if ( substr($domainname, $i, strlen($w)) == $w ) {
$foundword = $w;
break;
}
}
if ( isset($foundword) ) {
if ( ($result!="") && ($domainname[$i-1]!="-")) {
$result .= "*";
}
$result .= $foundword;
$i += (strlen($foundword)-1);
$last_chr_matched=TRUE;
}
else {
if ( $last_chr_matched && ($domainname[$i]!="-") &&
($result!="") && ($result[$i-1]!="-") ) {
$result .= "*";
}
$last_chr_matched=0;
$result .= $domainname[$i];
}
} // for
return $result;
}
?>