> This usually isn't the function you want; domain_split is better. // // Given a word, this function returns an object of type parsed_words // containing the best combination of words it could find, and whatever // pieces of the domain name were left over. define("MINSTRINGMATCH", 0); function do_domain_split($domain) { //, $indent) { global $use_wordnet; global $l_SHORT_LANGUAGE_NAME; if ( isset($l_SHORT_LANGUAGE_NAME) ) { $dicttable = "lang_$l_SHORT_LANGUAGE_NAME.dict"; } else { // shouldn't happen; call nameboy_set_language() first $dicttable = "lang_en.dict"; } //error_log("do_domain_split on $domain with $dicttable\n", 3, "/tmp/ds.log"); $parsed = new parsed_words; if ( $domain == "" ) { // Got nothing to work on; this is the recursion endpoint $parsed->leftovers = array(); $parsed->words = array(); return $parsed; } // best_parsed holds the object containing the fewest leftovers so far // if we don't find a perfect fit after scanning the entire domain (so // the for loop drops through) best_parsed is returned. $least_leftovers = 999999; $least_chunks = 999999; unset($best_parsed); $results_pushed = 0; $domainlength = strlen($domain); //$minstringmatch = 0; // now analyze substrings. $howmany=0; for ($k=$domainlength;$k>=MINSTRINGMATCH;$k--) { for ($j=0;$j<=($domainlength-$k);$j++) { //iterate through the word at starting character $j unset($domainsubstring); $domainsubstring = substr($domain, $j,$k); $dq = mysql_query("SELECT COUNT(*) FROM $dicttable WHERE word = '$domainsubstring';"); if ( $dq ) { list($wnresult) = mysql_fetch_row($dq); } else { error_log("nameboy engine: failed in get_dict_result query: |$querystring| " . mysql_error(), 0); $wnresult = 0; } //echo indent($indent) . $domainsubstring . "\t $foundstring\n"; if ($wnresult) { //hit //echo indent($indent) . "$domain: substr is $domainsubstring\n"; if ( ! ereg("^(.*)$domainsubstring(.*)$", $domain, $regs) ) { // gag, choke, don't know what to do! error_log(__FILE__ . ": regexp error in do_domain_split!", 0); return $parsed; } //if ( $domainsubstring == $domain ) { if ( $k == $domainlength ) { //echo indent($indent). "$domain: is a word - returning\n"; $parsed->words = array($domain); $parsed->leftovers = array(); return $parsed; } //echo indent($indent) . "$domain: recursing on $regs[1], $regs[2]\n"; // Now recurse on the bits left over $left = do_domain_split($regs[1]); //, $indent+1); $right = do_domain_split($regs[2]); //, $indent+1); $parsed->words = array_merge($left->words, array($domainsubstring), $right->words); $parsed->leftovers = array_merge($left->leftovers, $right->leftovers); $leftover_chars = php_array_characters_contained($parsed->leftovers); //echo indent($indent) . "$domain: have $leftover_chars leftovers from below\n"; if ( $leftover_chars == 0 ) { //echo indent($indent) . "$domain: success!\n"; return $parsed; } else { // This combination of sub-words is the best we've found so far if: // 1 - it leaves fewer leftover characters than anything // we've found so far, or // 2 - it has the same number of leftover chars, but they // are in fewer pieces. This prefers words at the beginning // and end of the domain, especially important if dict doesn't // know one of the words entered. // If it's the best we've found so far, we save it $leftover_chunks = count($parsed->leftovers); if ( ($leftover_chars < $least_leftovers) || ( ($leftover_chars == $least_leftovers) && ($leftover_chunks < $least_chunks) ) ) { //echo indent($indent) . "$domain: saving this one\n"; $best_parsed = $parsed; $least_leftovers = $leftover_chars; $least_chunks = $leftover_chunks; } } } } //iterate through the word at starting character $j } //iterate through a series of substring length $k //echo indent($indent) . "$domain: no luck\n"; if ( isset($best_parsed) ) { return $best_parsed; } else { $parsed->words = array(); $parsed->leftovers = array($domain); return $parsed; } } function reverse_length_function($a, $b) { return (strlen($a) < strlen($b)) ? 1 : -1; } function length_function($a, $b) { return (strlen($a) > strlen($b)) ? 1 : -1; } function sort_words_by_length(&$words, $reverse = FALSE) { if ( $reverse ) { usort($words, reverse_length_function); } else { usort($words, length_function); } // no return - words is by reference } function sort_substrings($subs, $sourcestr) { //echo "SORT_SUBSTRINGS:
INPUT: "; // !!! //foreach($subs as $s) { // print "$s "; //} $substring_count = count($subs); sort_words_by_length($subs, TRUE); $resultarr = array(); while (count($resultarr) < $substring_count) { $maxidx = 99999; foreach($subs as $s) { if ( ($idx = strpos($sourcestr, $s)) === false ) { continue; } if ( $idx < $maxidx ) { $maxidx = $idx; $firstsub = $s; if ( $idx == 0 ) { break; } } } if ( $maxidx == 99999 ) { error_log(__FILE__ . ": sort_substrings failed on $sourcestr! This is a bug.", 0); return $subs; } array_push($resultarr, $firstsub); $sourcestr = substr($sourcestr, $maxidx + strlen($firstsub)); } // while //echo "
OUTPUT: "; // !!! //foreach($resultarr as $s) { // print "$s "; //} return $resultarr; } $DOMAIN_SPLIT_CACHE = array(); // domain_split - cut up domain names into words function domain_split($domainname, $order_matters = TRUE) { if ( isset($DOMAIN_SPLIT_CACHE[$domainname]) ) { return $DOMAIN_SPLIT_CACHE[$domainname]; } $domainname = (string)$domainname; $hyphensplit = split("-", $domainname); $hyphenwordcount = sizeof($hyphensplit); for($i=0; $i<$hyphenwordcount; $i++) { $thisparsed = do_domain_split($hyphensplit[$i]); //, 0); $parsed->words = array_merge($thisparsed->words, $parsed->words); } if ( $order_matters ) { $wlist = sort_substrings($parsed->words, $domainname); $DOMAIN_SPLIT_CACHE[$domainname] = $wlist; return $wlist; } else { return $parsed->words; } } function domain_split_string($domainname) { $words = domain_split($domainname); $result = ""; $last_chr_matched= 0; $len = strlen($domainname); for ($i=0; $i<$len; $i++) { unset($foundword); foreach($words as $w) { if ( substr($domainname, $i, strlen($w)) == $w ) { $foundword = $w; break; } } if ( isset($foundword) ) { if ( ($result!="") && ($domainname[$i-1]!="-")) { $result .= "*"; } $result .= $foundword; $i += (strlen($foundword)-1); $last_chr_matched=TRUE; } else { if ( $last_chr_matched && ($domainname[$i]!="-") && ($result!="") && ($result[$i-1]!="-") ) { $result .= "*"; } $last_chr_matched=0; $result .= $domainname[$i]; } } // for return $result; } ?>