Logo Search packages:      
Sourcecode: bamboo version File versions  Download package

Search.php

<?php

/*****

Search
This class is the api for interacting with the index.

*****/

require_once("IndexStore.php");

class Search {

var $index;

var $exclude = array('and'=>1, 'or'=>1, 'but'=>1, 'i'=>1, 'a'=>1, 'of'=>1, 'the'=>1);

function Search(&$index) {
      $this->index = &$index;
}

// inits the index, and returns a reference 
// return null if could not be inited.
// we only want to init if we are actually going to use
// the index.
function &getIndex() {
      if ($this->index == NULL)
            return NULL; // init already failed.
            
      $ok = $this->index->init();
      if ($ok)
            return $this->index;
      else
            return NULL;      
}


function findClosestPage($s) {
      if ($index = &$this->getIndex() == NULL) return;
      
      // divide $s into two: path search terms and keyword search terms.
      // if $s is 'alt/media/democracy now' then 'democracy' and 'now' become 
      // keyword search terms and 'alt' and 'media' become path search terms
      // (in other words, split on the last /)
      
      $a = explode('/',$s);
      $keyword_search = $this->tokenize( $a[count($a)-1] );
      $path_search    = $this->tokenize( join(' ',array_slice($a,0,count($a)-1)) );
      
      $index->open();
      $pages = $index->getByKeyword($keyword_search);
      $index->close();

      #d::logv($pages, 'pages');
      // increase the counts for pages which also have paths
      // that have hits in the $path_search
      if (count($path_search)) {
            $regexp = '/' . join('|',$path_search) . '/';
            $path_matches = preg_grep($regexp,$pages);
            $pages = array_merge($pages,$path_matches);
      }

      if (!count($pages))
            return ''; // no possible matches found.
      
      // filter out .trash and .clipboard
      $pages = array_filter($pages, create_function('$a','return $a{1} != ".";'));

      // count occurances of a page, and sort by the count
      $counts = array_count_values($pages);
      arsort($counts,SORT_NUMERIC);

      // extract just the highest count getters
      // ie if the highest count is 3, only examine pages with a count of 3
      if (count($counts))
            $highcount = current($counts);
      $top=array();
      foreach($counts as $path => $count) {
            if ($count < $highcount) break;
            $top[] = $path;
      }
      
      // calculate distance from the current path
      // return shorter first
      $ret = array(); // list of pages 
      $distance=array();  // map path => distance
      $currentparent = dirname($s);
      foreach($top as $path) {
            $distance[$path] = levenshtein($currentparent,$path);
      }
      asort($distance,SORT_NUMERIC);
      $ret = array_keys($distance);
      
      // return best match:
      return $ret[0];
}

function findByContent($s,&$ps) {
      if ($index = &$this->getIndex() == NULL) return;
      
      $timeparts = explode(" ",microtime());
      $starttime = $timeparts[1].substr($timeparts[0],1);

      $index->open();
      $tokens        = $this->tokenize($s);

      $bykw      = $index->getByKeyword($tokens);
      $bycontent = $index->getByContent($tokens);
      $pages     = array_merge($bykw, $bykw, $bycontent); // count keywords twice.
      $index->close();

      // create regexptokens, used to make search terms bold.
      $regexptokens = array_map(create_function('$token','return "/$token/i";'),$tokens);
            
      $counts = array_count_values($pages);
      arsort($counts,SORT_NUMERIC);
      $counts = array_slice($counts,0,20); // no paging yet, limit to 20 results.
      $root = $GLOBALS['root']; // hackish
      $html = '';
      foreach($counts as $path => $count) {
            $page = $ps->getPage($path);
            $title = $page->get('title');
            if ($title == '') $title = "Untitled";
            $html .= "<a href=\"$root$path\">$title</a><br/>\n";
            $matches = $page->search($tokens);
            foreach($matches as $line) {
                  if (trim($line)) {
                        $line = preg_replace('/<[\/\!]*?[^<>]*?>|\*|- |\[|\]|{|}/si', '', $line); // strip html
                        $line = preg_replace($regexptokens, '<b>$0</b>', $line);
                        $html .= "$line ... ";
                  }
            }
            $html .= "<div style='color:green;font-size:small;'>" . $path . "</div>";
            $html .= "<p></p>";
      }
      
      $timeparts = explode(" ",microtime());
      $total_time = ($timeparts[1].substr($timeparts[0],1)) - $starttime;
      $time = "<form action=\"./\" method=get><input name=s value=\"" . htmlentities(@$_REQUEST[s]) . "\"><input type=submit value=" . _("Search") . "></form><p><i>" . _("Search Results") . " (" . substr($total_time,0,4) . " " . _("seconds") . ")</i></p>";
      $html = $time . $html;
      return $html;
}

/**
 * updates the indexes for $page
 * the keywords work like this:
 *   name, title, and base of path are all combined in a soup.
 *   the tokenized and none tokenized versions are added to the soup.
 *   then all duplicates are removed.
 *   finally, the tokenized keywords property is added to the keyword soup.
 * this means that there may be duplicates in the index if the keyword property
 * contains the same word that is in the title or path. Otherwise, there are no duplicates.
 **/
function indexPage($page, $keepopen=false) {
      #d::log("indexpage $page->path");
      if ($index = &$this->getIndex() == NULL) return;
      if ($page->get('index') == false) return;
      
      if (!$keepopen) $index->open('w');
            
      ## indexed timestamp    
      $timeindexed = $index->getIndexedTime($page->path);   
      if ($timeindexed >= $page->get('mtime'))
            return; // already up to date.
      else
            $index->setIndexedTime($page->path,time());
      
      ## content  
      $content = $page->get('content');
      $content = str_replace("\n",' ',$content);
      $content = preg_replace('/[^\w\s\d_-]/','',$content);
      $content = strtolower($content);
      $words = explode(" ", $content);
      $words = &clean_words($words);
      $index->setContent($page->path,$words);
      
      ## keywords
      $name     = $this->tokenize($page->get('name'));
      $title    = $this->tokenize($page->get('title'));
      $keywords = $this->tokenize($page->get('keywords'));
      $raw      = array(basename($page->path), $page->get('name'), $page->get('title'));
      $words    = array_unique(array_merge($name,$title,$raw));
      $words    = array_merge($words, $keywords);
      $words = &clean_words($words);
      $index->setKeywords($page->path,$words);
      
      ## modified time 
      $index->setModTime($page->path,$page->get('mtime'));

      if (!$keepopen) $index->close();
}

// removes the indexes for $page
function deindexPage($page,$keepopen=false) {
      if ($index = &$this->getIndex() == NULL) return;
      
      if (!$keepopen) $index->open('w');
      $index->remove($page->path);
      if (!$keepopen) $index->close();    
}

function clear() {
      if ($index = &$this->getIndex() == NULL) return;
      $index->clear();
}

function recentChanges(&$page, $offset, $limit, $starttime) {
      if ($index = &$this->getIndex() == NULL) return;
      
      $index->open();
      $pages = $index->getByModTime($starttime, $offset, $limit, $page->path);
      $index->close();

      return $pages;
}

###########################################################################
## PRIVATE FUNCTIONS

/**
 * normalizes a string for adding to or searching the index
 * returns an array of tokens.
 * basically, we reduce everything to lowercase words.
 * words with hypens are treated special: they are both broken down 
 * and kept whole.
 */
function tokenize($str, $char='') {
      $str = preg_replace('/[^\w\s\d\@_-]/',' ',$str);
      $str = strtolower($str);
      $str = explode(" ", $str);
      $str = array_filter($str,'strlen');
#     if ($char)
#           $str = array_map(create_function('$a','return "' . $char . '":$a";'), $str);
      return $str;
}

function replacePath($old, $new) {
      if ($index = &$this->getIndex() == NULL) return;
      $index->open('w');
      $index->rename($old,$new);
      $index->close();
}


// recursively indexes all sub pages
function indexTree(&$page) {
      if ($index = &$this->getIndex() == NULL) return;
      
      $index->open('w');
      $this->_indexTree($page);
      $index->close();
}

function _indexTree(&$page) {
      if ($index = &$this->getIndex() == NULL) return;
      
      $this->indexPage($page,true);
      $children = $page->children();
      foreach($children as $child) {
            if ($child->get('index') == true)
                  $this->_indexTree($child);
      }
}

// recursively deindexes all sub pages
function deindexTree(&$page) {
      if ($index = &$this->getIndex() == NULL) return;
      
      $index->open('w');
      $this->_deindexTree($page);
      $index->close();
}

function _deindexTree(&$page) {
      if ($index = &$this->getIndex() == NULL) return;
      
      $this->deindexPage($page,true);
      $children = $page->children();
      foreach($children as $child) {
            $this->_deindexTree($child);
      }
}

} // end class

function &clean_words(&$words) {
      $ret = array();
      foreach($words as $word) {
            $word = trim($word);
            if ($word == '' || strlen($word) <= 2) continue;
            if (isset($this->exclude[$word])) continue;
            $ret[] = $word;
      }
      return $ret;
}

return;
?>

Generated by  Doxygen 1.6.0   Back to index