// // // // //--------------------------------------------------------------------------- // // Here's [most of] Stefan's original comments: // This requires a database of wikipedia titles, like // http://www.whitelabel.org/wp/convertedtitles.gz, imported into a mysql // database 'wikiproxy' created like this: // // mysql> create table titles (title varchar(190) NOT NULL, PRIMARY KEY (title)) type=MyISAM; // mysql> load data infile '/home/stefan/whitelabel.org/wp/convertedtitles' ignore into table titles; // // The bits of code I didn't borrow from elsewhere (and I've credited where) // is licenced under the GPL. Do with it what you will, but this is my first // php and my first code for 7 years, so I'd appreciate feedback and // suggestions via comments on my blog: // http://www.whitelabel.org/archives/002248.html // // (especially regex optimisations for lines 64 and 65 - ideally a way of // making it NOT match if we're within an IMG tag, because then I could drop // the antiTaginTag stuff) //--------------------------------------------------------------------------- require_once "PEAR.php"; require_once "HTTP/Request.php"; //--------------------------------------------------------------------------- $debug = false; if (isset($_GET['debug'])) { $debug=true; } //--------------------------------------------------------------------------- function debug($msg) { global $debug; if ($debug) { print "$msg\n"; } } //--------------------------------------------------------------------------- function printRoot() { global $debug; $headertype = "text/xml"; if ($debug) { $headertype = "text/plain"; } header("Content-type: " . $headertype); print("\n"); print("\n"); } function printCloseRoot() { print "\n"; } //--------------------------------------------------------------------------- function reportErrorAndExit($msg) { print " $msg\n"; printCloseRoot(); die(); } //--------------------------------------------------------------------------- printRoot(); if (! isset($_GET['url'])) { reportErrorAndExit("URL parameter required"); } $urlString = $_GET['url']; if (preg_match("%^http://%", $urlString) == 0) { $urlString = "http://" . $urlString; } $req =& new HTTP_Request ($urlString); if (PEAR::isError($req->sendRequest())) { reportErrorAndExit("Error retrieving URL"); } $result = $req->getResponseBody(); // Curl version // $curlHandle = curl_init($urlString); // curl_setopt($curlHandle, CURLOPT_FOLLOWLOCATION, true); // curl_setopt($curlHandle, CURLOPT_MAXREDIRS, 3); // curl_setopt($curlHandle, CURLOPT_TIMEOUT, 15); // curl_setopt($curlHandle, CURLOPT_HEADER, false); // curl_setopt($curlHandle, CURLOPT_RETURNTRANSFER, true); // $result = curl_exec($curlHandle); // if (curl_error($curlHandle) != "") // { // reportErrorAndExit("Error retrieving URL"); // } // curl_close($curlHandle); // Only care about the text, not the tags. Should really nuke the entire header too. $result = strip_tags($result); // Match proper noun phrases. preg_match_all("/[A-Z][a-zA-Z]+(\s[A-Z][-a-zA-Z]+)+/ms", $result, $propernounphrases); // Match acronyms. (performance seems to go through the floor if we do these in one pass.) preg_match_all("/[A-Z][A-Z][A-Z]+/ms", $result, $acronyms); // Merge and de-duplicate. $phrases = array_unique($propernounphrases[0] + $acronyms[0]); // Open up a db connection and whittle our list down against the real titles. $connection = mysql_connect ("localhost", "wikipedia"); if ($connection == null) { reportErrorAndExit("Could not connect to database"); } if (! mysql_select_db("wikipedia", $connection)) { reportErrorAndExit("No database found"); } $entries = array(); foreach ($phrases as $phrase) { debug("phrase: \"$phrase\""); $match = mysql_query ("SELECT * FROM titles WHERE title = '{$phrase}'", $connection); if (! $match) { reportErrorAndExit("Error performing select, title = '$phrase'"); } while ($row = mysql_fetch_array($match, MYSQL_NUM)) { $entry["title"] = $row[0]; $wikiString = ereg_replace(" ", "_", $entry["title"]); $entry["url"] = "http://en.wikipedia.org/wiki/${wikiString}"; $entries[] = $entry; debug(" title: " . $entry["title"]); debug(" url: " . $entry["url"]); } } // Display. Just easier to do in code than inline. foreach ($entries as $curEntry) { print(' ' . "\n"); } printCloseRoot(); ?>