//
//
//
//
//---------------------------------------------------------------------------
//
// Here's [most of] Stefan's original comments:
// This requires a database of wikipedia titles, like
// http://www.whitelabel.org/wp/convertedtitles.gz, imported into a mysql
// database 'wikiproxy' created like this:
//
// mysql> create table titles (title varchar(190) NOT NULL, PRIMARY KEY (title)) type=MyISAM;
// mysql> load data infile '/home/stefan/whitelabel.org/wp/convertedtitles' ignore into table titles;
//
// The bits of code I didn't borrow from elsewhere (and I've credited where)
// is licenced under the GPL. Do with it what you will, but this is my first
// php and my first code for 7 years, so I'd appreciate feedback and
// suggestions via comments on my blog:
// http://www.whitelabel.org/archives/002248.html
//
// (especially regex optimisations for lines 64 and 65 - ideally a way of
// making it NOT match if we're within an IMG tag, because then I could drop
// the antiTaginTag stuff)
//---------------------------------------------------------------------------
require_once "PEAR.php";
require_once "HTTP/Request.php";
//---------------------------------------------------------------------------
$debug = false;
if (isset($_GET['debug']))
{
$debug=true;
}
//---------------------------------------------------------------------------
function debug($msg)
{
global $debug;
if ($debug)
{
print "$msg\n";
}
}
//---------------------------------------------------------------------------
function printRoot()
{
global $debug;
$headertype = "text/xml";
if ($debug)
{
$headertype = "text/plain";
}
header("Content-type: " . $headertype);
print("\n");
print("\n");
}
function printCloseRoot()
{
print "\n";
}
//---------------------------------------------------------------------------
function reportErrorAndExit($msg)
{
print " $msg\n";
printCloseRoot();
die();
}
//---------------------------------------------------------------------------
printRoot();
if (! isset($_GET['url']))
{
reportErrorAndExit("URL parameter required");
}
$urlString = $_GET['url'];
if (preg_match("%^http://%", $urlString) == 0)
{
$urlString = "http://" . $urlString;
}
$req =& new HTTP_Request ($urlString);
if (PEAR::isError($req->sendRequest()))
{
reportErrorAndExit("Error retrieving URL");
}
$result = $req->getResponseBody();
// Curl version
// $curlHandle = curl_init($urlString);
// curl_setopt($curlHandle, CURLOPT_FOLLOWLOCATION, true);
// curl_setopt($curlHandle, CURLOPT_MAXREDIRS, 3);
// curl_setopt($curlHandle, CURLOPT_TIMEOUT, 15);
// curl_setopt($curlHandle, CURLOPT_HEADER, false);
// curl_setopt($curlHandle, CURLOPT_RETURNTRANSFER, true);
// $result = curl_exec($curlHandle);
// if (curl_error($curlHandle) != "")
// {
// reportErrorAndExit("Error retrieving URL");
// }
// curl_close($curlHandle);
// Only care about the text, not the tags. Should really nuke the entire header too.
$result = strip_tags($result);
// Match proper noun phrases.
preg_match_all("/[A-Z][a-zA-Z]+(\s[A-Z][-a-zA-Z]+)+/ms", $result, $propernounphrases);
// Match acronyms. (performance seems to go through the floor if we do these in one pass.)
preg_match_all("/[A-Z][A-Z][A-Z]+/ms", $result, $acronyms);
// Merge and de-duplicate.
$phrases = array_unique($propernounphrases[0] + $acronyms[0]);
// Open up a db connection and whittle our list down against the real titles.
$connection = mysql_connect ("localhost", "wikipedia");
if ($connection == null)
{
reportErrorAndExit("Could not connect to database");
}
if (! mysql_select_db("wikipedia", $connection))
{
reportErrorAndExit("No database found");
}
$entries = array();
foreach ($phrases as $phrase)
{
debug("phrase: \"$phrase\"");
$match = mysql_query ("SELECT * FROM titles WHERE title = '{$phrase}'", $connection);
if (! $match)
{
reportErrorAndExit("Error performing select, title = '$phrase'");
}
while ($row = mysql_fetch_array($match, MYSQL_NUM))
{
$entry["title"] = $row[0];
$wikiString = ereg_replace(" ", "_", $entry["title"]);
$entry["url"] = "http://en.wikipedia.org/wiki/${wikiString}";
$entries[] = $entry;
debug(" title: " . $entry["title"]);
debug(" url: " . $entry["url"]);
}
}
// Display. Just easier to do in code than inline.
foreach ($entries as $curEntry)
{
print(' ' . "\n");
}
printCloseRoot();
?>