Added a check and a function to handle UTF-8 article strings

This commit is contained in:
Jack-Benny Persson 2014-08-04 14:05:03 +02:00
parent 72dae5c75b
commit 7a124e16ca
2 changed files with 36 additions and 11 deletions

View File

@ -1,6 +1,11 @@
0.1 - 2014-07-28
First release of wie
0.3 - 2014-08-04
Added a check to see whatever the article string is in
UTF-8 or not. If it is in UTF-8, use a specific function
to handle it. Regular 'ucwords' can't handle UTF-8.
0.2 - 2014-07-31
Changed 'shell_exec(curl -s $URL)' to PHP5 cURL function
Added HISTORY file
0.1 - 2014-07-28
First release of wie

38
wie.php
View File

@ -20,14 +20,9 @@
$defaultLang = "en"; // default language
$progName = $argv[0];
function usage()
{
print "Wikipedia ingress extractor (wie), version 0.2\n";
print "Usage: $GLOBALS[progName] [--lang=sv] article\n";
print "Default language if none specified is $GLOBALS[defaultLang].\n";
print "Remember to quote the article if there's more than one word,\n";
print "for example Roger Bacon as 'Roger Bacon'.\n";
}
// check if no argument was specified
if (!isset($argv[1]))
{
@ -55,7 +50,16 @@ else
$article = $argv[1];
}
$article = ucwords($article); // uppercase article
// check if article is UTF-8 encoded, in which case regular ucwords won't work
if (mb_check_encoding($article, 'UTF-8'))
{
$article = utf8_ucwords($article);
}
else
{
$article = ucwords($article); // uppercase article
}
$article = preg_replace("/\s/", "_" ,$article); // make spaces to underscore
$url = "http://$lang.wikipedia.org/wiki/$article";
@ -78,4 +82,20 @@ if (!isset($match[1]))
$string = strip_tags($match[1]);
print (wordwrap($string, 65, "\n") . "\n");
// misc functions
function usage()
{
print "Wikipedia ingress extractor (wie), version 0.2\n";
print "Usage: $GLOBALS[progName] [--lang=sv] article\n";
print "Default language if none specified is $GLOBALS[defaultLang].\n";
print "Remember to quote the article if there's more than one word,\n";
print "for example Roger Bacon as 'Roger Bacon'.\n";
}
function utf8_ucwords($str)
{
$str = mb_convert_case($str, MB_CASE_TITLE, "UTF-8");
return $str;
}
?>