From 7a124e16ca606eb8e170dafca88a4128a35df9ec Mon Sep 17 00:00:00 2001 From: Jack-Benny Persson Date: Mon, 4 Aug 2014 14:05:03 +0200 Subject: [PATCH] Added a check and a function to handle UTF-8 article strings --- HISTORY | 9 +++++++-- wie.php | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/HISTORY b/HISTORY index 48ede65..93871c6 100644 --- a/HISTORY +++ b/HISTORY @@ -1,6 +1,11 @@ -0.1 - 2014-07-28 - First release of wie +0.3 - 2014-08-04 + Added a check to see whatever the article string is in + UTF-8 or not. If it is in UTF-8, use a specific function + to handle it. Regular 'ucwords' can't handle UTF-8. 0.2 - 2014-07-31 Changed 'shell_exec(curl -s $URL)' to PHP5 cURL function Added HISTORY file + +0.1 - 2014-07-28 + First release of wie diff --git a/wie.php b/wie.php index 84dce4c..d7a6b00 100755 --- a/wie.php +++ b/wie.php @@ -20,14 +20,9 @@ $defaultLang = "en"; // default language $progName = $argv[0]; -function usage() -{ - print "Wikipedia ingress extractor (wie), version 0.2\n"; - print "Usage: $GLOBALS[progName] [--lang=sv] article\n"; - print "Default language if none specified is $GLOBALS[defaultLang].\n"; - print "Remember to quote the article if there's more than one word,\n"; - print "for example Roger Bacon as 'Roger Bacon'.\n"; -} + + + // check if no argument was specified if (!isset($argv[1])) { @@ -55,7 +50,16 @@ else $article = $argv[1]; } -$article = ucwords($article); // uppercase article +// check if article is UTF-8 encoded, in which case regular ucwords won't work +if (mb_check_encoding($article, 'UTF-8')) +{ + $article = utf8_ucwords($article); +} +else +{ + $article = ucwords($article); // uppercase article +} + $article = preg_replace("/\s/", "_" ,$article); // make spaces to underscore $url = "http://$lang.wikipedia.org/wiki/$article"; @@ -78,4 +82,20 @@ if (!isset($match[1])) $string = strip_tags($match[1]); print (wordwrap($string, 65, "\n") . "\n"); +// misc functions +function usage() +{ + print "Wikipedia ingress extractor (wie), version 0.2\n"; + print "Usage: $GLOBALS[progName] [--lang=sv] article\n"; + print "Default language if none specified is $GLOBALS[defaultLang].\n"; + print "Remember to quote the article if there's more than one word,\n"; + print "for example Roger Bacon as 'Roger Bacon'.\n"; +} + +function utf8_ucwords($str) +{ + $str = mb_convert_case($str, MB_CASE_TITLE, "UTF-8"); + return $str; +} + ?>