3 Commits
v0.1 ... v0.3

Author SHA1 Message Date
1a443709b3 Added comment to clarify 2014-08-04 14:10:02 +02:00
7a124e16ca Added a check and a function to handle UTF-8 article strings 2014-08-04 14:05:03 +02:00
72dae5c75b Rewrote shell_exec(curl...) to PHP5 cURL function
* Added HISTORY file
* Bumped to version 0.2
* Updated README with PHP5 cURL module
2014-07-31 13:37:07 +02:00
3 changed files with 51 additions and 10 deletions

11
HISTORY Normal file
View File

@@ -0,0 +1,11 @@
0.3 - 2014-08-04
Added a check to see whatever the article string is in
UTF-8 or not. If it is in UTF-8, use a specific function
to handle it. Regular 'ucwords' can't handle UTF-8.
0.2 - 2014-07-31
Changed 'shell_exec(curl -s $URL)' to PHP5 cURL function
Added HISTORY file
0.1 - 2014-07-28
First release of wie

View File

@@ -13,6 +13,9 @@ Or run it with 'php wie.php'. To display the help, type:
./wie.php --help ./wie.php --help
## Requirements ##
The script requires PHP5 and the PHP5 cURL module (php5-curl on Debian systems).
## Thanks ## ## Thanks ##
Many thanks goes to flinga who came up with the idea for this script, please see Many thanks goes to flinga who came up with the idea for this script, please see
the THANKS file for more information. the THANKS file for more information.

47
wie.php
View File

@@ -20,14 +20,9 @@
$defaultLang = "en"; // default language $defaultLang = "en"; // default language
$progName = $argv[0]; $progName = $argv[0];
function usage()
{
print "Wikipedia ingress extractor (wie), version 0.1\n";
print "Usage: $GLOBALS[progName] [--lang=sv] article\n";
print "Default language if none specified is $GLOBALS[defaultLang].\n";
print "Remember to quote the article if there's more than one word,\n";
print "for example Roger Bacon as 'Roger Bacon'.\n";
}
// check if no argument was specified // check if no argument was specified
if (!isset($argv[1])) if (!isset($argv[1]))
{ {
@@ -55,10 +50,26 @@ else
$article = $argv[1]; $article = $argv[1];
} }
$article = ucwords($article); // uppercase article // check if article is UTF-8 encoded, in which case regular ucwords won't work
if (mb_check_encoding($article, 'UTF-8'))
{
$article = utf8_ucwords($article); // uppercase article in UTF-8
}
else
{
$article = ucwords($article); // uppercase article
}
$article = preg_replace("/\s/", "_" ,$article); // make spaces to underscore $article = preg_replace("/\s/", "_" ,$article); // make spaces to underscore
$url = "http://$lang.wikipedia.org/wiki/$article"; $url = "http://$lang.wikipedia.org/wiki/$article";
$data = shell_exec("curl -s $url"); // retrive the page
// get the wiki page
$ch = curl_init("$url");
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$data = curl_exec($ch);
curl_close($ch);
preg_match("/\<p\>(.*)\<\/p\>/", $data, $match); // fetch text inside first <p> preg_match("/\<p\>(.*)\<\/p\>/", $data, $match); // fetch text inside first <p>
// check is we had a match // check is we had a match
@@ -71,4 +82,20 @@ if (!isset($match[1]))
$string = strip_tags($match[1]); $string = strip_tags($match[1]);
print (wordwrap($string, 65, "\n") . "\n"); print (wordwrap($string, 65, "\n") . "\n");
// misc functions
function usage()
{
print "Wikipedia ingress extractor (wie), version 0.2\n";
print "Usage: $GLOBALS[progName] [--lang=sv] article\n";
print "Default language if none specified is $GLOBALS[defaultLang].\n";
print "Remember to quote the article if there's more than one word,\n";
print "for example Roger Bacon as 'Roger Bacon'.\n";
}
function utf8_ucwords($str)
{
$str = mb_convert_case($str, MB_CASE_TITLE, "UTF-8");
return $str;
}
?> ?>