Curious about how Brock University's Sakai-Based LMS was named Isaak?. Find out more here

Category:Auto Imported 2009 01 28

From Information about Isaak, Brock University's Sakai-Based LMS

Jump to: navigation, search

I wrote a script (found below) to auto import all of this content from the Sakai help section into our Media Wiki. The next step is to review it all!

Mclare 11:44, 28 January 2009 (EST)

<source lang="php"> <?PHP /*************** * Sakai help scraper * Written by Matt Clare, Brock University * Not written by someone with a computer science degree * /***************/ header("content-type: text/xml"); //error_reporting(2047); //header('Content-Type: text/plain; charset=utf-8'); $help_base_url = 'https://lms.brocku.ca/portal/help/'; $toc ='TOCDisplay/main'; $output_dir = '/home/mclare/public_html/sakai/help_scrape/data/'; $username = 'Mclare'; $blank = array('<p> </p>',"Â",'<','>','Back to top'); // things we just want blanked out $remove_dup_titles = true; $queens_english = true; $link_to_articles = true; $write_files = true; $categories = array('Announcements','Assignments','Blogger','Chat Room','Drop Box','Forms','Forums','Glossary','Gradebook','Groups','Permissions','Tests and Quizzes','Syllabus','Dropbox','News','Messages','Podcasts','Polls','Presentation','Profile','Resources','Schedule','Sign-up','Site Stats','Web Content','Wiki','Worksite Setup','Firefox','Internet Explorer','Netscape','Mac OS','Windows','My Workspace','Portfolio','Student View','see'); $categories_variations = array('Gradebook' => array('Grade book','Grades'),'Assignments' => array('Assignment'),'Blogger' => array('Blog'),'Chat Room' => array('Chat'),'News' => array('RSS','Feed','Podcasts'),'Schedule' => array('Calendar'),'Sign-up' => array('signup','sign up'),'Site Stats' => array('Stats'),'Roles' =>array('Role','Roles'),'Text Editor' =>array('FCKeditor'),'TAs'=>array(' TAs ','teaching assistant'),'Tests and Quizzes' => array('Tests','Quizzes','Questions'),'Non-Brock' => array('Other Official Participants'),'Forums' => array('Forum')); $link_to_article = array("'''''Note:''''' " =>'Note: ','[[XTHML]]'=>'XHTML','[[HTML]]' =>'HTML','[[PPT]]' =>'ppt','[[MP3]]' =>'MP3','[[mp4]]' =>'MP4','[[RSS]]'=>'RSS',' [[URL]] '=>' URL ',' [[Sakai]] '=>' Sakai ',' [[LMS]] ' =>' CLE ',' [[WebDAV]] ' =>' WebDAV ','[[ZIP]|.zip] ' =>'.zip '); $append_to_wiki = "{{Template:Help Page}}\n{{Template:Sakai Help Source}}\n{{Template:Auto Imported and Not Reviewed}}\n[[Category:Auto Imported ".date('Y m d')."]]\n"; $regexp = "<a\s[^>]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/a>"; $articles = array(); //Load in config file of titles to exclude $filename = "exclude.txt"; $handle = fopen($filename, "r"); $contents = fread($handle, filesize($filename)); fclose($handle); $exclude = explode("\n",$contents); function queens_english ($contents) { $translate = array('Honor'=>'Honour','spoilt'=>'spoiled'); foreach($translate as $key => $value) { $contents = str_ireplace($key,$value,$contents ); } return $contents; } function link_to_articles ($contents) { global $link_to_article; foreach ($link_to_article as $key => $value) { $contents = str_ireplace($value,$key,$contents ); } return $contents; } function find_categories ($contents) { global $categories,$categories_variations,$append_to_wiki; $append = ''; foreach ($categories as $value) { if (stripos($contents,$value) > 1) { if (strpos($append,$value) < 1) $append.="[[Category:$value]]\n"; //Add category, so long as it's not already there } } foreach ($categories_variations as $key => $value) { foreach ($value as $value2){ if (stripos($contents,$value2) > 1) { if (strpos($append,$key) < 1) $append.="[[Category:$key]]\n"; //Add category, so long as it's not already there } } } return $append; } function remove_title($title,$contents,$length = 200) { //takes a look at the first $length character to try and remove tile $common_variations = explode(' ',$title); $i=1; $c = array(); $c[0] = substr($contents, 0,$length ); $c[1] = substr($contents, $length ,strlen($contents)); $variations = array(1 => "$title"); foreach ($common_variations as $v_value) { $replacers = array(': ',' a ',' an ','ing ','ing an '); for($k=0;$k < count($replacers);$k++){ $i++; $j=0; $variations[$i] = ''; foreach ($common_variations as $v_value2) { $j++; $variations[$i] .= $v_value2; for ($l=1;$l<count($common_variations);$l++){ if ($l == $j) $variations[$i] .= $replacers[$k]; else $variations[$i] .= ' '; } } } } $c[0] = str_ireplace($variations,"",$c[0]); $c[0] = str_replace("====",'',$c[0]); // We tend to leave a lot of these return $c[0].$c[1]; } function get_url_contents($url) { $handle = fopen($url, "r"); $contents = stream_get_contents($handle); fclose($handle); return utf8_encode($contents); //let's explicitly work in UTF8 to reduce errors } function write_file ($filename, $content) { global $output_dir,$write_files; if ($write_files) { $filename = addslashes($filename); $filename = str_replace('/','_',$filename); $f=fopen(utf8_encode($output_dir.$filename), "wb"); if (!fputs($f, $content)) die("Unable to write to: ".$filename); fclose($f); } } //Read in from Sakai $contents = get_url_contents($help_base_url.$toc); if(preg_match_all("/$regexp/siU", $contents, $matches)) { foreach($matches[3] as $match_key => $match_value){ if (!in_array($match_value,$exclude)){ if (strpos(' '.$matches[2][$match_key],"#") < 1 && strlen($matches[2][$match_key]) > 0) $articles[$match_value] = $help_base_url.$matches[2][$match_key]; } } } $xml = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en"> <siteinfo> <sitename>Sakai Import</sitename> <base>http://kumu.brocku.ca/sakai/Main_Page</base> <generator>PHP '.phpversion().'</generator> <case>first-letter</case>'; /*' <namespaces> <namespace key="-2">Media</namespace> <namespace key="-1">Special</namespace> <namespace key="0" /> <namespace key="1">Talk</namespace> <namespace key="2">User</namespace> <namespace key="3">User talk</namespace> <namespace key="4">Sakai at Brock</namespace> <namespace key="5">Discussion</namespace> <namespace key="6">Image</namespace> <namespace key="7">Image talk</namespace> <namespace key="8">MediaWiki</namespace> <namespace key="9">MediaWiki talk</namespace> <namespace key="10">Template</namespace> <namespace key="11">Template talk</namespace> <namespace key="12">Help</namespace> <namespace key="13">Help talk</namespace> <namespace key="14">Category</namespace> <namespace key="15">Category talk</namespace> </namespaces>'; */ $xml .= '</siteinfo>'; $i = 0; foreach($articles as $key => $value) { $i ++; // $contents = utf8_encode(get_url_contents($value)); $contents = get_url_contents($value); write_file($key.'.html',$contents); if (function_exists('tidy_repair_string')) { $config = array('indent' => TRUE, 'indent-spaces' =>0, 'indent' => 0, 'hide-comments' => TRUE, 'newline' => 0, 'tab-size' => 0, 'output-html' => TRUE); $tidy = tidy_parse_string($contents, $config, 'UTF8'); $tidy->cleanRepair(); $contents = $tidy; } /** Translate heading tags to wiki markup **/ $eql = ''; for ($i=1;$i<6;$i++) { $eql .= '='; $contents = str_ireplace(array("<h$i>\n","\n</h$i>"),$eql,$contents); $contents = str_ireplace(array("<h$i>","</h$i>"),$eql,$contents); } $contents = str_replace(' '," ",$contents); //blank out these characters as I don't like how PHP handles them $contents = html_entity_decode($contents); //Aparently XML doesn't like these things? $contents = strip_tags($contents); //remove remaining html tags (mainly p tags) $contents = str_replace('&','and',$contents); //Aparently XML doesn't like these things? $contents = str_replace($blank,'',$contents); //blank out these characters /** Explode and do line by line **/ $contents_array= explode("\n",$contents); $contents = ''; $blank_watch = array(); for($i=0; $i < count($contents_array);$i++) { //can't help but think an OO approach might have been better if ($i > 2) { if (strlen($contents_array[$i]) <= 2) { //if the line is only so long if (in_array($i-1,$blank_watch)) array_push($blank_watch,$i); //check to see if the last line as too, if so record this one else { if (count($blank_watch) > 1) {// if we've got a few blanks, remove the foreach ($blank_watch as $value) { unset($contents_array[$value]); } } $blank_watch = array($i); } } } else $contents .= trim($contents_array[$i]); } foreach ($contents_array as $value){ if (strpos($value,'*/') < 1) $contents .= trim($value)."\n"; } if ($remove_dup_titles) $contents = remove_title($key,$contents); //Remove titles and reduce some of the blank lines after titles if ($queens_english) $contents = queens_english($contents); if ($link_to_articles) $contents = link_to_articles ($contents); $contents = $contents.$append_to_wiki.find_categories($contents); write_file($key.'.txt',$contents); $xml .='<page> <title>'.$key.'</title> <id>4</id> <revision> <id>1</id> <timestamp>'.date("Y-m-d\TG:i:s\Z").'</timestamp> <contributor> <username>'.$username.'</username> <id>2</id> </contributor> <comment>Import from Sakai help pages '.date("F j, Y, g:i a").'</comment> <text xml:space="preserve">'.sprintf("%s",$contents).'</text> </revision> </page>'."\n"; } $xml .= '</mediawiki>'; echo $xml; ?> </source>


Articles in category "Auto Imported 2009 01 28"

There are 73 articles in this category.

A

B

C

C cont.

D

E

F

G

L

M

O

P

P cont.

R

S

U

V

W

Personal tools
  • Log in / create account
Bookmark and Share