Page Menu
Home
DevCentral
Search
Configure Global Search
Log In
Files
F11723956
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
16 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/page.php b/page.php
index cc056c7..e7c4b8e 100644
--- a/page.php
+++ b/page.php
@@ -1,157 +1,188 @@
<?php
define('LONG_DATE_FORMAT', '%e %B %Y');
define('USER_AGENT', 'WikimediaTools/SourceTemplatesGenerator/0.1');
define('USER_AGENT_FALLBACK', 'Mozilla/5.0');
class Page {
public $url;
/**
* @var array Meta tags
*/
public $meta_tags;
/**
* @var string The page content
*/
public $data;
public $title;
public $author;
public $yyyy;
public $mm;
public $dd;
public $skipYMD;
public $skipAuthor;
public $error;
function __construct ($url) {
$this->url = $url;
ini_set('user_agent', USER_AGENT);
$this->data = @file_get_contents($url);
if (!$this->data) {
ini_set('user_agent', USER_AGENT_FALLBACK);
if (!$this->data = @file_get_contents($url)) {
$this->error = "Can't read URL";
return;
}
}
$this->analyse();
}
static function load ($url) {
$pages = file('pages/index.dat', true);
foreach ($pages as $line) {
$page = explode("\t", $line);
if (substr($url, 0, strlen($page[0])) == $page[0]) {
$file = strtolower(trim($page[1])) . '.php';
$class = trim($page[1]) . 'Page';
require("pages/$file");
return new $class($url);
}
}
return new Page($url);
}
/**
* Analyses metatags to process content
*/
function analyse () {
$this->meta_tags = $this->get_meta_tags();
$this->title = $this->get_title();
if (array_key_exists('date', $this->meta_tags)) {
$date = date_parse($this->meta_tags['date']);
$this->yyyy = $date['year'];
$this->mm = $date['month'];
$this->dd = $date['day'];
}
}
/**
* Gets page metatags
*
* @return array an array where the keys are the metatags' names and the values the metatags' values
*/
function get_meta_tags () {
return get_meta_tags($this->url);
}
/**
* Gets all metatags, including those using meta property= and meta itemprop= syntax
*
* @return array an array where the keys are the metatags' names and the values the metatags' values
*/
function get_all_meta_tags () {
//Thank you to Michael Knapp and Mariano
//See http://php.net/manual/en/function.get-meta-tags.php comments
preg_match_all('/<[\s]*meta[\s]*\b(name|property|itemprop)\b="?' . '([^>"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->data, $match);
if (isset($match) && is_array($match) && count($match) == 4) {
$originals = $match[0];
$names = $match[2];
$values = $match[3];
if (count($originals) == count($names) && count($names) == count($values)) {
$metaTags = array();
for ($i=0, $limiti = count($names) ; $i < $limiti ; $i++) {
$metaTags[$names[$i]] = $values[$i];
}
}
}
return $metaTags;
}
/**
* Gets title
*
* @return string The page title
*/
function get_title () {
if (array_key_exists('title', $this->meta_tags)) return $this->meta_tags['title'];
return (preg_match("#<title>(.+)<\/title>#iU", $this->data, $title)) ? trim($title[1]) : '';
}
/**
* Determines if the current page is an article published in a journal.
*
* @return bool true if the current page is an article ; otherwise, false
*/
function is_article () {
return
(array_key_exists('dc_type', $this->meta_tags) && $this->meta_tags['dc_type'] == 'journalArticle')
||
(array_key_exists('dcsext_pn-cat', $this->meta_tags) && $this->meta_tags['dcsext_pn-cat'] == 'Article')
||
array_key_exists('citation_journal_title', $this->meta_tags)
||
array_key_exists('prism_publicationname', $this->meta_tags);
}
- /**
- * Gets relevant metatag
- *
- * @param array the metatags
- * @param string... the list of acceptable metatags
- *
- * @return string the first metatag value found
- */
- static function getMetaTag () {
- $tags = func_get_args();
- $metatags = array_shift($tags);
-
- foreach ($tags as $tag) {
- if (array_key_exists($tag, $metatags)) {
- return $metatags[$tag];
- }
- }
-
- return '';
- }
+ /**
+ * Gets relevant metatag
+ *
+ * @param array the metatags
+ * @param string... the list of acceptable metatags
+ *
+ * @return string the first metatag value found
+ */
+ static function getMetaTag () {
+ $tags = func_get_args();
+ $metatags = array_shift($tags);
+
+ foreach ($tags as $tag) {
+ if (array_key_exists($tag, $metatags)) {
+ return $metatags[$tag];
+ }
+ }
+
+ return '';
+ }
+
+ /**
+ * Finds a portion of text included between $before and $after strings on the current page
+ *
+ * @param string $before The string at the left of the text to be grabbed
+ * @param string $after The string at the right of the text to be grabbed
+ *
+ * @return string The text found between $before and $after
+ */
+ function between ($before, $after) {
+ return self::grab($this->data, $before, $after);
+ }
+
+ /**
+ * Finds a portion of text included between $before and $after strings
+ *
+ * @param string $text The text where to find the substring
+ * @param string $before The string at the left of the text to be grabbed
+ * @param string $after The string at the right of the text to be grabbed
+ *
+ * @return string The text found between $before and $after
+ */
+ static function grab ($text, $before, $after) {
+ $pos1 = strpos($text, $before);
+ if ($pos1 === false) { return false; } else { $pos1 += strlen($before); }
+
+ $pos2 = strpos($text, $after, $pos1 + 1);
+ if ($pos2 === false) { return false; }
+
+ return substr($text, $pos1, $pos2 - $pos1);
+ }
}
diff --git a/pages/index.dat b/pages/index.dat
index 8949989..549bebe 100644
--- a/pages/index.dat
+++ b/pages/index.dat
@@ -1,2 +1,3 @@
http://www.rue89.com/ Rue89
http://www.lesoir.be/ LeSoir
+http://archives.lesoir.be/ LeSoir
diff --git a/pages/lesoir.php b/pages/lesoir.php
index 6c5debe..197ccc6 100644
--- a/pages/lesoir.php
+++ b/pages/lesoir.php
@@ -1,43 +1,64 @@
<?php
//Page analysis for www.lesoir.be
class LeSoirPage extends Page {
function analyse () {
parent::analyse();
//Hardcoded known info
$this->site = "Le Soir";
$this->skipYMD = true;
//Gets date
//meta tag 'archi_id' has t-YYYYMMDD-HHMMhh as format (where hh = AM/PM)
// e.g. t-20120722-0211PM
$yyyy = substr($this->meta_tags['archi_id'], 2, 4);
$mm = substr($this->meta_tags['archi_id'], 6, 2);
$dd = substr($this->meta_tags['archi_id'], 8, 2);
$this->date = strftime(LONG_DATE_FORMAT, mktime(0, 0, 0, $mm, $dd, $yyyy));
//Gets author
- //TODO: ensure no article has more than one author
- $pos1 = strpos($this->data, '<p class="info st_signature">') + 29;
- $pos2 = strpos($this->data, '</p>', $pos1);
- $author = substr($this->data, $pos1, $pos2 - $pos1);
- if ($author == "Rédaction en ligne") {
- $this->skipAuthor = true;
+ $authors = self::between('st_signature">', '</p>');
+
+ if ($authors == "Rédaction en ligne") {
+ $this->skipAuthor = true;
} else {
- require_once('helpers/namecase.php');
- $this->author = name_case($author);
+ require_once('helpers/namecase.php');
+
+ //Some Le Soir articles use firstname name, others name,firstname.
+ //When there are several authors, ' ;' is the separator.
+ //Authors are in uppercase, so we need to clean case.
+
+ $authors = explode('; ', $authors);
+ $start = true;
+
+ foreach ($authors as $author) {
+ if (strpos($author, ',') !== false) {
+ $name = explode(',', $author, 2);
+ $author = $name[1] . ' ' . $name[0];
+ }
+ $author = name_case($author);
+ if ($start) {
+ $this->author = name_case($author);
+ $start = false;
+ } else {
+ $this->coauthors[] = name_case($author);
+ }
+ }
}
}
function get_title () {
- return $this->meta_tags['og:title'];
+ if (!$title = $this->meta_tags['og:title']) {
+ $title = parent::get_title();
+ }
+ return $title;
}
function get_meta_tags () {
//Rue89 doesn't always use <meta name="" value=""> but sometimes property= or itemprop=
return $this->get_all_meta_tags();
}
}
?>
diff --git a/templates/wikipedia-fr/Article.php b/templates/wikipedia-fr/Article.php
index 8fdf198..828e8de 100644
--- a/templates/wikipedia-fr/Article.php
+++ b/templates/wikipedia-fr/Article.php
@@ -1,160 +1,160 @@
<?php
setlocale(LC_TIME, 'fr_FR.UTF-8');
class ArticleTemplate extends Template {
public $lang;
public $title;
public $periodique;
public $year;
public $accessdate;
function __construct () {
$this->name = "Article";
$this->accessdate = trim(strftime(LONG_DATE_FORMAT));
}
static function loadFromPage ($page) {
$template = new self();
$t = $page->meta_tags;
//Language
$template->lang = page::getMetaTag($t, 'dc_language', 'citation_language');
//Authors
if ($author = page::getMetaTag($t, 'author', 'dc_creator', 'citation_authors', 'dc_contributor', 'citation_author')) {
//TODO: handle Alpha Beta syntax instead Beta, Alpha
$template->authors[] = explode(', ', $author, 2);
}
//Title
if (!$template->title = page::getMetaTag($t, 'dc_title', 'citation_title')) {
$template->title = $page->title;
}
//Journal, publisher
$template->journal = page::getMetaTag($t, 'prism_publicationname', 'citation_journal_title');
$template->journalLink = $t['dc_source'];
$template->publisher = page::getMetaTag($t, 'dc_publisher', 'citation_publisher');
//Issue name, number and volume
- $template->issue = page::getMetaTag($t, 'prism_number', 'citation_issue');
+ $template->issue = page::getMetaTag($t, 'prism_number', 'citation_issue');
$template->volume = page::getMetaTag($t, 'citation_volume');
if (
(!$template->issueName = $t['prism_issuename'])
&&
array_key_exists('dc_relation_ispartof', $t)
) {
$template->issueName = $t['dc_relation_ispartof']
. " <!-- !!! paramètre à nettoyer !!! -->";
}
//Date
if ($date = page::getMetaTag($t, 'prism_publicationdate', 'dc_date', 'citation_date')) {
$template->yyyy = substr($date, 0, 4);
$template->mm = substr($date, 5, 2);
$template->dd = substr($date, 8, 2);
} else {
$template->yyyy = page::getMetaTag($t, 'citation_year');
}
//Pages
$template->pageStart = page::getMetaTag($t, 'prism_startingpage', 'citation_firstpage');
$template->pageEnd = page::getMetaTag($t, 'prism_endingpage', 'citation_lastpage');
//ISBN, ISSN, URLs
$template->issn = page::getMetaTag($t, 'prism_issn', 'citation_issn');
$template->isbn = page::getMetaTag($t, 'citation_isbn');
$template->doi = page::getMetaTag($t, 'citation_doi');
$template->summary = page::getMetaTag($t, 'citation_abstract_html_url');
$template->url = self::getTextURL($page->url, $t);
return $template;
}
function __toString () {
//Langue
$this->params['langue'] = $this->lang;
//Authors
if (count($this->authors)) {
$k = 1;
foreach ($this->authors as $author) {
$this->params["prénom$k"] = $author[1];
$this->params["nom$k"] = $author[0];
$this->params["lien auteur$k"] = '';
$k++;
}
}
//Titre, périodique, éditeur, volume, etc.
$this->params['titre'] = $this->title;
$this->params['périodique'] = $this->journal;
//TODO: vérifier si l'aticle existe sur fr.wikip et contient l'infobox Presse ou est rattaché à une catégorie fille de [[Catégorie:Revue scientifique]]
$this->params['lien périodique'] = $this->journal;
$this->params['éditeur'] = $this->publisher;
if ($this->volume) $this->params['volume'] = $this->volume;
$this->params['numéro'] = $this->issue;
if ($this->issueName) $this->params['titre numéro'] = $this->issueName;
//Date
if ($this->mm && $this->dd) {
$date = mktime(12, 0, 0, $this->mm, $this->dd, $this->yyyy);
$this->params['jour'] = trim(strftime('%e', $date));
$this->params['mois'] = strftime('%B', $date);
}
$this->params['année'] = $this->yyyy;
//Pages, ISSN, ISBN, DOI, URL, consulté le
$this->params['pages'] = $this->pageEnd ? ($this->pageStart . '-' . $this->pageEnd) : $this->pageStart;
if ($this->issn) $this->params['ISSN'] = $this->issn;
if ($this->isbn) $this->params['ISBN'] = $this->isbn;
if ($this->doi) $this->params['doi'] = $this->doi;
$this->params['url texte'] = $this->url;
if (self::isSummaryPertinent($this->url, $this->summary)) {
$this->params['résumé'] = $this->summary;
}
$this->params['consulté le'] = trim(strftime(LONG_DATE_FORMAT));
return parent::__toString();
}
/**
* Gets article full text URL
*
* @param string $url the article current URL
*
* @return string the article fulltext URL
*/
static function getTextURL ($url, $metatags) {
if (strpos($url, '.revues.org/') > 0) {
//revues.org PDF generation is broken
return $url;
}
if ($text_url = page::getMetaTag($metatags, 'citation_pdf_url', 'citation_fulltext_html_url')) {
return $text_url;
}
return $url;
}
/**
* Determines if a summary is pertinent to include in parameters
*
* @param string $url_article Article URL
* @param string $url_summary Summary URL
*
* @return bool true if the summary URL should be included in templat ; otherwise, false
*/
static function isSummaryPertinent ($url_article, $url_summary) {
//Empty summary or identical to URL are rejected
if ($url_summary == '' || $url_summary == $url_article) return false;
//This site is indexed through /resume.php but gives /article.php as summary URL in metadata
if (substr($url_article, 0, 32) == "http://www.cairn.info/resume.php") return false;
return true;
}
}
?>
diff --git a/templates/wikipedia-fr/Lien_web.php b/templates/wikipedia-fr/Lien_web.php
index c785563..d7364d3 100644
--- a/templates/wikipedia-fr/Lien_web.php
+++ b/templates/wikipedia-fr/Lien_web.php
@@ -1,60 +1,71 @@
<?php
setlocale(LC_TIME, 'fr_FR.UTF-8');
class LienWebTemplate extends Template {
public $author;
+ public $coauthors;
public $url;
public $title;
public $dd;
public $mm;
public $yyyy;
public $site;
public $publishdate;
public $accessdate;
/**
* @var bool Indicates if we've to remove jour/mois/année parameters
*/
public $skipYMD = false;
+ /**
+ * @var bool Indicates if we've to remove auteur and coauteurs parameters
+ */
+ public $skipAuthor = false;
+
function __construct () {
$this->name = "Lien web";
$this->accessdate = trim(strftime(LONG_DATE_FORMAT));
}
static function loadFromPage ($page) {
$template = new LienWebTemplate();
$template->author = $page->author;
$template->skipAuthor = $page->skipAuthor;
+ $template->coauthors = $page->coauthors;
$template->url = $page->url;
$template->title = $page->title;
$template->dd = $page->dd;
$template->mm = $page->mm;
$template->yyyy = $page->yyyy;
$template->site = $page->site;
$template->publishdate = $page->date;
$template->skipYMD = $page->skipYMD;
return $template;
}
function __toString () {
if (!$this->skipAuthor) {
$this->params['auteur'] = $this->author;
+
+ if ($this->coauthors) {
+ $this->params['coauteurs'] = implode(', ', $this->coauthors);
+ }
}
$this->params['titre'] = $this->title;
if (!$this->skipYMD) {
$this->params['jour'] = $this->mm;
$this->params['mois'] = $this->dd;
$this->params['année'] = $this->yyyy;
}
$this->params['url'] = $this->url;
$this->params['site'] = $this->site;
$this->params['en ligne le'] = $this->publishdate;
$this->params['consulté le'] = $this->accessdate;
return parent::__toString();
}
}
?>
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Thu, Sep 18, 12:02 (13 h, 4 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2990761
Default Alt Text
(16 KB)
Attached To
Mode
rSTG Source templates generator
Attached
Detach File
Event Timeline
Log In to Comment