Page MenuHomeDevCentral

No OneTemporary

diff --git a/page.php b/page.php
index cc056c7..e7c4b8e 100644
--- a/page.php
+++ b/page.php
@@ -1,157 +1,188 @@
<?php
define('LONG_DATE_FORMAT', '%e %B %Y');
define('USER_AGENT', 'WikimediaTools/SourceTemplatesGenerator/0.1');
define('USER_AGENT_FALLBACK', 'Mozilla/5.0');
class Page {
public $url;
/**
* @var array Meta tags
*/
public $meta_tags;
/**
* @var string The page content
*/
public $data;
public $title;
public $author;
public $yyyy;
public $mm;
public $dd;
public $skipYMD;
public $skipAuthor;
public $error;
function __construct ($url) {
$this->url = $url;
ini_set('user_agent', USER_AGENT);
$this->data = @file_get_contents($url);
if (!$this->data) {
ini_set('user_agent', USER_AGENT_FALLBACK);
if (!$this->data = @file_get_contents($url)) {
$this->error = "Can't read URL";
return;
}
}
$this->analyse();
}
static function load ($url) {
$pages = file('pages/index.dat', true);
foreach ($pages as $line) {
$page = explode("\t", $line);
if (substr($url, 0, strlen($page[0])) == $page[0]) {
$file = strtolower(trim($page[1])) . '.php';
$class = trim($page[1]) . 'Page';
require("pages/$file");
return new $class($url);
}
}
return new Page($url);
}
/**
* Analyses metatags to process content
*/
function analyse () {
$this->meta_tags = $this->get_meta_tags();
$this->title = $this->get_title();
if (array_key_exists('date', $this->meta_tags)) {
$date = date_parse($this->meta_tags['date']);
$this->yyyy = $date['year'];
$this->mm = $date['month'];
$this->dd = $date['day'];
}
}
/**
* Gets page metatags
*
* @return array an array where the keys are the metatags' names and the values the metatags' values
*/
function get_meta_tags () {
return get_meta_tags($this->url);
}
/**
* Gets all metatags, including those using meta property= and meta itemprop= syntax
*
* @return array an array where the keys are the metatags' names and the values the metatags' values
*/
function get_all_meta_tags () {
//Thank you to Michael Knapp and Mariano
//See http://php.net/manual/en/function.get-meta-tags.php comments
preg_match_all('/<[\s]*meta[\s]*\b(name|property|itemprop)\b="?' . '([^>"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->data, $match);
if (isset($match) && is_array($match) && count($match) == 4) {
$originals = $match[0];
$names = $match[2];
$values = $match[3];
if (count($originals) == count($names) && count($names) == count($values)) {
$metaTags = array();
for ($i=0, $limiti = count($names) ; $i < $limiti ; $i++) {
$metaTags[$names[$i]] = $values[$i];
}
}
}
return $metaTags;
}
/**
* Gets title
*
* @return string The page title
*/
function get_title () {
if (array_key_exists('title', $this->meta_tags)) return $this->meta_tags['title'];
return (preg_match("#<title>(.+)<\/title>#iU", $this->data, $title)) ? trim($title[1]) : '';
}
/**
* Determines if the current page is an article published in a journal.
*
* @return bool true if the current page is an article ; otherwise, false
*/
function is_article () {
return
(array_key_exists('dc_type', $this->meta_tags) && $this->meta_tags['dc_type'] == 'journalArticle')
||
(array_key_exists('dcsext_pn-cat', $this->meta_tags) && $this->meta_tags['dcsext_pn-cat'] == 'Article')
||
array_key_exists('citation_journal_title', $this->meta_tags)
||
array_key_exists('prism_publicationname', $this->meta_tags);
}
- /**
- * Gets relevant metatag
- *
- * @param array the metatags
- * @param string... the list of acceptable metatags
- *
- * @return string the first metatag value found
- */
- static function getMetaTag () {
- $tags = func_get_args();
- $metatags = array_shift($tags);
-
- foreach ($tags as $tag) {
- if (array_key_exists($tag, $metatags)) {
- return $metatags[$tag];
- }
- }
-
- return '';
- }
+ /**
+ * Gets relevant metatag
+ *
+ * @param array the metatags
+ * @param string... the list of acceptable metatags
+ *
+ * @return string the first metatag value found
+ */
+ static function getMetaTag () {
+ $tags = func_get_args();
+ $metatags = array_shift($tags);
+
+ foreach ($tags as $tag) {
+ if (array_key_exists($tag, $metatags)) {
+ return $metatags[$tag];
+ }
+ }
+
+ return '';
+ }
+
+ /**
+ * Finds a portion of text included between $before and $after strings on the current page
+ *
+ * @param string $before The string at the left of the text to be grabbed
+ * @param string $after The string at the right of the text to be grabbed
+ *
+ * @return string The text found between $before and $after
+ */
+ function between ($before, $after) {
+ return self::grab($this->data, $before, $after);
+ }
+
+ /**
+ * Finds a portion of text included between $before and $after strings
+ *
+ * @param string $text The text where to find the substring
+ * @param string $before The string at the left of the text to be grabbed
+ * @param string $after The string at the right of the text to be grabbed
+ *
+ * @return string The text found between $before and $after
+ */
+ static function grab ($text, $before, $after) {
+ $pos1 = strpos($text, $before);
+ if ($pos1 === false) { return false; } else { $pos1 += strlen($before); }
+
+ $pos2 = strpos($text, $after, $pos1 + 1);
+ if ($pos2 === false) { return false; }
+
+ return substr($text, $pos1, $pos2 - $pos1);
+ }
}
diff --git a/pages/index.dat b/pages/index.dat
index 8949989..549bebe 100644
--- a/pages/index.dat
+++ b/pages/index.dat
@@ -1,2 +1,3 @@
http://www.rue89.com/ Rue89
http://www.lesoir.be/ LeSoir
+http://archives.lesoir.be/ LeSoir
diff --git a/pages/lesoir.php b/pages/lesoir.php
index 6c5debe..197ccc6 100644
--- a/pages/lesoir.php
+++ b/pages/lesoir.php
@@ -1,43 +1,64 @@
<?php
//Page analysis for www.lesoir.be
class LeSoirPage extends Page {
function analyse () {
parent::analyse();
//Hardcoded known info
$this->site = "Le Soir";
$this->skipYMD = true;
//Gets date
//meta tag 'archi_id' has t-YYYYMMDD-HHMMhh as format (where hh = AM/PM)
// e.g. t-20120722-0211PM
$yyyy = substr($this->meta_tags['archi_id'], 2, 4);
$mm = substr($this->meta_tags['archi_id'], 6, 2);
$dd = substr($this->meta_tags['archi_id'], 8, 2);
$this->date = strftime(LONG_DATE_FORMAT, mktime(0, 0, 0, $mm, $dd, $yyyy));
//Gets author
- //TODO: ensure no article has more than one author
- $pos1 = strpos($this->data, '<p class="info st_signature">') + 29;
- $pos2 = strpos($this->data, '</p>', $pos1);
- $author = substr($this->data, $pos1, $pos2 - $pos1);
- if ($author == "R&#233;daction en ligne") {
- $this->skipAuthor = true;
+ $authors = self::between('st_signature">', '</p>');
+
+ if ($authors == "R&#233;daction en ligne") {
+ $this->skipAuthor = true;
} else {
- require_once('helpers/namecase.php');
- $this->author = name_case($author);
+ require_once('helpers/namecase.php');
+
+ //Some Le Soir articles use firstname name, others name,firstname.
+ //When there are several authors, ' ;' is the separator.
+ //Authors are in uppercase, so we need to clean case.
+
+ $authors = explode('; ', $authors);
+ $start = true;
+
+ foreach ($authors as $author) {
+ if (strpos($author, ',') !== false) {
+ $name = explode(',', $author, 2);
+ $author = $name[1] . ' ' . $name[0];
+ }
+ $author = name_case($author);
+ if ($start) {
+ $this->author = name_case($author);
+ $start = false;
+ } else {
+ $this->coauthors[] = name_case($author);
+ }
+ }
}
}
function get_title () {
- return $this->meta_tags['og:title'];
+ if (!$title = $this->meta_tags['og:title']) {
+ $title = parent::get_title();
+ }
+ return $title;
}
function get_meta_tags () {
//Rue89 doesn't always use <meta name="" value=""> but sometimes property= or itemprop=
return $this->get_all_meta_tags();
}
}
?>
diff --git a/templates/wikipedia-fr/Article.php b/templates/wikipedia-fr/Article.php
index 8fdf198..828e8de 100644
--- a/templates/wikipedia-fr/Article.php
+++ b/templates/wikipedia-fr/Article.php
@@ -1,160 +1,160 @@
<?php
setlocale(LC_TIME, 'fr_FR.UTF-8');
class ArticleTemplate extends Template {
public $lang;
public $title;
public $periodique;
public $year;
public $accessdate;
function __construct () {
$this->name = "Article";
$this->accessdate = trim(strftime(LONG_DATE_FORMAT));
}
static function loadFromPage ($page) {
$template = new self();
$t = $page->meta_tags;
//Language
$template->lang = page::getMetaTag($t, 'dc_language', 'citation_language');
//Authors
if ($author = page::getMetaTag($t, 'author', 'dc_creator', 'citation_authors', 'dc_contributor', 'citation_author')) {
//TODO: handle Alpha Beta syntax instead Beta, Alpha
$template->authors[] = explode(', ', $author, 2);
}
//Title
if (!$template->title = page::getMetaTag($t, 'dc_title', 'citation_title')) {
$template->title = $page->title;
}
//Journal, publisher
$template->journal = page::getMetaTag($t, 'prism_publicationname', 'citation_journal_title');
$template->journalLink = $t['dc_source'];
$template->publisher = page::getMetaTag($t, 'dc_publisher', 'citation_publisher');
//Issue name, number and volume
- $template->issue = page::getMetaTag($t, 'prism_number', 'citation_issue');
+ $template->issue = page::getMetaTag($t, 'prism_number', 'citation_issue');
$template->volume = page::getMetaTag($t, 'citation_volume');
if (
(!$template->issueName = $t['prism_issuename'])
&&
array_key_exists('dc_relation_ispartof', $t)
) {
$template->issueName = $t['dc_relation_ispartof']
. " <!-- !!! paramètre à nettoyer !!! -->";
}
//Date
if ($date = page::getMetaTag($t, 'prism_publicationdate', 'dc_date', 'citation_date')) {
$template->yyyy = substr($date, 0, 4);
$template->mm = substr($date, 5, 2);
$template->dd = substr($date, 8, 2);
} else {
$template->yyyy = page::getMetaTag($t, 'citation_year');
}
//Pages
$template->pageStart = page::getMetaTag($t, 'prism_startingpage', 'citation_firstpage');
$template->pageEnd = page::getMetaTag($t, 'prism_endingpage', 'citation_lastpage');
//ISBN, ISSN, URLs
$template->issn = page::getMetaTag($t, 'prism_issn', 'citation_issn');
$template->isbn = page::getMetaTag($t, 'citation_isbn');
$template->doi = page::getMetaTag($t, 'citation_doi');
$template->summary = page::getMetaTag($t, 'citation_abstract_html_url');
$template->url = self::getTextURL($page->url, $t);
return $template;
}
function __toString () {
//Langue
$this->params['langue'] = $this->lang;
//Authors
if (count($this->authors)) {
$k = 1;
foreach ($this->authors as $author) {
$this->params["prénom$k"] = $author[1];
$this->params["nom$k"] = $author[0];
$this->params["lien auteur$k"] = '';
$k++;
}
}
//Titre, périodique, éditeur, volume, etc.
$this->params['titre'] = $this->title;
$this->params['périodique'] = $this->journal;
//TODO: vérifier si l'aticle existe sur fr.wikip et contient l'infobox Presse ou est rattaché à une catégorie fille de [[Catégorie:Revue scientifique]]
$this->params['lien périodique'] = $this->journal;
$this->params['éditeur'] = $this->publisher;
if ($this->volume) $this->params['volume'] = $this->volume;
$this->params['numéro'] = $this->issue;
if ($this->issueName) $this->params['titre numéro'] = $this->issueName;
//Date
if ($this->mm && $this->dd) {
$date = mktime(12, 0, 0, $this->mm, $this->dd, $this->yyyy);
$this->params['jour'] = trim(strftime('%e', $date));
$this->params['mois'] = strftime('%B', $date);
}
$this->params['année'] = $this->yyyy;
//Pages, ISSN, ISBN, DOI, URL, consulté le
$this->params['pages'] = $this->pageEnd ? ($this->pageStart . '-' . $this->pageEnd) : $this->pageStart;
if ($this->issn) $this->params['ISSN'] = $this->issn;
if ($this->isbn) $this->params['ISBN'] = $this->isbn;
if ($this->doi) $this->params['doi'] = $this->doi;
$this->params['url texte'] = $this->url;
if (self::isSummaryPertinent($this->url, $this->summary)) {
$this->params['résumé'] = $this->summary;
}
$this->params['consulté le'] = trim(strftime(LONG_DATE_FORMAT));
return parent::__toString();
}
/**
* Gets article full text URL
*
* @param string $url the article current URL
*
* @return string the article fulltext URL
*/
static function getTextURL ($url, $metatags) {
if (strpos($url, '.revues.org/') > 0) {
//revues.org PDF generation is broken
return $url;
}
if ($text_url = page::getMetaTag($metatags, 'citation_pdf_url', 'citation_fulltext_html_url')) {
return $text_url;
}
return $url;
}
/**
* Determines if a summary is pertinent to include in parameters
*
* @param string $url_article Article URL
* @param string $url_summary Summary URL
*
* @return bool true if the summary URL should be included in templat ; otherwise, false
*/
static function isSummaryPertinent ($url_article, $url_summary) {
//Empty summary or identical to URL are rejected
if ($url_summary == '' || $url_summary == $url_article) return false;
//This site is indexed through /resume.php but gives /article.php as summary URL in metadata
if (substr($url_article, 0, 32) == "http://www.cairn.info/resume.php") return false;
return true;
}
}
?>
diff --git a/templates/wikipedia-fr/Lien_web.php b/templates/wikipedia-fr/Lien_web.php
index c785563..d7364d3 100644
--- a/templates/wikipedia-fr/Lien_web.php
+++ b/templates/wikipedia-fr/Lien_web.php
@@ -1,60 +1,71 @@
<?php
setlocale(LC_TIME, 'fr_FR.UTF-8');
class LienWebTemplate extends Template {
public $author;
+ public $coauthors;
public $url;
public $title;
public $dd;
public $mm;
public $yyyy;
public $site;
public $publishdate;
public $accessdate;
/**
* @var bool Indicates if we've to remove jour/mois/année parameters
*/
public $skipYMD = false;
+ /**
+ * @var bool Indicates if we've to remove auteur and coauteurs parameters
+ */
+ public $skipAuthor = false;
+
function __construct () {
$this->name = "Lien web";
$this->accessdate = trim(strftime(LONG_DATE_FORMAT));
}
static function loadFromPage ($page) {
$template = new LienWebTemplate();
$template->author = $page->author;
$template->skipAuthor = $page->skipAuthor;
+ $template->coauthors = $page->coauthors;
$template->url = $page->url;
$template->title = $page->title;
$template->dd = $page->dd;
$template->mm = $page->mm;
$template->yyyy = $page->yyyy;
$template->site = $page->site;
$template->publishdate = $page->date;
$template->skipYMD = $page->skipYMD;
return $template;
}
function __toString () {
if (!$this->skipAuthor) {
$this->params['auteur'] = $this->author;
+
+ if ($this->coauthors) {
+ $this->params['coauteurs'] = implode(', ', $this->coauthors);
+ }
}
$this->params['titre'] = $this->title;
if (!$this->skipYMD) {
$this->params['jour'] = $this->mm;
$this->params['mois'] = $this->dd;
$this->params['année'] = $this->yyyy;
}
$this->params['url'] = $this->url;
$this->params['site'] = $this->site;
$this->params['en ligne le'] = $this->publishdate;
$this->params['consulté le'] = $this->accessdate;
return parent::__toString();
}
}
?>

File Metadata

Mime Type
text/x-diff
Expires
Thu, Sep 18, 12:02 (13 h, 4 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2990761
Default Alt Text
(16 KB)

Event Timeline