diff --git a/page.php b/page.php index 35f4630..68a2387 100644 --- a/page.php +++ b/page.php @@ -1,400 +1,405 @@ <?php define('LONG_DATE_FORMAT', '%e %B %Y'); define('USER_AGENT', 'WikimediaTools/SourceTemplatesGenerator/0.1'); define('USER_AGENT_FALLBACK', 'Mozilla/5.0'); define('USER_AGENT_FALLBACK_FULL', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13'); require_once('helpers/Encoding.php'); class Page { /* * @var string The page URL */ public $url; /** * @var array Meta tags */ public $meta_tags; /** * @var string The page content */ public $data; /** * @var string The page title */ public $title; /** * @var string The page author */ public $author; /** * @var Array The page coauthors */ public $coauthors = []; /** * @var string The site ISSN */ public $issn; //If we use the parameters yyyy mm dd, we describe CONTENT date: /** * @var int The page content's year */ public $yyyy; /** * @var int The page content's month */ public $mm; /** * @var int The page content's day */ public $dd; //If not, we describe ONLINE RESOURCE PUBLISH date: /** * @var string The page publication date in relevant locale */ public $date; /** * @var int The page publication unixtime */ public $unixtime; /** * @var bool Indicates if we have to skip year/month/date template parameters */ public $skipYMD = false; /** * @var bool Indicates if we have to skip month/date (but maybe keep year) template parameters */ public $skipMD = false; /** * @var bool Indicates if we have to skip author template parameter */ public $skipAuthor; /** * @var bool If true, this site uses an anti ad blocker technology to force users to show ads */ public $antiAdBlocker = false; + /** + * @var bool If true, this site is behind a paywall + */ + public $paywall = false; + /** * @var mixed If not null, contains an array for another service to use */ public $switchTo = null; /** * @var string The last error occured while opening and parsing the page */ public $error; /** * Initializes a new Page instance. If an error occured, you can read it in $this->error. * * @param string $url the page URL */ function __construct ($url) { $this->url = $url; $this->get_data(); if ($this->data) { $this->analyse(); } } function get_data () { ini_set('user_agent', USER_AGENT); $data = file_get_contents($this->url); if (!$data) { ini_set('user_agent', USER_AGENT_FALLBACK); if (!$data = @file_get_contents($this->url)) { $this->error = "Can't read URL"; return; } } $this->data = $data; $this->encodeData(); } function encodeData () { $encoding = mb_detect_encoding($this->data, "ISO-8859-15, ISO-8859-1, UTF-8, ASCII, auto"); if ($encoding && $encoding != 'UTF-8') { $this->data = Encoding::toUTF8($this->data); } } /** * Return a new Page instance, or if such class exists, an instance class specialized for your site. * * @param $url the page URL */ static function load ($url) { //Classes list are stored in pages/index.dat file //Each line contains the URL beginning, a tabulation, and the page analyser name // * class is this name, appended by 'Page' // * source file is the lowercase version of this name, appended by '.php' $pages = file('pages/index.dat', true); foreach ($pages as $line) { $page = explode("\t", $line); if (substr($url, 0, strlen($page[0])) == $page[0]) { $file = strtolower(trim($page[1])) . '.php'; $class = trim($page[1]) . 'Page'; return new $class($url); } } return new Page($url); } /** * Analyses metatags to process content */ function analyse () { //Meta tags (including <meta property="" value=""> and <meta itemprop="" value="" syntax) $this->meta_tags = $this->get_meta_tags(); $t = $this->meta_tags; //Title $this->title = $this->get_title(); //Date if ($date = $this->getMetaTag($t, 'date', 'pubdate', 'content_create_date')) { $date = date_parse($date); $this->yyyy = $date['year']; $this->mm = $date['month']; $this->dd = $date['day']; } //Site name $this->site = $this->getMetaTag($t, 'og:site_name'); //Author $this->author = $this->getMetaTag($t, 'author'); } /** * Gets page metatags * * @return array an array where the keys are the metatags' names and the values the metatags' values */ function get_meta_tags () { return $this::get_all_meta_tags($this->url); } /** * Gets all metatags, including those using meta property= and meta itemprop= syntax * * @return array an array where the keys are the metatags' names and the values the metatags' values */ function get_all_meta_tags () { // Thank you to Bobble Bubble // See http://php.net/manual/en/function.get-meta-tags.php comments $pattern = ' ~<\s*meta\s # Lookahead to capture type to $1 (?=[^>]*? \b(?:name|property|itemprop|http-equiv)\s*=\s* (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'| ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=)) ) # Capture content to $2 [^>]*?\bcontent\s*=\s* (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'| ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=)) [^>]*> ~ix'; if (!preg_match_all($pattern, $this->data, $match)) { return []; } $metaTags = array_combine($match[1], $match[2]); array_walk($metaTags, 'self::clean_tag'); return $metaTags; } /** * Cleans a tag value (callback for array_walk) * * @param mixed &$value array item's value * @param string $key array item's key */ static function clean_tag (&$item, $key) { if (is_array($item)) { $item = join("; ", $item); } return trim($item); } /** * Gets title * * @return string The page title */ function get_title () { $title = $this->getMetaTag($this->meta_tags, 'title', 'og:title', 'DC.title', 'Title'); return $title ?: ((preg_match("#<title>(.+)<\/title>#iU", $this->data, $title)) ? trim($title[1]) : ''); } /** * Determines if the current page is an article published in a journal. * * @return bool true if the current page is an article ; otherwise, false */ function is_article () { return (array_key_exists('dc_type', $this->meta_tags) && $this->meta_tags['dc_type'] == 'journalArticle') || (array_key_exists('dcsext_pn-cat', $this->meta_tags) && $this->meta_tags['dcsext_pn-cat'] == 'Article') || array_key_exists('citation_journal_title', $this->meta_tags) || array_key_exists('prism_publicationname', $this->meta_tags); } /** * Gets relevant metatag * * @param array the metatags * @param string... the list of acceptable metatags * * @return string the first metatag value found */ static function getMetaTag () { $tags = func_get_args(); $metatags = array_shift($tags); foreach ($tags as $tag) { $tag_lowercase = strtolower($tag); foreach ($metatags as $key => $value) { if ($tag_lowercase == strtolower($key)) return $value; } } return ''; } /** * Finds a portion of text included between $before and $after strings on the current page * * @param string $before The string at the left of the text to be grabbed * @param string $after The string at the right of the text to be grabbed * * @return string The text found between $before and $after */ function between ($before, $after) { return self::grab($this->data, $before, $after); } /** * Finds a portion of text included between $before and $after strings * * @param string $text The text where to find the substring * @param string $before The string at the left of the text to be grabbed * @param string $after The string at the right of the text to be grabbed [facultative] * * @return string The text found between $before and $after */ static function grab ($text, $before, $after = null) { $pos1 = strpos($text, $before); if ($pos1 === false) { return false; } else { $pos1 += strlen($before); } if ($after === null) { return substr($text, $pos1); } $pos2 = strpos($text, $after, $pos1 + 1); if ($pos2 === false) { return false; } return substr($text, $pos1, $pos2 - $pos1); } /** * Downloads, through CURL library, accepting cookies. * * @param $url The URL to fetch */ static function curl_download ($url, $agent = '') { $ch = curl_init(); $timeout = 5; $cookie_file = tmpfile(); $cookie_file = tempnam(sys_get_temp_dir(), "cookie-sourcesgen-"); curl_setopt($ch, CURLOPT_COOKIESESSION, true); curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file); curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); if ($agent != '') { curl_setopt($ch, CURLOPT_USERAGENT, $agent); } $data = curl_exec($ch); curl_close($ch); unlink($cookie_file); return $data; } /// /// DATES /// /** * @param string $toParse The date to parse * @param string $tz The timezone to use [optional] */ function dateFromDateParse ($toParse, $tz = 'Europe/Paris') { $old_tz = date_default_timezone_get(); date_default_timezone_set($tz); $date = date_parse($toParse); $this->yyyy = $date['year']; $this->mm = $date['month']; $this->dd = $date['day']; $this->unixtime = mktime($date['hour'], $date['minute'], $date['second'], $date['month'], $date['day'], $date['year']); date_default_timezone_set($old_tz); } function extractYYYYMMDDDateFromURL() { $pattern = "@/([12][0-9]{3})\-([0-9]{2})\-([0-9]{2})/@"; if (preg_match($pattern, $this->url, $matches)) { $this->yyyy = $matches[1]; $this->mm = $matches[2]; $this->dd = $matches[3]; } } function extractYYYYDDMMateFromURL() { $pattern = "@/([12][0-9]{3})\-([0-9]{2})\-([0-9]{2})/@"; if (preg_match($pattern, $this->url, $matches)) { $this->yyyy = $matches[1]; $this->mm = $matches[3]; $this->dd = $matches[2]; } } } diff --git a/templates/wikipedia-fr/Lien_web.php b/templates/wikipedia-fr/Lien_web.php index fb2281b..63a9e04 100644 --- a/templates/wikipedia-fr/Lien_web.php +++ b/templates/wikipedia-fr/Lien_web.php @@ -1,111 +1,118 @@ <?php setlocale(LC_TIME, 'fr_FR.UTF-8'); class LienWebTemplate extends Template { public $author; public $coauthors; public $url; public $title; public $dd; public $mm; public $yyyy; public $site; public $pageDate = null; public $accessdate; + public $antiAdBlocker; + public $paywall; /** * @var bool Indicates if we've to remove jour/mois/année parameters */ public $skipYMD = false; /** * @var bool Indicates if we've to remove jour/mois parameters but maybe keep année */ public $skipMD = false; /** * @var bool Indicates if we've to remove auteur and coauteurs parameters */ public $skipAuthor = false; function __construct () { $this->name = "Lien web"; $this->accessdate = trim(strftime(LONG_DATE_FORMAT)); } static function loadFromPage ($page) { $template = new LienWebTemplate(); $template->author = $page->author; $template->skipAuthor = $page->skipAuthor; $template->coauthors = $page->coauthors; $template->url = $page->url; $template->title = $page->title; $template->dd = $page->dd; $template->mm = $page->mm; $template->yyyy = $page->yyyy; $template->site = $page->site; $template->pageDate = $page->date; $template->skipYMD = $page->skipYMD; $template->skipMD = $page->skipMD; $template->antiAdBlocker = $page->antiAdBlocker; + $template->paywall = $page->paywall; return $template; } function computeDate () { //Legacy code issue if ($this->pageDate !== "" && $this->pageDate !== null) { echo '<div data-alert class="alert-box info radius">'; echo "<p>The Page metadata contains the following date information:<br />$this->pageDate</p><p>{{Lien web}} should now use jour, mois, année instead of a date parameter to provide richer machine data.</p>"; echo ' <a href="#" class="close">×</a></div>'; } } /** * Gets the month, as a string in current locale * * @return string the month name */ function getMonth () { if (!$this->mm) { return ""; } if (is_numeric($this->mm)) { return strftime('%B', mktime(0, 0, 0, (int)$this->mm)); } return $this->mm; } function __toString () { if (!$this->skipAuthor) { $this->params['auteur'] = $this->author; if ($this->coauthors) { $this->params['coauteurs'] = implode(', ', $this->coauthors); } } $this->params['titre'] = $this->title; $this->computeDate(); if (!$this->skipYMD && !$this->skipMD) { $this->params['jour'] = $this->dd; $this->params['mois'] = $this->getMonth(); } if (!$this->skipYMD) { $this->params['année'] = $this->yyyy; } $this->params['url'] = $this->url; $this->params['site'] = $this->site; $this->params['consulté le'] = $this->accessdate; $template = parent::__toString(); if ($this->antiAdBlocker) { $template .= " {{Publicité forcée}}"; } + if ($this->paywall) { + $template .= " {{inscription nécessaire}}"; + } + return $template; } }