Page MenuHomeDevCentral

D309.id723.diff
No OneTemporary

D309.id723.diff

diff --git a/page.php b/page.php
--- a/page.php
+++ b/page.php
@@ -108,11 +108,11 @@
$this->get_data();
if ($this->data) {
$this->analyse();
- }
+ }
}
function get_data () {
- ini_set('user_agent', USER_AGENT);
+ ini_set('user_agent', USER_AGENT);
$data = file_get_contents($this->url);
if (!$data) {
ini_set('user_agent', USER_AGENT_FALLBACK);
@@ -121,11 +121,13 @@
return;
}
}
- $encoding = mb_detect_encoding($data, "ISO-8859-15, ISO-8859-1, UTF-8, ASCII, auto");
+ $this->encodeData();
+ }
+
+ function encodeData () {
+ $encoding = mb_detect_encoding($this->data, "ISO-8859-15, ISO-8859-1, UTF-8, ASCII, auto");
if ($encoding && $encoding != 'UTF-8') {
- $this->data = Encoding::toUTF8($data);
- } else {
- $this->data = $data;
+ $this->data = Encoding::toUTF8($this->data);
}
}
@@ -146,7 +148,6 @@
$file = strtolower(trim($page[1])) . '.php';
$class = trim($page[1]) . 'Page';
- require("pages/$file");
return new $class($url);
}
}
@@ -327,4 +328,27 @@
unlink($cookie_file);
return $data;
}
+
+ ///
+ /// DATES
+ ///
+
+ function extractYYYYMMDDDateFromURL() {
+ $pattern = "@/([12][0-9]{3})\-([0-9]{2})\-([0-9]{2})/@";
+ if (preg_match($pattern, $this->url, $matches)) {
+ $this->yyyy = $matches[1];
+ $this->mm = $matches[2];
+ $this->dd = $matches[3];
+ }
+ }
+
+ function extractYYYYDDMMateFromURL() {
+ $pattern = "@/([12][0-9]{3})\-([0-9]{2})\-([0-9]{2})/@";
+ if (preg_match($pattern, $this->url, $matches)) {
+ $this->yyyy = $matches[1];
+ $this->mm = $matches[3];
+ $this->dd = $matches[2];
+ }
+ }
+
}
diff --git a/pages/archiveslesoir.php b/pages/archiveslesoir.php
--- a/pages/archiveslesoir.php
+++ b/pages/archiveslesoir.php
@@ -1,25 +1,16 @@
<?php
-require 'lesoir.php';
-
/**
* Represents a page from the http://archives.lesoir.be/ site.
*/
class ArchivesLeSoirPage extends LeSoirPage {
+
/**
- * Analyses the page and extracts metadata
+ * Determines if this is the archive
+ * @return bool always true
*/
- function analyse ($skipSpecificProcessing = false) {
- parent::analyse(true);
-
- $authors = $this->between('<p class="st_signature">', '</p>');
- $date = trim($this->between('<p class="st_date">', '</p>'));
-
- $this->processAuthors($authors);
- $this->processDate($date);
+ function isArchive () {
+ return true;
}
- function get_title () {
- return $this->between('<h3 class="story_title main">', '</h3>');
- }
}
diff --git a/pages/lesoir.php b/pages/lesoir.php
--- a/pages/lesoir.php
+++ b/pages/lesoir.php
@@ -2,17 +2,30 @@
//Page analysis for www.lesoir.be
class LeSoirPage extends Page {
- function analyse ($skipSpecificProcessing = false) {
+
+ use DownloadWithWget;
+
+ /**
+ * Determines if the article belongs to thearchives
+ * @return bool
+ */
+ function isArchive () {
+ return strpos($this->url, "//www.lesoir.be/archives") !== false;
+ }
+
+ function analyse () {
parent::analyse();
- //Hardcoded known info
$this->site = "[[Le Soir]]";
- //Allows to skip the analyis for ArchivesLeSoirPage
- if ($skipSpecificProcessing) {
- return;
+ if ($this->isArchive()) {
+ $this->analyseForArchive();
+ } else {
+ $this->analyseForMainSite();
}
+ }
+ function analyseForMainSite () {
//Gets metadata
$meta = $this->between('<div class="meta">', '</div>');
$authors = trim(self::grab($meta, '<strong>', '</strong>'));
@@ -25,6 +38,17 @@
}
}
+ function analyseForArchive () {
+ $authors = $this->between('st_signature">', '</p>');
+ $this->processAuthors($authors);
+
+ if ($date = trim($this->between('<p class="st_date">', '</p>'))) {
+ $this->processDate($date);
+ } else {
+ $this->extractYYYYMMDDDateFromURL();
+ }
+ }
+
protected function processDate ($date) {
$dateFragments = explode(' ', $date);
if (count($dateFragments) == 4) {
@@ -67,9 +91,20 @@
* Gets page title
*/
function get_title () {
+ if ($this->isArchive()) {
+ $title = $this->between('<h3 class="story_title main">', '</h3>');
+
+ if ($title === false) {
+ $title = $this->between('<h1>', '</h1>');
+ }
+
+ return $title;
+ }
+
if (!$title = $this->meta_tags['og:title']) {
$title = parent::get_title();
}
+
return $title;
}
}

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 23, 08:53 (5 h, 32 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2371294
Default Alt Text
D309.id723.diff (4 KB)

Event Timeline