Page MenuHomeDevCentral

D309.id726.diff
No OneTemporary

D309.id726.diff

diff --git a/autoload.php b/autoload.php
--- a/autoload.php
+++ b/autoload.php
@@ -12,6 +12,19 @@
case 'ArticleTemplate': require('templates/wikipedia-fr/Article.php'); return;
case 'LienWebTemplate': require('templates/wikipedia-fr/Lien_web.php'); return;
case 'OuvrageTemplate': require('templates/wikipedia-fr/Ouvrage.php'); return;
+
+ case 'DownloadWithWget': require('pages/DownloadWithWget.php'); return;
+ }
+
+ if (substr($class, -4) === "Page") {
+ if (file_exists("pages/$class.php")) {
+ require "pages/$class.php";
+ return;
+ }
+
+ $file = strtolower(substr($class, 0, -4));
+ require "pages/$file.php";
+ return;
}
}
diff --git a/page.php b/page.php
--- a/page.php
+++ b/page.php
@@ -108,11 +108,11 @@
$this->get_data();
if ($this->data) {
$this->analyse();
- }
+ }
}
function get_data () {
- ini_set('user_agent', USER_AGENT);
+ ini_set('user_agent', USER_AGENT);
$data = file_get_contents($this->url);
if (!$data) {
ini_set('user_agent', USER_AGENT_FALLBACK);
@@ -121,11 +121,13 @@
return;
}
}
- $encoding = mb_detect_encoding($data, "ISO-8859-15, ISO-8859-1, UTF-8, ASCII, auto");
+ $this->encodeData();
+ }
+
+ function encodeData () {
+ $encoding = mb_detect_encoding($this->data, "ISO-8859-15, ISO-8859-1, UTF-8, ASCII, auto");
if ($encoding && $encoding != 'UTF-8') {
- $this->data = Encoding::toUTF8($data);
- } else {
- $this->data = $data;
+ $this->data = Encoding::toUTF8($this->data);
}
}
@@ -146,7 +148,6 @@
$file = strtolower(trim($page[1])) . '.php';
$class = trim($page[1]) . 'Page';
- require("pages/$file");
return new $class($url);
}
}
@@ -327,4 +328,27 @@
unlink($cookie_file);
return $data;
}
+
+ ///
+ /// DATES
+ ///
+
+ function extractYYYYMMDDDateFromURL() {
+ $pattern = "@/([12][0-9]{3})\-([0-9]{2})\-([0-9]{2})/@";
+ if (preg_match($pattern, $this->url, $matches)) {
+ $this->yyyy = $matches[1];
+ $this->mm = $matches[2];
+ $this->dd = $matches[3];
+ }
+ }
+
+ function extractYYYYDDMMateFromURL() {
+ $pattern = "@/([12][0-9]{3})\-([0-9]{2})\-([0-9]{2})/@";
+ if (preg_match($pattern, $this->url, $matches)) {
+ $this->yyyy = $matches[1];
+ $this->mm = $matches[3];
+ $this->dd = $matches[2];
+ }
+ }
+
}
diff --git a/pages/DownloadWithWget.php b/pages/DownloadWithWget.php
new file mode 100644
--- /dev/null
+++ b/pages/DownloadWithWget.php
@@ -0,0 +1,37 @@
+<?php
+
+trait DownloadWithWget {
+
+ /**
+ * @return string
+ */
+ private function getTemporaryFilename () {
+ $dir = sys_get_temp_dir();
+ return tempnam($dir, "http-client-wget-");
+ }
+
+ /**
+ * Gets the content of the specified URL, using wget to download it
+ *
+ * @return string
+ */
+ function getFileContents ($url) {
+ $file = $this->getTemporaryFilename();
+ $url = escapeshellarg($url);
+
+ system("wget -q -O $file $url");
+ $data = file_get_contents($file);
+ unlink($file);
+
+ return $data;
+ }
+
+ /**
+ * Downloads the URL through wget and fill data properties
+ */
+ function get_data () {
+ $this->data = $this->getFileContents($this->url);
+ $this->encodeData();
+ }
+
+}
diff --git a/pages/archiveslesoir.php b/pages/archiveslesoir.php
--- a/pages/archiveslesoir.php
+++ b/pages/archiveslesoir.php
@@ -1,25 +1,16 @@
<?php
-require 'lesoir.php';
-
/**
* Represents a page from the http://archives.lesoir.be/ site.
*/
class ArchivesLeSoirPage extends LeSoirPage {
+
/**
- * Analyses the page and extracts metadata
+ * Determines if this is the archive
+ * @return bool always true
*/
- function analyse ($skipSpecificProcessing = false) {
- parent::analyse(true);
-
- $authors = $this->between('<p class="st_signature">', '</p>');
- $date = trim($this->between('<p class="st_date">', '</p>'));
-
- $this->processAuthors($authors);
- $this->processDate($date);
+ function isArchive () {
+ return true;
}
- function get_title () {
- return $this->between('<h3 class="story_title main">', '</h3>');
- }
}
diff --git a/pages/lesoir.php b/pages/lesoir.php
--- a/pages/lesoir.php
+++ b/pages/lesoir.php
@@ -2,17 +2,30 @@
//Page analysis for www.lesoir.be
class LeSoirPage extends Page {
- function analyse ($skipSpecificProcessing = false) {
+
+ use DownloadWithWget;
+
+ /**
+ * Determines if the article belongs to thearchives
+ * @return bool
+ */
+ function isArchive () {
+ return strpos($this->url, "//www.lesoir.be/archives") !== false;
+ }
+
+ function analyse () {
parent::analyse();
- //Hardcoded known info
$this->site = "[[Le Soir]]";
- //Allows to skip the analyis for ArchivesLeSoirPage
- if ($skipSpecificProcessing) {
- return;
+ if ($this->isArchive()) {
+ $this->analyseForArchive();
+ } else {
+ $this->analyseForMainSite();
}
+ }
+ function analyseForMainSite () {
//Gets metadata
$meta = $this->between('<div class="meta">', '</div>');
$authors = trim(self::grab($meta, '<strong>', '</strong>'));
@@ -25,6 +38,17 @@
}
}
+ function analyseForArchive () {
+ $authors = $this->between('st_signature">', '</p>');
+ $this->processAuthors($authors);
+
+ if ($date = trim($this->between('<p class="st_date">', '</p>'))) {
+ $this->processDate($date);
+ } else {
+ $this->extractYYYYMMDDDateFromURL();
+ }
+ }
+
protected function processDate ($date) {
$dateFragments = explode(' ', $date);
if (count($dateFragments) == 4) {
@@ -67,9 +91,20 @@
* Gets page title
*/
function get_title () {
+ if ($this->isArchive()) {
+ $title = $this->between('<h3 class="story_title main">', '</h3>');
+
+ if ($title === false) {
+ $title = $this->between('<h1>', '</h1>');
+ }
+
+ return $title;
+ }
+
if (!$title = $this->meta_tags['og:title']) {
$title = parent::get_title();
}
+
return $title;
}
}
diff --git a/tests/DownloadWithWgetTest.php b/tests/DownloadWithWgetTest.php
new file mode 100644
--- /dev/null
+++ b/tests/DownloadWithWgetTest.php
@@ -0,0 +1,34 @@
+<?php
+
+require 'pages/DownloadWithWget.php';
+
+class DownloadWithWgetTest extends \PHPUnit_Framework_TestCase {
+
+ /**
+ * The object under test.
+ *
+ * @var object
+ */
+ private $instance;
+
+ /**
+ * Sets up the fixture.
+ *
+ * This method is called before a test is executed.
+ *
+ * @return void
+ */
+ public function setUp () {
+ $this->instance = $this->getObjectForTrait('DownloadWithWget');
+ }
+
+ /**
+ * Tests getFileContents method
+ */
+ public function testGetFileContents () {
+ $this->assertContains(
+ "* <----- vous &ecirc;tes ici",
+ $this->instance->getFileContents("http://www.perdu.com")
+ );
+ }
+}

File Metadata

Mime Type
text/plain
Expires
Mon, Apr 28, 23:58 (4 h, 20 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2613589
Default Alt Text
D309.id726.diff (7 KB)

Event Timeline