Page Menu
Home
DevCentral
Search
Configure Global Search
Log In
Files
F3768268
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/src/commands/wikidata/mod.rs b/src/commands/wikidata/mod.rs
index dee3b77..31b6bff 100644
--- a/src/commands/wikidata/mod.rs
+++ b/src/commands/wikidata/mod.rs
@@ -1,214 +1,233 @@
//! Query Wikidata SPARQL end-point and import result into PostgreSQL
mod qualification;
+mod report;
use std::collections::HashMap;
use std::process::exit;
+
use oxrdf::Term;
use sqlx::PgPool;
+use crate::commands::wikidata::qualification::determine_p31_winner;
+use crate::commands::wikidata::report::*;
use crate::db::*;
use crate::WikidataArgs;
-use crate::commands::wikidata::qualification::determine_p31_winner;
use crate::fantoir::{fix_fantoir_code, FixedFantoirCode};
use crate::services::query::search_fantoir_code;
use crate::services::sparql::*;
pub static WIKIDATA_TABLE: &'static str = "fantoir_wikidata";
pub static WIKIDATA_SPARQL_ENDPOINT: &'static str = "https://query.wikidata.org/sparql";
/* -------------------------------------------------------------
Import task
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
pub async fn import (args: &WikidataArgs, database_url: &str) {
let pool = connect_to_db(database_url).await;
// Create/truncate table as needed and as allowed by options
let callback = async {
let queries = include_str!("../../schema/wikidata.sql");
run_multiple_queries(&pool, &queries).await;
};
if let Err(error) = initialize_table(&pool, callback, args).await {
eprintln!("{}", &error);
exit(1);
}
// Query Wikidata and get (Wikidata/FANTOIR code, list of P31 (instance of) values) hashmap
let client = Client::new(WIKIDATA_SPARQL_ENDPOINT);
let mut what_map = HashMap::new();
client.query(include_str!("../../queries/wikidata.sparql"))
.await
.into_solutions()
.expect("A list of solutions is expected for a SELECT query")
.iter()
.filter(|entry| !is_term_empty(&entry["code_fantoir"]))
.for_each(|entry| {
// Build a map of the different P31 (instance of) values for a specified code.
let key = WikidataEntryKey::parse(entry);
let what = parse_wikidata_entity_uri(&entry["what"]).expect("Can't parse P31 what result");
what_map.entry(key).or_insert(Vec::new())
.push(what);
});
// Consolidate entries and insert them into the database.
// To avoid an async closure, we don't use HOF pattern.
+ let mut maintenance_report = HashMap::new();
for (key, candidates) in what_map {
if let Some(entry) = WikidataEntry::consolidate_set(&pool, &key, candidates).await {
- entry.insert_to_db(&pool).await;
+ if let Err(error) = entry.insert_to_db(&pool).await {
+ if args.maintenance_report {
+ update_report(&mut maintenance_report, key, error);
+ } else {
+ eprintln!();
+ eprintln!("Can't insert Wikidata information for the following entry:");
+ eprintln!("{:?}", entry);
+ eprintln!("{}", error);
+ }
+ }
continue;
}
- eprintln!();
- eprintln!("Can't insert Wikidata information for the following entry:");
- eprintln!("{:?}", &key);
- eprintln!("Can't resolve FANTOIR code.");
+ if args.maintenance_report {
+ let entry = maintenance_report
+ .entry("Can't resolve FANTOIR code")
+ .or_insert(Vec::new());
+ entry.push(key);
+ } else {
+ eprintln!();
+ eprintln!("Can't insert Wikidata information for the following entry:");
+ eprintln!("{:?}", &key);
+ eprintln!("Can't resolve FANTOIR code.");
+ }
}
+ if args.maintenance_report {
+ print_maintenance_report(maintenance_report);
+ }
}
/* -------------------------------------------------------------
Arguments parsing
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
impl ToTableInitializationArgs for &WikidataArgs {
fn to_table_initialization_args(&self) -> TableInitializationArgs {
TableInitializationArgs {
table_name: String::from(WIKIDATA_TABLE),
create_table: self.create_table,
overwrite_table: self.overwrite_table,
}
}
}
/* -------------------------------------------------------------
Wikidata entry structures
WikidataEntry represents the data ready to be inserted
in our database.
WikidataEntryKey is a subset of WikidataEntry to identify
a set (FANTOIR code, Wikidata item) to be used as HashMap key
when a SPARQL query returns several rows for such set.
For example, here, we ask for P31 values, and if a Wikidata
entity offers several P31 values, we'll get one row per value.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#[derive(Debug, Clone)]
struct WikidataEntry {
code_fantoir: String,
code_fantoir_wikidata: String,
item: String,
item_label: String,
what: String,
}
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
-struct WikidataEntryKey {
- code_fantoir_wikidata: String,
- item: String,
- item_label: String,
+pub struct WikidataEntryKey {
+ pub code_fantoir_wikidata: String,
+ pub item: String,
+ pub item_label: String,
}
impl WikidataEntryKey {
fn parse(entry: &HashMap<String, Term>) -> Self {
Self {
code_fantoir_wikidata: parse_literal(&entry["code_fantoir"]).expect("Can't parse code"),
item: parse_wikidata_entity_uri(&entry["item"]).expect("Can't parse item"),
item_label: parse_literal(&entry["itemLabel"]).expect("Can't parse item label"),
}
}
}
impl WikidataEntry {
async fn consolidate_set(pool: &PgPool, key: &WikidataEntryKey, what_candidates: Vec<String>) -> Option<Self> {
let what = determine_p31_winner(&what_candidates);
let code_fantoir = match fix_fantoir_code(&key.code_fantoir_wikidata) {
FixedFantoirCode::Computed(code) => code,
FixedFantoirCode::ToSearch { code_insee, identifiant_communal_voie } => {
search_fantoir_code(pool, &code_insee, &identifiant_communal_voie).await?
}
};
Some(Self {
code_fantoir,
code_fantoir_wikidata: key.code_fantoir_wikidata.clone(),
item: key.item.clone(),
item_label: key.item_label.clone(),
what,
})
}
- async fn insert_to_db (&self, pool: &PgPool) {
+ async fn insert_to_db (&self, pool: &PgPool) -> Result<(), sqlx::Error> {
let mut query = format!("INSERT INTO {}", WIKIDATA_TABLE);
query.push_str(
r#"
(code_fantoir, code_fantoir_wikidata, item, item_label, what)
VALUES
($1, $2, $3, $4, $5)"#
);
- if let Err(error) = sqlx::query(&query)
+ sqlx::query(&query)
.bind(&self.code_fantoir)
.bind(&self.code_fantoir_wikidata)
.bind(&self.item)
.bind(&self.item_label)
.bind(&self.what)
.execute(pool)
- .await {
- eprintln!();
- eprintln!("Can't insert Wikidata information for the following entry:");
- eprintln!("{:?}", self);
- eprintln!("{}", error);
- }
+ .await
+ .map(|_result| ())
}
}
/* -------------------------------------------------------------
Wikidata helper methods
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/// Parses a Wikidata entity URI from a RDF term.
///
/// For example, to parse a term representing Q1234:
///
/// ```
/// let term = Term::NamedNode(
/// NamedNode::new("http://www.wikidata.org/entity/Q1234").unwrap()
/// );
/// let entity = parse_wikidata_entity_uri(&term).unwrap();
///
/// assert_eq!("Q1234", &entity);
/// ```
pub fn parse_wikidata_entity_uri (term: &Term) -> Option<String> {
parse_term_uri(term)
.map(|uri| {
let pos = uri.rfind('/').expect("URI doesn't contain any /") + 1;
uri[pos..].to_string()
})
}
/* -------------------------------------------------------------
Tests
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#[cfg(test)]
mod tests {
use oxrdf::NamedNode;
use super::*;
#[test]
pub fn test_parse_wikidata_entity_uri () {
let node = NamedNode::new("http://www.wikidata.org/entity/Q849777").unwrap();
let term = Term::NamedNode(node);
assert_eq!("Q849777", &parse_wikidata_entity_uri(&term).unwrap());
}
}
diff --git a/src/commands/wikidata/report.rs b/src/commands/wikidata/report.rs
new file mode 100644
index 0000000..dc29b12
--- /dev/null
+++ b/src/commands/wikidata/report.rs
@@ -0,0 +1,83 @@
+use std::cmp::Ordering;
+use std::collections::HashMap;
+
+use sqlx::Error;
+
+use crate::commands::wikidata::WikidataEntryKey;
+
+type MaintenanceReport = HashMap<&'static str, Vec<WikidataEntryKey>>;
+
+/* -------------------------------------------------------------
+ Report update and wiki code
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+pub fn update_report (maintenance_report: &mut MaintenanceReport, key: WikidataEntryKey, error: Error) {
+ let error_category = match error {
+ Error::Database(error) => {
+ if let Some(index) = error.constraint() {
+ match index {
+ "index_fantoir_wikidata_pk" => "Duplicate FANTOIR code",
+ "fantoir_wikidata_code_fantoir_fk" => "Not in FANTOIR national file",
+ _ => {
+ eprintln!("Unknown constraint index: {}", index);
+
+ unreachable!()
+ },
+ }
+ } else if let Some(code) = error.code() {
+ let code = code.to_string();
+ match code.as_str() {
+ "22001" => "FANTOIR code too long",
+ _ => unimplemented!(),
+ }
+ } else {
+ unimplemented!()
+ }
+ },
+ _ => unimplemented!(),
+ };
+
+ let entry = maintenance_report
+ .entry(error_category)
+ .or_insert(Vec::new());
+ entry.push(key);
+}
+
+pub fn print_maintenance_report (maintenance_report: MaintenanceReport) {
+ for (section_title, mut entries) in maintenance_report {
+ println!("== {} ==", section_title);
+ println!(r#"
+{{| class="wikitable sortable"
+|+ Items with issue
+|-
+! Item !! Item label in French !! FANTOIR code"#);
+
+ entries.sort();
+ for entry in entries {
+ println!(r#"|-
+| [[{}]] || {} || {}"#, &entry.item, &entry.item_label, &entry.code_fantoir_wikidata);
+ }
+
+ println!(r#"|}}"#);
+ println!();
+ }
+
+ println!("== Notes ==");
+ println!("This maintenance report has been generated automatically by fantoir-datasource tool, based on the issues encountered to cross-validate Wikidata entries and FANTOIR national file.");
+}
+
+/* -------------------------------------------------------------
+ Sort for report entries
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+impl PartialOrd for WikidataEntryKey {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for WikidataEntryKey {
+ fn cmp(&self, other: &Self) -> Ordering {
+ self.code_fantoir_wikidata.cmp(&other.code_fantoir_wikidata)
+ }
+}
diff --git a/src/main.rs b/src/main.rs
index 358a65c..ce86816 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,103 +1,107 @@
use std::env;
use clap::{Args, Parser};
use crate::commands::promote::promote;
mod commands;
mod db;
mod fantoir;
mod services;
#[derive(Debug, Parser)]
#[command(name = "fantoir-datasource")]
#[clap(author="Nasqueron project", version, about="Import FANTOIR database into PostgreSQL", long_about=None)]
enum FantoirCommand {
/// Import from FANTOIR file generated by the DGFIP
#[command(arg_required_else_help = true)]
Import(ImportArgs),
/// Promote an imported FANTOIR table as the current FANTOIR table to use
#[command(arg_required_else_help = true)]
Promote(PromoteArgs),
/// Query Wikidata SPARQL end-point to enrich FANTOIR information
Wikidata(WikidataArgs),
/// Query the imported FANTOIR table
Query(QueryArgs)
}
#[derive(Debug, Args)]
pub struct ImportArgs {
/// Create table if it doesn't exist
#[arg(short = 'c')]
create_table: bool,
/// Truncate table if it already exists, allowing the overwrite mode.
/// If not specified, the script will fail if table exists.
#[arg(short = 't')]
overwrite_table: bool,
/// The FANTOIR file to import
fantoir_file: String,
/// The name of the table to populate
fantoir_table: String,
}
#[derive(Debug, Args)]
pub struct PromoteArgs {
/// The name of the table to promote
fantoir_table: String,
}
#[derive(Debug, Args)]
pub struct WikidataArgs {
/// Create table if it doesn't exist
#[arg(short = 'c')]
create_table: bool,
/// Truncate table if it already exists, allowing the overwrite mode.
/// If not specified, the script will fail if table exists.
#[arg(short = 't')]
overwrite_table: bool,
+
+ /// Generate a Wikidata maintenance report instead to print errors to stderr
+ #[arg(long)]
+ maintenance_report: bool,
}
#[derive(Debug, Args)]
#[clap(trailing_var_arg=true)]
pub struct QueryArgs {
/// INSEE code to identify a commune
#[arg(long)]
code_insee: Option<String>,
/// Identifier of the voie by the commune
#[arg(long)]
code_voie: Option<String>,
/// Expression to search
libelle: Vec<String>,
}
#[tokio::main]
async fn main() {
let command = FantoirCommand::parse(); // Will exit if argument is missing or --help/--version provided.
let database_url = env::var("DATABASE_URL")
.expect("The environment variable DATABASE_URL need to be set to your PostgreSQL database.");
match command {
FantoirCommand::Import(args) => {
commands::import::import(&args, &database_url).await;
},
FantoirCommand::Promote(args) => {
promote(&args.fantoir_table, &database_url).await;
},
FantoirCommand::Wikidata(args) => {
commands::wikidata::import(&args, &database_url).await
},
FantoirCommand::Query(args) => {
commands::query::search(args, &database_url).await
},
};
}
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Mon, Nov 25, 07:18 (1 d, 17 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2259704
Default Alt Text
(14 KB)
Attached To
Mode
rDS Nasqueron Datasources
Attached
Detach File
Event Timeline
Log In to Comment