Page MenuHomeDevCentral

No OneTemporary

diff --git a/src/commands/wikidata/mod.rs b/src/commands/wikidata/mod.rs
index dee3b77..31b6bff 100644
--- a/src/commands/wikidata/mod.rs
+++ b/src/commands/wikidata/mod.rs
@@ -1,214 +1,233 @@
//! Query Wikidata SPARQL end-point and import result into PostgreSQL
mod qualification;
+mod report;
use std::collections::HashMap;
use std::process::exit;
+
use oxrdf::Term;
use sqlx::PgPool;
+use crate::commands::wikidata::qualification::determine_p31_winner;
+use crate::commands::wikidata::report::*;
use crate::db::*;
use crate::WikidataArgs;
-use crate::commands::wikidata::qualification::determine_p31_winner;
use crate::fantoir::{fix_fantoir_code, FixedFantoirCode};
use crate::services::query::search_fantoir_code;
use crate::services::sparql::*;
pub static WIKIDATA_TABLE: &'static str = "fantoir_wikidata";
pub static WIKIDATA_SPARQL_ENDPOINT: &'static str = "https://query.wikidata.org/sparql";
/* -------------------------------------------------------------
Import task
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
pub async fn import (args: &WikidataArgs, database_url: &str) {
let pool = connect_to_db(database_url).await;
// Create/truncate table as needed and as allowed by options
let callback = async {
let queries = include_str!("../../schema/wikidata.sql");
run_multiple_queries(&pool, &queries).await;
};
if let Err(error) = initialize_table(&pool, callback, args).await {
eprintln!("{}", &error);
exit(1);
}
// Query Wikidata and get (Wikidata/FANTOIR code, list of P31 (instance of) values) hashmap
let client = Client::new(WIKIDATA_SPARQL_ENDPOINT);
let mut what_map = HashMap::new();
client.query(include_str!("../../queries/wikidata.sparql"))
.await
.into_solutions()
.expect("A list of solutions is expected for a SELECT query")
.iter()
.filter(|entry| !is_term_empty(&entry["code_fantoir"]))
.for_each(|entry| {
// Build a map of the different P31 (instance of) values for a specified code.
let key = WikidataEntryKey::parse(entry);
let what = parse_wikidata_entity_uri(&entry["what"]).expect("Can't parse P31 what result");
what_map.entry(key).or_insert(Vec::new())
.push(what);
});
// Consolidate entries and insert them into the database.
// To avoid an async closure, we don't use HOF pattern.
+ let mut maintenance_report = HashMap::new();
for (key, candidates) in what_map {
if let Some(entry) = WikidataEntry::consolidate_set(&pool, &key, candidates).await {
- entry.insert_to_db(&pool).await;
+ if let Err(error) = entry.insert_to_db(&pool).await {
+ if args.maintenance_report {
+ update_report(&mut maintenance_report, key, error);
+ } else {
+ eprintln!();
+ eprintln!("Can't insert Wikidata information for the following entry:");
+ eprintln!("{:?}", entry);
+ eprintln!("{}", error);
+ }
+ }
continue;
}
- eprintln!();
- eprintln!("Can't insert Wikidata information for the following entry:");
- eprintln!("{:?}", &key);
- eprintln!("Can't resolve FANTOIR code.");
+ if args.maintenance_report {
+ let entry = maintenance_report
+ .entry("Can't resolve FANTOIR code")
+ .or_insert(Vec::new());
+ entry.push(key);
+ } else {
+ eprintln!();
+ eprintln!("Can't insert Wikidata information for the following entry:");
+ eprintln!("{:?}", &key);
+ eprintln!("Can't resolve FANTOIR code.");
+ }
}
+ if args.maintenance_report {
+ print_maintenance_report(maintenance_report);
+ }
}
/* -------------------------------------------------------------
Arguments parsing
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
impl ToTableInitializationArgs for &WikidataArgs {
fn to_table_initialization_args(&self) -> TableInitializationArgs {
TableInitializationArgs {
table_name: String::from(WIKIDATA_TABLE),
create_table: self.create_table,
overwrite_table: self.overwrite_table,
}
}
}
/* -------------------------------------------------------------
Wikidata entry structures
WikidataEntry represents the data ready to be inserted
in our database.
WikidataEntryKey is a subset of WikidataEntry to identify
a set (FANTOIR code, Wikidata item) to be used as HashMap key
when a SPARQL query returns several rows for such set.
For example, here, we ask for P31 values, and if a Wikidata
entity offers several P31 values, we'll get one row per value.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#[derive(Debug, Clone)]
struct WikidataEntry {
code_fantoir: String,
code_fantoir_wikidata: String,
item: String,
item_label: String,
what: String,
}
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
-struct WikidataEntryKey {
- code_fantoir_wikidata: String,
- item: String,
- item_label: String,
+pub struct WikidataEntryKey {
+ pub code_fantoir_wikidata: String,
+ pub item: String,
+ pub item_label: String,
}
impl WikidataEntryKey {
fn parse(entry: &HashMap<String, Term>) -> Self {
Self {
code_fantoir_wikidata: parse_literal(&entry["code_fantoir"]).expect("Can't parse code"),
item: parse_wikidata_entity_uri(&entry["item"]).expect("Can't parse item"),
item_label: parse_literal(&entry["itemLabel"]).expect("Can't parse item label"),
}
}
}
impl WikidataEntry {
async fn consolidate_set(pool: &PgPool, key: &WikidataEntryKey, what_candidates: Vec<String>) -> Option<Self> {
let what = determine_p31_winner(&what_candidates);
let code_fantoir = match fix_fantoir_code(&key.code_fantoir_wikidata) {
FixedFantoirCode::Computed(code) => code,
FixedFantoirCode::ToSearch { code_insee, identifiant_communal_voie } => {
search_fantoir_code(pool, &code_insee, &identifiant_communal_voie).await?
}
};
Some(Self {
code_fantoir,
code_fantoir_wikidata: key.code_fantoir_wikidata.clone(),
item: key.item.clone(),
item_label: key.item_label.clone(),
what,
})
}
- async fn insert_to_db (&self, pool: &PgPool) {
+ async fn insert_to_db (&self, pool: &PgPool) -> Result<(), sqlx::Error> {
let mut query = format!("INSERT INTO {}", WIKIDATA_TABLE);
query.push_str(
r#"
(code_fantoir, code_fantoir_wikidata, item, item_label, what)
VALUES
($1, $2, $3, $4, $5)"#
);
- if let Err(error) = sqlx::query(&query)
+ sqlx::query(&query)
.bind(&self.code_fantoir)
.bind(&self.code_fantoir_wikidata)
.bind(&self.item)
.bind(&self.item_label)
.bind(&self.what)
.execute(pool)
- .await {
- eprintln!();
- eprintln!("Can't insert Wikidata information for the following entry:");
- eprintln!("{:?}", self);
- eprintln!("{}", error);
- }
+ .await
+ .map(|_result| ())
}
}
/* -------------------------------------------------------------
Wikidata helper methods
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/// Parses a Wikidata entity URI from a RDF term.
///
/// For example, to parse a term representing Q1234:
///
/// ```
/// let term = Term::NamedNode(
/// NamedNode::new("http://www.wikidata.org/entity/Q1234").unwrap()
/// );
/// let entity = parse_wikidata_entity_uri(&term).unwrap();
///
/// assert_eq!("Q1234", &entity);
/// ```
pub fn parse_wikidata_entity_uri (term: &Term) -> Option<String> {
parse_term_uri(term)
.map(|uri| {
let pos = uri.rfind('/').expect("URI doesn't contain any /") + 1;
uri[pos..].to_string()
})
}
/* -------------------------------------------------------------
Tests
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#[cfg(test)]
mod tests {
use oxrdf::NamedNode;
use super::*;
#[test]
pub fn test_parse_wikidata_entity_uri () {
let node = NamedNode::new("http://www.wikidata.org/entity/Q849777").unwrap();
let term = Term::NamedNode(node);
assert_eq!("Q849777", &parse_wikidata_entity_uri(&term).unwrap());
}
}
diff --git a/src/commands/wikidata/report.rs b/src/commands/wikidata/report.rs
new file mode 100644
index 0000000..dc29b12
--- /dev/null
+++ b/src/commands/wikidata/report.rs
@@ -0,0 +1,83 @@
+use std::cmp::Ordering;
+use std::collections::HashMap;
+
+use sqlx::Error;
+
+use crate::commands::wikidata::WikidataEntryKey;
+
+type MaintenanceReport = HashMap<&'static str, Vec<WikidataEntryKey>>;
+
+/* -------------------------------------------------------------
+ Report update and wiki code
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+pub fn update_report (maintenance_report: &mut MaintenanceReport, key: WikidataEntryKey, error: Error) {
+ let error_category = match error {
+ Error::Database(error) => {
+ if let Some(index) = error.constraint() {
+ match index {
+ "index_fantoir_wikidata_pk" => "Duplicate FANTOIR code",
+ "fantoir_wikidata_code_fantoir_fk" => "Not in FANTOIR national file",
+ _ => {
+ eprintln!("Unknown constraint index: {}", index);
+
+ unreachable!()
+ },
+ }
+ } else if let Some(code) = error.code() {
+ let code = code.to_string();
+ match code.as_str() {
+ "22001" => "FANTOIR code too long",
+ _ => unimplemented!(),
+ }
+ } else {
+ unimplemented!()
+ }
+ },
+ _ => unimplemented!(),
+ };
+
+ let entry = maintenance_report
+ .entry(error_category)
+ .or_insert(Vec::new());
+ entry.push(key);
+}
+
+pub fn print_maintenance_report (maintenance_report: MaintenanceReport) {
+ for (section_title, mut entries) in maintenance_report {
+ println!("== {} ==", section_title);
+ println!(r#"
+{{| class="wikitable sortable"
+|+ Items with issue
+|-
+! Item !! Item label in French !! FANTOIR code"#);
+
+ entries.sort();
+ for entry in entries {
+ println!(r#"|-
+| [[{}]] || {} || {}"#, &entry.item, &entry.item_label, &entry.code_fantoir_wikidata);
+ }
+
+ println!(r#"|}}"#);
+ println!();
+ }
+
+ println!("== Notes ==");
+ println!("This maintenance report has been generated automatically by fantoir-datasource tool, based on the issues encountered to cross-validate Wikidata entries and FANTOIR national file.");
+}
+
+/* -------------------------------------------------------------
+ Sort for report entries
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+impl PartialOrd for WikidataEntryKey {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for WikidataEntryKey {
+ fn cmp(&self, other: &Self) -> Ordering {
+ self.code_fantoir_wikidata.cmp(&other.code_fantoir_wikidata)
+ }
+}
diff --git a/src/main.rs b/src/main.rs
index 358a65c..ce86816 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,103 +1,107 @@
use std::env;
use clap::{Args, Parser};
use crate::commands::promote::promote;
mod commands;
mod db;
mod fantoir;
mod services;
#[derive(Debug, Parser)]
#[command(name = "fantoir-datasource")]
#[clap(author="Nasqueron project", version, about="Import FANTOIR database into PostgreSQL", long_about=None)]
enum FantoirCommand {
/// Import from FANTOIR file generated by the DGFIP
#[command(arg_required_else_help = true)]
Import(ImportArgs),
/// Promote an imported FANTOIR table as the current FANTOIR table to use
#[command(arg_required_else_help = true)]
Promote(PromoteArgs),
/// Query Wikidata SPARQL end-point to enrich FANTOIR information
Wikidata(WikidataArgs),
/// Query the imported FANTOIR table
Query(QueryArgs)
}
#[derive(Debug, Args)]
pub struct ImportArgs {
/// Create table if it doesn't exist
#[arg(short = 'c')]
create_table: bool,
/// Truncate table if it already exists, allowing the overwrite mode.
/// If not specified, the script will fail if table exists.
#[arg(short = 't')]
overwrite_table: bool,
/// The FANTOIR file to import
fantoir_file: String,
/// The name of the table to populate
fantoir_table: String,
}
#[derive(Debug, Args)]
pub struct PromoteArgs {
/// The name of the table to promote
fantoir_table: String,
}
#[derive(Debug, Args)]
pub struct WikidataArgs {
/// Create table if it doesn't exist
#[arg(short = 'c')]
create_table: bool,
/// Truncate table if it already exists, allowing the overwrite mode.
/// If not specified, the script will fail if table exists.
#[arg(short = 't')]
overwrite_table: bool,
+
+ /// Generate a Wikidata maintenance report instead to print errors to stderr
+ #[arg(long)]
+ maintenance_report: bool,
}
#[derive(Debug, Args)]
#[clap(trailing_var_arg=true)]
pub struct QueryArgs {
/// INSEE code to identify a commune
#[arg(long)]
code_insee: Option<String>,
/// Identifier of the voie by the commune
#[arg(long)]
code_voie: Option<String>,
/// Expression to search
libelle: Vec<String>,
}
#[tokio::main]
async fn main() {
let command = FantoirCommand::parse(); // Will exit if argument is missing or --help/--version provided.
let database_url = env::var("DATABASE_URL")
.expect("The environment variable DATABASE_URL need to be set to your PostgreSQL database.");
match command {
FantoirCommand::Import(args) => {
commands::import::import(&args, &database_url).await;
},
FantoirCommand::Promote(args) => {
promote(&args.fantoir_table, &database_url).await;
},
FantoirCommand::Wikidata(args) => {
commands::wikidata::import(&args, &database_url).await
},
FantoirCommand::Query(args) => {
commands::query::search(args, &database_url).await
},
};
}

File Metadata

Mime Type
text/x-diff
Expires
Mon, Nov 25, 07:18 (1 d, 17 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2259704
Default Alt Text
(14 KB)

Event Timeline