diff --git a/Cargo.toml b/Cargo.toml --- a/Cargo.toml +++ b/Cargo.toml @@ -9,11 +9,18 @@ license = "BSD-2-Clause" [dependencies] +lazy_static = "~1.4.0" +oxrdf = "~0.1.1" +sparesults = "~0.1.3" [dependencies.clap] version = "~4.0.32" features = ["derive"] +[dependencies.reqwest] +version = "~0.11.13" +features = ["gzip", "deflate"] + [dependencies.sqlx] version = "~0.6.2" features = ["runtime-tokio-native-tls", "postgres", "chrono"] diff --git a/src/commands/import.rs b/src/commands/import.rs --- a/src/commands/import.rs +++ b/src/commands/import.rs @@ -12,12 +12,33 @@ use crate::db::*; use crate::fantoir::FantoirEntry; +impl ToTableInitializationArgs for &ImportArgs { + fn to_table_initialization_args (&self) -> TableInitializationArgs { + TableInitializationArgs { + table_name: self.fantoir_table.clone(), + create_table: self.create_table, + overwrite_table: self.overwrite_table, + } + } +} + +async fn create_table(pool: &PgPool, table: &str) { + let queries = include_str!("../schema/fantoir.sql") + .replace("/*table*/fantoir", table) + .replace("/*index*/index_fantoir_", format!("index_{}_", table).as_ref()); + + run_multiple_queries(pool, &queries).await; +} + pub async fn import(args: &ImportArgs, database_url: &str) { let fd = File::open(&args.fantoir_file).await.expect("Can't open file."); let pool = connect_to_db(database_url).await; // Create/truncate table as needed and as allowed by options - if let Err(error) = initialize_table(args, &pool).await { + let callback = async { + create_table(&pool, &args.fantoir_table).await; + }; + if let Err(error) = initialize_table(&pool, callback, args).await { eprintln!("{}", &error); exit(1); } @@ -46,39 +67,3 @@ .await } } - -async fn initialize_table(args: &ImportArgs, pool: &PgPool) -> Result<(), String> { - if is_table_exists(pool, &args.fantoir_table).await { - if is_table_empty(&pool, &args.fantoir_table).await { - return Ok(()); - } - - if args.overwrite_table { - truncate_table(&pool, &args.fantoir_table).await; - return Ok(()); - } - - return Err(format!( - "Table {} already exists and contains rows. To overwrite it, run the import tool with -t option.", - &args.fantoir_table - )); - } - - if args.create_table { - create_table(&pool, &args.fantoir_table).await; - return Ok(()); - } - - Err(format!( - "Table {} doesn't exist. To create it, run the import tool with -c option.", - &args.fantoir_table - )) -} - -async fn create_table(pool: &PgPool, table: &str) { - let queries = include_str!("../schema/fantoir.sql") - .replace("/*table*/fantoir", table) - .replace("/*index*/index_fantoir_", format!("index_{}_", table).as_ref()); - - run_multiple_queries(pool, &queries).await; -} diff --git a/src/commands/mod.rs b/src/commands/mod.rs --- a/src/commands/mod.rs +++ b/src/commands/mod.rs @@ -1,3 +1,5 @@ //! Commands for the fantoir2db tool. pub(crate) mod import; +pub(crate) mod wikidata; +pub(crate) mod update_foreign_keys; diff --git a/src/commands/update_foreign_keys.rs b/src/commands/update_foreign_keys.rs new file mode 100644 --- /dev/null +++ b/src/commands/update_foreign_keys.rs @@ -0,0 +1,12 @@ +//! Update foreign keys relations between tables + +use crate::db::*; + +pub async fn update (database_url: &str, fantoir_table: &str) { + let pool = connect_to_db(database_url).await; + + let queries = include_str!("../schema/foreign_keys.sql") + .replace("/*table*/fantoir", fantoir_table); + + run_multiple_queries(&pool, &queries).await; +} diff --git a/src/commands/wikidata/mod.rs b/src/commands/wikidata/mod.rs new file mode 100644 --- /dev/null +++ b/src/commands/wikidata/mod.rs @@ -0,0 +1,229 @@ +//! Query Wikidata SPARQL end-point and import result into PostgreSQL + +mod qualification; + +use std::collections::HashMap; +use std::process::exit; +use oxrdf::Term; +use sqlx::{Execute, PgPool}; + +use crate::db::*; +use crate::{sparql, WikidataArgs}; +use crate::commands::wikidata::qualification::determine_p31_winner; +use crate::fantoir::{fix_fantoir_code, FixedFantoirCode}; +use crate::services::http_client::get_user_agent; +use crate::sparql::{is_term_empty, parse_literal, parse_term_uri}; + +static WIKIDATA_TABLE: &str = "fantoir_wikidata"; + +/* ------------------------------------------------------------- + Import task + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +pub async fn import (args: &WikidataArgs, database_url: &str) { + let pool = connect_to_db(database_url).await; + + // Create/truncate table as needed and as allowed by options + let callback = async { + let queries = include_str!("../../schema/wikidata.sql"); + run_multiple_queries(&pool, &queries).await; + }; + if let Err(error) = initialize_table(&pool, callback, args).await { + eprintln!("{}", &error); + exit(1); + } + + // Query Wikidata + let client = sparql::Client::new("https://query.wikidata.org/sparql", get_user_agent()); + + let mut what_map = HashMap::new(); + client.query(include_str!("../../queries/wikidata.sparql")) + .await + .iter() + .filter(|entry| !is_term_empty(&entry["code_fantoir"])) + .for_each(|entry| { + // Build a map of the different P31 (instance of) values for a specified code. + + let key = WikidataEntryKey::parse(entry); + let what = parse_wikidata_entity_uri(&entry["what"]).expect("Can't parse P31 what result"); + + what_map + .entry(key) + .or_insert(Vec::new()) + .push(what); + }); + + // Consolidate entries and insert them into the database. + // To avoid a async closure, we need to stop the HOF in the map. + let entries = what_map + .into_iter() + .map(|(key, candidates)| WikidataEntry::consolidate(key, candidates)); + + for entry in entries { + entry + .insert_to_db(&pool) + .await + } + +} + +/* ------------------------------------------------------------- + Arguments parsing + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +impl ToTableInitializationArgs for &WikidataArgs { + fn to_table_initialization_args(&self) -> TableInitializationArgs { + TableInitializationArgs { + table_name: String::from(WIKIDATA_TABLE), + create_table: self.create_table, + overwrite_table: self.overwrite_table, + } + } +} + +/* ------------------------------------------------------------- + Wikidata entry structures + + WikidataEntry represents the data ready to be inserted + in our database. + + WikidataEntryKey is a subset of WikidataEntry to identify + a set (FANTOIR code, Wikidata item) to be used as HashMap key + when a SPARQL query returns several rows for such set. + + For example, here, we ask for P31 values, and if a Wikidata + entity offers several P31 values, we'll get one row per value. + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +#[derive(Debug, Clone)] +struct WikidataEntry { + code_fantoir: String, + code_fantoir_wikidata: String, + item: String, + item_label: String, + what: String, +} + +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +struct WikidataEntryKey { + code_fantoir_wikidata: String, + item: String, + item_label: String, +} + +impl WikidataEntryKey { + fn parse(entry: &HashMap<String, Term>) -> Self { + Self { + code_fantoir_wikidata: parse_literal(&entry["code_fantoir"]).expect("Can't parse code"), + item: parse_wikidata_entity_uri(&entry["item"]).expect("Can't parse item"), + item_label: parse_literal(&entry["itemLabel"]).expect("Can't parse item label"), + } + } +} + +impl WikidataEntry { + fn parse(entry: &HashMap<String, Term>) -> Self { + let code_fantoir_wikidata = parse_literal(&entry["code_fantoir"]).expect("Can't parse code"); + let code_fantoir = match fix_fantoir_code(&code_fantoir_wikidata) { + FixedFantoirCode::Computed(code) => code, + FixedFantoirCode::ToSearch { code_insee, identifiant_communal_voie } => { + todo!() + } + }; + + Self { + code_fantoir, + code_fantoir_wikidata, + item: parse_wikidata_entity_uri(&entry["item"]).expect("Can't parse item"), + item_label: parse_literal(&entry["itemLabel"]).expect("Can't parse item label"), + what: parse_wikidata_entity_uri(&entry["what"]).expect("Can't parse P31 what result"), + } + } + + fn consolidate (key: WikidataEntryKey, what_candidates: Vec<String>) -> Self { + let what = determine_p31_winner(&what_candidates); + + let code_fantoir = match fix_fantoir_code(&key.code_fantoir_wikidata) { + FixedFantoirCode::Computed(code) => code, + FixedFantoirCode::ToSearch { code_insee, identifiant_communal_voie } => { + todo!(); + } + }; + + Self { + code_fantoir, + code_fantoir_wikidata: key.code_fantoir_wikidata, + item: key.item, + item_label: key.item_label, + what, + } + } + + async fn insert_to_db (&self, pool: &PgPool) { + let mut query = format!("INSERT INTO {}", WIKIDATA_TABLE); + query.push_str( + r#" + (code_fantoir, code_fantoir_wikidata, item, item_label, what) + VALUES + ($1, $2, $3, $4, $5)"# + ); + + if let Err(error) = sqlx::query(&query) + .bind(&self.code_fantoir) + .bind(&self.code_fantoir_wikidata) + .bind(&self.item) + .bind(&self.item_label) + .bind(&self.what) + + .execute(pool) + .await { + eprintln!(); + eprintln!("Can't insert Wikidata information for the following entry:"); + eprintln!("{:?}", self); + eprintln!("{}", error); + } + } +} + +/* ------------------------------------------------------------- + Wikidata helper methods + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +/// Parses a Wikidata entity URI from a RDF term. +/// +/// For example, to parse a term representing Q1234: +/// +/// ``` +/// let term = Term::NamedNode( +/// NamedNode::new("http://www.wikidata.org/entity/Q1234").unwrap() +/// ); +/// let entity = parse_wikidata_entity_uri(&term).unwrap(); +/// +/// assert_eq!("Q1234", &entity); +/// ``` +pub fn parse_wikidata_entity_uri (term: &Term) -> Option<String> { + parse_term_uri(term) + .map(|uri| { + let pos = uri.rfind('/').expect("URI doesn't contain any /") + 1; + + uri[pos..].to_string() + }) +} + +/* ------------------------------------------------------------- + Tests + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +#[cfg(test)] +mod tests { + use oxrdf::NamedNode; + use super::*; + + #[test] + pub fn test_parse_wikidata_entity_uri () { + let node = NamedNode::new("http://www.wikidata.org/entity/Q849777").unwrap(); + let term = Term::NamedNode(node); + + assert_eq!("Q849777", &parse_wikidata_entity_uri(&term).unwrap()); + } +} diff --git a/src/commands/wikidata/qualification.rs b/src/commands/wikidata/qualification.rs new file mode 100644 --- /dev/null +++ b/src/commands/wikidata/qualification.rs @@ -0,0 +1,54 @@ +//! Helper for items qualification. +//! +//! Wikidata uses the P31 "instance of" property to qualify items, +//! which is helpful to identify voies, especially the pseudo-voies +//! not furthermore described in FANTOIR. + +use lazy_static::lazy_static; + +lazy_static! { + static ref P31_WINNERS: Vec<&'static str> = vec![ + // Important values + "Q510662", // ring road + "Q928830", // metro station + "Q18615527", // tram bridge + + // Not interesting values, as FANTOIR already describes the road, + // but we still need to consolidate. + "Q2026833", // square, type jardin public + "Q207934", // allée + "Q54114", // boulevard + "Q79007", // street, wins against road but loses against boulevard + + /* + Can't determine P31 winner amongst ["Q22698", "Q22746"], Q22698 is picked. + Can't determine P31 winner amongst ["Q703941", "Q13634881"], Q703941 is picked. + Can't determine P31 winner amongst ["Q703941", "Q17383262"], Q703941 is picked. + Can't determine P31 winner amongst ["Q703941", "Q7543083"], Q703941 is picked. + Can't determine P31 winner amongst ["Q703941", "Q3558430"], Q703941 is picked. + */ + ]; +} + +/// Determine amongst a sets of items which one is the more relevant +/// to describe a pseudo-voie. +/// +/// This is useful when a Wikidata entity has several values for P31 +/// to decide which one is the most interesting to keep in our context. +pub fn determine_p31_winner(candidates: &Vec<String>) -> String { + if candidates.len() == 1 { + // If there is only one, that's the one to use. + return candidates[0].clone(); + } + + for winner_candidate in P31_WINNERS.iter() { + for actual_candidate in candidates { + if winner_candidate == actual_candidate { + return actual_candidate.clone(); + } + } + } + + eprintln!("Can't determine P31 winner amongst {:?}, {} is picked.", candidates, candidates[0]); + candidates[0].clone() +} diff --git a/src/db.rs b/src/db.rs --- a/src/db.rs +++ b/src/db.rs @@ -3,11 +3,22 @@ //! This module provides helpers to interact with a PostgreSQL database. //! Functions expect to work with an executor from sqlx crate. +use std::future::Future; use sqlx::PgPool; use sqlx::postgres::PgPoolOptions; static QUERIES_SEPARATOR: &str = "\n\n\n"; +pub struct TableInitializationArgs { + pub table_name: String, + pub create_table: bool, + pub overwrite_table: bool, +} + +pub trait ToTableInitializationArgs { + fn to_table_initialization_args(&self) -> TableInitializationArgs; +} + pub async fn connect_to_db (database_url: &str) -> PgPool { PgPoolOptions::new() .max_connections(3) @@ -61,6 +72,41 @@ .expect("Can't truncate table."); } +pub async fn initialize_table<F, T> ( + pool: &PgPool, + callback: F, + args: T +) -> Result<(), String> + where F: Future, T: ToTableInitializationArgs +{ + let args = args.to_table_initialization_args(); + if is_table_exists(pool, &args.table_name).await { + if is_table_empty(&pool, &args.table_name).await { + return Ok(()); + } + + if args.overwrite_table { + truncate_table(&pool, &args.table_name).await; + return Ok(()); + } + + return Err(format!( + "Table {} already exists and contains rows. To overwrite it, run the import tool with -t option.", + &args.table_name + )); + } + + if args.create_table { + callback.await; + return Ok(()); + } + + Err(format!( + "Table {} doesn't exist. To create it, run the import tool with -c option.", + &args.table_name + )) +} + pub async fn run_multiple_queries(pool: &PgPool, queries: &str) { for query in queries.split(QUERIES_SEPARATOR) { sqlx::query(&query) diff --git a/src/fantoir.rs b/src/fantoir.rs --- a/src/fantoir.rs +++ b/src/fantoir.rs @@ -3,9 +3,20 @@ //! This module offers a structure for a FANTOIR record, methods to parse the file and export it. //! Database functions expect to work with an executor from sqlx crate. +use lazy_static::lazy_static; use sqlx::PgPool; use sqlx::types::chrono::NaiveDate; +lazy_static! { + static ref DEPARTMENTS_WITH_CODE_DIRECTION: Vec<&'static str> = vec!["13", "59", "75", "92", "97"]; + + /// The alphabet without I O and Q. + static ref RIVOLI_STRING: Vec<char> = vec![ + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', + 'N', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' + ]; +} + /// A voie in the FANTOIR database #[derive(Debug)] pub struct FantoirEntry { @@ -158,6 +169,99 @@ } } +/// A fixed FANTOIR code result +pub enum FixedFantoirCode { + /// The code has been fully computed + Computed(String), + + /// Information needed to query the code has been extracted, but code direction is unknown + /// Such result can be queried through search_code_fantoir() + ToSearch { code_insee: String, identifiant_communal_voie: String } +} + +/// Transforms FANTOIR code from BAN into regular FANTOIR codes. +/// BAN sometimes uses <insee code>_<identifiant voie commune> without Rivoli key. +pub fn fix_fantoir_code(code: &str) -> FixedFantoirCode { + let mut code = code.to_string(); + + if code.contains("_") { + // 97231_B026 -> 972231B026 + code = if code.starts_with("97") { + // Code direction = department last digit + format!("{}{}{}", &code[0..=2], &code[2..5], &code[6..]) + } else if uses_specific_code_direction(&code) { + // We can't fix it by computation, we need to search it in the database + return FixedFantoirCode::ToSearch { + code_insee: code[0..5].to_string(), + identifiant_communal_voie: code[6..10].to_string(), + } + } else { + // Code direction = 0 + format!("{}0{}{}", &code[0..=2], &code[3..5], &code[6..]) + }; + } + + if code.len() == 10 { + let last_char = code.chars().last().unwrap(); + println!("Code: {}, ends by {}", code, last_char); + + match last_char { + '0'..='9' => { + code.push(compute_rivoli_key(&code)); + } + + 'A'..='Z' => { + // 441090516U -> 4401090516U + code = if uses_specific_code_direction(&code) { + // We can't fix it by computation, we need to search it in the database + // 920514135A -> 92051 4135 + return FixedFantoirCode::ToSearch { + code_insee: code[0..5].to_string(), + identifiant_communal_voie: code[5..9].to_string(), + } + //( , &code[5..9]) + } else { + format!("{}0{}", &code[0..2], &code[2..]) + }; + } + + _ => unreachable!(), + } + } + + FixedFantoirCode::Computed(code) +} + +pub fn uses_specific_code_direction (code: &str) -> bool { + DEPARTMENTS_WITH_CODE_DIRECTION + .iter() + .any(|&dpt| code.starts_with(dpt)) +} + +pub fn compute_rivoli_key (code: &str) -> char { + // See https://georezo.net/forum/viewtopic.php?id=102292 + + if code.starts_with("2A") || code.starts_with("2B") { + // 2A would be 2 10 and 2B would be 2 11, but how to build a number to multiply by 19? + unimplemented!() + } + + println!(); + println!("Compute code rivoli key for {}", code); + + let part_commune: i32 = code[0..6].parse().unwrap(); + let type_voie = code.chars().nth(6).unwrap(); + let type_voie = if type_voie.is_alphabetic() { + type_voie as u32 - 55 + } else { + type_voie.to_digit(10).unwrap() + }; + let numero_identifiant_communal_voie: i32 = code[7..].parse().unwrap(); + + let index = (part_commune * 19 + type_voie as i32 * 11 + numero_identifiant_communal_voie) % 23; + return RIVOLI_STRING[index as usize]; +} + #[cfg(test)] mod tests { // Note this useful idiom: importing names from outer (for mod tests) scope. @@ -166,7 +270,7 @@ #[test] fn test_parse_fantoir_date() { let expected = NaiveDate::from_ymd_opt(1987, 1, 1).unwrap(); - let actual = parse_fantoir_date("1987001"); + let actual = parse_fantoir_date("1987001").unwrap(); assert_eq!(expected, actual); } @@ -189,4 +293,23 @@ fn test_parse_optional_string_when_only_spaces() { assert_eq!(true, parse_optional_string(" ").is_none()); } + + #[test] + pub fn test_fix_fantoir_code () { + assert_eq!("755112P144L", &fix_fantoir_code("755112P144L")); + assert_eq!("972231B026U", &fix_fantoir_code("97231_B026")); + assert_eq!("4401090516U", &fix_fantoir_code("441090516U")); + assert_eq!("972222B305L", &fix_fantoir_code("972222B305")); + } + + #[test] + pub fn test_compute_rivoli_key() { + assert_eq!('W', compute_rivoli_key("380003B001")); + assert_eq!('U', compute_rivoli_key("972231B026")); + } + + #[test] + pub fn test_compute_rivoli_key_with_type_voie_zero() { + assert_eq!('C', compute_rivoli_key("9722230261")); + } } diff --git a/src/main.rs b/src/main.rs --- a/src/main.rs +++ b/src/main.rs @@ -2,11 +2,11 @@ use clap::{Args, Parser}; -use crate::commands::import::import; - mod commands; mod db; mod fantoir; +mod services; +mod sparql; #[derive(Debug, Parser)] #[command(name = "fantoir2db")] @@ -15,6 +15,13 @@ /// Import from FANTOIR file generated by the DGFIP #[command(arg_required_else_help = true)] Import(ImportArgs), + + /// Query Wikidata SPARQL end-point to enrich FANTOIR information + Wikidata(WikidataArgs), + + /// Update foreign keys to the specified fantoir table + #[command(arg_required_else_help = true)] + UpdateForeignKeys(UpdateForeignKeysArgs), } #[derive(Debug, Args)] @@ -35,6 +42,24 @@ fantoir_table: String, } +#[derive(Debug, Args)] +pub struct WikidataArgs { + /// Create table if it doesn't exist + #[arg(short = 'c')] + create_table: bool, + + /// Truncate table if it already exists, allowing the overwrite mode. + /// If not specified, the script will fail if table exists. + #[arg(short = 't')] + overwrite_table: bool, +} + +#[derive(Debug, Args)] +pub struct UpdateForeignKeysArgs { + /// The name of the FANTOIR table to use + fantoir_table: String, +} + #[tokio::main] async fn main() { let command = FantoirCommand::parse(); // Will exit if argument is missing or --help/--version provided. @@ -44,7 +69,13 @@ match command { FantoirCommand::Import(args) => { - import(&args, &database_url).await; + commands::import::import(&args, &database_url).await; }, + FantoirCommand::Wikidata(args) => { + commands::wikidata::import(&args, &database_url).await + } + FantoirCommand::UpdateForeignKeys(args) => { + commands::update_foreign_keys::update(&database_url, &args.fantoir_table).await; + } }; } diff --git a/src/queries/wikidata.sparql b/src/queries/wikidata.sparql new file mode 100644 --- /dev/null +++ b/src/queries/wikidata.sparql @@ -0,0 +1,13 @@ +PREFIX bd: <http://www.bigdata.com/rdf#> +PREFIX wikibase: <http://wikiba.se/ontology#> +PREFIX wdt: <http://www.wikidata.org/prop/direct/> + +# Streets with FANTOIR code +SELECT DISTINCT ?code_fantoir ?item ?itemLabel ?what +WHERE +{ + ?item wdt:P3182 ?code_fantoir . + ?item wdt:P31 ?what + SERVICE wikibase:label { bd:serviceParam wikibase:language "fr". } +} +LIMIT 3 diff --git a/src/schema/foreign_keys.sql b/src/schema/foreign_keys.sql new file mode 100644 --- /dev/null +++ b/src/schema/foreign_keys.sql @@ -0,0 +1,7 @@ +alter table fantoir_wikidata + drop constraint if exists fantoir_wikidata_code_fantoir_fk; + + +alter table fantoir_wikidata + add constraint fantoir_wikidata_code_fantoir_fk + foreign key (code_fantoir) references /*table*/fantoir (code_fantoir); diff --git a/src/schema/wikidata.sql b/src/schema/wikidata.sql new file mode 100644 --- /dev/null +++ b/src/schema/wikidata.sql @@ -0,0 +1,29 @@ +-- This table matches Wikidata entities and FANTOIR codes. +-- +-- If you provide several instructions, separate those with TWO blank lines. +-- Indexes have to match every WHERE clause used against the database. +-- +-- This schema is compiled as part of the program, as such you need to rebuild +-- (`cargo build`) the project after any schema modification. + +CREATE TABLE IF NOT EXISTS /*table*/fantoir_wikidata +( + -- Identifiers + code_fantoir char(11) NOT NULL + constraint /*index*/index_fantoir_wikidata_pk + primary key, + code_fantoir_wikidata char(11) NOT NULL, + + -- Wikidata information + item varchar(12) NOT NULL, + item_label text, + what varchar(12) NOT NULL, + + -- Constraints + UNIQUE (code_fantoir_wikidata) +); + + +CREATE INDEX CONCURRENTLY /*index*/index_fantoir_wikidata_voie_trigram + ON /*table*/fantoir_wikidata + USING gin (item_label gin_trgm_ops); diff --git a/src/services/http_client.rs b/src/services/http_client.rs new file mode 100644 --- /dev/null +++ b/src/services/http_client.rs @@ -0,0 +1,12 @@ +use lazy_static::lazy_static; + +lazy_static! { + pub static ref USER_AGENT: String = format!( + "{}/{} (https://databases.nasqueron.org/)", + env!("CARGO_PKG_NAME"), env!("CARGO_PKG_VERSION") + ); +} + +pub fn get_user_agent () -> &'static str { + &USER_AGENT +} diff --git a/src/services/mod.rs b/src/services/mod.rs new file mode 100644 --- /dev/null +++ b/src/services/mod.rs @@ -0,0 +1 @@ +pub(crate) mod http_client; diff --git a/src/sparql.rs b/src/sparql.rs new file mode 100644 --- /dev/null +++ b/src/sparql.rs @@ -0,0 +1,120 @@ +//! # SPARQL client + +use std::collections::HashMap; +use std::env; +use oxrdf::Term; + +use reqwest::{ClientBuilder, Url}; +use reqwest::Client as HttpClient; +use reqwest::header::{HeaderMap, HeaderValue}; +use sparesults::{QueryResultsFormat, QueryResultsParser, QueryResultsReader}; + +static DEFAULT_USER_AGENT: &str = concat!( + env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION"), +); + +pub struct Client { + pub endpoint: String, + client: HttpClient, +} + +impl Client { + pub fn new (endpoint: &str, user_agent: &str) -> Self { + let client = ClientBuilder::new() + .user_agent(user_agent) + .default_headers( { + let mut headers = HeaderMap::new(); + headers.insert("Accept", HeaderValue::from_static("Accept: application/sparql-results+json")); + headers + }) + .gzip(true) + .deflate(true) + .build() + .expect("Can't build HTTP client"); + + Self { + endpoint: String::from(endpoint), + client, + } + } + + pub fn with_default_user_agent(endpoint: &str) -> Self { + let user_agent = Self::determine_user_agent(); + + Self::new(endpoint, &user_agent) + } + + fn determine_user_agent () -> String { + env::current_exe() + .ok() + .and_then(|path| path.file_name().map(|s| s.to_os_string())) + .and_then(|program_name| program_name.into_string().ok()) + .unwrap_or(String::from(DEFAULT_USER_AGENT)) + } + + pub async fn query (&self, query: &str) -> Vec<HashMap<String, Term>> { + let result = include_str!("tmp/wikidata-query-result.xml").to_string(); + + // let url = Url::parse_with_params(&self.endpoint, &[("query", query)]) + // .expect("Can't parse endpoint as absolute URL."); + // + // let result = reqwest::get(url) + // .await + // .expect("Can't query endpoint") + // .text() + // .await + // .expect("End-point didn't return a reply."); + + let mut entries = Vec::new(); + + if let QueryResultsReader::Solutions(solutions) = QueryResultsParser + ::from_format(QueryResultsFormat::Xml) + .read_results(result.as_bytes()) + .expect("Can't read Wikidata reply") + { + for solution in solutions { + let entry: HashMap<_, _> = solution + .expect("Can't read solution") + .iter() + .map(|(variable, term)| ( + variable.as_str().to_string(), + term.clone(), + )) + .collect(); + entries.push(entry); + } + } else { + panic!("Can't parse SPARQL result as a solution."); + } + + entries + } +} + +pub fn parse_term_uri (term: &Term) -> Option<String> { + if let Term::NamedNode(node) = term { + Some(node.as_str().to_string()) + } else { + None + } +} + +pub fn parse_literal (term: &Term) -> Option<String> { + if let Term::Literal(literal) = term { + Some(literal.value().to_string()) + } else { + None + } +} + +pub fn is_term_empty(term: &Term) -> bool { + match term { + Term::NamedNode(node) => { + // Special values IRI are considered as empty values. + node.as_str().contains("/.well-known/genid/") + } + Term::BlankNode(_) => true, + Term::Literal(_) => false, + _ => unimplemented!(), + } +}