diff --git a/fantoir-datasource/Cargo.toml b/fantoir-datasource/Cargo.toml index 64da1e4..c8b9d2d 100644 --- a/fantoir-datasource/Cargo.toml +++ b/fantoir-datasource/Cargo.toml @@ -1,36 +1,37 @@ [package] name = "fantoir-datasource" version = "0.1.0" edition = "2021" description = "Generates a Postgres table from FANTOIR raw file" authors = [ "Sébastien Santoro " ] license = "BSD-2-Clause" [dependencies] chrono = "~0.4.23" lazy_static = "~1.4.0" opendatasoft-explore-api = { version = "0.1.0", path = "../opendatasoft-explore-api" } oxrdf = "~0.1.1" +regex = "~1.7.1" sparesults = "~0.1.3" [dependencies.async-scoped] version = "~0.7.1" features = ["use-tokio"] [dependencies.clap] version = "~4.0.32" features = ["derive"] [dependencies.reqwest] version = "~0.11.13" features = ["gzip", "deflate"] [dependencies.sqlx] version = "~0.6.2" features = ["runtime-tokio-native-tls", "postgres", "chrono"] [dependencies.tokio] version = "~1.23.0" features = ["full"] diff --git a/fantoir-datasource/src/commands/query.rs b/fantoir-datasource/src/commands/query.rs index 05e141e..2f47b7d 100644 --- a/fantoir-datasource/src/commands/query.rs +++ b/fantoir-datasource/src/commands/query.rs @@ -1,76 +1,90 @@ use std::process::exit; use sqlx::PgPool; use crate::db::connect_to_db; +use crate::fantoir::looks_like_canonical_fantoir_code; use crate::QueryArgs; use crate::services::query::*; static EXIT_CODE_NO_RESULT_FOUND: i32 = 4; pub async fn search(args: QueryArgs, database_url: &str) { let pool = connect_to_db(database_url).await; if args.code_insee.is_some() && args.code_voie.is_some() { let code_fantoir = search_fantoir_code( &pool, &args.code_insee.unwrap(), &args.code_voie.unwrap(), ).await; if let Some(code) = code_fantoir { search_one_row(&pool, &code).await; return; } exit(EXIT_CODE_NO_RESULT_FOUND); } - if args.libelle.len() > 0 { + if args.expression.len() > 0 { + if let Some(code) = pick_fantoir_code_from_args(&args.expression) { + search_one_row(&pool, &code).await; + return; + } + search_libelle(&pool, args).await; return; } unimplemented!() } async fn search_one_row(pool: &PgPool, code_fantoir: &str) { match query_fantoir_code(pool, code_fantoir).await { None => { exit(EXIT_CODE_NO_RESULT_FOUND); } Some(result) => { println!("{}", result); } } } async fn search_libelle(pool: &PgPool, args: QueryArgs) { - let expression = args.libelle.join(" "); + let expression = args.expression.join(" "); let mut found = false; query_libelle(pool, &expression) .await .iter() .filter(|&entry| entry_matches_conditions(entry, &args)) .for_each(|entry| { found = true; println!("{}", entry); }); if !found { exit(EXIT_CODE_NO_RESULT_FOUND); } } fn entry_matches_conditions(entry: &FantoirVoieResult, conditions: &QueryArgs) -> bool { if let Some(code_insee) = &conditions.code_insee { if &entry.code_insee != code_insee { return false; } } return true; } + +fn pick_fantoir_code_from_args (expressions: &Vec) -> Option { + if expressions.len() == 1 && looks_like_canonical_fantoir_code(&expressions[0]) { + Some(expressions[0].clone()) + } else { + None + } +} diff --git a/fantoir-datasource/src/fantoir.rs b/fantoir-datasource/src/fantoir.rs index daed379..224a0e3 100644 --- a/fantoir-datasource/src/fantoir.rs +++ b/fantoir-datasource/src/fantoir.rs @@ -1,332 +1,358 @@ //! # Helper methods for FANTOIR database. //! //! This module offers a structure for a FANTOIR record, methods to parse the file and export it. //! Database functions expect to work with an executor from sqlx crate. use chrono::NaiveDate; use lazy_static::lazy_static; +use regex::Regex; use sqlx::PgPool; lazy_static! { static ref DEPARTMENTS_WITH_CODE_DIRECTION: Vec<&'static str> = vec!["13", "59", "75", "92", "97"]; /// The alphabet without I O and Q. static ref RIVOLI_STRING: Vec = vec![ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ]; + + static ref RE_FANTOIR: Regex = Regex::new( + "^[0-9][0-9A-Z][0-9][0-9][0-9][0-9][0-9A-Z][0-9][0-9][0-9][A-Z]$" + ).unwrap(); } /// A voie in the FANTOIR database #[derive(Debug)] pub struct FantoirEntry { /* Identifiers */ code_fantoir: String, /* Part 1 - commune */ departement: String, // Generally an integer, but INSEE uses 2A and 2B for Corse code_commune: i32, code_insee: String, // Afa in Corse has 2A001 type_commune: Option, is_pseudo_recensee: bool, /* Part 2 - voie */ identifiant_communal_voie: String, cle_rivoli: String, code_nature_voie: Option, libelle_voie: String, type_voie: i32, // 1: voie, 2: ens. immo, 3: lieu-dit, 4: pseudo-voie, 5: provisoire is_public: bool, /* Part 3 - population */ is_large: bool, population_a_part: i32, population_fictive: i32, /* Part 4 - metadata */ is_cancelled: bool, cancel_date: Option, creation_date: Option, code_majic: i32, last_alpha_word: String, } impl FantoirEntry { pub fn parse_line(line: &str) -> Self { let departement = match &line[0..2] { "97" => String::from(&line[0..3]), // include for DOM/TOM the next digit department => String::from(department), }; let len = line.len(); Self { /* Identifier */ code_fantoir: String::from(&line[0..11]), /* Part 1 - commune */ departement, code_commune: line[3..6].parse().expect("Can't parse code commune"), code_insee: format!("{:02}{:03}", &line[0..2], &line[3..6]), type_commune: parse_optional_string(&line[43..44]), is_pseudo_recensee: &line[45..46] == "3", /* Part 2 - voie */ identifiant_communal_voie: String::from(&line[6..10]), cle_rivoli: String::from(&line[10..11]), code_nature_voie: parse_optional_string(&line[11..15]), libelle_voie: String::from(line[15..41].trim()), type_voie: line[108..109].parse().expect("Can't parse type de voie."), is_public: &line[48..49] == "0", /* Part 3 - population */ is_large: &line[49..50] == "*", population_a_part: line[59..66].parse().expect("Can't parse population à part"), population_fictive: line[66..73].parse().expect("Can't parse population fictive"), /* Part 4 - metadata */ is_cancelled: &line[73..74] != " ", cancel_date: parse_fantoir_date(&line[74..81]), creation_date: parse_fantoir_date(&line[81..88]), code_majic: line[103..108].parse().expect("Can't parse MAJIC"), last_alpha_word: String::from(&line[112..len]), } } pub async fn insert_to_db(&self, pool: &PgPool, table: &str) { let mut query = format!("INSERT INTO {}", table); query.push_str( r#" (code_fantoir, departement, code_commune, code_insee, type_commune, is_pseudo_recensee, identifiant_communal_voie, cle_rivoli, code_nature_voie, libelle_voie, type_voie, is_public, is_large, population_a_part, population_fictive, is_cancelled, cancel_date, creation_date, code_majic, last_alpha_word ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20 )"# ); sqlx::query(&query) /* Identifiers */ .bind(&self.code_fantoir) /* Part 1 - commune */ .bind(&self.departement) .bind(&self.code_commune) .bind(&self.code_insee) .bind(&self.type_commune) .bind(&self.is_pseudo_recensee) /* Part 2 - Voie */ .bind(&self.identifiant_communal_voie) .bind(&self.cle_rivoli) .bind(&self.code_nature_voie) .bind(&self.libelle_voie) .bind(&self.type_voie) .bind(&self.is_public) /* Part 3 - Population */ .bind(&self.is_large) .bind(&self.population_a_part) .bind(&self.population_fictive) /* Part 4 - Metadata */ .bind(&self.is_cancelled) .bind(&self.cancel_date) .bind(&self.creation_date) .bind(&self.code_majic) .bind(&self.last_alpha_word) .execute(pool) .await .expect("Can't insert entry to database"); } } pub fn parse_fantoir_date (date: &str) -> Option { if date == "0000000" { return None; } let year = date[0..4].parse().expect("Can't parse date: year part"); let ord = date[4..7].parse().expect("Can't parse date: ordinal part"); NaiveDate::from_yo_opt(year, ord) } fn parse_optional_string (expression: &str) -> Option { let expression = expression.trim(); if expression.len() > 0 { Some(String::from(expression)) } else { None } } /// A fixed FANTOIR code result #[derive(Debug, Eq, PartialEq)] pub enum FixedFantoirCode { /// The code has been fully computed Computed(String), /// Information needed to query the code has been extracted, but code direction is unknown /// Such result can be queried through search_code_fantoir() ToSearch { code_insee: String, identifiant_communal_voie: String }, } /// Transforms FANTOIR code from BAN into regular FANTOIR codes. /// BAN sometimes uses _ without Rivoli key. pub fn fix_fantoir_code(code: &str) -> FixedFantoirCode { let mut code = code.to_string(); if code.contains("_") { // 97231_B026 -> 972231B026 code = if code.starts_with("97") { // Code direction = department last digit format!("{}{}{}", &code[0..=2], &code[2..5], &code[6..]) } else if uses_specific_code_direction(&code) { // We can't fix it by computation, we need to search it in the database return FixedFantoirCode::ToSearch { code_insee: code[0..5].to_string(), identifiant_communal_voie: code[6..10].to_string(), } } else { // Code direction = 0 format!("{}0{}{}", &code[0..=2], &code[3..5], &code[6..]) }; } if code.len() == 10 { let last_char = code.chars().last().unwrap(); match last_char { '0'..='9' => { code.push(compute_rivoli_key(&code)); } 'A'..='Z' => { // 441090516U -> 4401090516U code = if uses_specific_code_direction(&code) { // We can't fix it by computation, we need to search it in the database // 920514135A -> 92051 4135 return FixedFantoirCode::ToSearch { code_insee: code[0..5].to_string(), identifiant_communal_voie: code[5..9].to_string(), } } else { format!("{}0{}", &code[0..2], &code[2..]) }; } _ => unreachable!(), } } FixedFantoirCode::Computed(code) } pub fn uses_specific_code_direction (code: &str) -> bool { DEPARTMENTS_WITH_CODE_DIRECTION .iter() .any(|&dpt| code.starts_with(dpt)) } pub fn compute_rivoli_key (code: &str) -> char { // See https://georezo.net/forum/viewtopic.php?id=102292 if code.starts_with("2A") || code.starts_with("2B") { // 2A would be 2 10 and 2B would be 2 11, but how to build a number to multiply by 19? unimplemented!() } let part_commune: i32 = code[0..6].parse().unwrap(); let type_voie = code.chars().nth(6).unwrap(); let type_voie = if type_voie.is_alphabetic() { type_voie as u32 - 55 } else { type_voie.to_digit(10).unwrap() }; let numero_identifiant_communal_voie: i32 = code[7..].parse().unwrap(); let index = (part_commune * 19 + type_voie as i32 * 11 + numero_identifiant_communal_voie) % 23; return RIVOLI_STRING[index as usize]; } +/// Determines if the specified expression looks like a FANTOIR code, +/// as used by DGFiP official FANTOIR file. +/// +/// The IGN or OpenStreetMap format variants will return false. +/// +/// This method does NOT check the RIVOLI key, only the format. +pub fn looks_like_canonical_fantoir_code (expression: &str) -> bool { + RE_FANTOIR.is_match(expression) +} + #[cfg(test)] mod tests { // Note this useful idiom: importing names from outer (for mod tests) scope. use super::*; #[test] fn test_parse_fantoir_date() { let expected = NaiveDate::from_ymd_opt(1987, 1, 1).unwrap(); let actual = parse_fantoir_date("1987001").unwrap(); assert_eq!(expected, actual); } #[test] fn test_parse_optional_string() { assert_eq!(Some(String::from("quux")), parse_optional_string("quux")); } #[test] fn test_parse_optional_string_with_trailing_spaces() { assert_eq!(Some(String::from("quux")), parse_optional_string("quux ")); } #[test] fn test_parse_optional_string_when_empty() { assert_eq!(true, parse_optional_string("").is_none()); } #[test] fn test_parse_optional_string_when_only_spaces() { assert_eq!(true, parse_optional_string(" ").is_none()); } #[test] pub fn test_fix_fantoir_code () { assert_fixed_fantoir_code("755112P144L", fix_fantoir_code("755112P144L")); assert_fixed_fantoir_code("972231B026U", fix_fantoir_code("97231_B026")); assert_fixed_fantoir_code("4401090516U", fix_fantoir_code("441090516U")); assert_fixed_fantoir_code("972222B305L", fix_fantoir_code("972222B305")); } fn assert_fixed_fantoir_code (expected: &str, actual: FixedFantoirCode) { match actual { FixedFantoirCode::Computed(code) => { assert_eq!(expected, &code); }, _ => assert!(false, "Expected a computed FANTOIR code") } } #[test] pub fn test_fix_fantoir_code_when_it_cannot_be_computed () { let expected = FixedFantoirCode::ToSearch { code_insee: "92002".to_string(), identifiant_communal_voie: "5130".to_string() }; assert_eq!(expected, fix_fantoir_code("920025130X"), "As code direction can't be computed, this code should be to search"); assert_eq!(expected, fix_fantoir_code("92002_5130"), "As code direction can't be computed, this code should be to search"); } - #[test] pub fn test_compute_rivoli_key() { assert_eq!('W', compute_rivoli_key("380003B001")); assert_eq!('U', compute_rivoli_key("972231B026")); } #[test] pub fn test_compute_rivoli_key_with_type_voie_zero() { assert_eq!('C', compute_rivoli_key("9722230261")); } + + #[test] + pub fn test_looks_like_canonical_fantoir_code () { + assert!(looks_like_canonical_fantoir_code("770246B015C")); + } + + #[test] + pub fn test_looks_like_canonical_fantoir_code_for_variants () { + assert!(!looks_like_canonical_fantoir_code("770246B015")); + assert!(!looks_like_canonical_fantoir_code("77246_B015")); + assert!(!looks_like_canonical_fantoir_code("77246B015C")); + } } diff --git a/fantoir-datasource/src/main.rs b/fantoir-datasource/src/main.rs index 364c97e..1f1cdcd 100644 --- a/fantoir-datasource/src/main.rs +++ b/fantoir-datasource/src/main.rs @@ -1,120 +1,120 @@ use std::env; use clap::{Args, Parser}; use crate::commands::promote::promote; mod commands; mod db; mod fantoir; mod services; #[derive(Debug, Parser)] #[command(name = "fantoir-datasource")] #[clap(author="Nasqueron project", version, about="Import FANTOIR database into PostgreSQL", long_about=None)] enum FantoirCommand { /// Fetch the last version of the FANTOIR file Fetch(FetchArgs), /// Import from FANTOIR file generated by the DGFIP #[command(arg_required_else_help = true)] Import(ImportArgs), /// Promote an imported FANTOIR table as the current FANTOIR table to use #[command(arg_required_else_help = true)] Promote(PromoteArgs), /// Query Wikidata SPARQL end-point to enrich FANTOIR information Wikidata(WikidataArgs), /// Query the imported FANTOIR table Query(QueryArgs) } #[derive(Debug, Args)] pub struct FetchArgs { /// Overwrite file if already existing #[arg(long)] overwrite: bool, } #[derive(Debug, Args)] pub struct ImportArgs { /// Create table if it doesn't exist #[arg(short = 'c')] create_table: bool, /// Truncate table if it already exists, allowing the overwrite mode. /// If not specified, the script will fail if table exists. #[arg(short = 't')] overwrite_table: bool, /// The FANTOIR file to import fantoir_file: String, /// The name of the table to populate fantoir_table: String, } #[derive(Debug, Args)] pub struct PromoteArgs { /// The name of the table to promote fantoir_table: String, } #[derive(Debug, Args)] pub struct WikidataArgs { /// Create table if it doesn't exist #[arg(short = 'c')] create_table: bool, /// Truncate table if it already exists, allowing the overwrite mode. /// If not specified, the script will fail if table exists. #[arg(short = 't')] overwrite_table: bool, /// Generate a Wikidata maintenance report instead to print errors to stderr #[arg(long)] maintenance_report: bool, } #[derive(Debug, Args)] #[clap(trailing_var_arg=true)] pub struct QueryArgs { /// INSEE code to identify a commune #[arg(long)] code_insee: Option, /// Identifier of the voie by the commune #[arg(long)] code_voie: Option, - /// Expression to search - libelle: Vec, + /// Expression to search, FANTOIR code or label of the voie + expression: Vec, } #[tokio::main] async fn main() { let command = FantoirCommand::parse(); // Will exit if argument is missing or --help/--version provided. let database_url = env::var("DATABASE_URL") .expect("The environment variable DATABASE_URL need to be set to your PostgreSQL database."); match command { FantoirCommand::Fetch(args) => { commands::fetch::fetch(args.overwrite).await; }, FantoirCommand::Import(args) => { commands::import::import(&args, &database_url).await; }, FantoirCommand::Promote(args) => { promote(&args.fantoir_table, &database_url).await; }, FantoirCommand::Wikidata(args) => { commands::wikidata::import(&args, &database_url).await }, FantoirCommand::Query(args) => { commands::query::search(args, &database_url).await }, }; }