Page MenuHomeDevCentral

D2754.id8426.diff
No OneTemporary

D2754.id8426.diff

diff --git a/.gitignore b/.gitignore
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,10 @@
+# Rust
/target
+Cargo.lock
+
+# Python
+__pycache__
+
+# Data
/FANTOIR*
!/fantoir-datasource
-Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,5 +2,7 @@
members = [
"fantoir-datasource",
+ "language-subtag-registry-datasource",
+ "rfc-datasource",
"opendatasoft-explore-api",
]
diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -14,6 +14,23 @@
More information: [fantoir-datasource README](fantoir-datasource/README.md)
+### IANA language subtag registry (language-subtag-registry-datasource)
+
+Import IANA language subtag registry datasource from RFC 5646 and convert it to
+the specified text-based format.
+
+Can be used to refresh language Darkbot database for IRC bots.
+
+More information: [language-subtag-registry-datasource README](language-subtag-registry-datasource/README.md)
+
+### RFC import fool (rfc-datasource)
+
+Import RFC index and convert it to the specified text-based format.
+
+Can be used to refresh RFC Darkbot database for IRC bots.
+
+More information: [rfc-datasource README](rfc-datasource/README.md)
+
### Opendatasoft Explore API client (opendatasoft-explore-api)
The opendatasoft-explore-api crate allows to query the Opendatasoft Explore API from Rust code.
diff --git a/_pipelines/README.md b/_pipelines/README.md
new file mode 100644
--- /dev/null
+++ b/_pipelines/README.md
@@ -0,0 +1,14 @@
+## Nasqueron Datasources :: pipelines
+
+The dags/ directory contains DAGs pipelines as code for Apache Airflow.
+
+Those pipelines can be used:
+
+ - at Nasqueron, on our Airflow instance
+ - elsewhere, as a sample documentation how to use our datasources
+ components and how to glue the components together
+
+The nasqueron_datasources module is published to the dags folder too,
+so is available from the different DAGs. It contains helper methods.
+
+Unit tests are available in tests/ folder.
diff --git a/_pipelines/dags/fantoir_fetch.py b/_pipelines/dags/fantoir_fetch.py
new file mode 100644
--- /dev/null
+++ b/_pipelines/dags/fantoir_fetch.py
@@ -0,0 +1,111 @@
+# -------------------------------------------------------------
+# Nasqueron Datasources :: pipelines
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+# Project: Nasqueron
+# Pipeline: Datasources > FANTOIR > fetch
+# License: BSD-2-Clause
+# -------------------------------------------------------------
+
+from datetime import datetime
+import json
+import requests
+
+from airflow.decorators import dag, task
+from airflow.models import Variable, TaskInstance
+from airflow.operators.python import ShortCircuitOperator
+from airflow.operators.trigger_dagrun import TriggerDagRunOperator
+
+from nasqueron_datasources.pipelines.commands import run, parse_environment
+from nasqueron_datasources.pipelines.errors import CommandException, WorkflowException
+
+NOTIFICATION_URL = "https://notifications.nasqueron.org/gate/Notification/Nasqueron"
+
+
+@dag(
+ dag_id="fantoir_fetch",
+ schedule=None,
+ start_date=datetime(2023, 1, 1),
+ tags=["datasources", "fantoir", "download", "external"],
+)
+def fantoir_fetch_dag():
+ """
+ ### Pipeline for FANTOIR datasource - fetch
+
+ This pipeline checks if a new version of FANTOIR file is published.
+
+ If so it downloads it, extracts it and calls import DAG.
+
+ Reference: https://agora.nasqueron.org/Fantoir-datasource
+ """
+
+ @task
+ def fetch() -> dict:
+ """Fetches FANTOIR from data.economie.gouv.fr, if a new version is available."""
+ exit_code, stdout, stderr = run(
+ ["fantoir-datasource", "fetch"],
+ cwd=Variable.get("fantoir_directory"),
+ env={
+ "DATABASE_URL": "", # a value is unneeded for fetch operation
+ },
+ )
+
+ if exit_code == 12:
+ # No new version available
+ return {
+ "new_version": False,
+ "environment": {},
+ }
+
+ if exit_code != 0:
+ # Failure
+ raise CommandException("Can't fetch FANTOIR", exit_code, stderr)
+
+ return {
+ "new_version": True,
+ "environment": parse_environment(stdout),
+ }
+
+ def is_new_version_available(task_instance: TaskInstance) -> bool:
+ return task_instance.xcom_pull(task_ids="fetch", key="new_version")
+
+ check_fetch = ShortCircuitOperator(
+ task_id="check_fetch",
+ python_callable=is_new_version_available,
+ doc_md="""Determine if a new version is available from previous task.""",
+ )
+
+ call_import_dag = TriggerDagRunOperator(
+ task_id="call_import_dag",
+ trigger_dag_id="fantoir_import",
+ notes="Triggered by fantoir_fetch DAG, as a new version is available.",
+ conf={
+ "fantoir_environment": "{{ task_instance.xcom_pull(task_ids='fetch', key='environment') }}"
+ },
+ doc_md="""Launch the workflow to import FANTOIR new version""",
+ )
+
+ @task
+ def notify(task_instance: TaskInstance):
+ """Sends a notification a new version is available."""
+
+ fantoir_file = task_instance.xcom_pull(task_ids="fetch", key="environment").get(
+ "FANTOIR_FILE", "(unknown)"
+ )
+ dag_run_id = task_instance.xcom_pull(
+ task_id="call_import_dag", key="trigger_run_id"
+ )
+ notification = {
+ "service": "Airflow",
+ "project": "Nasqueron",
+ "group": "Datasources",
+ "type": "fantoir-fetch",
+ "text": f"A new version of FANTOIR has been fetched: {fantoir_file}. Triggering import workflow: {dag_run_id}.",
+ }
+
+ response = requests.post(NOTIFICATION_URL, data=json.dumps(notification))
+ if response.status_code != 200:
+ raise WorkflowException(
+ "Can't send notification: HTTP error " + str(response.status_code)
+ )
+
+ fetch() >> check_fetch >> call_import_dag >> notify()
diff --git a/_pipelines/dags/fantoir_import.py b/_pipelines/dags/fantoir_import.py
new file mode 100644
--- /dev/null
+++ b/_pipelines/dags/fantoir_import.py
@@ -0,0 +1,104 @@
+# -------------------------------------------------------------
+# Nasqueron Datasources :: pipelines
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+# Project: Nasqueron
+# Pipeline: Datasources > FANTOIR > import
+# License: BSD-2-Clause
+# -------------------------------------------------------------
+
+from datetime import datetime
+
+from airflow.decorators import dag, task
+from airflow.models import Connection, Variable
+
+from nasqueron_datasources.pipelines.commands import run
+
+
+@dag(
+ dag_id="fantoir_import",
+ schedule=None,
+ start_date=datetime(2023, 1, 1),
+ tags=["datasources", "fantoir", "postgresql", "external"],
+)
+def fantoir_import_dag():
+ """
+ ### Pipeline for FANTOIR datasource - import
+
+ This pipeline imports FANTOIR into PostgreSQL, enriches it
+ and promotes the new table as the one to use.
+
+ Enrichment is done by fetching information from:
+ - Wikidata
+
+ Reference: https://agora.nasqueron.org/Fantoir-datasource
+ """
+
+ fantoir_directory = Variable.get("fantoir_directory")
+ database_url = Connection.get_connection_from_secrets("postgresql_fantoir").get_uri()
+
+ @task
+ def import_to_pgsql():
+ run(
+ [
+ "fantoir-datasource",
+ "import",
+ "{{ params['FANTOIR_FILE'] }}",
+ "{{ params['FANTOIR_TABLE'] }}",
+ ],
+ cwd=fantoir_directory,
+ env={
+ "DATABASE_URL": database_url,
+ },
+ )
+
+ @task
+ def enrich_from_wikidata():
+ run(
+ ["fantoir-datasource", "wikidata"],
+ cwd=fantoir_directory,
+ env={
+ "DATABASE_URL": database_url,
+ },
+ )
+
+ @task
+ def promote():
+ run(
+ ["fantoir-datasource", "promote"],
+ cwd=fantoir_directory,
+ env={
+ "DATABASE_URL": database_url,
+ },
+ )
+
+ @task
+ def publish_to_configuration():
+ """
+ NOT IMPLEMENTED.
+
+ Publish new table name to use to etcd/consul
+ """
+ pass
+
+ @task
+ def notify():
+ """
+ NOT IMPLEMENTED.
+
+ Send notification payload to Notifications Center
+ """
+ pass
+
+ (
+ import_to_pgsql()
+ >> [
+ # Enrichment sources can run in //.
+ enrich_from_wikidata(),
+ ]
+ >> promote()
+ >> [
+ # Post-action tasks can run in // too.
+ publish_to_configuration(),
+ notify(),
+ ]
+ )
diff --git a/_pipelines/dags/nasqueron_datasources/__init__.py b/_pipelines/dags/nasqueron_datasources/__init__.py
new file mode 100644
diff --git a/_pipelines/dags/nasqueron_datasources/pipelines/__init__.py b/_pipelines/dags/nasqueron_datasources/pipelines/__init__.py
new file mode 100644
diff --git a/_pipelines/dags/nasqueron_datasources/pipelines/commands.py b/_pipelines/dags/nasqueron_datasources/pipelines/commands.py
new file mode 100644
--- /dev/null
+++ b/_pipelines/dags/nasqueron_datasources/pipelines/commands.py
@@ -0,0 +1,57 @@
+# -------------------------------------------------------------
+# Nasqueron Datasources :: pipelines :: command utilities
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+# Project: Nasqueron
+# Description: Helpers to handle commands in Python pipelines
+# License: BSD-2-Clause
+# -------------------------------------------------------------
+
+
+import subprocess
+
+
+# -------------------------------------------------------------
+# Subprocess wrappers
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+
+def run(command, cwd=None, env=None):
+ """
+ Runs the specified command and return exit_code, stdout, stderr.
+
+ :type env: dict|None
+ :param env: The environment variables to pass to the software
+ :type command: string|list
+ :param command: The command to run, as a string to pass to shell (to avoid) or a list [command, arg1, arg2, ...]
+ :param cwd: The working directory for the command to run
+
+ :return: (exit_code, stdout, stderr)
+ """
+ if env is None:
+ env = {}
+ shell = type(command) is str
+ process = subprocess.run(
+ command, shell=shell, cwd=cwd, env=env, capture_output=True
+ )
+
+ return process.returncode, process.stdout, process.stderr
+
+
+# -------------------------------------------------------------
+# Environment
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+
+def parse_environment(environment_lines):
+ """
+ Parses environment as a dictionary.
+
+ This method is intended to be used with `env`, with .env files,
+ or with any command offering a similar format:
+
+ VARIABLE=value
+ """
+ return {
+ parts[0]: parts[1]
+ for parts in [line.strip().split("=") for line in environment_lines]
+ }
diff --git a/_pipelines/dags/nasqueron_datasources/pipelines/errors.py b/_pipelines/dags/nasqueron_datasources/pipelines/errors.py
new file mode 100644
--- /dev/null
+++ b/_pipelines/dags/nasqueron_datasources/pipelines/errors.py
@@ -0,0 +1,19 @@
+# -------------------------------------------------------------
+# Nasqueron Datasources :: pipelines : errors
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+# Project: Nasqueron
+# License: BSD-2-Clause
+# -------------------------------------------------------------
+
+
+class WorkflowException(Exception):
+ def __init__(self, message):
+ super(WorkflowException, self).__init__(message)
+
+
+class CommandException(WorkflowException):
+ def __init__(self, message, exit_code, stderr):
+ consolidated_message = "{} (exit code {}): {}".format(
+ message, exit_code, stderr
+ )
+ super(CommandException, self).__init__(consolidated_message)
diff --git a/_pipelines/requirements.txt b/_pipelines/requirements.txt
new file mode 100644
--- /dev/null
+++ b/_pipelines/requirements.txt
@@ -0,0 +1,2 @@
+apache-airflow~=2.8.0
+requests~=2.28.2
diff --git a/_pipelines/tests/files/env b/_pipelines/tests/files/env
new file mode 100644
--- /dev/null
+++ b/_pipelines/tests/files/env
@@ -0,0 +1,3 @@
+FOO=This is a sentence.
+QUUX=666
+BAR=
diff --git a/_pipelines/tests/test_commands.py b/_pipelines/tests/test_commands.py
new file mode 100644
--- /dev/null
+++ b/_pipelines/tests/test_commands.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+
+# -------------------------------------------------------------
+# Nasqueron Datasources :: pipelines :: Tests
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+# Project: Nasqueron
+# License: BSD-2-Clause
+# -------------------------------------------------------------
+
+from nasqueron_datasources.pipelines import commands
+import unittest
+
+
+class TestCommands(unittest.TestCase):
+ def test_parse_environment(self):
+ expected = {
+ "FOO": "This is a sentence.",
+ "QUUX": "666", # everything is parsed as a string
+ "BAR": "", # an empty string is used instead of None for empty values
+ }
+
+ with open("files/env") as fd:
+ self.assertDictEqual(expected, commands.parse_environment(fd))
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/fantoir-datasource/Cargo.toml b/fantoir-datasource/Cargo.toml
--- a/fantoir-datasource/Cargo.toml
+++ b/fantoir-datasource/Cargo.toml
@@ -13,7 +13,7 @@
lazy_static = "~1.4.0"
opendatasoft-explore-api = { version = "0.1.0", path = "../opendatasoft-explore-api" }
oxrdf = "~0.1.1"
-regex = "~1.7.1"
+regex = "~1.8.1"
sparesults = "~0.1.3"
[dependencies.async-scoped]
@@ -21,11 +21,11 @@
features = ["use-tokio"]
[dependencies.clap]
-version = "~4.0.32"
+version = "~4.3.0"
features = ["derive"]
[dependencies.reqwest]
-version = "~0.11.13"
+version = "~0.11.18"
features = ["gzip", "deflate"]
[dependencies.sqlx]
@@ -33,5 +33,5 @@
features = ["runtime-tokio-native-tls", "postgres", "chrono"]
[dependencies.tokio]
-version = "~1.23.0"
+version = "~1.28.1"
features = ["full"]
diff --git a/language-subtag-registry-datasource/Cargo.toml b/language-subtag-registry-datasource/Cargo.toml
new file mode 100644
--- /dev/null
+++ b/language-subtag-registry-datasource/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "language-subtag-registry-datasource"
+version = "0.1.0"
+edition = "2021"
+description = "Downloads and transforms IANA language subtag registry"
+authors = [
+ "Sébastien Santoro <dereckson@espace-win.org>"
+]
+license = "BSD-2-Clause"
+
+[dependencies]
+lazy_static = "1.4.0"
+regex = "1.8.1"
+
+[dependencies.clap]
+version = "4.3.0"
+features = ["derive"]
+
+[dependencies.reqwest]
+version = "~0.11.18"
+features = ["gzip", "deflate"]
+
+[dependencies.tokio]
+version = "1.28.1"
+features = ["full"]
diff --git a/language-subtag-registry-datasource/README.md b/language-subtag-registry-datasource/README.md
new file mode 100644
--- /dev/null
+++ b/language-subtag-registry-datasource/README.md
@@ -0,0 +1,65 @@
+The `language-subtag-registry-datasource` utility allows to download
+IANA language subtag registry datasource defined in the RFC 5646,
+parse it, and transform the output.
+
+This registry shares language codes with the different ISO-639 lists,
+but is more inclusive and descriptive.
+
+It has been designed to output the index in an arbitrary format,
+so we can export a Darkbot database for Odderon, one of our IRC bot.
+
+## Usage
+
+```
+language-subtag-registry-datasource
+ --format <format string>
+ [--languages-only]
+ [--aggregation-separator <separator string>]
+ [--source /path/to/registry.txt]`
+```
+
+The format string can be arbitrary text or variables:
+
+| **Variable** | **Description** |
+|-----------------|-------------------------------------------|
+| %%id%% | The Tag or Subtag field of the entry |
+| %%<key>%% | A field in the registry entry |
+| %%fullstatus%% | A string built with description, comments |
+
+If an entry doesn't have the required field, it left blank.
+
+Examples for the variables:
+ - `%%Description%%` will output `Inupiaq` for the `ik` subtag
+ - `%%Description%%` will output `Sichuan Yi / Nuosu` for the `ii` subtag
+ - `%%Comments%%` will output an empty string for both `ik` and `ii` subtags
+ - `%%fulldescription%%` will output "Serbo-Croatian - sr, hr, bs are preferred for most modern uses" for `sh`
+
+If a language has several values, they are coalesced and a specific string
+is used as separator. Default separator is " / ". It can be overridden with
+`--aggregation-separator`.
+
+The database contains entries of other types than languages, like variants, regions or redundant.
+To only parse languages, use `-l` or `--languages-only` flag.
+
+The utility uses as source, by order of priority:
+ - the path specified to the `--source` argument
+ - any `registry.txt` file available in the current directory
+ - https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
+
+## Recipes
+
+### Darkbot database
+
+ language-subtag-registry-datasource -l --format "lang+%%id%% %%fulldescription%%"
+
+### CSV export
+
+Identify the fields and the order you wish to use.
+
+For example, to create a CSV with the following header:
+
+ Type,Subtag,Tag,Added,Suppress-Script,Preferred-Value,Comments,Scope,Macrolanguage,Deprecated,Description
+
+Use:
+
+ language-subtag-registry-datasource --format '"%%Type%%","%%Subtag%%","%%Tag%%","%%Added%%","%%Suppress-Script%%","%%Preferred-Value%%","%%Comments%%","%%Scope%%","%%Macrolanguage%%","%%Deprecated%%","%%Description%%"'
diff --git a/language-subtag-registry-datasource/src/language_parser.rs b/language-subtag-registry-datasource/src/language_parser.rs
new file mode 100644
--- /dev/null
+++ b/language-subtag-registry-datasource/src/language_parser.rs
@@ -0,0 +1,179 @@
+use std::collections::HashMap;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+/* -------------------------------------------------------------
+ Regexp definitions, used in builder
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+lazy_static! {
+ static ref RE_KEY: Regex = Regex::new(
+ // %%key%%
+ r"%%(.*?)%%"
+ ).unwrap();
+}
+
+/* -------------------------------------------------------------
+ Language
+
+ Each language entry from the registry is a key/value map.
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+#[derive(Debug)]
+pub struct Language {
+ pub fields: HashMap<String, Vec<String>>,
+}
+
+impl Language {
+
+ ///
+ /// Parser
+ ///
+
+ pub fn parse_document(document: &str, restrict_to_language: bool) -> Vec<Self> {
+ document
+ .split("\n%%\n")
+ .skip(1) // Metadata File-Date: <date>
+ .filter(|&entry| !restrict_to_language || entry.contains("Type: language"))
+ .map(|entry| Self::parse_entry(entry))
+ .collect()
+ }
+
+ pub fn parse_entry(entry: &str) -> Self {
+ let mut fields = HashMap::new();
+
+ let mut key = String::new();
+ let mut value= String::new();
+ let mut has_value = false;
+
+ // Pitfall: some values can extend to several lines
+ for line in entry.split("\n") {
+ if line.contains(": ") {
+ // Save previous value
+ if has_value {
+ fields
+ .entry(key)
+ .or_insert(Vec::new())
+ .push(value);
+ }
+
+ // <key>: <value> line
+ let mut tokens = line.splitn(2, ": ");
+ key = String::from(tokens.next().unwrap());
+ value = String::from(tokens.next().unwrap());
+ has_value = true;
+ } else {
+ // Multiline value. Append the line to previous value.
+ value = format!("{} {}", &value.trim(), line.trim())
+ }
+ }
+ if has_value {
+ fields
+ .entry(key)
+ .or_insert(Vec::new())
+ .push(value);
+ }
+
+ Self {
+ fields,
+ }
+ }
+
+ ///
+ /// Builder
+ ///
+
+ pub fn get_field(&self, tag: &str, separator: &str) -> Option<String> {
+ self.fields
+ .get(tag)
+ .map(|values| values.join(separator))
+ }
+
+ pub fn get_id(&self) -> Option<String> {
+ self.get_field("Subtag", "-")
+ .or_else(|| self.get_field("Tag", "-"))
+ }
+
+ pub fn build_full_description(&self, separator: &str) -> String {
+ let mut full_description = self.get_field("Description", separator)
+ .unwrap_or("<no description in IANA registry>".to_string());
+
+ if self.fields.contains_key("Deprecated") {
+ full_description.push_str(" [deprecated]");
+ }
+
+ if let Some(should_use) = self.get_field("Preferred-Value", separator) {
+ full_description.push_str("; preferred value: ");
+ full_description.push_str(&should_use);
+
+ }
+
+ if let Some(comments) = self.get_field("Comments", separator) {
+ full_description.push_str("; ");
+ full_description.push_str(&comments);
+ }
+
+
+
+ full_description
+ }
+
+ pub fn format(&self, format: &str, separator: &str) -> String {
+ let mut formatted = String::from(format);
+
+ if formatted.contains("%%id%%") {
+ let id = self.get_id().unwrap_or(String::new());
+ formatted = formatted.replace("%%id%%", &id);
+ }
+
+ if formatted.contains("%%fulldescription%%") {
+ let description = self.build_full_description(separator);
+ formatted = formatted.replace("%%fulldescription%%", &description);
+ }
+
+ for (key , values) in &self.fields {
+ let value = values.join(separator);
+
+ formatted = formatted.replace(
+ &format!("%%{}%%", &key),
+ &value
+ );
+ }
+
+ RE_KEY
+ .replace_all(&formatted, "")
+ .to_string()
+ }
+
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ pub fn test_format() {
+ let liquids = vec!["Water".to_string(), "Air".to_string()];
+
+ let mut fields = HashMap::new();
+ fields.insert("Liquid".to_string(), liquids);
+ fields.insert("Model".to_string(), vec!["Newtonian".to_string()]);
+
+ let language = Language { fields };
+
+ assert_eq!(
+ "Water or Air use Newtonian physic.",
+ &language.format("%%Liquid%% use %%Model%% physic.", " or ")
+ );
+
+ assert_eq!(
+ "Water or Air use Newtonian physic.",
+ &language.format("%%Liquid%% use %%Prefix%%%%Model%% physic.", " or ")
+ );
+
+ assert_eq!(
+ "", &language.format("", "")
+ );
+ }
+}
diff --git a/language-subtag-registry-datasource/src/main.rs b/language-subtag-registry-datasource/src/main.rs
new file mode 100644
--- /dev/null
+++ b/language-subtag-registry-datasource/src/main.rs
@@ -0,0 +1,40 @@
+use clap::Parser;
+
+use crate::registry::get_registry;
+use crate::language_parser::Language;
+
+mod registry;
+mod language_parser;
+
+#[derive(Debug, Parser)]
+#[command(name = "language-subtag-registry-datasource")]
+#[clap(author="Nasqueron project", version, about="Download and print language subtag registry", long_about=None)]
+pub struct Args {
+ /// The format string to use
+ #[arg(long, short = 'f')]
+ format: String,
+
+ /// The aggregation separator
+ #[arg(long, short = 'a', default_value = " / ")]
+ aggregation_separator: String,
+
+ /// The path to the registry source
+ #[arg(long, short = 's')]
+ source: Option<String>,
+
+ /// Restricts parsing to language type
+ #[arg(long, short = 'l', default_value_t = false)]
+ languages_only: bool,
+}
+
+#[tokio::main]
+async fn main() {
+ let args = Args::parse(); // Will exit if argument is missing or --help/--version provided.
+
+ let document = get_registry(args.source).await
+ .expect("Can't read or fetch registry");
+
+ for language in Language::parse_document(&document, args.languages_only) {
+ println!("{}", language.format(&args.format, &args.aggregation_separator));
+ }
+}
diff --git a/language-subtag-registry-datasource/src/registry.rs b/language-subtag-registry-datasource/src/registry.rs
new file mode 100644
--- /dev/null
+++ b/language-subtag-registry-datasource/src/registry.rs
@@ -0,0 +1,60 @@
+use std::error::Error;
+use std::fs;
+use std::path::Path;
+
+use reqwest::ClientBuilder;
+
+static REGISTRY_URL: &str = "https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry";
+
+/* -------------------------------------------------------------
+ User agent
+
+ The USER_AGENT variable is computed at build time.
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+lazy_static::lazy_static! {
+ pub static ref USER_AGENT: String = format!(
+ "{}/{} (https://databases.nasqueron.org/)",
+ env!("CARGO_PKG_NAME"), env!("CARGO_PKG_VERSION")
+ );
+}
+
+pub fn get_user_agent () -> &'static str {
+ &USER_AGENT
+}
+
+/* -------------------------------------------------------------
+ Read or fetch registry
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+pub async fn get_registry(source: Option<String>) -> Result<String, Box<dyn Error>> {
+ match source {
+ // Case 1 - A source file has been explicitly set
+ Some(file) => Ok(fs::read_to_string(&file)?.parse()?),
+
+ None => {
+ if Path::new("registry.txt").exists() {
+ // Case 2 - The file registry.txt can be found locally
+ Ok(fs::read_to_string("registry.txt")?.parse()?)
+ } else {
+ // Case 3 - Fetch the index remotely
+ Ok(fetch_registry().await?)
+ }
+ }
+ }
+}
+
+async fn fetch_registry() -> Result<String, Box<dyn Error>> {
+ let client = ClientBuilder::new()
+ .user_agent(get_user_agent())
+ .gzip(true)
+ .deflate(true)
+ .build()
+ .expect("Can't build HTTP client");
+
+ let body = client.get(REGISTRY_URL)
+ .send().await?
+ .text().await?;
+
+ Ok(body)
+}
diff --git a/opendatasoft-explore-api/Cargo.toml b/opendatasoft-explore-api/Cargo.toml
--- a/opendatasoft-explore-api/Cargo.toml
+++ b/opendatasoft-explore-api/Cargo.toml
@@ -16,14 +16,17 @@
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
-bytes = "~1.3.0" # Keep in sync with reqwest
+bytes = "~1.4.0" # Keep in sync with reqwest
chrono = { version = "~0.4", features = ["serde"] }
-reqwest = {version = "~0.11.13" }
+reqwest = {version = "~0.11.18" }
serde = "~1.0.152"
serde_derive = "~1.0.152"
serde_json = "~1.0.91"
[dev-dependencies]
-mockito = "~0.31.1"
+mockito = "~1.1.0"
lazy_static = "~1.4.0"
-tokio = { version = "~1.23.0", features = ["macros", "rt"] }
+
+[dependencies.tokio]
+version = "~1.28.1"
+features = ["macros", "rt"]
diff --git a/opendatasoft-explore-api/tests/requests_test.rs b/opendatasoft-explore-api/tests/requests_test.rs
--- a/opendatasoft-explore-api/tests/requests_test.rs
+++ b/opendatasoft-explore-api/tests/requests_test.rs
@@ -3,7 +3,7 @@
use std::collections::HashMap;
use lazy_static::lazy_static;
-use mockito::{mock, Mock};
+use mockito::{Server, ServerGuard};
use serde_json::json;
use opendatasoft_explore_api::requests::ExploreApiEndPoint;
@@ -16,49 +16,78 @@
static TEST_RECORD_ID: &'static str = "eb04cba18e872814448a7fda829f3f1918cfae0b";
lazy_static! {
- static ref MOCK_URL: String = mockito::server_url();
-
static ref MOCK_FILES: HashMap<&'static str, &'static str> = {
let mut m = HashMap::new();
- m.insert("/catalog/datasets", include_str!("requests/catalog_datasets.json"));
- m.insert("/catalog/facets", include_str!("requests/catalog_facets.json"));
- m.insert("/catalog/exports/rdf", include_str!("requests/catalog_exports.rdf"));
- m.insert("/catalog/datasets/controle_techn/records", include_str!("requests/catalog_datasets_records.json"));
- m.insert("/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits", include_str!("requests/catalog_dataset_fantoir.json"));
- m.insert("/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits/attachments", include_str!("requests/catalog_dataset_fantoir_attachments.json"));
- m.insert("/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits/facets", include_str!("requests/catalog_dataset_fantoir_facets.json"));
- m.insert("/catalog/datasets/controle_techn/records/eb04cba18e872814448a7fda829f3f1918cfae0b", include_str!("requests/catalog_dataset_record.json"));
+ m.insert(
+ "/catalog/datasets",
+ include_str!("requests/catalog_datasets.json"),
+ );
+ m.insert(
+ "/catalog/facets",
+ include_str!("requests/catalog_facets.json"),
+ );
+ m.insert(
+ "/catalog/exports/rdf",
+ include_str!("requests/catalog_exports.rdf"),
+ );
+ m.insert(
+ "/catalog/datasets/controle_techn/records",
+ include_str!("requests/catalog_datasets_records.json"),
+ );
+ m.insert(
+ "/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits",
+ include_str!("requests/catalog_dataset_fantoir.json"),
+ );
+ m.insert(
+ "/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits/attachments",
+ include_str!("requests/catalog_dataset_fantoir_attachments.json"),
+ );
+ m.insert(
+ "/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits/facets",
+ include_str!("requests/catalog_dataset_fantoir_facets.json"),
+ );
+ m.insert(
+ "/catalog/datasets/controle_techn/records/eb04cba18e872814448a7fda829f3f1918cfae0b",
+ include_str!("requests/catalog_dataset_record.json"),
+ );
m
};
}
-pub fn prepare_mock (url: &str) -> Mock {
- mock("GET", url)
- .with_status(200)
+pub async fn prepare_mock(url: &str) -> ServerGuard {
+ let mut server = Server::new_async().await;
+ server
+ .mock("GET", url)
.with_body(MOCK_FILES[url])
- .create()
+ .create_async()
+ .await;
+ server
}
#[tokio::test]
-async fn test_get_datasets () {
- let _mock = prepare_mock("/catalog/datasets");
+async fn test_get_datasets() {
+ let server = prepare_mock("/catalog/datasets").await;
- let endpoint = ExploreApiEndPoint::new(&MOCK_URL);
+ let endpoint = ExploreApiEndPoint::new(&server.url());
let catalog = endpoint.get_datasets().await;
assert_eq!(426, catalog.total_count);
- assert_eq!(Link {
- href: "https://data.economie.gouv.fr/api/v2/catalog/datasets/mef-catalogue-temporaire".to_string(),
- rel: "self".to_string(),
- }, catalog.datasets[0].links[0]);
+ assert_eq!(
+ Link {
+ href: "https://data.economie.gouv.fr/api/v2/catalog/datasets/mef-catalogue-temporaire"
+ .to_string(),
+ rel: "self".to_string(),
+ },
+ catalog.datasets[0].links[0]
+ );
assert_eq!(3, catalog.datasets.len());
}
#[tokio::test]
-async fn test_export_datasets_catalog () {
- let _mock = prepare_mock("/catalog/exports/rdf");
+async fn test_export_datasets_catalog() {
+ let server = prepare_mock("/catalog/exports/rdf").await;
- let mut response = ExploreApiEndPoint::new(&MOCK_URL)
+ let mut response = ExploreApiEndPoint::new(&server.url())
.export_datasets_catalog("rdf")
.await;
@@ -75,32 +104,32 @@
}
#[tokio::test]
-async fn test_get_facets () {
- let _mock = prepare_mock("/catalog/facets");
+async fn test_get_facets() {
+ let server = prepare_mock("/catalog/facets").await;
- let endpoint = ExploreApiEndPoint::new(&MOCK_URL);
+ let endpoint = ExploreApiEndPoint::new(&server.url());
let facets = endpoint.get_facets().await;
assert!(facets.links[0].href.starts_with(TEST_URL));
let expected_facets_categories = vec![
- "features".to_string(), "modified".to_string(),
- "publisher".to_string(), "keyword".to_string(),
+ "features".to_string(),
+ "modified".to_string(),
+ "publisher".to_string(),
+ "keyword".to_string(),
"theme".to_string(),
];
- let actual_facets_categories: Vec<_> = facets.facets
- .into_iter()
- .map(|facet| facet.name)
- .collect();
+ let actual_facets_categories: Vec<_> =
+ facets.facets.into_iter().map(|facet| facet.name).collect();
assert_eq!(expected_facets_categories, actual_facets_categories);
}
#[tokio::test]
-async fn test_get_dataset_records () {
- let _mock = prepare_mock("/catalog/datasets/controle_techn/records");
+async fn test_get_dataset_records() {
+ let server = prepare_mock("/catalog/datasets/controle_techn/records").await;
- let results = ExploreApiEndPoint::new(&MOCK_URL)
+ let results = ExploreApiEndPoint::new(&server.url())
.get_dataset_records(TEST_DATASET_WITH_RECORDS_ID)
.await;
@@ -110,7 +139,10 @@
ResultsRecord::Aggregation(_) => unreachable!(),
ResultsRecord::Record(record) => record.clone(),
};
- assert_eq!("b839362b229db63bc9b344e980ae6273be7f80fd", record.record.id.as_str());
+ assert_eq!(
+ "b839362b229db63bc9b344e980ae6273be7f80fd",
+ record.record.id.as_str()
+ );
assert_eq!(
Some(&json!("Voiture Particulière")),
record.record.fields.get("cat_vehicule_libelle")
@@ -122,10 +154,10 @@
}
#[tokio::test]
-async fn test_get_dataset_information () {
- let _mock = prepare_mock("/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits");
+async fn test_get_dataset_information() {
+ let server = prepare_mock("/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits").await;
- let dataset = ExploreApiEndPoint::new(&MOCK_URL)
+ let dataset = ExploreApiEndPoint::new(&server.url())
.get_dataset_information(TEST_DATASET_ID)
.await;
@@ -133,21 +165,26 @@
}
#[tokio::test]
-async fn test_get_dataset_attachments () {
- let _mock = prepare_mock("/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits/attachments");
+async fn test_get_dataset_attachments() {
+ let server =
+ prepare_mock("/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits/attachments").await;
- let attachments = ExploreApiEndPoint::new(&MOCK_URL)
+ let attachments = ExploreApiEndPoint::new(&server.url())
.get_dataset_attachments(TEST_DATASET_ID)
.await;
- assert!(attachments.attachments[0].metas.url.starts_with("odsfile://"));
+ assert!(attachments.attachments[0]
+ .metas
+ .url
+ .starts_with("odsfile://"));
}
#[tokio::test]
-async fn test_get_dataset_facets () {
- let _mock = prepare_mock("/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits/facets");
+async fn test_get_dataset_facets() {
+ let server =
+ prepare_mock("/catalog/datasets/fichier-fantoir-des-voies-et-lieux-dits/facets").await;
- let facets = ExploreApiEndPoint::new(&MOCK_URL)
+ let facets = ExploreApiEndPoint::new(&server.url())
.get_dataset_facets(TEST_DATASET_ID)
.await;
@@ -155,10 +192,13 @@
}
#[tokio::test]
-async fn test_get_dataset_record () {
- let _mock = prepare_mock("/catalog/datasets/controle_techn/records/eb04cba18e872814448a7fda829f3f1918cfae0b");
+async fn test_get_dataset_record() {
+ let server = prepare_mock(
+ "/catalog/datasets/controle_techn/records/eb04cba18e872814448a7fda829f3f1918cfae0b",
+ )
+ .await;
- let record = ExploreApiEndPoint::new(&MOCK_URL)
+ let record = ExploreApiEndPoint::new(&server.url())
.get_dataset_record(TEST_DATASET_WITH_RECORDS_ID, TEST_RECORD_ID)
.await;
diff --git a/rfc-datasource/Cargo.toml b/rfc-datasource/Cargo.toml
new file mode 100644
--- /dev/null
+++ b/rfc-datasource/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "rfc-datasource"
+version = "0.1.0"
+edition = "2021"
+description = "Downloads and transforms RFC index"
+authors = [
+ "Sébastien Santoro <dereckson@espace-win.org>"
+]
+license = "BSD-2-Clause"
+
+[dependencies]
+lazy_static = "1.4.0"
+regex = "1.8.1"
+
+[dependencies.clap]
+version = "4.3.0"
+features = ["derive"]
+
+[dependencies.reqwest]
+version = "~0.11.18"
+
+[dependencies.tokio]
+version = "1.28.1"
+features = ["full"]
diff --git a/rfc-datasource/README.md b/rfc-datasource/README.md
new file mode 100644
--- /dev/null
+++ b/rfc-datasource/README.md
@@ -0,0 +1,39 @@
+The `rfc-datasource` utility allows to download the RFC index, parse it,
+and transform the output.
+
+It has been designed to output the index in an arbitrary RFC format,
+so we can export a Darkbot database for Odderon, one of our IRC bot.
+
+## Usage
+
+`rfc-datasource --format <format string> [--source /path/to/rfc-index.txt]`
+
+The format string can be arbitrary text or variables:
+
+| **Variable** | **Description** |
+|-----------------|---------------------------------------------------------------|
+| %%id%% | The number of the RFC without leading 0 |
+| %%<len>id%% | The number of the RFC with leading 0 to fill <len> digits (1) |
+| %%description%% | The RFC title, authors and date |
+| %%status%% | The RFC status (2) |
+| %%fullstatus%% | A string summarizing the different status notes (3) |
+
+Examples for the variables:
+ - (1) e.g. `%%4id%%` will output `0065` for the RFC 65
+ - (2) e.g. `INFORMATIONAL` for RFC 2286
+ - (3) e.g. `Obsoletes RFC1938. Status: DRAFT STANDARD.` for RFC 2289
+
+The utility uses as source, by order of priority:
+ - the path specified to the --source argument
+ - any `rfc-index.txt` file available in the current directory
+ - https://www.ietf.org/download/rfc-index.txt
+
+## Recipes
+
+### Darkbot database
+
+ rfc-datasource --format "rfc+%%id%% %%description%% %%fullstatus%%"
+
+### CSV export
+
+ rfc-datasource --format '%%id%%,"%%description%%", "%%status%%"'
diff --git a/rfc-datasource/src/main.rs b/rfc-datasource/src/main.rs
new file mode 100644
--- /dev/null
+++ b/rfc-datasource/src/main.rs
@@ -0,0 +1,32 @@
+use clap::Parser;
+
+use crate::rfc_index::get_rfc_index;
+use crate::rfc_parser::Rfc;
+
+mod rfc_index;
+mod rfc_parser;
+
+#[derive(Debug, Parser)]
+#[command(name = "rfc-datasource")]
+#[clap(author="Nasqueron project", version, about="Download and print RFC index", long_about=None)]
+pub struct RfcArgs {
+ /// The format string to use
+ #[arg(long, short = 'f')]
+ format: String,
+
+ /// The path to the RFC index source
+ #[arg(long, short = 's')]
+ source: Option<String>,
+}
+
+#[tokio::main]
+async fn main() {
+ let args = RfcArgs::parse(); // Will exit if argument is missing or --help/--version provided.
+
+ let document = get_rfc_index(args.source).await
+ .expect("Can't read or fetch RFC index");
+
+ for rfc in Rfc::parse_document(&document) {
+ println!("{}", rfc.format(&args.format));
+ }
+}
diff --git a/rfc-datasource/src/rfc_index.rs b/rfc-datasource/src/rfc_index.rs
new file mode 100644
--- /dev/null
+++ b/rfc-datasource/src/rfc_index.rs
@@ -0,0 +1,31 @@
+use std::error::Error;
+use std::fs;
+use std::path::Path;
+
+static RFC_INDEX_URL: &str = "https://www.ietf.org/download/rfc-index.txt";
+
+pub async fn get_rfc_index(source: Option<String>) -> Result<String, Box<dyn Error>> {
+ match source {
+ // Case 1 - A source file has been explicitly set
+ Some(file) => Ok(fs::read_to_string(&file)?.parse()?),
+
+ None => {
+ if Path::new("rfc-index.txt").exists() {
+ // Case 2 - The file rfc-index.txt can be found locally
+ Ok(fs::read_to_string("rfc-index.txt")?.parse()?)
+ } else {
+ // Case 3 - Fetch the index remotely
+ Ok(fetch_rfc_index().await?)
+ }
+ }
+ }
+}
+
+async fn fetch_rfc_index() -> Result<String, Box<dyn Error>> {
+ let body = reqwest::get(RFC_INDEX_URL)
+ .await?
+ .text()
+ .await?;
+
+ Ok(body)
+}
diff --git a/rfc-datasource/src/rfc_parser.rs b/rfc-datasource/src/rfc_parser.rs
new file mode 100644
--- /dev/null
+++ b/rfc-datasource/src/rfc_parser.rs
@@ -0,0 +1,214 @@
+use std::collections::HashMap;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+/* -------------------------------------------------------------
+ Regexp definitions, used in parser and builder
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+lazy_static!{
+ static ref RE_RFC: Regex = Regex::new(
+ // <id> <description> <metadata...>
+ r"(\d+) (.*?) (\(.*\))"
+ ).unwrap();
+
+ static ref RE_RFC_METADATA: Regex = Regex::new(
+ // (...) (...) (...)
+ r"\((.*?)\)"
+ ).unwrap();
+
+ static ref RE_ID: Regex = Regex::new(
+ // %%9id%%
+ r"\%(\d+)id\%"
+ ).unwrap();
+}
+
+/* -------------------------------------------------------------
+ RFC
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+#[derive(Debug)]
+pub struct Rfc {
+ pub id: i32,
+ pub description: String,
+
+ pub metadata: HashMap<String, String>,
+ pub untagged_metadata: Vec<String>,
+}
+
+impl Rfc {
+
+ ///
+ /// Parser
+ ///
+
+ pub fn parse_document(document: &str) -> Vec<Self> {
+ let lines: Vec<_> = document.lines().collect();
+
+ let start_index = lines
+ .iter()
+ .position(|&line| line.starts_with("0001"))
+ .unwrap_or(0);
+
+ let document = lines[start_index..].join("\n");
+
+ Self::parse_blocks(&document)
+ }
+
+ fn parse_blocks(document: &str) -> Vec<Self> {
+ document
+ .split("\n\n")
+ .map(|block| Self::parse_block(block))
+ .filter(|rfc| rfc.is_some())
+ .map(|rfc| rfc.unwrap())
+ .collect()
+ }
+
+ pub fn parse_block(block: &str) -> Option<Self> {
+ let rfc_expression: Vec<&str> = block
+ .split("\n")
+ .map(|line| line.trim_start())
+ .collect();
+
+ Self::parse_line(&rfc_expression.join(" "))
+ }
+
+ fn parse_line(line: &str) -> Option<Self> {
+ match RE_RFC.captures(line) {
+ None => None,
+
+ Some(caps) => {
+ match caps.len() {
+ 4 => {
+ let (metadata, untagged_metadata) = Self::parse_metadata_line(
+ caps.get(3)?.as_str()
+ );
+
+ Some(Rfc {
+ id: caps.get(1)?.as_str().parse::<i32>().ok()?,
+ description: caps.get(2)?.as_str().to_string(),
+ metadata,
+ untagged_metadata,
+ })
+ },
+ _ => None,
+ }
+ }
+ }
+ }
+
+ fn parse_metadata_line(expression: &str) -> (HashMap<String, String>, Vec<String>) {
+ let mut metadata = HashMap::new();
+ let mut untagged_metadata = Vec::new();
+
+ RE_RFC_METADATA
+ .captures_iter(expression)
+ .map(|cap| cap.get(1).unwrap().as_str())
+ .for_each(|value| {
+ if value.contains(":") {
+ let parts: Vec<_> = value.splitn(2, ": ").collect(); // K: V
+ metadata.insert(parts[0].to_owned(), parts[1].to_owned());
+ } else {
+ untagged_metadata.push(String::from(value));
+ }
+ });
+
+ (metadata, untagged_metadata)
+ }
+
+ ///
+ /// Builder
+ ///
+
+ pub fn get_status (&self) -> Option<String> {
+ self.metadata
+ .get("Status")
+ .map(|value| String::from(value))
+ }
+
+ pub fn get_full_status_metadata (&self) -> Vec<String> {
+ let mut all_metadata: Vec<String> = self.untagged_metadata
+ .iter()
+ .map(|value| format!("{}.", value))
+ .collect();
+
+ all_metadata.extend(
+ self.metadata
+ .iter()
+ .filter(|&(key, _value)| key != "DOI" && key != "Format")
+ .map(|(key, value)| format!("{}: {}.", key, value))
+ );
+
+ all_metadata
+ }
+
+ pub fn get_full_status (&self) -> String {
+ self.get_full_status_metadata()
+ .join(" ")
+ }
+
+ ///
+ /// Format
+ ///
+
+ pub fn format(&self, format: &str) -> String {
+ // Replace expressions like %%4id%% %%5id%%
+ let matches = RE_ID
+ .captures_iter(&format)
+ .map(|caps| caps.get(1).unwrap()
+ .as_str()
+ .parse::<usize>().unwrap());
+
+ let mut formatted_rfc = String::from(format);
+ for len in matches {
+ formatted_rfc = formatted_rfc.replace(
+ &format!("%%{}id%%", len.clone()),
+ &zerofill(self.id, len.clone()),
+ );
+ }
+
+ // Replace straightforward variables
+ formatted_rfc
+ .replace("%%id%%", &self.id.to_string())
+ .replace("%%description%%", &self.description)
+ .replace("%%status%%", &self.get_status().unwrap_or(String::new()))
+ .replace("%%fullstatus%%", &self.get_full_status())
+ }
+}
+
+/* -------------------------------------------------------------
+ Helper methods
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+fn zerofill(number: i32, width: usize) -> String {
+ format!("{:0>width$}", number, width = width)
+}
+
+/* -------------------------------------------------------------
+ Unit tests
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ pub fn test_zerofill () {
+ // Test case 1: number is smaller than width (usual case)
+ assert_eq!(zerofill(42, 5), "00042");
+
+ // Test case 2: number is equal to width
+ assert_eq!(zerofill(12345, 5), "12345");
+
+ // Test case 3: number is larger than width
+ assert_eq!(zerofill(987654, 4), "987654");
+
+ // Test case 4: number is zero
+ assert_eq!(zerofill(0, 3), "000");
+
+ // Test case 5: width is zero
+ assert_eq!(zerofill(987, 0), "987");
+ }
+
+}

File Metadata

Mime Type
text/plain
Expires
Sat, Nov 23, 14:04 (18 h, 1 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2258181
Default Alt Text
D2754.id8426.diff (46 KB)

Event Timeline