Page MenuHomeDevCentral

D3156.diff
No OneTemporary

D3156.diff

diff --git a/Cargo.toml b/Cargo.toml
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,6 +2,7 @@
members = [
"fantoir-datasource",
+ "language-subtag-registry-datasource",
"rfc-datasource",
"opendatasoft-explore-api",
]
diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -14,6 +14,15 @@
More information: [fantoir-datasource README](fantoir-datasource/README.md)
+### IANA language subtag registry (language-subtag-registry-datasource)
+
+Import IANA language subtag registry datasource from RFC 5646 and convert it to
+the specified text-based format.
+
+Can be used to refresh language Darkbot database for IRC bots.
+
+More information: [language-subtag-registry-datasource README](language-subtag-registry-datasource/README.md)
+
### RFC import fool (rfc-datasource)
Import RFC index and convert it to the specified text-based format.
diff --git a/language-subtag-registry-datasource/Cargo.toml b/language-subtag-registry-datasource/Cargo.toml
new file mode 100644
--- /dev/null
+++ b/language-subtag-registry-datasource/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "language-subtag-registry-datasource"
+version = "0.1.0"
+edition = "2021"
+description = "Downloads and transforms IANA language subtag registry"
+authors = [
+ "Sébastien Santoro <dereckson@espace-win.org>"
+]
+license = "BSD-2-Clause"
+
+[dependencies]
+lazy_static = "1.4.0"
+regex = "1.8.1"
+
+[dependencies.clap]
+version = "4.3.0"
+features = ["derive"]
+
+[dependencies.reqwest]
+version = "~0.11.18"
+features = ["gzip", "deflate"]
+
+[dependencies.tokio]
+version = "1.28.1"
+features = ["full"]
diff --git a/language-subtag-registry-datasource/README.md b/language-subtag-registry-datasource/README.md
new file mode 100644
--- /dev/null
+++ b/language-subtag-registry-datasource/README.md
@@ -0,0 +1,65 @@
+The `language-subtag-registry-datasource` utility allows to download
+IANA language subtag registry datasource defined in the RFC 5646,
+parse it, and transform the output.
+
+This registry shares language codes with the different ISO-639 lists,
+but is more inclusive and descriptive.
+
+It has been designed to output the index in an arbitrary format,
+so we can export a Darkbot database for Odderon, one of our IRC bot.
+
+## Usage
+
+```
+language-subtag-registry-datasource
+ --format <format string>
+ [--languages-only]
+ [--aggregation-separator <separator string>]
+ [--source /path/to/registry.txt]`
+```
+
+The format string can be arbitrary text or variables:
+
+| **Variable** | **Description** |
+|-----------------|-------------------------------------------|
+| %%id%% | The Tag or Subtag field of the entry |
+| %%<key>%% | A field in the registry entry |
+| %%fullstatus%% | A string built with description, comments |
+
+If an entry doesn't have the required field, it left blank.
+
+Examples for the variables:
+ - `%%Description%%` will output `Inupiaq` for the `ik` subtag
+ - `%%Description%%` will output `Sichuan Yi / Nuosu` for the `ii` subtag
+ - `%%Comments%%` will output an empty string for both `ik` and `ii` subtags
+ - `%%fulldescription%%` will output "Serbo-Croatian - sr, hr, bs are preferred for most modern uses" for `sh`
+
+If a language has several values, they are coalesced and a specific string
+is used as separator. Default separator is " / ". It can be overridden with
+`--aggregation-separator`.
+
+The database contains entries of other types than languages, like variants, regions or redundant.
+To only parse languages, use `-l` or `--languages-only` flag.
+
+The utility uses as source, by order of priority:
+ - the path specified to the `--source` argument
+ - any `registry.txt` file available in the current directory
+ - https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
+
+## Recipes
+
+### Darkbot database
+
+ language-subtag-registry-datasource -l --format "lang+%%id%% %%fulldescription%%"
+
+### CSV export
+
+Identify the fields and the order you wish to use.
+
+For example, to create a CSV with the following header:
+
+ Type,Subtag,Tag,Added,Suppress-Script,Preferred-Value,Comments,Scope,Macrolanguage,Deprecated,Description
+
+Use:
+
+ language-subtag-registry-datasource --format '"%%Type%%","%%Subtag%%","%%Tag%%","%%Added%%","%%Suppress-Script%%","%%Preferred-Value%%","%%Comments%%","%%Scope%%","%%Macrolanguage%%","%%Deprecated%%","%%Description%%"'
diff --git a/language-subtag-registry-datasource/src/language_parser.rs b/language-subtag-registry-datasource/src/language_parser.rs
new file mode 100644
--- /dev/null
+++ b/language-subtag-registry-datasource/src/language_parser.rs
@@ -0,0 +1,179 @@
+use std::collections::HashMap;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+/* -------------------------------------------------------------
+ Regexp definitions, used in builder
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+lazy_static! {
+ static ref RE_KEY: Regex = Regex::new(
+ // %%key%%
+ r"%%(.*?)%%"
+ ).unwrap();
+}
+
+/* -------------------------------------------------------------
+ Language
+
+ Each language entry from the registry is a key/value map.
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+#[derive(Debug)]
+pub struct Language {
+ pub fields: HashMap<String, Vec<String>>,
+}
+
+impl Language {
+
+ ///
+ /// Parser
+ ///
+
+ pub fn parse_document(document: &str, restrict_to_language: bool) -> Vec<Self> {
+ document
+ .split("\n%%\n")
+ .skip(1) // Metadata File-Date: <date>
+ .filter(|&entry| !restrict_to_language || entry.contains("Type: language"))
+ .map(|entry| Self::parse_entry(entry))
+ .collect()
+ }
+
+ pub fn parse_entry(entry: &str) -> Self {
+ let mut fields = HashMap::new();
+
+ let mut key = String::new();
+ let mut value= String::new();
+ let mut has_value = false;
+
+ // Pitfall: some values can extend to several lines
+ for line in entry.split("\n") {
+ if line.contains(": ") {
+ // Save previous value
+ if has_value {
+ fields
+ .entry(key)
+ .or_insert(Vec::new())
+ .push(value);
+ }
+
+ // <key>: <value> line
+ let mut tokens = line.splitn(2, ": ");
+ key = String::from(tokens.next().unwrap());
+ value = String::from(tokens.next().unwrap());
+ has_value = true;
+ } else {
+ // Multiline value. Append the line to previous value.
+ value = format!("{} {}", &value.trim(), line.trim())
+ }
+ }
+ if has_value {
+ fields
+ .entry(key)
+ .or_insert(Vec::new())
+ .push(value);
+ }
+
+ Self {
+ fields,
+ }
+ }
+
+ ///
+ /// Builder
+ ///
+
+ pub fn get_field(&self, tag: &str, separator: &str) -> Option<String> {
+ self.fields
+ .get(tag)
+ .map(|values| values.join(separator))
+ }
+
+ pub fn get_id(&self) -> Option<String> {
+ self.get_field("Subtag", "-")
+ .or_else(|| self.get_field("Tag", "-"))
+ }
+
+ pub fn build_full_description(&self, separator: &str) -> String {
+ let mut full_description = self.get_field("Description", separator)
+ .unwrap_or("<no description in IANA registry>".to_string());
+
+ if self.fields.contains_key("Deprecated") {
+ full_description.push_str(" [deprecated]");
+ }
+
+ if let Some(should_use) = self.get_field("Preferred-Value", separator) {
+ full_description.push_str("; preferred value: ");
+ full_description.push_str(&should_use);
+
+ }
+
+ if let Some(comments) = self.get_field("Comments", separator) {
+ full_description.push_str("; ");
+ full_description.push_str(&comments);
+ }
+
+
+
+ full_description
+ }
+
+ pub fn format(&self, format: &str, separator: &str) -> String {
+ let mut formatted = String::from(format);
+
+ if formatted.contains("%%id%%") {
+ let id = self.get_id().unwrap_or(String::new());
+ formatted = formatted.replace("%%id%%", &id);
+ }
+
+ if formatted.contains("%%fulldescription%%") {
+ let description = self.build_full_description(separator);
+ formatted = formatted.replace("%%fulldescription%%", &description);
+ }
+
+ for (key , values) in &self.fields {
+ let value = values.join(separator);
+
+ formatted = formatted.replace(
+ &format!("%%{}%%", &key),
+ &value
+ );
+ }
+
+ RE_KEY
+ .replace_all(&formatted, "")
+ .to_string()
+ }
+
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ pub fn test_format() {
+ let liquids = vec!["Water".to_string(), "Air".to_string()];
+
+ let mut fields = HashMap::new();
+ fields.insert("Liquid".to_string(), liquids);
+ fields.insert("Model".to_string(), vec!["Newtonian".to_string()]);
+
+ let language = Language { fields };
+
+ assert_eq!(
+ "Water or Air use Newtonian physic.",
+ &language.format("%%Liquid%% use %%Model%% physic.", " or ")
+ );
+
+ assert_eq!(
+ "Water or Air use Newtonian physic.",
+ &language.format("%%Liquid%% use %%Prefix%%%%Model%% physic.", " or ")
+ );
+
+ assert_eq!(
+ "", &language.format("", "")
+ );
+ }
+}
diff --git a/language-subtag-registry-datasource/src/main.rs b/language-subtag-registry-datasource/src/main.rs
new file mode 100644
--- /dev/null
+++ b/language-subtag-registry-datasource/src/main.rs
@@ -0,0 +1,40 @@
+use clap::Parser;
+
+use crate::registry::get_registry;
+use crate::language_parser::Language;
+
+mod registry;
+mod language_parser;
+
+#[derive(Debug, Parser)]
+#[command(name = "language-subtag-registry-datasource")]
+#[clap(author="Nasqueron project", version, about="Download and print language subtag registry", long_about=None)]
+pub struct Args {
+ /// The format string to use
+ #[arg(long, short = 'f')]
+ format: String,
+
+ /// The aggregation separator
+ #[arg(long, short = 'a', default_value = " / ")]
+ aggregation_separator: String,
+
+ /// The path to the registry source
+ #[arg(long, short = 's')]
+ source: Option<String>,
+
+ /// Restricts parsing to language type
+ #[arg(long, short = 'l', default_value_t = false)]
+ languages_only: bool,
+}
+
+#[tokio::main]
+async fn main() {
+ let args = Args::parse(); // Will exit if argument is missing or --help/--version provided.
+
+ let document = get_registry(args.source).await
+ .expect("Can't read or fetch registry");
+
+ for language in Language::parse_document(&document, args.languages_only) {
+ println!("{}", language.format(&args.format, &args.aggregation_separator));
+ }
+}
diff --git a/language-subtag-registry-datasource/src/registry.rs b/language-subtag-registry-datasource/src/registry.rs
new file mode 100644
--- /dev/null
+++ b/language-subtag-registry-datasource/src/registry.rs
@@ -0,0 +1,60 @@
+use std::error::Error;
+use std::fs;
+use std::path::Path;
+
+use reqwest::ClientBuilder;
+
+static REGISTRY_URL: &str = "https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry";
+
+/* -------------------------------------------------------------
+ User agent
+
+ The USER_AGENT variable is computed at build time.
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+lazy_static::lazy_static! {
+ pub static ref USER_AGENT: String = format!(
+ "{}/{} (https://databases.nasqueron.org/)",
+ env!("CARGO_PKG_NAME"), env!("CARGO_PKG_VERSION")
+ );
+}
+
+pub fn get_user_agent () -> &'static str {
+ &USER_AGENT
+}
+
+/* -------------------------------------------------------------
+ Read or fetch registry
+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+pub async fn get_registry(source: Option<String>) -> Result<String, Box<dyn Error>> {
+ match source {
+ // Case 1 - A source file has been explicitly set
+ Some(file) => Ok(fs::read_to_string(&file)?.parse()?),
+
+ None => {
+ if Path::new("registry.txt").exists() {
+ // Case 2 - The file registry.txt can be found locally
+ Ok(fs::read_to_string("registry.txt")?.parse()?)
+ } else {
+ // Case 3 - Fetch the index remotely
+ Ok(fetch_registry().await?)
+ }
+ }
+ }
+}
+
+async fn fetch_registry() -> Result<String, Box<dyn Error>> {
+ let client = ClientBuilder::new()
+ .user_agent(get_user_agent())
+ .gzip(true)
+ .deflate(true)
+ .build()
+ .expect("Can't build HTTP client");
+
+ let body = client.get(REGISTRY_URL)
+ .send().await?
+ .text().await?;
+
+ Ok(body)
+}

File Metadata

Mime Type
text/plain
Expires
Sat, Nov 23, 17:30 (18 h, 30 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2258543
Default Alt Text
D3156.diff (13 KB)

Event Timeline