D3156.diff
No OneTemporary
Actions

Size

13 KB

Referenced Files

None

Subscribers

None

D3156.diff
View Options

	diff --git a/Cargo.toml b/Cargo.toml
	--- a/Cargo.toml
	+++ b/Cargo.toml
	@@ -2,6 +2,7 @@

	members = [
	"fantoir-datasource",
	+ "language-subtag-registry-datasource",
	"rfc-datasource",
	"opendatasoft-explore-api",
	]
	diff --git a/README.md b/README.md
	--- a/README.md
	+++ b/README.md
	@@ -14,6 +14,15 @@

	More information: [fantoir-datasource README](fantoir-datasource/README.md)

	+### IANA language subtag registry (language-subtag-registry-datasource)
	+
	+Import IANA language subtag registry datasource from RFC 5646 and convert it to
	+the specified text-based format.
	+
	+Can be used to refresh language Darkbot database for IRC bots.
	+
	+More information: [language-subtag-registry-datasource README](language-subtag-registry-datasource/README.md)
	+
	### RFC import fool (rfc-datasource)

	Import RFC index and convert it to the specified text-based format.
	diff --git a/language-subtag-registry-datasource/Cargo.toml b/language-subtag-registry-datasource/Cargo.toml
	new file mode 100644
	--- /dev/null
	+++ b/language-subtag-registry-datasource/Cargo.toml
	@@ -0,0 +1,25 @@
	+[package]
	+name = "language-subtag-registry-datasource"
	+version = "0.1.0"
	+edition = "2021"
	+description = "Downloads and transforms IANA language subtag registry"
	+authors = [
	+ "Sébastien Santoro <dereckson@espace-win.org>"
	+]
	+license = "BSD-2-Clause"
	+
	+[dependencies]
	+lazy_static = "1.4.0"
	+regex = "1.8.1"
	+
	+[dependencies.clap]
	+version = "4.3.0"
	+features = ["derive"]
	+
	+[dependencies.reqwest]
	+version = "~0.11.18"
	+features = ["gzip", "deflate"]
	+
	+[dependencies.tokio]
	+version = "1.28.1"
	+features = ["full"]
	diff --git a/language-subtag-registry-datasource/README.md b/language-subtag-registry-datasource/README.md
	new file mode 100644
	--- /dev/null
	+++ b/language-subtag-registry-datasource/README.md
	@@ -0,0 +1,65 @@
	+The `language-subtag-registry-datasource` utility allows to download
	+IANA language subtag registry datasource defined in the RFC 5646,
	+parse it, and transform the output.
	+
	+This registry shares language codes with the different ISO-639 lists,
	+but is more inclusive and descriptive.
	+
	+It has been designed to output the index in an arbitrary format,
	+so we can export a Darkbot database for Odderon, one of our IRC bot.
	+
	+## Usage
	+
	+```
	+language-subtag-registry-datasource
	+ --format <format string>
	+ [--languages-only]
	+ [--aggregation-separator <separator string>]
	+ [--source /path/to/registry.txt]`
	+```
	+
	+The format string can be arbitrary text or variables:
	+
	+\| Variable \| Description \|
	+\|-----------------\|-------------------------------------------\|
	+\| %%id%% \| The Tag or Subtag field of the entry \|
	+\| %%<key>%% \| A field in the registry entry \|
	+\| %%fullstatus%% \| A string built with description, comments \|
	+
	+If an entry doesn't have the required field, it left blank.
	+
	+Examples for the variables:
	+ - `%%Description%%` will output `Inupiaq` for the `ik` subtag
	+ - `%%Description%%` will output `Sichuan Yi / Nuosu` for the `ii` subtag
	+ - `%%Comments%%` will output an empty string for both `ik` and `ii` subtags
	+ - `%%fulldescription%%` will output "Serbo-Croatian - sr, hr, bs are preferred for most modern uses" for `sh`
	+
	+If a language has several values, they are coalesced and a specific string
	+is used as separator. Default separator is " / ". It can be overridden with
	+`--aggregation-separator`.
	+
	+The database contains entries of other types than languages, like variants, regions or redundant.
	+To only parse languages, use `-l` or `--languages-only` flag.
	+
	+The utility uses as source, by order of priority:
	+ - the path specified to the `--source` argument
	+ - any `registry.txt` file available in the current directory
	+ - https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
	+
	+## Recipes
	+
	+### Darkbot database
	+
	+ language-subtag-registry-datasource -l --format "lang+%%id%% %%fulldescription%%"
	+
	+### CSV export
	+
	+Identify the fields and the order you wish to use.
	+
	+For example, to create a CSV with the following header:
	+
	+ Type,Subtag,Tag,Added,Suppress-Script,Preferred-Value,Comments,Scope,Macrolanguage,Deprecated,Description
	+
	+Use:
	+
	+ language-subtag-registry-datasource --format '"%%Type%%","%%Subtag%%","%%Tag%%","%%Added%%","%%Suppress-Script%%","%%Preferred-Value%%","%%Comments%%","%%Scope%%","%%Macrolanguage%%","%%Deprecated%%","%%Description%%"'
	diff --git a/language-subtag-registry-datasource/src/language_parser.rs b/language-subtag-registry-datasource/src/language_parser.rs
	new file mode 100644
	--- /dev/null
	+++ b/language-subtag-registry-datasource/src/language_parser.rs
	@@ -0,0 +1,179 @@
	+use std::collections::HashMap;
	+
	+use lazy_static::lazy_static;
	+use regex::Regex;
	+
	+/* -------------------------------------------------------------
	+ Regexp definitions, used in builder
	+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
	+
	+lazy_static! {
	+ static ref RE_KEY: Regex = Regex::new(
	+ // %%key%%
	+ r"%%(.*?)%%"
	+ ).unwrap();
	+}
	+
	+/* -------------------------------------------------------------
	+ Language
	+
	+ Each language entry from the registry is a key/value map.
	+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
	+
	+#[derive(Debug)]
	+pub struct Language {
	+ pub fields: HashMap<String, Vec<String>>,
	+}
	+
	+impl Language {
	+
	+ ///
	+ /// Parser
	+ ///
	+
	+ pub fn parse_document(document: &str, restrict_to_language: bool) -> Vec<Self> {
	+ document
	+ .split("\n%%\n")
	+ .skip(1) // Metadata File-Date: <date>
	+ .filter(\|&entry\| !restrict_to_language \|\| entry.contains("Type: language"))
	+ .map(\|entry\| Self::parse_entry(entry))
	+ .collect()
	+ }
	+
	+ pub fn parse_entry(entry: &str) -> Self {
	+ let mut fields = HashMap::new();
	+
	+ let mut key = String::new();
	+ let mut value= String::new();
	+ let mut has_value = false;
	+
	+ // Pitfall: some values can extend to several lines
	+ for line in entry.split("\n") {
	+ if line.contains(": ") {
	+ // Save previous value
	+ if has_value {
	+ fields
	+ .entry(key)
	+ .or_insert(Vec::new())
	+ .push(value);
	+ }
	+
	+ // <key>: <value> line
	+ let mut tokens = line.splitn(2, ": ");
	+ key = String::from(tokens.next().unwrap());
	+ value = String::from(tokens.next().unwrap());
	+ has_value = true;
	+ } else {
	+ // Multiline value. Append the line to previous value.
	+ value = format!("{} {}", &value.trim(), line.trim())
	+ }
	+ }
	+ if has_value {
	+ fields
	+ .entry(key)
	+ .or_insert(Vec::new())
	+ .push(value);
	+ }
	+
	+ Self {
	+ fields,
	+ }
	+ }
	+
	+ ///
	+ /// Builder
	+ ///
	+
	+ pub fn get_field(&self, tag: &str, separator: &str) -> Option<String> {
	+ self.fields
	+ .get(tag)
	+ .map(\|values\| values.join(separator))
	+ }
	+
	+ pub fn get_id(&self) -> Option<String> {
	+ self.get_field("Subtag", "-")
	+ .or_else(\|\| self.get_field("Tag", "-"))
	+ }
	+
	+ pub fn build_full_description(&self, separator: &str) -> String {
	+ let mut full_description = self.get_field("Description", separator)
	+ .unwrap_or("<no description in IANA registry>".to_string());
	+
	+ if self.fields.contains_key("Deprecated") {
	+ full_description.push_str(" [deprecated]");
	+ }
	+
	+ if let Some(should_use) = self.get_field("Preferred-Value", separator) {
	+ full_description.push_str("; preferred value: ");
	+ full_description.push_str(&should_use);
	+
	+ }
	+
	+ if let Some(comments) = self.get_field("Comments", separator) {
	+ full_description.push_str("; ");
	+ full_description.push_str(&comments);
	+ }
	+
	+
	+
	+ full_description
	+ }
	+
	+ pub fn format(&self, format: &str, separator: &str) -> String {
	+ let mut formatted = String::from(format);
	+
	+ if formatted.contains("%%id%%") {
	+ let id = self.get_id().unwrap_or(String::new());
	+ formatted = formatted.replace("%%id%%", &id);
	+ }
	+
	+ if formatted.contains("%%fulldescription%%") {
	+ let description = self.build_full_description(separator);
	+ formatted = formatted.replace("%%fulldescription%%", &description);
	+ }
	+
	+ for (key , values) in &self.fields {
	+ let value = values.join(separator);
	+
	+ formatted = formatted.replace(
	+ &format!("%%{}%%", &key),
	+ &value
	+ );
	+ }
	+
	+ RE_KEY
	+ .replace_all(&formatted, "")
	+ .to_string()
	+ }
	+
	+}
	+
	+#[cfg(test)]
	+mod tests {
	+ use super::*;
	+
	+ #[test]
	+ pub fn test_format() {
	+ let liquids = vec!["Water".to_string(), "Air".to_string()];
	+
	+ let mut fields = HashMap::new();
	+ fields.insert("Liquid".to_string(), liquids);
	+ fields.insert("Model".to_string(), vec!["Newtonian".to_string()]);
	+
	+ let language = Language { fields };
	+
	+ assert_eq!(
	+ "Water or Air use Newtonian physic.",
	+ &language.format("%%Liquid%% use %%Model%% physic.", " or ")
	+ );
	+
	+ assert_eq!(
	+ "Water or Air use Newtonian physic.",
	+ &language.format("%%Liquid%% use %%Prefix%%%%Model%% physic.", " or ")
	+ );
	+
	+ assert_eq!(
	+ "", &language.format("", "")
	+ );
	+ }
	+}
	diff --git a/language-subtag-registry-datasource/src/main.rs b/language-subtag-registry-datasource/src/main.rs
	new file mode 100644
	--- /dev/null
	+++ b/language-subtag-registry-datasource/src/main.rs
	@@ -0,0 +1,40 @@
	+use clap::Parser;
	+
	+use crate::registry::get_registry;
	+use crate::language_parser::Language;
	+
	+mod registry;
	+mod language_parser;
	+
	+#[derive(Debug, Parser)]
	+#[command(name = "language-subtag-registry-datasource")]
	+#[clap(author="Nasqueron project", version, about="Download and print language subtag registry", long_about=None)]
	+pub struct Args {
	+ /// The format string to use
	+ #[arg(long, short = 'f')]
	+ format: String,
	+
	+ /// The aggregation separator
	+ #[arg(long, short = 'a', default_value = " / ")]
	+ aggregation_separator: String,
	+
	+ /// The path to the registry source
	+ #[arg(long, short = 's')]
	+ source: Option<String>,
	+
	+ /// Restricts parsing to language type
	+ #[arg(long, short = 'l', default_value_t = false)]
	+ languages_only: bool,
	+}
	+
	+#[tokio::main]
	+async fn main() {
	+ let args = Args::parse(); // Will exit if argument is missing or --help/--version provided.
	+
	+ let document = get_registry(args.source).await
	+ .expect("Can't read or fetch registry");
	+
	+ for language in Language::parse_document(&document, args.languages_only) {
	+ println!("{}", language.format(&args.format, &args.aggregation_separator));
	+ }
	+}
	diff --git a/language-subtag-registry-datasource/src/registry.rs b/language-subtag-registry-datasource/src/registry.rs
	new file mode 100644
	--- /dev/null
	+++ b/language-subtag-registry-datasource/src/registry.rs
	@@ -0,0 +1,60 @@
	+use std::error::Error;
	+use std::fs;
	+use std::path::Path;
	+
	+use reqwest::ClientBuilder;
	+
	+static REGISTRY_URL: &str = "https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry";
	+
	+/* -------------------------------------------------------------
	+ User agent
	+
	+ The USER_AGENT variable is computed at build time.
	+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
	+
	+lazy_static::lazy_static! {
	+ pub static ref USER_AGENT: String = format!(
	+ "{}/{} (https://databases.nasqueron.org/)",
	+ env!("CARGO_PKG_NAME"), env!("CARGO_PKG_VERSION")
	+ );
	+}
	+
	+pub fn get_user_agent () -> &'static str {
	+ &USER_AGENT
	+}
	+
	+/* -------------------------------------------------------------
	+ Read or fetch registry
	+ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
	+
	+pub async fn get_registry(source: Option<String>) -> Result<String, Box<dyn Error>> {
	+ match source {
	+ // Case 1 - A source file has been explicitly set
	+ Some(file) => Ok(fs::read_to_string(&file)?.parse()?),
	+
	+ None => {
	+ if Path::new("registry.txt").exists() {
	+ // Case 2 - The file registry.txt can be found locally
	+ Ok(fs::read_to_string("registry.txt")?.parse()?)
	+ } else {
	+ // Case 3 - Fetch the index remotely
	+ Ok(fetch_registry().await?)
	+ }
	+ }
	+ }
	+}
	+
	+async fn fetch_registry() -> Result<String, Box<dyn Error>> {
	+ let client = ClientBuilder::new()
	+ .user_agent(get_user_agent())
	+ .gzip(true)
	+ .deflate(true)
	+ .build()
	+ .expect("Can't build HTTP client");
	+
	+ let body = client.get(REGISTRY_URL)
	+ .send().await?
	+ .text().await?;
	+
	+ Ok(body)
	+}

File Metadata

Mime Type: text/plain
Expires: Sat, Nov 23, 17:30 (18 h, 30 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 2258543
Default Alt Text: D3156.diff (13 KB)

D3156.diffNo OneTemporaryActions

D3156.diffView Options

File Metadata

Event Timeline

D3156.diff
No OneTemporary
Actions

D3156.diff
View Options