diff --git a/README.md b/README.md index d0ba400..b7a3221 100644 --- a/README.md +++ b/README.md @@ -1,82 +1,133 @@ # Merge dictionaries ## Root problem You uses everal IDEs and each maintain its own spelling dictionary. You want to merge them so words from PyCharm are available in PhpStorm too. ## Usage ### Merge all dictionaries To discover dictionaries in your computer, extract words and merge them: ```shell $ merge-dictionaries --merge ``` This is a potentially destructive operation: your dictionary files will be overwritten. ### Extract dictionaries words To print all the words: ```shell $ merge-dictionaries --extract ``` This is a safe operation. ### Build an Hunspell-compatible dictionary To create a personal dictionary file for your Hunspell dictionary: ```shell -$ merge-dictionaries --extract > perso.dic +$ merge-dictionaries --extract > $HOME/.hunspell_default ``` -This is a safe read-only operation, -as long as perso.dic doesn't already exist in your current folder. +This is a safe read-only operation for your IDE files. This can +overwrite your default Hunspell dictionary if it already exists. ### Build a dictionary in a IDE specific format You can specify `--format=` as argument to the extract task: ```shell $ merge-dictionaries --extract --format=JetBrains ``` It will output a dictionary file you can use in any IDE compatible with that format. This is a safe read-only operation. +### Sync with a Git repository + +Create a `$HOME/.config/merge-dictionaries.conf` with the following content: + +```yaml +git: + - git@github.com:luser/dictionary.git +``` + +See below if you wish to host the Git repository locally. + ## IDE support Currently, the following IDEs are supported * All JetBrains IDEs: application-level dictionary +* Hunspell: read personal dictionaries +* Git repository ## Extend the code ### How to add an IDE? To add an IDE, you need to provide the following methods: * sources * a list of paths candidates for the IDE dictionary * a method extracting words from the dictionary * output * a method to dump the extracted words in the IDE format * write * a method to save the files, normally you can call the ones created ### How can I contribute? You can commit your changes to the upstream by following instructions at https://agora.nasqueron.org/How_to_contribute_code The canonical repository is https://devcentral.nasqueron.org/source/merge-dictionaries.git +## FAQ + +### Delete a word + +Not yet implemented. Here a proposal to implement this. + +Curently, the workflow is: + +[ extract ] -> { words } -> [ publish ] + +You want to add a new transformation step: + +[ extract ] -> { words } -> [ transform ] -> { words cleaned up } -> [ publish ] + +Add a transform step with an allowlist of the words to remove. + +It's not easy to detect if the user has removed a word explicitly +from a dictionary, as we don't cache extracted words. + +### Host locally the Git repository + +If you want to host the repository on your local machine, use a bare repository: + +```shell +$ git init --bare ~/.cache/dictionary +Initialized empty Git repository in /usr/home/luser/.cache/dictionary/ +``` + +You can push to a bare repository, but non-bare ones are protected against pushes, +to avoid a desync between your index and the working files. + +Alternatively, you can prepare a script to do this sequence of operation: +```shell +$ merge-dictionaries --merge +$ cd ~/.cache/dictionary +$ git reset +``` + ## License BSD-2-Clause, see [LICENSE](LICENSE) file. diff --git a/pyproject.toml b/pyproject.toml index 4a2286e..c30dc68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,14 +1,14 @@ # ------------------------------------------------------------- -# Resolve hash +# Merge dictionaries # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Project: Nasqueron # License: BSD-2-Clause # ------------------------------------------------------------- [build-system] requires = [ "setuptools>=42", "wheel" ] build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg index 7c99224..9990d74 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,31 +1,33 @@ [metadata] name = merge-dictionaries -version = 0.1.0 +version = 0.2.0 author = Sébastien Santoro author_email = dereckson@espace-win.org description = Merge dictionaries long_description = file: README.md long_description_content_type = text/markdown license = BSD-2-Clause license_files = LICENSE url = https://devcentral.nasqueron.org/source/merge-dictionaries/ project_urls = Bug Tracker = https://devcentral.nasqueron.org/tag/development_tools/ classifiers = Programming Language :: Python :: 3 License :: OSI Approved :: BSD License Operating System :: OS Independent Environment :: Console Intended Audience :: Developers Topic :: Software Development :: Build Tools [options] package_dir = = src packages = find: scripts = bin/merge-dictionaries python_requires = >=3.6 +install_requires = + PyYAML>=6.0,<7.0 [options.packages.find] where = src diff --git a/src/mergedictionaries/app/app.py b/src/mergedictionaries/app/app.py index f9ba648..c1b649d 100644 --- a/src/mergedictionaries/app/app.py +++ b/src/mergedictionaries/app/app.py @@ -1,119 +1,161 @@ #!/usr/bin/env python3 # ------------------------------------------------------------- # Merge dictionaries # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Project: Nasqueron # Description: Merge dictionaries from various sources, # mainly IDEs, and allow to propagate them. # License: BSD-2-Clause # ------------------------------------------------------------- import argparse +import os import sys -from mergedictionaries.sources import jetbrains as jetbrains_source -from mergedictionaries.output import jetbrains as jetbrains_output -from mergedictionaries.write import jetbrains as jetbrains_write +import yaml + +from mergedictionaries import write, output, sources # ------------------------------------------------------------- # Extract words # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -def get_words_sources(): - return [ - jetbrains_source.extract_words_from_all_dictionaries, - ] - - def get_dictionary_formatters(): return { - "JetBrains": jetbrains_output.dump, + "JetBrains": output.jetbrains.dump, } -def extract_all_words(): - return sorted([words for method in get_words_sources() for words in method()]) - - -def run_extract_all_words(words_format): - words = extract_all_words() - - # Trivial case - if words_format == "text": - for word in words: - print(word) - sys.exit(0) - - # We need a specific formatter - formatters = get_dictionary_formatters() - if words_format not in formatters: - print(f"Unknown format: {words_format}", file=sys.stderr) - sys.exit(2) - - print(formatters[words_format](words)) - sys.exit(0) - - # ------------------------------------------------------------- -# Merge all dictionaries +# Configuration # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -def get_dictionary_writers(): - return [ - jetbrains_write.write, - ] - +def get_configuration_path(): + return os.environ["HOME"] + "/.config/merge-dictionaries.conf" -def run_merge(): - words = extract_all_words() - for method in get_dictionary_writers(): - method(words) +def parse_configuration(): + try: + with open(get_configuration_path()) as fd: + return yaml.safe_load(fd) + except OSError: + return {} # ------------------------------------------------------------- # Application entry point # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def parse_arguments(): parser = argparse.ArgumentParser(description="Merge dictionaries.") parser.add_argument( "--extract", action="store_const", dest="task", const="extract", help="Extract all words from found dictionaries", ) parser.add_argument( "--format", action="store", help="Specifies the output format", default="text" ) parser.add_argument( "--merge", action="store_const", dest="task", const="merge", help="Merge all found dictionaries", ) return parser.parse_args() -def run(): - args = parse_arguments() +class Application: + def __init__(self): + self.context = {"git": {}} + + def run(self): + args = parse_arguments() + + if args.task is None: + print("No task has been specified.", file=sys.stderr) + sys.exit(1) + + self.context["config"] = parse_configuration() + self.context["args"] = args - if args.task is None: - print("No task has been specified.", file=sys.stderr) - sys.exit(1) + if args.task == "extract": + self.run_extract_all_words(args.format) + elif args.task == "merge": + self.run_merge() - if args.task == "extract": - run_extract_all_words(args.format) - elif args.task == "merge": - run_merge() + def get_dictionary_writers(self): + return [ + lambda words: write.jetbrains.write(words), + lambda words: write.git.write( + words, self.context["config"].get("git", []), self.context["git"] + ), + ] + + def run_merge(self): + words = self.extract_all_words() + + for method in self.get_dictionary_writers(): + method(words) + + self.on_exit() + + def get_words_sources(self): + return [ + lambda: sources.git.extract_words_from_all_dictionaries( + self.context["config"].get("git", []), self.context["git"] + ), + lambda: sources.jetbrains.extract_words_from_all_dictionaries(), + lambda: sources.hunspell.extract_words_from_all_dictionaries(), + ] + + def extract_all_words(self): + return sorted( + {word for method in self.get_words_sources() for word in method()} + ) + + def run_extract_all_words(self, words_format): + words = self.extract_all_words() + + # Trivial case + if words_format == "text" or words_format == "hunspell": + if words_format == "hunspell": + print(len(words)) + + for word in words: + print(word) + + self.on_exit() + sys.exit(0) + + # We need a specific formatter + formatters = get_dictionary_formatters() + if words_format not in formatters: + print(f"Unknown format: {words_format}", file=sys.stderr) + self.on_exit() + sys.exit(2) + + print(formatters[words_format](words)) + self.on_exit() + sys.exit(0) + + def on_exit(self): + """Events to run before exiting to cleanup resources.""" + sources.git.on_exit(self.context["git"]) + + +def run(): + app = Application() + app.run() diff --git a/src/mergedictionaries/output/__init__.py b/src/mergedictionaries/output/__init__.py index e69de29..3ef3645 100644 --- a/src/mergedictionaries/output/__init__.py +++ b/src/mergedictionaries/output/__init__.py @@ -0,0 +1,10 @@ +# ------------------------------------------------------------- +# Merge dictionaries :: Output :: JetBrains XML format +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Project: Nasqueron +# Description: Find application-level dictionaries +# from JetBrains IDEs +# License: BSD-2-Clause +# ------------------------------------------------------------- + +from . import jetbrains diff --git a/src/mergedictionaries/sources/__init__.py b/src/mergedictionaries/sources/__init__.py index e69de29..cb38ed4 100644 --- a/src/mergedictionaries/sources/__init__.py +++ b/src/mergedictionaries/sources/__init__.py @@ -0,0 +1,12 @@ +# ------------------------------------------------------------- +# Merge dictionaries :: Sources +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Project: Nasqueron +# License: BSD-2-Clause +# ------------------------------------------------------------- + +from . import git +from . import hunspell +from . import jetbrains + +from .git import GitRepository diff --git a/src/mergedictionaries/sources/git.py b/src/mergedictionaries/sources/git.py new file mode 100644 index 0000000..29fcb8a --- /dev/null +++ b/src/mergedictionaries/sources/git.py @@ -0,0 +1,147 @@ +# ------------------------------------------------------------- +# Merge dictionaries :: Sources :: Git +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Project: Nasqueron +# Description: Fetch dictionaries from Git repository +# License: BSD-2-Clause +# ------------------------------------------------------------- + + +import hashlib +import os +import shutil +import subprocess +import tempfile + + +# ------------------------------------------------------------- +# Manipulate a dictionary sync repository +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + +class GitRepository: + + DICTIONARY_PATH = "dictionary.txt" + + def __init__(self, repository_remote, cached_repositories): + self.remote = repository_remote + self.cache = cached_repositories + self.path = None + + self.prepare_repository() + + def get_cache_hash(self): + return hashlib.md5(self.remote.encode("ascii")).hexdigest() + + def prepare_repository(self): + cache_hash = self.get_cache_hash() + + try: + self.path = self.cache[cache_hash] + except KeyError: + self.clone() + self.cache[cache_hash] = self.path + + def get_dictionary_path(self): + return os.path.join(self.path, self.DICTIONARY_PATH) + + def extract_words(self): + return [word.strip() for word in open(self.get_dictionary_path())] + + def publish(self, tmp_dictionary_path): + shutil.copy(tmp_dictionary_path, self.get_dictionary_path()) + + if self.is_dirty(): + self.commit() + self.push() + + # ------------------------------------------------------------- + # Git operations + # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + def is_dirty(self): + checks = [ + # Detect empty repository + ["git", "show-ref", "HEAD"], + # Detect index change + ["git", "diff-index", "--quiet", "HEAD", "--"], + ] + + for check_command in checks: + process = subprocess.run( + check_command, + stderr=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + cwd=self.path, + ) + + if process.returncode > 0: + return True + + return False + + @staticmethod + def get_commit_message(): + return f"Sync personal dictionary\n\nSync application: merge-dictionaries\nSync hostname: {os.environ['HOSTNAME']}" + + def run(self, commands): + for command in commands: + subprocess.run( + command, + cwd=self.path, + ) + + def commit(self): + self.run( + [ + # Detect empty repository + ["git", "add", self.DICTIONARY_PATH], + # Detect index change + ["git", "commit", "-m", self.get_commit_message()], + ] + ) + + def push(self): + self.run( + [ + ["git", "push", "origin", self.get_branch()], + ] + ) + + def clone(self): + self.path = tempfile.mkdtemp(prefix="merge-dictionaries-") + subprocess.run(["git", "clone", self.remote, self.path]) + + def get_branch(self): + return ( + subprocess.run( + ["git", "symbolic-ref", "--short", "HEAD"], + cwd=self.path, + capture_output=True, + ) + .stdout.decode() + .strip() + ) + + +# ------------------------------------------------------------- +# Wrapper to read Git repositories +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + +def extract_words_from_all_dictionaries(target_repos, cached_repos): + return { + word + for repo in target_repos + for word in GitRepository(repo, cached_repos).extract_words() + } + + +# ------------------------------------------------------------- +# Events +# :: on_exit +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +def on_exit(cached_repos): + for _, repository_path in cached_repos.items(): + shutil.rmtree(repository_path) diff --git a/src/mergedictionaries/sources/hunspell.py b/src/mergedictionaries/sources/hunspell.py new file mode 100644 index 0000000..aa7e0f5 --- /dev/null +++ b/src/mergedictionaries/sources/hunspell.py @@ -0,0 +1,56 @@ +# ------------------------------------------------------------- +# Merge dictionaries :: Sources :: Hunspell +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Project: Nasqueron +# Description: Find Hunspell personal dictionaries +# License: BSD-2-Clause +# ------------------------------------------------------------- + + +import os + + +def get_hunspell_environment_variables(): + return [ + "DICTIONARY", + "LC_ALL", + "LC_MESSAGES", + "LANG", + ] + + +def resolve_personal_dictionary_paths_from_environment(): + names = {"default"} + + for variable in get_hunspell_environment_variables(): + if variable in os.environ: + names.add(os.environ[variable]) + + dictionary_paths = [ + os.path.join(os.environ["HOME"], f".hunspell_{name}") for name in names + ] + + if "WORDLIST" in os.environ: + dictionary_paths.append(os.environ["WORDLIST"]) + + return dictionary_paths + + +def find_personal_dictionaries(): + return [ + file + for file in resolve_personal_dictionary_paths_from_environment() + if os.path.exists(file) + ] + + +def extract_words(dictionary_path): + return [word.strip() for word in open(dictionary_path)] + + +def extract_words_from_all_dictionaries(): + return { + word + for dictionary_path in find_personal_dictionaries() + for word in extract_words(dictionary_path) + } diff --git a/src/mergedictionaries/write/__init__.py b/src/mergedictionaries/write/__init__.py index e69de29..63d05a2 100644 --- a/src/mergedictionaries/write/__init__.py +++ b/src/mergedictionaries/write/__init__.py @@ -0,0 +1,2 @@ +from . import git +from . import jetbrains diff --git a/src/mergedictionaries/write/git.py b/src/mergedictionaries/write/git.py new file mode 100644 index 0000000..ff9873e --- /dev/null +++ b/src/mergedictionaries/write/git.py @@ -0,0 +1,33 @@ +# ------------------------------------------------------------- +# Merge dictionaries :: Publishers :: Git repository +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Project: Nasqueron +# Description: Find application-level dictionaries +# from Git repository +# License: BSD-2-Clause +# ------------------------------------------------------------- +import os +from tempfile import NamedTemporaryFile + +from mergedictionaries.sources import GitRepository + + +def build_temporary_dictionary(words): + fd = NamedTemporaryFile(delete=False) + for word in words: + fd.write(f"{word}\n".encode("utf-8")) + fd.close() + + return fd.name + + +def write(words, target_repos, cached_repos): + if not target_repos: + return + + tmp_dictionary_path = build_temporary_dictionary(words) + + for repo in target_repos: + GitRepository(repo, cached_repos).publish(tmp_dictionary_path) + + os.unlink(tmp_dictionary_path)