bazarr/libs/pycountry/__init__.py

"""pycountry"""

import os.path
import unicodedata
from importlib import metadata as importlib_metadata
from typing import Dict, List, Optional, Type

import pycountry.db

# We prioritise importing the backported `importlib_resources`
# because the function we use (`importlib.resources.files`) is only
# available from Python 3.9, but the module itself exists since 3.7.
# We install `importlib_resources` on Python < 3.9.
# TODO: Remove usage of importlib_resources once support for Python 3.8 is dropped
try:
    import importlib_resources  # type: ignore
except ModuleNotFoundError:
    from importlib import resources as importlib_resources


def resource_filename(package_or_requirement: str, resource_name: str) -> str:
    return str(
        importlib_resources.files(package_or_requirement) / resource_name
    )


def get_version(distribution_name: str) -> Optional[str]:
    try:
        return importlib_metadata.version(distribution_name)
    except importlib_metadata.PackageNotFoundError:
        return "n/a"


# Variable annotations
LOCALES_DIR: str = resource_filename("pycountry", "locales")
DATABASE_DIR: str = resource_filename("pycountry", "databases")
__version__: Optional[str] = get_version("pycountry")


def remove_accents(input_str: str) -> str:
    output_str = input_str
    if not input_str.isascii():
        # Borrowed from https://stackoverflow.com/a/517974/1509718
        nfkd_form = unicodedata.normalize("NFKD", input_str)
        output_str = "".join(
            [c for c in nfkd_form if not unicodedata.combining(c)]
        )
    return output_str


class ExistingCountries(pycountry.db.Database):
    """Provides access to an ISO 3166 database (Countries)."""

    data_class = pycountry.db.Country
    root_key = "3166-1"

    def search_fuzzy(self, query: str) -> List[Type["ExistingCountries"]]:
        query = remove_accents(query.strip().lower())

        # A country-code to points mapping for later sorting countries
        # based on the query's matching incidence.
        results: dict[str, int] = {}

        def add_result(country: "pycountry.db.Country", points: int) -> None:
            results.setdefault(country.alpha_2, 0)
            results[country.alpha_2] += points

        # Prio 1: exact matches on country names
        try:
            add_result(self.lookup(query), 50)
        except LookupError:
            pass

        # Prio 2: exact matches on subdivision names
        match_subdivions = pycountry.Subdivisions.match(
            self=subdivisions, query=query
        )
        for candidate in match_subdivions:
            print(candidate)
            add_result(candidate.country, 49)

        # Prio 3: partial matches on country names
        for candidate in self:
            # Higher priority for a match on the common name
            for v in [
                candidate._fields.get("name"),
                candidate._fields.get("official_name"),
                candidate._fields.get("comment"),
            ]:
                if v is not None:
                    v = remove_accents(v.lower())
                    if query in v:
                        # This prefers countries with a match early in their name
                        # and also balances against countries with a number of
                        # partial matches and their name containing 'new' in the
                        # middle
                        add_result(
                            candidate, max([5, 30 - (2 * v.find(query))])
                        )
                        break

        # Prio 4: partial matches on subdivision names
        partial_match_subdivisions = pycountry.Subdivisions.partial_match(
            self=subdivisions, query=query
        )
        for candidate in partial_match_subdivisions:
            v = candidate._fields.get("name")
            v = remove_accents(v.lower())
            if query in v:
                add_result(candidate.country, max([1, 5 - v.find(query)]))

        if not results:
            raise LookupError(query)

        sorted_results = [
            self.get(alpha_2=x[0])
            # sort by points first, by alpha2 code second, and to ensure stable
            # results the negative value allows us to sort reversely on the
            # points but ascending on the country code.
            for x in sorted(results.items(), key=lambda x: (-x[1], x[0]))
        ]
        return sorted_results


class HistoricCountries(ExistingCountries):
    """Provides access to an ISO 3166-3 database
    (Countries that have been removed from the standard)."""

    data_class = pycountry.db.Country
    root_key = "3166-3"


class Scripts(pycountry.db.Database):
    """Provides access to an ISO 15924 database (Scripts)."""

    data_class = "Script"
    root_key = "15924"


class Currencies(pycountry.db.Database):
    """Provides access to an ISO 4217 database (Currencies)."""

    data_class = "Currency"
    root_key = "4217"


class Languages(pycountry.db.Database):
    """Provides access to an ISO 639-1/2T/3 database (Languages)."""

    no_index = ["status", "scope", "type", "inverted_name", "common_name"]

    data_class = "Language"
    root_key = "639-3"


class LanguageFamilies(pycountry.db.Database):
    """Provides access to an ISO 639-5 database
    (Language Families and Groups)."""

    data_class = "LanguageFamily"
    root_key = "639-5"


class SubdivisionHierarchy(pycountry.db.Data):
    def __init__(self, **kw):
        if "parent" in kw:
            kw["parent_code"] = kw["parent"]
        else:
            kw["parent_code"] = None
        super().__init__(**kw)
        self.country_code = self.code.split("-")[0]
        if self.parent_code is not None:
            self.parent_code = f"{self.country_code}-{self.parent_code}"

    @property
    def country(self):
        return countries.get(alpha_2=self.country_code)

    @property
    def parent(self):
        if not self.parent_code:
            return None
        return subdivisions.get(code=self.parent_code)


class Subdivisions(pycountry.db.Database):
    # Note: subdivisions can be hierarchical to other subdivisions. The
    # parent_code attribute is related to other subdivisions, *not*
    # the country!

    data_class = SubdivisionHierarchy
    no_index = ["name", "parent_code", "parent", "type"]
    root_key = "3166-2"

    def _load(self, *args, **kw):
        super()._load(*args, **kw)

        # Add index for the country code.
        self.indices["country_code"] = {}
        for subdivision in self:
            divs = self.indices["country_code"].setdefault(
                subdivision.country_code.lower(), set()
            )
            divs.add(subdivision)

    def get(self, **kw):
        default = kw.setdefault("default", None)
        subdivisions = super().get(**kw)
        if subdivisions is default and "country_code" in kw:
            # This handles the case where we know about a country but there
            # are no subdivisions: we return an empty list in this case
            # (sticking to the expected type here) instead of None.
            if countries.get(alpha_2=kw["country_code"]) is not None:
                return []
        return subdivisions

    def match(self, query):
        query = remove_accents(query.strip().lower())
        matching_candidates = []
        for candidate in subdivisions:
            for v in candidate._fields.values():
                if v is not None:
                    v = remove_accents(v.lower())
                    # Some names include alternative versions which we want to
                    # match exactly.
                    for w in v.split(";"):
                        if w == query:
                            matching_candidates.append(candidate)
                            break

        return matching_candidates

    def partial_match(self, query):
        query = remove_accents(query.strip().lower())
        matching_candidates = []
        for candidate in subdivisions:
            v = candidate._fields.get("name")
            v = remove_accents(v.lower())
            if query in v:
                matching_candidates.append(candidate)

        return matching_candidates

    def search_fuzzy(self, query: str) -> List[Type["Subdivisions"]]:
        query = remove_accents(query.strip().lower())

        # A Subdivision's code to points mapping for later sorting subdivisions
        # based on the query's matching incidence.
        results: dict[str, int] = {}

        def add_result(
            subdivision: "pycountry.db.Subdivision", points: int
        ) -> None:
            results.setdefault(subdivision.code, 0)
            results[subdivision.code] += points

        # Prio 1: exact matches on subdivision names
        match_subdivisions = self.match(query)
        for candidate in match_subdivisions:
            add_result(candidate, 50)

        # Prio 2: partial matches on subdivision names
        partial_match_subdivisions = self.partial_match(query)
        for candidate in partial_match_subdivisions:
            v = candidate._fields.get("name")
            v = remove_accents(v.lower())
            if query in v:
                add_result(candidate, max([1, 5 - v.find(query)]))

        if not results:
            raise LookupError(query)

        sorted_results = [
            self.get(code=x[0])
            # sort by points first, by alpha2 code second, and to ensure stable
            # results the negative value allows us to sort reversely on the
            # points but ascending on the country code.
            for x in sorted(results.items(), key=lambda x: (-x[1], x[0]))
        ]
        return sorted_results


# Initialize instances with type hints
countries: ExistingCountries = ExistingCountries(
    os.path.join(DATABASE_DIR, "iso3166-1.json")
)
subdivisions: Subdivisions = Subdivisions(
    os.path.join(DATABASE_DIR, "iso3166-2.json")
)
historic_countries: HistoricCountries = HistoricCountries(
    os.path.join(DATABASE_DIR, "iso3166-3.json")
)

currencies: Currencies = Currencies(os.path.join(DATABASE_DIR, "iso4217.json"))

languages: Languages = Languages(os.path.join(DATABASE_DIR, "iso639-3.json"))
language_families: LanguageFamilies = LanguageFamilies(
    os.path.join(DATABASE_DIR, "iso639-5.json")
)

scripts: Scripts = Scripts(os.path.join(DATABASE_DIR, "iso15924.json"))