Source code for ahvn.utils.exts.autoi18n

__all__ = [
    "autoi18n",
]

from ..basic.log_utils import get_logger
from ..basic.config_utils import HEAVEN_CM, hpj
from ..basic.path_utils import *
from ..basic.file_utils import *
from ..basic.serialize_utils import load_txt
from ..basic.parser_utils import parse_md
from ...llm import LLM
from ...ukf.templates.basic.prompt import PromptUKFT

logger = get_logger(__name__)
_src_lang = HEAVEN_CM.get("prompts.main", "en")
_tgt_lang = HEAVEN_CM.get("prompts.lang", "en")

from typing import Optional, List, Dict
import polib
import re



[docs]
def autoi18n(
    path,
    src_lang: Optional[str] = None,
    tgt_lang: Optional[str] = None,
    overwrite: bool = False,
    batch_size: int = 20,
    hints: Optional[List[str]] = None,
    llm_args: Optional[Dict] = None,
    **kwargs,
):
    """\
    Translate Babel `.po` files with an LLM.

    Args:
        path (str): The root directory containing the 'locale' folder.
        src_lang (Optional[str]): The main language of the POT file. Defaults to None, which will use the encoding in the config file ("prompts.main").
        tgt_lang (Optional[str]): The target language to translate to. Defaults to None, which will use the encoding in the config file ("prompts.lang").
        overwrite (bool): Overwrite existing translations. Defaults to False.
        batch_size (int): Number of entries to process in each batch. Defaults to 10.
        llm_args (Optional[Dict]): Arguments for the LLM model. Defaults to None, which will resolve to {"preset": "translator"}
        **kwargs: Additional keyword arguments passed to `cmd` (`subprocess.Popen`).
    """
    if src_lang is None:
        src_lang = _src_lang
    if tgt_lang is None:
        tgt_lang = _tgt_lang

    if llm_args is None:
        llm_args = {"preset": "translator"}

    path = hpj(path, abs=True)
    locale_path = hpj(path, "locale")
    touch_dir(locale_path)
    cfg_path = hpj(locale_path, "babel.cfg")
    if not exists_file(cfg_path):
        logger.warning(f"Babel configuration file not found at {cfg_path}. Please run `babel_init` first.")
        return

    lang_po_path = hpj(locale_path, tgt_lang, "LC_MESSAGES", "messages.po")
    if not exists_file(lang_po_path):
        logger.warning(f"PO file not found at {lang_po_path}. Please run `babel_init` first.")
        return

    # Load pofile
    pofile = polib.pofile(lang_po_path)

    # Find untranslated or empty entries
    entries = [entry for entry in pofile if not entry.msgstr or overwrite]
    if not entries:
        logger.info(f"No untranslated entries found in {lang_po_path}.")
        return

    logger.info(f"Found {len(entries)} entries to translate in {lang_po_path}.")

    files = sorted(list_files(path, ext="jinja;jinja2;j2;txt"))
    contents = [load_txt(pj(path, file)) for file in files]

    # Construct file context description
    file_context_desc = "## Original Files\n===== FILE CONTENTS START =====\n"
    for file_path, content in zip(files, contents):
        file_context_desc += f"\n### File: `{file_path}`\n```jinja\n{content}\n```\n"
    file_context_desc += "===== FILE CONTENTS END ====="

    # Build the system prompt
    system_prompt = f"""\
You are a professional translator working with Babel PO files. Your task is to translate the `msgid` strings from source language ({src_lang}) into the `msgstr` in target language ({tgt_lang}).

Example:
## Source Language: en
## Input:
```pot
msgid_0: "Inputs:"
msgid_1: "Output:"
```
## Target Language: zh
## Output:
```pot
msgstr_0: "输入:"
msgstr_1: "输出:"
```

Now, given new `msgid` lines, produce the translated PO entries in the same format.
Output only the translated PO entries without any additional text. Your output should be wrapped in markdown code block with `pot` syntax highlighting."""

    llm = LLM(**llm_args)

    for i in range(0, len(entries), batch_size):
        batch_entries = entries[i : i + batch_size]
        strings = [entry.msgid for entry in batch_entries]

        # Build descriptions
        desc_list = [file_context_desc]

        # Build input section
        input_pot = "## Input\n```pot\n"
        for j, string in enumerate(strings):
            input_pot += f'msgid_{j}: "{string}"\n'
        input_pot += "```"
        desc_list.append(input_pot)

        # Build instructions
        instr_list = [
            f"Translate from {src_lang} to {tgt_lang}.",
            "Output only the translated PO entries in the format shown in the example.",
            "Wrap the output in a markdown `pot` code block.",
        ]
        if hints:
            instr_list.extend(hints)

        # Create prompt using PromptUKFT
        prompt = PromptUKFT.from_path(
            "& prompts/system",
            default_entry="prompt.jinja",
            binds={
                "system": system_prompt,
                "descriptions": desc_list,
                "instructions": instr_list,
            },
        )

        try:
            prompt_str = prompt.text(lang="en").rstrip()
        except Exception as e:
            logger.error(f"Failed to render prompt for batch {i // batch_size + 1}: {e}")
            continue

        logger.debug(f"Prompt:\n{prompt_str}")

        try:
            response = llm.oracle(prompt_str)
        except Exception as e:
            logger.error(f"LLM failed for batch {i // batch_size + 1}: {e}")
            continue

        logger.debug(f"LLM response:\n{response}")

        try:
            parsed = parse_md(response)
            pot_response = parsed.get("pot", "").strip()
            for j, entry in enumerate(batch_entries):
                pattern = rf'msgstr_{j}:\s*"((?:[^"\\]|\\.)*)"'
                match = re.search(pattern, pot_response, re.DOTALL)
                if match:
                    translation = match.group(1)
                    translation = eval(f'"{translation}"')
                    entry.msgstr = translation
                    logger.info(f"Translated: {entry.msgid} -> {entry.msgstr}")
                else:
                    logger.warning(f"Failed to extract translation for: {entry.msgid}")
        except Exception as e:
            logger.error(f"Error processing batch {i // batch_size + 1}: {e}")
            continue
    pofile.save(lang_po_path)
    logger.info(f"Translations saved to {lang_po_path}.")
    return